LLVM 22.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <memory>
99#include <optional>
100#include <set>
101#include <string>
102#include <tuple>
103#include <utility>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107using namespace slpvectorizer;
108using namespace std::placeholders;
109
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
112
113STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
114
115DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
116 "Controls which SLP graphs should be vectorized.");
117
118static cl::opt<bool>
119 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
120 cl::desc("Run the SLP vectorization passes"));
121
122static cl::opt<bool>
123 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
124 cl::desc("Enable vectorization for wider vector utilization"));
125
126static cl::opt<int>
128 cl::desc("Only vectorize if you gain more than this "
129 "number "));
130
132 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
133 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
135
136static cl::opt<bool>
137ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
138 cl::desc("Attempt to vectorize horizontal reductions"));
139
141 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
142 cl::desc(
143 "Attempt to vectorize horizontal reductions feeding into a store"));
144
146 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
147 cl::desc("Improve the code quality by splitting alternate instructions"));
148
149static cl::opt<int>
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
155 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
156
157/// Limits the size of scheduling regions in a block.
158/// It avoid long compile times for _very_ large blocks where vector
159/// instructions are spread over a wide range.
160/// This limit is way higher than needed by real-world functions.
161static cl::opt<int>
162ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
163 cl::desc("Limit the size of the SLP scheduling region per block"));
164
166 "slp-min-reg-size", cl::init(128), cl::Hidden,
167 cl::desc("Attempt to vectorize for this register size in bits"));
168
170 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
171 cl::desc("Limit the recursion depth when building a vectorizable tree"));
172
174 "slp-min-tree-size", cl::init(3), cl::Hidden,
175 cl::desc("Only vectorize small trees if they are fully vectorizable"));
176
177// The maximum depth that the look-ahead score heuristic will explore.
178// The higher this value, the higher the compilation time overhead.
180 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
181 cl::desc("The maximum look-ahead depth for operand reordering scores"));
182
183// The maximum depth that the look-ahead score heuristic will explore
184// when it probing among candidates for vectorization tree roots.
185// The higher this value, the higher the compilation time overhead but unlike
186// similar limit for operands ordering this is less frequently used, hence
187// impact of higher value is less noticeable.
189 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
190 cl::desc("The maximum look-ahead depth for searching best rooting option"));
191
193 "slp-min-strided-loads", cl::init(2), cl::Hidden,
194 cl::desc("The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
196
198 "slp-max-stride", cl::init(8), cl::Hidden,
199 cl::desc("The maximum stride, considered to be profitable."));
200
201static cl::opt<bool>
202 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
203 cl::desc("Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
205
206static cl::opt<bool>
207 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
208 cl::desc("Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
210
211static cl::opt<bool>
212 ViewSLPTree("view-slp-tree", cl::Hidden,
213 cl::desc("Display the SLP trees with Graphviz"));
214
216 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
217 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
218
219/// Enables vectorization of copyable elements.
221 "slp-copyable-elements", cl::init(true), cl::Hidden,
222 cl::desc("Try to replace values with the idempotent instructions for "
223 "better vectorization."));
224
225// Limit the number of alias checks. The limit is chosen so that
226// it has no negative effect on the llvm benchmarks.
227static const unsigned AliasedCheckLimit = 10;
228
229// Limit of the number of uses for potentially transformed instructions/values,
230// used in checks to avoid compile-time explode.
231static constexpr int UsesLimit = 64;
232
233// Another limit for the alias checks: The maximum distance between load/store
234// instructions where alias checks are done.
235// This limit is useful for very large basic blocks.
236static const unsigned MaxMemDepDistance = 160;
237
238/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
239/// regions to be handled.
240static const int MinScheduleRegionSize = 16;
241
242/// Maximum allowed number of operands in the PHI nodes.
243static const unsigned MaxPHINumOperands = 128;
244
245/// Predicate for the element types that the SLP vectorizer supports.
246///
247/// The most important thing to filter here are types which are invalid in LLVM
248/// vectors. We also filter target specific types which have absolutely no
249/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
250/// avoids spending time checking the cost model and realizing that they will
251/// be inevitably scalarized.
252static bool isValidElementType(Type *Ty) {
253 // TODO: Support ScalableVectorType.
255 Ty = Ty->getScalarType();
256 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
257 !Ty->isPPC_FP128Ty();
258}
259
260/// Returns the type of the given value/instruction \p V. If it is store,
261/// returns the type of its value operand, for Cmp - the types of the compare
262/// operands and for insertelement - the type os the inserted operand.
263/// Otherwise, just the type of the value is returned.
265 if (auto *SI = dyn_cast<StoreInst>(V))
266 return SI->getValueOperand()->getType();
267 if (auto *CI = dyn_cast<CmpInst>(V))
268 return CI->getOperand(0)->getType();
269 if (auto *IE = dyn_cast<InsertElementInst>(V))
270 return IE->getOperand(1)->getType();
271 return V->getType();
272}
273
274/// \returns the number of elements for Ty.
275static unsigned getNumElements(Type *Ty) {
277 "ScalableVectorType is not supported.");
278 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
279 return VecTy->getNumElements();
280 return 1;
281}
282
283/// \returns the vector type of ScalarTy based on vectorization factor.
284static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
285 return FixedVectorType::get(ScalarTy->getScalarType(),
286 VF * getNumElements(ScalarTy));
287}
288
289/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
290/// which forms type, which splits by \p TTI into whole vector types during
291/// legalization.
293 Type *Ty, unsigned Sz) {
294 if (!isValidElementType(Ty))
295 return bit_ceil(Sz);
296 // Find the number of elements, which forms full vectors.
297 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
298 if (NumParts == 0 || NumParts >= Sz)
299 return bit_ceil(Sz);
300 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
301}
302
303/// Returns the number of elements of the given type \p Ty, not greater than \p
304/// Sz, which forms type, which splits by \p TTI into whole vector types during
305/// legalization.
306static unsigned
308 unsigned Sz) {
309 if (!isValidElementType(Ty))
310 return bit_floor(Sz);
311 // Find the number of elements, which forms full vectors.
312 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
313 if (NumParts == 0 || NumParts >= Sz)
314 return bit_floor(Sz);
315 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
316 if (RegVF > Sz)
317 return bit_floor(Sz);
318 return (Sz / RegVF) * RegVF;
319}
320
321static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
322 SmallVectorImpl<int> &Mask) {
323 // The ShuffleBuilder implementation use shufflevector to splat an "element".
324 // But the element have different meaning for SLP (scalar) and REVEC
325 // (vector). We need to expand Mask into masks which shufflevector can use
326 // directly.
327 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
328 for (unsigned I : seq<unsigned>(Mask.size()))
329 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
330 I * VecTyNumElements, VecTyNumElements)))
331 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
332 : Mask[I] * VecTyNumElements + J;
333 Mask.swap(NewMask);
334}
335
336/// \returns the number of groups of shufflevector
337/// A group has the following features
338/// 1. All of value in a group are shufflevector.
339/// 2. The mask of all shufflevector is isExtractSubvectorMask.
340/// 3. The mask of all shufflevector uses all of the elements of the source.
341/// e.g., it is 1 group (%0)
342/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
343/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
344/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
345/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
346/// it is 2 groups (%3 and %4)
347/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
348/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
349/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
350/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
351/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
352/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
353/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
354/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
355/// it is 0 group
356/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
357/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
358/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
359/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
361 if (VL.empty())
362 return 0;
364 return 0;
365 auto *SV = cast<ShuffleVectorInst>(VL.front());
366 unsigned SVNumElements =
367 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
370 return 0;
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
373 return 0;
374 unsigned NumGroup = 0;
375 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
376 auto *SV = cast<ShuffleVectorInst>(VL[I]);
377 Value *Src = SV->getOperand(0);
378 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
379 SmallBitVector ExpectedIndex(GroupSize);
380 if (!all_of(Group, [&](Value *V) {
381 auto *SV = cast<ShuffleVectorInst>(V);
382 // From the same source.
383 if (SV->getOperand(0) != Src)
384 return false;
385 int Index;
386 if (!SV->isExtractSubvectorMask(Index))
387 return false;
388 ExpectedIndex.set(Index / ShuffleMaskSize);
389 return true;
390 }))
391 return 0;
392 if (!ExpectedIndex.all())
393 return 0;
394 ++NumGroup;
395 }
396 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
397 return NumGroup;
398}
399
400/// \returns a shufflevector mask which is used to vectorize shufflevectors
401/// e.g.,
402/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
403/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
404/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
405/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
406/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
407/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
409/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
410/// the result is
411/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
413 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
414 auto *SV = cast<ShuffleVectorInst>(VL.front());
415 unsigned SVNumElements =
416 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
417 SmallVector<int> Mask;
418 unsigned AccumulateLength = 0;
419 for (Value *V : VL) {
420 auto *SV = cast<ShuffleVectorInst>(V);
421 for (int M : SV->getShuffleMask())
422 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
425 }
426 return Mask;
427}
428
429/// \returns True if the value is a constant (but not globals/constant
430/// expressions).
431static bool isConstant(Value *V) {
433}
434
435/// Checks if \p V is one of vector-like instructions, i.e. undef,
436/// insertelement/extractelement with constant indices for fixed vector type or
437/// extractvalue instruction.
441 return false;
442 auto *I = dyn_cast<Instruction>(V);
443 if (!I || isa<ExtractValueInst>(I))
444 return true;
445 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
446 return false;
448 return isConstant(I->getOperand(1));
449 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
450 return isConstant(I->getOperand(2));
451}
452
453/// Returns power-of-2 number of elements in a single register (part), given the
454/// total number of elements \p Size and number of registers (parts) \p
455/// NumParts.
456static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
457 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
458}
459
460/// Returns correct remaining number of elements, considering total amount \p
461/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
462/// and current register (part) \p Part.
463static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
464 unsigned Part) {
465 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
466}
467
468#if !defined(NDEBUG)
469/// Print a short descriptor of the instruction bundle suitable for debug output.
470static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
471 std::string Result;
472 raw_string_ostream OS(Result);
473 if (Idx >= 0)
474 OS << "Idx: " << Idx << ", ";
475 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
476 return Result;
477}
478#endif
479
480/// \returns true if all of the instructions in \p VL are in the same block or
481/// false otherwise.
483 auto *It = find_if(VL, IsaPred<Instruction>);
484 if (It == VL.end())
485 return false;
488 return true;
489
490 BasicBlock *BB = I0->getParent();
491 for (Value *V : iterator_range(It, VL.end())) {
492 if (isa<PoisonValue>(V))
493 continue;
494 auto *II = dyn_cast<Instruction>(V);
495 if (!II)
496 return false;
497
498 if (BB != II->getParent())
499 return false;
500 }
501 return true;
502}
503
504/// \returns True if all of the values in \p VL are constants (but not
505/// globals/constant expressions).
507 // Constant expressions and globals can't be vectorized like normal integer/FP
508 // constants.
509 return all_of(VL, isConstant);
510}
511
512/// \returns True if all of the values in \p VL are identical or some of them
513/// are UndefValue.
514static bool isSplat(ArrayRef<Value *> VL) {
515 Value *FirstNonUndef = nullptr;
516 for (Value *V : VL) {
517 if (isa<UndefValue>(V))
518 continue;
519 if (!FirstNonUndef) {
520 FirstNonUndef = V;
521 continue;
522 }
523 if (V != FirstNonUndef)
524 return false;
525 }
526 return FirstNonUndef != nullptr;
527}
528
529/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
530/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
531/// patterns that make it effectively commutative (like equality comparisons
532/// with zero).
533/// In most cases, users should not call this function directly (since \p I and
534/// \p InstWithUses are the same). However, when analyzing interchangeable
535/// instructions, we need to use the converted opcode along with the original
536/// uses.
537/// \param I The instruction to check for commutativity
538/// \param ValWithUses The value whose uses are analyzed for special
539/// patterns
540static bool isCommutative(Instruction *I, Value *ValWithUses,
541 bool IsCopyable = false) {
542 if (auto *Cmp = dyn_cast<CmpInst>(I))
543 return Cmp->isCommutative();
544 if (auto *BO = dyn_cast<BinaryOperator>(I))
545 return BO->isCommutative() ||
546 (BO->getOpcode() == Instruction::Sub &&
547 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
548 all_of(
549 ValWithUses->uses(),
550 [&](const Use &U) {
551 // Commutative, if icmp eq/ne sub, 0
552 CmpPredicate Pred;
553 if (match(U.getUser(),
554 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
555 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
556 return true;
557 // Commutative, if abs(sub nsw, true) or abs(sub, false).
558 ConstantInt *Flag;
559 auto *I = dyn_cast<BinaryOperator>(U.get());
560 return match(U.getUser(),
561 m_Intrinsic<Intrinsic::abs>(
562 m_Specific(U.get()), m_ConstantInt(Flag))) &&
563 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
564 Flag->isOne());
565 })) ||
566 (BO->getOpcode() == Instruction::FSub &&
567 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
568 all_of(ValWithUses->uses(), [](const Use &U) {
569 return match(U.getUser(),
570 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
571 }));
572 return I->isCommutative();
573}
574
575/// This is a helper function to check whether \p I is commutative.
576/// This is a convenience wrapper that calls the two-parameter version of
577/// isCommutative with the same instruction for both parameters. This is
578/// the common case where the instruction being checked for commutativity
579/// is the same as the instruction whose uses are analyzed for special
580/// patterns (see the two-parameter version above for details).
581/// \param I The instruction to check for commutativity
582/// \returns true if the instruction is commutative, false otherwise
583static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
584
585/// \returns number of operands of \p I, considering commutativity. Returns 2
586/// for commutative instrinsics.
587/// \param I The instruction to check for commutativity
590 // IntrinsicInst::isCommutative returns true if swapping the first "two"
591 // arguments to the intrinsic produces the same result.
592 constexpr unsigned IntrinsicNumOperands = 2;
593 return IntrinsicNumOperands;
594 }
595 return I->getNumOperands();
596}
597
598template <typename T>
599static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
600 unsigned Offset) {
601 static_assert(std::is_same_v<T, InsertElementInst> ||
602 std::is_same_v<T, ExtractElementInst>,
603 "unsupported T");
604 int Index = Offset;
605 if (const auto *IE = dyn_cast<T>(Inst)) {
606 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
607 if (!VT)
608 return std::nullopt;
609 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
610 if (!CI)
611 return std::nullopt;
612 if (CI->getValue().uge(VT->getNumElements()))
613 return std::nullopt;
614 Index *= VT->getNumElements();
615 Index += CI->getZExtValue();
616 return Index;
617 }
618 return std::nullopt;
619}
620
621/// \returns inserting or extracting index of InsertElement, ExtractElement or
622/// InsertValue instruction, using Offset as base offset for index.
623/// \returns std::nullopt if the index is not an immediate.
624static std::optional<unsigned> getElementIndex(const Value *Inst,
625 unsigned Offset = 0) {
626 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
627 return Index;
629 return Index;
630
631 int Index = Offset;
632
633 const auto *IV = dyn_cast<InsertValueInst>(Inst);
634 if (!IV)
635 return std::nullopt;
636
637 Type *CurrentType = IV->getType();
638 for (unsigned I : IV->indices()) {
639 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
640 Index *= ST->getNumElements();
641 CurrentType = ST->getElementType(I);
642 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
643 Index *= AT->getNumElements();
644 CurrentType = AT->getElementType();
645 } else {
646 return std::nullopt;
647 }
648 Index += I;
649 }
650 return Index;
651}
652
653/// \returns true if all of the values in \p VL use the same opcode.
654/// For comparison instructions, also checks if predicates match.
655/// PoisonValues are considered matching.
656/// Interchangeable instructions are not considered.
658 auto *It = find_if(VL, IsaPred<Instruction>);
659 if (It == VL.end())
660 return true;
661 Instruction *MainOp = cast<Instruction>(*It);
662 unsigned Opcode = MainOp->getOpcode();
663 bool IsCmpOp = isa<CmpInst>(MainOp);
664 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
666 return std::all_of(It, VL.end(), [&](Value *V) {
667 if (auto *CI = dyn_cast<CmpInst>(V))
668 return BasePred == CI->getPredicate();
669 if (auto *I = dyn_cast<Instruction>(V))
670 return I->getOpcode() == Opcode;
671 return isa<PoisonValue>(V);
672 });
673}
674
675namespace {
676/// Specifies the way the mask should be analyzed for undefs/poisonous elements
677/// in the shuffle mask.
678enum class UseMask {
679 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
680 ///< check for the mask elements for the first argument (mask
681 ///< indices are in range [0:VF)).
682 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
683 ///< for the mask elements for the second argument (mask indices
684 ///< are in range [VF:2*VF))
685 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
686 ///< future shuffle elements and mark them as ones as being used
687 ///< in future. Non-undef elements are considered as unused since
688 ///< they're already marked as used in the mask.
689};
690} // namespace
691
692/// Prepares a use bitset for the given mask either for the first argument or
693/// for the second.
695 UseMask MaskArg) {
696 SmallBitVector UseMask(VF, true);
697 for (auto [Idx, Value] : enumerate(Mask)) {
698 if (Value == PoisonMaskElem) {
699 if (MaskArg == UseMask::UndefsAsMask)
700 UseMask.reset(Idx);
701 continue;
702 }
703 if (MaskArg == UseMask::FirstArg && Value < VF)
704 UseMask.reset(Value);
705 else if (MaskArg == UseMask::SecondArg && Value >= VF)
706 UseMask.reset(Value - VF);
707 }
708 return UseMask;
709}
710
711/// Checks if the given value is actually an undefined constant vector.
712/// Also, if the \p UseMask is not empty, tries to check if the non-masked
713/// elements actually mask the insertelement buildvector, if any.
714template <bool IsPoisonOnly = false>
716 const SmallBitVector &UseMask = {}) {
717 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
718 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
719 if (isa<T>(V))
720 return Res;
721 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
722 if (!VecTy)
723 return Res.reset();
724 auto *C = dyn_cast<Constant>(V);
725 if (!C) {
726 if (!UseMask.empty()) {
727 const Value *Base = V;
728 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
729 Base = II->getOperand(0);
730 if (isa<T>(II->getOperand(1)))
731 continue;
732 std::optional<unsigned> Idx = getElementIndex(II);
733 if (!Idx) {
734 Res.reset();
735 return Res;
736 }
737 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
738 Res.reset(*Idx);
739 }
740 // TODO: Add analysis for shuffles here too.
741 if (V == Base) {
742 Res.reset();
743 } else {
744 SmallBitVector SubMask(UseMask.size(), false);
745 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
746 }
747 } else {
748 Res.reset();
749 }
750 return Res;
751 }
752 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
753 if (Constant *Elem = C->getAggregateElement(I))
754 if (!isa<T>(Elem) &&
755 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
756 Res.reset(I);
757 }
758 return Res;
759}
760
761/// Checks if the vector of instructions can be represented as a shuffle, like:
762/// %x0 = extractelement <4 x i8> %x, i32 0
763/// %x3 = extractelement <4 x i8> %x, i32 3
764/// %y1 = extractelement <4 x i8> %y, i32 1
765/// %y2 = extractelement <4 x i8> %y, i32 2
766/// %x0x0 = mul i8 %x0, %x0
767/// %x3x3 = mul i8 %x3, %x3
768/// %y1y1 = mul i8 %y1, %y1
769/// %y2y2 = mul i8 %y2, %y2
770/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
771/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
772/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
773/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
774/// ret <4 x i8> %ins4
775/// can be transformed into:
776/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
777/// i32 6>
778/// %2 = mul <4 x i8> %1, %1
779/// ret <4 x i8> %2
780/// Mask will return the Shuffle Mask equivalent to the extracted elements.
781/// TODO: Can we split off and reuse the shuffle mask detection from
782/// ShuffleVectorInst/getShuffleCost?
783static std::optional<TargetTransformInfo::ShuffleKind>
785 AssumptionCache *AC) {
786 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
787 if (It == VL.end())
788 return std::nullopt;
789 unsigned Size =
790 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
791 auto *EI = dyn_cast<ExtractElementInst>(V);
792 if (!EI)
793 return S;
794 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
795 if (!VTy)
796 return S;
797 return std::max(S, VTy->getNumElements());
798 });
799
800 Value *Vec1 = nullptr;
801 Value *Vec2 = nullptr;
802 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
803 auto *EE = dyn_cast<ExtractElementInst>(V);
804 if (!EE)
805 return false;
806 Value *Vec = EE->getVectorOperand();
807 if (isa<UndefValue>(Vec))
808 return false;
809 return isGuaranteedNotToBePoison(Vec, AC);
810 });
811 enum ShuffleMode { Unknown, Select, Permute };
812 ShuffleMode CommonShuffleMode = Unknown;
813 Mask.assign(VL.size(), PoisonMaskElem);
814 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
815 // Undef can be represented as an undef element in a vector.
816 if (isa<UndefValue>(VL[I]))
817 continue;
818 auto *EI = cast<ExtractElementInst>(VL[I]);
819 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
820 return std::nullopt;
821 auto *Vec = EI->getVectorOperand();
822 // We can extractelement from undef or poison vector.
824 continue;
825 // All vector operands must have the same number of vector elements.
826 if (isa<UndefValue>(Vec)) {
827 Mask[I] = I;
828 } else {
829 if (isa<UndefValue>(EI->getIndexOperand()))
830 continue;
831 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
832 if (!Idx)
833 return std::nullopt;
834 // Undefined behavior if Idx is negative or >= Size.
835 if (Idx->getValue().uge(Size))
836 continue;
837 unsigned IntIdx = Idx->getValue().getZExtValue();
838 Mask[I] = IntIdx;
839 }
840 if (isUndefVector(Vec).all() && HasNonUndefVec)
841 continue;
842 // For correct shuffling we have to have at most 2 different vector operands
843 // in all extractelement instructions.
844 if (!Vec1 || Vec1 == Vec) {
845 Vec1 = Vec;
846 } else if (!Vec2 || Vec2 == Vec) {
847 Vec2 = Vec;
848 Mask[I] += Size;
849 } else {
850 return std::nullopt;
851 }
852 if (CommonShuffleMode == Permute)
853 continue;
854 // If the extract index is not the same as the operation number, it is a
855 // permutation.
856 if (Mask[I] % Size != I) {
857 CommonShuffleMode = Permute;
858 continue;
859 }
860 CommonShuffleMode = Select;
861 }
862 // If we're not crossing lanes in different vectors, consider it as blending.
863 if (CommonShuffleMode == Select && Vec2)
865 // If Vec2 was never used, we have a permutation of a single vector, otherwise
866 // we have permutation of 2 vectors.
869}
870
871/// \returns True if Extract{Value,Element} instruction extracts element Idx.
872static std::optional<unsigned> getExtractIndex(const Instruction *E) {
873 unsigned Opcode = E->getOpcode();
874 assert((Opcode == Instruction::ExtractElement ||
875 Opcode == Instruction::ExtractValue) &&
876 "Expected extractelement or extractvalue instruction.");
877 if (Opcode == Instruction::ExtractElement) {
878 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
879 if (!CI)
880 return std::nullopt;
881 return CI->getZExtValue();
882 }
883 auto *EI = cast<ExtractValueInst>(E);
884 if (EI->getNumIndices() != 1)
885 return std::nullopt;
886 return *EI->idx_begin();
887}
888
889/// Checks if the provided value does not require scheduling. It does not
890/// require scheduling if this is not an instruction or it is an instruction
891/// that does not read/write memory and all operands are either not instructions
892/// or phi nodes or instructions from different blocks.
893static bool areAllOperandsNonInsts(Value *V);
894/// Checks if the provided value does not require scheduling. It does not
895/// require scheduling if this is not an instruction or it is an instruction
896/// that does not read/write memory and all users are phi nodes or instructions
897/// from the different blocks.
898static bool isUsedOutsideBlock(Value *V);
899/// Checks if the specified value does not require scheduling. It does not
900/// require scheduling if all operands and all users do not need to be scheduled
901/// in the current basic block.
902static bool doesNotNeedToBeScheduled(Value *V);
903
904/// \returns true if \p Opcode is allowed as part of the main/alternate
905/// instruction for SLP vectorization.
906///
907/// Example of unsupported opcode is SDIV that can potentially cause UB if the
908/// "shuffled out" lane would result in division by zero.
909static bool isValidForAlternation(unsigned Opcode) {
910 return !Instruction::isIntDivRem(Opcode);
911}
912
913namespace {
914
915/// Helper class that determines VL can use the same opcode.
916/// Alternate instruction is supported. In addition, it supports interchangeable
917/// instruction. An interchangeable instruction is an instruction that can be
918/// converted to another instruction with same semantics. For example, x << 1 is
919/// equal to x * 2. x * 1 is equal to x | 0.
920class BinOpSameOpcodeHelper {
921 using MaskType = std::uint_fast16_t;
922 /// Sort SupportedOp because it is used by binary_search.
923 constexpr static std::initializer_list<unsigned> SupportedOp = {
924 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
925 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
926 enum : MaskType {
927 ShlBIT = 0b1,
928 AShrBIT = 0b10,
929 MulBIT = 0b100,
930 AddBIT = 0b1000,
931 SubBIT = 0b10000,
932 AndBIT = 0b100000,
933 OrBIT = 0b1000000,
934 XorBIT = 0b10000000,
935 MainOpBIT = 0b100000000,
937 };
938 /// Return a non-nullptr if either operand of I is a ConstantInt.
939 /// The second return value represents the operand position. We check the
940 /// right-hand side first (1). If the right hand side is not a ConstantInt and
941 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
942 /// side (0).
943 static std::pair<ConstantInt *, unsigned>
944 isBinOpWithConstantInt(const Instruction *I) {
945 unsigned Opcode = I->getOpcode();
946 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
947 (void)SupportedOp;
948 auto *BinOp = cast<BinaryOperator>(I);
949 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
950 return {CI, 1};
951 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
952 Opcode == Instruction::AShr)
953 return {nullptr, 0};
954 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
955 return {CI, 0};
956 return {nullptr, 0};
957 }
958 struct InterchangeableInfo {
959 const Instruction *I = nullptr;
960 /// The bit it sets represents whether MainOp can be converted to.
961 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
962 MulBIT | AShrBIT | ShlBIT;
963 /// We cannot create an interchangeable instruction that does not exist in
964 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
965 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
966 /// 1]. SeenBefore is used to know what operations have been seen before.
967 MaskType SeenBefore = 0;
968 InterchangeableInfo(const Instruction *I) : I(I) {}
969 /// Return false allows BinOpSameOpcodeHelper to find an alternate
970 /// instruction. Directly setting the mask will destroy the mask state,
971 /// preventing us from determining which instruction it should convert to.
972 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
973 if (Mask & InterchangeableMask) {
974 SeenBefore |= OpcodeInMaskForm;
975 Mask &= InterchangeableMask;
976 return true;
977 }
978 return false;
979 }
980 bool equal(unsigned Opcode) {
981 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
982 }
983 unsigned getOpcode() const {
984 MaskType Candidate = Mask & SeenBefore;
985 if (Candidate & MainOpBIT)
986 return I->getOpcode();
987 if (Candidate & ShlBIT)
988 return Instruction::Shl;
989 if (Candidate & AShrBIT)
990 return Instruction::AShr;
991 if (Candidate & MulBIT)
992 return Instruction::Mul;
993 if (Candidate & AddBIT)
994 return Instruction::Add;
995 if (Candidate & SubBIT)
996 return Instruction::Sub;
997 if (Candidate & AndBIT)
998 return Instruction::And;
999 if (Candidate & OrBIT)
1000 return Instruction::Or;
1001 if (Candidate & XorBIT)
1002 return Instruction::Xor;
1003 llvm_unreachable("Cannot find interchangeable instruction.");
1004 }
1005
1006 /// Return true if the instruction can be converted to \p Opcode.
1007 bool hasCandidateOpcode(unsigned Opcode) const {
1008 MaskType Candidate = Mask & SeenBefore;
1009 switch (Opcode) {
1010 case Instruction::Shl:
1011 return Candidate & ShlBIT;
1012 case Instruction::AShr:
1013 return Candidate & AShrBIT;
1014 case Instruction::Mul:
1015 return Candidate & MulBIT;
1016 case Instruction::Add:
1017 return Candidate & AddBIT;
1018 case Instruction::Sub:
1019 return Candidate & SubBIT;
1020 case Instruction::And:
1021 return Candidate & AndBIT;
1022 case Instruction::Or:
1023 return Candidate & OrBIT;
1024 case Instruction::Xor:
1025 return Candidate & XorBIT;
1026 case Instruction::LShr:
1027 case Instruction::FAdd:
1028 case Instruction::FSub:
1029 case Instruction::FMul:
1030 case Instruction::SDiv:
1031 case Instruction::UDiv:
1032 case Instruction::FDiv:
1033 case Instruction::SRem:
1034 case Instruction::URem:
1035 case Instruction::FRem:
1036 return false;
1037 default:
1038 break;
1039 }
1040 llvm_unreachable("Cannot find interchangeable instruction.");
1041 }
1042
1043 SmallVector<Value *> getOperand(const Instruction *To) const {
1044 unsigned ToOpcode = To->getOpcode();
1045 unsigned FromOpcode = I->getOpcode();
1046 if (FromOpcode == ToOpcode)
1047 return SmallVector<Value *>(I->operands());
1048 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1049 auto [CI, Pos] = isBinOpWithConstantInt(I);
1050 const APInt &FromCIValue = CI->getValue();
1051 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1052 APInt ToCIValue;
1053 switch (FromOpcode) {
1054 case Instruction::Shl:
1055 if (ToOpcode == Instruction::Mul) {
1056 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1057 FromCIValue.getZExtValue());
1058 } else {
1059 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1060 ToCIValue = ToOpcode == Instruction::And
1061 ? APInt::getAllOnes(FromCIValueBitWidth)
1062 : APInt::getZero(FromCIValueBitWidth);
1063 }
1064 break;
1065 case Instruction::Mul:
1066 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1067 if (ToOpcode == Instruction::Shl) {
1068 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1069 } else {
1070 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1071 ToCIValue = ToOpcode == Instruction::And
1072 ? APInt::getAllOnes(FromCIValueBitWidth)
1073 : APInt::getZero(FromCIValueBitWidth);
1074 }
1075 break;
1076 case Instruction::Add:
1077 case Instruction::Sub:
1078 if (FromCIValue.isZero()) {
1079 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1080 } else {
1081 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1082 "Cannot convert the instruction.");
1083 ToCIValue = FromCIValue;
1084 ToCIValue.negate();
1085 }
1086 break;
1087 case Instruction::And:
1088 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1089 ToCIValue = ToOpcode == Instruction::Mul
1090 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1091 : APInt::getZero(FromCIValueBitWidth);
1092 break;
1093 default:
1094 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1095 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1096 break;
1097 }
1098 Value *LHS = I->getOperand(1 - Pos);
1099 Constant *RHS =
1100 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1101 // constant + x cannot be -constant - x
1102 // instead, it should be x - -constant
1103 if (Pos == 1 ||
1104 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1105 FromOpcode == Instruction::Xor) &&
1106 ToOpcode == Instruction::Sub))
1107 return SmallVector<Value *>({LHS, RHS});
1108 return SmallVector<Value *>({RHS, LHS});
1109 }
1110 };
1111 InterchangeableInfo MainOp;
1112 InterchangeableInfo AltOp;
1113 bool isValidForAlternation(const Instruction *I) const {
1114 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1115 ::isValidForAlternation(I->getOpcode());
1116 }
1117 bool initializeAltOp(const Instruction *I) {
1118 if (AltOp.I)
1119 return true;
1121 return false;
1122 AltOp.I = I;
1123 return true;
1124 }
1125
1126public:
1127 BinOpSameOpcodeHelper(const Instruction *MainOp,
1128 const Instruction *AltOp = nullptr)
1129 : MainOp(MainOp), AltOp(AltOp) {
1130 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1131 }
1132 bool add(const Instruction *I) {
1134 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1135 unsigned Opcode = I->getOpcode();
1136 MaskType OpcodeInMaskForm;
1137 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1138 switch (Opcode) {
1139 case Instruction::Shl:
1140 OpcodeInMaskForm = ShlBIT;
1141 break;
1142 case Instruction::AShr:
1143 OpcodeInMaskForm = AShrBIT;
1144 break;
1145 case Instruction::Mul:
1146 OpcodeInMaskForm = MulBIT;
1147 break;
1148 case Instruction::Add:
1149 OpcodeInMaskForm = AddBIT;
1150 break;
1151 case Instruction::Sub:
1152 OpcodeInMaskForm = SubBIT;
1153 break;
1154 case Instruction::And:
1155 OpcodeInMaskForm = AndBIT;
1156 break;
1157 case Instruction::Or:
1158 OpcodeInMaskForm = OrBIT;
1159 break;
1160 case Instruction::Xor:
1161 OpcodeInMaskForm = XorBIT;
1162 break;
1163 default:
1164 return MainOp.equal(Opcode) ||
1165 (initializeAltOp(I) && AltOp.equal(Opcode));
1166 }
1167 MaskType InterchangeableMask = OpcodeInMaskForm;
1168 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1169 if (CI) {
1170 constexpr MaskType CanBeAll =
1171 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1172 const APInt &CIValue = CI->getValue();
1173 switch (Opcode) {
1174 case Instruction::Shl:
1175 if (CIValue.ult(CIValue.getBitWidth()))
1176 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1177 break;
1178 case Instruction::Mul:
1179 if (CIValue.isOne()) {
1180 InterchangeableMask = CanBeAll;
1181 break;
1182 }
1183 if (CIValue.isPowerOf2())
1184 InterchangeableMask = MulBIT | ShlBIT;
1185 break;
1186 case Instruction::Add:
1187 case Instruction::Sub:
1188 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1189 break;
1190 case Instruction::And:
1191 if (CIValue.isAllOnes())
1192 InterchangeableMask = CanBeAll;
1193 break;
1194 case Instruction::Xor:
1195 if (CIValue.isZero())
1196 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1197 break;
1198 default:
1199 if (CIValue.isZero())
1200 InterchangeableMask = CanBeAll;
1201 break;
1202 }
1203 }
1204 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1205 (initializeAltOp(I) &&
1206 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1207 }
1208 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1209 /// Checks if the list of potential opcodes includes \p Opcode.
1210 bool hasCandidateOpcode(unsigned Opcode) const {
1211 return MainOp.hasCandidateOpcode(Opcode);
1212 }
1213 bool hasAltOp() const { return AltOp.I; }
1214 unsigned getAltOpcode() const {
1215 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1216 }
1217 SmallVector<Value *> getOperand(const Instruction *I) const {
1218 return MainOp.getOperand(I);
1219 }
1220};
1221
1222/// Main data required for vectorization of instructions.
1223class InstructionsState {
1224 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1225 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1226 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1227 /// isAltShuffle).
1228 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1229 /// from getMainAltOpsNoStateVL.
1230 /// For those InstructionsState that use alternate instructions, the resulting
1231 /// vectorized output ultimately comes from a shufflevector. For example,
1232 /// given a vector list (VL):
1233 /// VL[0] = add i32 a, e
1234 /// VL[1] = sub i32 b, f
1235 /// VL[2] = add i32 c, g
1236 /// VL[3] = sub i32 d, h
1237 /// The vectorized result would be:
1238 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1239 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1240 /// result = shufflevector <4 x i32> intermediated_0,
1241 /// <4 x i32> intermediated_1,
1242 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1243 /// Since shufflevector is used in the final result, when calculating the cost
1244 /// (getEntryCost), we must account for the usage of shufflevector in
1245 /// GetVectorCost.
1246 Instruction *MainOp = nullptr;
1247 Instruction *AltOp = nullptr;
1248 /// Wether the instruction state represents copyable instructions.
1249 bool HasCopyables = false;
1250
1251public:
1252 Instruction *getMainOp() const {
1253 assert(valid() && "InstructionsState is invalid.");
1254 return MainOp;
1255 }
1256
1257 Instruction *getAltOp() const {
1258 assert(valid() && "InstructionsState is invalid.");
1259 return AltOp;
1260 }
1261
1262 /// The main/alternate opcodes for the list of instructions.
1263 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1264
1265 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1266
1267 /// Some of the instructions in the list have alternate opcodes.
1268 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1269
1270 /// Checks if the instruction matches either the main or alternate opcode.
1271 /// \returns
1272 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1273 /// to it
1274 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1275 /// it
1276 /// - nullptr if \param I cannot be matched or converted to either opcode
1277 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1278 assert(MainOp && "MainOp cannot be nullptr.");
1279 if (I->getOpcode() == MainOp->getOpcode())
1280 return MainOp;
1281 // Prefer AltOp instead of interchangeable instruction of MainOp.
1282 assert(AltOp && "AltOp cannot be nullptr.");
1283 if (I->getOpcode() == AltOp->getOpcode())
1284 return AltOp;
1285 if (!I->isBinaryOp())
1286 return nullptr;
1287 BinOpSameOpcodeHelper Converter(MainOp);
1288 if (!Converter.add(I) || !Converter.add(MainOp))
1289 return nullptr;
1290 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1291 BinOpSameOpcodeHelper AltConverter(AltOp);
1292 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1293 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1294 return AltOp;
1295 }
1296 if (Converter.hasAltOp() && !isAltShuffle())
1297 return nullptr;
1298 return Converter.hasAltOp() ? AltOp : MainOp;
1299 }
1300
1301 /// Checks if main/alt instructions are shift operations.
1302 bool isShiftOp() const {
1303 return getMainOp()->isShift() && getAltOp()->isShift();
1304 }
1305
1306 /// Checks if main/alt instructions are bitwise logic operations.
1307 bool isBitwiseLogicOp() const {
1308 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1309 }
1310
1311 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1312 bool isMulDivLikeOp() const {
1313 constexpr std::array<unsigned, 8> MulDiv = {
1314 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1315 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1316 Instruction::URem, Instruction::FRem};
1317 return is_contained(MulDiv, getOpcode()) &&
1318 is_contained(MulDiv, getAltOpcode());
1319 }
1320
1321 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1322 bool isAddSubLikeOp() const {
1323 constexpr std::array<unsigned, 4> AddSub = {
1324 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1325 Instruction::FSub};
1326 return is_contained(AddSub, getOpcode()) &&
1327 is_contained(AddSub, getAltOpcode());
1328 }
1329
1330 /// Checks if main/alt instructions are cmp operations.
1331 bool isCmpOp() const {
1332 return (getOpcode() == Instruction::ICmp ||
1333 getOpcode() == Instruction::FCmp) &&
1334 getAltOpcode() == getOpcode();
1335 }
1336
1337 /// Checks if the current state is valid, i.e. has non-null MainOp
1338 bool valid() const { return MainOp && AltOp; }
1339
1340 explicit operator bool() const { return valid(); }
1341
1342 InstructionsState() = delete;
1343 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1344 bool HasCopyables = false)
1345 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1346 static InstructionsState invalid() { return {nullptr, nullptr}; }
1347
1348 /// Checks if the value is a copyable element.
1349 bool isCopyableElement(Value *V) const {
1350 assert(valid() && "InstructionsState is invalid.");
1351 if (!HasCopyables)
1352 return false;
1353 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1354 return false;
1355 auto *I = dyn_cast<Instruction>(V);
1356 if (!I)
1357 return !isa<PoisonValue>(V);
1358 if (I->getParent() != MainOp->getParent() &&
1361 return true;
1362 if (I->getOpcode() == MainOp->getOpcode())
1363 return false;
1364 if (!I->isBinaryOp())
1365 return true;
1366 BinOpSameOpcodeHelper Converter(MainOp);
1367 return !Converter.add(I) || !Converter.add(MainOp) ||
1368 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1369 }
1370
1371 /// Checks if the value is non-schedulable.
1372 bool isNonSchedulable(Value *V) const {
1373 assert(valid() && "InstructionsState is invalid.");
1374 auto *I = dyn_cast<Instruction>(V);
1375 if (!HasCopyables)
1376 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1378 // MainOp for copyables always schedulable to correctly identify
1379 // non-schedulable copyables.
1380 if (getMainOp() == V)
1381 return false;
1382 if (isCopyableElement(V)) {
1383 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1384 auto *I = dyn_cast<Instruction>(V);
1385 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1387 // If the copyable instructions comes after MainOp
1388 // (non-schedulable, but used in the block) - cannot vectorize
1389 // it, will possibly generate use before def.
1390 !MainOp->comesBefore(I));
1391 };
1392
1393 return IsNonSchedulableCopyableElement(V);
1394 }
1395 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1397 }
1398
1399 /// Checks if the state represents copyable instructions.
1400 bool areInstructionsWithCopyableElements() const {
1401 assert(valid() && "InstructionsState is invalid.");
1402 return HasCopyables;
1403 }
1404};
1405
1406std::pair<Instruction *, SmallVector<Value *>>
1407convertTo(Instruction *I, const InstructionsState &S) {
1408 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1409 assert(SelectedOp && "Cannot convert the instruction.");
1410 if (I->isBinaryOp()) {
1411 BinOpSameOpcodeHelper Converter(I);
1412 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1413 }
1414 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1415}
1416
1417} // end anonymous namespace
1418
1419static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1420 const TargetLibraryInfo &TLI);
1421
1422/// Find an instruction with a specific opcode in VL.
1423/// \param VL Array of values to search through. Must contain only Instructions
1424/// and PoisonValues.
1425/// \param Opcode The instruction opcode to search for
1426/// \returns
1427/// - The first instruction found with matching opcode
1428/// - nullptr if no matching instruction is found
1430 unsigned Opcode) {
1431 for (Value *V : VL) {
1432 if (isa<PoisonValue>(V))
1433 continue;
1434 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1435 auto *Inst = cast<Instruction>(V);
1436 if (Inst->getOpcode() == Opcode)
1437 return Inst;
1438 }
1439 return nullptr;
1440}
1441
1442/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1443/// compatible instructions or constants, or just some other regular values.
1444static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1445 Value *Op1, const TargetLibraryInfo &TLI) {
1446 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1447 (isConstant(BaseOp1) && isConstant(Op1)) ||
1448 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1449 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1450 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1451 getSameOpcode({BaseOp0, Op0}, TLI) ||
1452 getSameOpcode({BaseOp1, Op1}, TLI);
1453}
1454
1455/// \returns true if a compare instruction \p CI has similar "look" and
1456/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1457/// swapped, false otherwise.
1458static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1459 const TargetLibraryInfo &TLI) {
1460 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1461 "Assessing comparisons of different types?");
1462 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1463 CmpInst::Predicate Pred = CI->getPredicate();
1465
1466 Value *BaseOp0 = BaseCI->getOperand(0);
1467 Value *BaseOp1 = BaseCI->getOperand(1);
1468 Value *Op0 = CI->getOperand(0);
1469 Value *Op1 = CI->getOperand(1);
1470
1471 return (BasePred == Pred &&
1472 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1473 (BasePred == SwappedPred &&
1474 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1475}
1476
1477/// \returns analysis of the Instructions in \p VL described in
1478/// InstructionsState, the Opcode that we suppose the whole list
1479/// could be vectorized even if its structure is diverse.
1480static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1481 const TargetLibraryInfo &TLI) {
1482 // Make sure these are all Instructions.
1484 return InstructionsState::invalid();
1485
1486 auto *It = find_if(VL, IsaPred<Instruction>);
1487 if (It == VL.end())
1488 return InstructionsState::invalid();
1489
1490 Instruction *MainOp = cast<Instruction>(*It);
1491 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1492 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1493 (VL.size() == 2 && InstCnt < 2))
1494 return InstructionsState::invalid();
1495
1496 bool IsCastOp = isa<CastInst>(MainOp);
1497 bool IsBinOp = isa<BinaryOperator>(MainOp);
1498 bool IsCmpOp = isa<CmpInst>(MainOp);
1499 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1501 Instruction *AltOp = MainOp;
1502 unsigned Opcode = MainOp->getOpcode();
1503 unsigned AltOpcode = Opcode;
1504
1505 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1506 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1507 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1508 UniquePreds.insert(BasePred);
1509 UniqueNonSwappedPreds.insert(BasePred);
1510 for (Value *V : VL) {
1511 auto *I = dyn_cast<CmpInst>(V);
1512 if (!I)
1513 return false;
1514 CmpInst::Predicate CurrentPred = I->getPredicate();
1515 CmpInst::Predicate SwappedCurrentPred =
1516 CmpInst::getSwappedPredicate(CurrentPred);
1517 UniqueNonSwappedPreds.insert(CurrentPred);
1518 if (!UniquePreds.contains(CurrentPred) &&
1519 !UniquePreds.contains(SwappedCurrentPred))
1520 UniquePreds.insert(CurrentPred);
1521 }
1522 // Total number of predicates > 2, but if consider swapped predicates
1523 // compatible only 2, consider swappable predicates as compatible opcodes,
1524 // not alternate.
1525 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1526 }();
1527 // Check for one alternate opcode from another BinaryOperator.
1528 // TODO - generalize to support all operators (types, calls etc.).
1529 Intrinsic::ID BaseID = 0;
1530 SmallVector<VFInfo> BaseMappings;
1531 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1532 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1533 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1534 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1535 return InstructionsState::invalid();
1536 }
1537 bool AnyPoison = InstCnt != VL.size();
1538 // Check MainOp too to be sure that it matches the requirements for the
1539 // instructions.
1540 for (Value *V : iterator_range(It, VL.end())) {
1541 auto *I = dyn_cast<Instruction>(V);
1542 if (!I)
1543 continue;
1544
1545 // Cannot combine poison and divisions.
1546 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1547 // intrinsics/functions only.
1548 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1549 return InstructionsState::invalid();
1550 unsigned InstOpcode = I->getOpcode();
1551 if (IsBinOp && isa<BinaryOperator>(I)) {
1552 if (BinOpHelper.add(I))
1553 continue;
1554 } else if (IsCastOp && isa<CastInst>(I)) {
1555 Value *Op0 = MainOp->getOperand(0);
1556 Type *Ty0 = Op0->getType();
1557 Value *Op1 = I->getOperand(0);
1558 Type *Ty1 = Op1->getType();
1559 if (Ty0 == Ty1) {
1560 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1561 continue;
1562 if (Opcode == AltOpcode) {
1563 assert(isValidForAlternation(Opcode) &&
1564 isValidForAlternation(InstOpcode) &&
1565 "Cast isn't safe for alternation, logic needs to be updated!");
1566 AltOpcode = InstOpcode;
1567 AltOp = I;
1568 continue;
1569 }
1570 }
1571 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1572 auto *BaseInst = cast<CmpInst>(MainOp);
1573 Type *Ty0 = BaseInst->getOperand(0)->getType();
1574 Type *Ty1 = Inst->getOperand(0)->getType();
1575 if (Ty0 == Ty1) {
1576 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1577 assert(InstOpcode == AltOpcode &&
1578 "Alternate instructions are only supported by BinaryOperator "
1579 "and CastInst.");
1580 // Check for compatible operands. If the corresponding operands are not
1581 // compatible - need to perform alternate vectorization.
1582 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1583 CmpInst::Predicate SwappedCurrentPred =
1584 CmpInst::getSwappedPredicate(CurrentPred);
1585
1586 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1587 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1588 continue;
1589
1590 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1591 continue;
1592 auto *AltInst = cast<CmpInst>(AltOp);
1593 if (MainOp != AltOp) {
1594 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1595 continue;
1596 } else if (BasePred != CurrentPred) {
1597 assert(
1598 isValidForAlternation(InstOpcode) &&
1599 "CmpInst isn't safe for alternation, logic needs to be updated!");
1600 AltOp = I;
1601 continue;
1602 }
1603 CmpInst::Predicate AltPred = AltInst->getPredicate();
1604 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1605 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1606 continue;
1607 }
1608 } else if (InstOpcode == Opcode) {
1609 assert(InstOpcode == AltOpcode &&
1610 "Alternate instructions are only supported by BinaryOperator and "
1611 "CastInst.");
1612 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1613 if (Gep->getNumOperands() != 2 ||
1614 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1615 return InstructionsState::invalid();
1616 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1618 return InstructionsState::invalid();
1619 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1620 auto *BaseLI = cast<LoadInst>(MainOp);
1621 if (!LI->isSimple() || !BaseLI->isSimple())
1622 return InstructionsState::invalid();
1623 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1624 auto *CallBase = cast<CallInst>(MainOp);
1625 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1626 return InstructionsState::invalid();
1627 if (Call->hasOperandBundles() &&
1629 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1630 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1631 CallBase->op_begin() +
1633 return InstructionsState::invalid();
1635 if (ID != BaseID)
1636 return InstructionsState::invalid();
1637 if (!ID) {
1638 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1639 if (Mappings.size() != BaseMappings.size() ||
1640 Mappings.front().ISA != BaseMappings.front().ISA ||
1641 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1642 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1643 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1644 Mappings.front().Shape.Parameters !=
1645 BaseMappings.front().Shape.Parameters)
1646 return InstructionsState::invalid();
1647 }
1648 }
1649 continue;
1650 }
1651 return InstructionsState::invalid();
1652 }
1653
1654 if (IsBinOp) {
1655 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1656 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1657 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1658 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1659 }
1660 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1661 "Incorrect implementation of allSameOpcode.");
1662 InstructionsState S(MainOp, AltOp);
1663 assert(all_of(VL,
1664 [&](Value *V) {
1665 return isa<PoisonValue>(V) ||
1666 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1667 }) &&
1668 "Invalid InstructionsState.");
1669 return S;
1670}
1671
1672/// \returns true if all of the values in \p VL have the same type or false
1673/// otherwise.
1675 Type *Ty = VL.consume_front()->getType();
1676 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1677}
1678
1679/// \returns True if in-tree use also needs extract. This refers to
1680/// possible scalar operand in vectorized instruction.
1681static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1682 TargetLibraryInfo *TLI,
1683 const TargetTransformInfo *TTI) {
1684 if (!UserInst)
1685 return false;
1686 unsigned Opcode = UserInst->getOpcode();
1687 switch (Opcode) {
1688 case Instruction::Load: {
1689 LoadInst *LI = cast<LoadInst>(UserInst);
1690 return (LI->getPointerOperand() == Scalar);
1691 }
1692 case Instruction::Store: {
1693 StoreInst *SI = cast<StoreInst>(UserInst);
1694 return (SI->getPointerOperand() == Scalar);
1695 }
1696 case Instruction::Call: {
1697 CallInst *CI = cast<CallInst>(UserInst);
1699 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1700 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1701 Arg.value().get() == Scalar;
1702 });
1703 }
1704 default:
1705 return false;
1706 }
1707}
1708
1709/// \returns the AA location that is being access by the instruction.
1712 return MemoryLocation::get(SI);
1713 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1714 return MemoryLocation::get(LI);
1715 return MemoryLocation();
1716}
1717
1718/// \returns True if the instruction is not a volatile or atomic load/store.
1719static bool isSimple(Instruction *I) {
1720 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1721 return LI->isSimple();
1723 return SI->isSimple();
1725 return !MI->isVolatile();
1726 return true;
1727}
1728
1729/// Shuffles \p Mask in accordance with the given \p SubMask.
1730/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1731/// one but two input vectors.
1732static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1733 bool ExtendingManyInputs = false) {
1734 if (SubMask.empty())
1735 return;
1736 assert(
1737 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1738 // Check if input scalars were extended to match the size of other node.
1739 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1740 "SubMask with many inputs support must be larger than the mask.");
1741 if (Mask.empty()) {
1742 Mask.append(SubMask.begin(), SubMask.end());
1743 return;
1744 }
1745 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1746 int TermValue = std::min(Mask.size(), SubMask.size());
1747 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1748 if (SubMask[I] == PoisonMaskElem ||
1749 (!ExtendingManyInputs &&
1750 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1751 continue;
1752 NewMask[I] = Mask[SubMask[I]];
1753 }
1754 Mask.swap(NewMask);
1755}
1756
1757/// Order may have elements assigned special value (size) which is out of
1758/// bounds. Such indices only appear on places which correspond to undef values
1759/// (see canReuseExtract for details) and used in order to avoid undef values
1760/// have effect on operands ordering.
1761/// The first loop below simply finds all unused indices and then the next loop
1762/// nest assigns these indices for undef values positions.
1763/// As an example below Order has two undef positions and they have assigned
1764/// values 3 and 7 respectively:
1765/// before: 6 9 5 4 9 2 1 0
1766/// after: 6 3 5 4 7 2 1 0
1768 const size_t Sz = Order.size();
1769 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1770 SmallBitVector MaskedIndices(Sz);
1771 for (unsigned I = 0; I < Sz; ++I) {
1772 if (Order[I] < Sz)
1773 UnusedIndices.reset(Order[I]);
1774 else
1775 MaskedIndices.set(I);
1776 }
1777 if (MaskedIndices.none())
1778 return;
1779 assert(UnusedIndices.count() == MaskedIndices.count() &&
1780 "Non-synced masked/available indices.");
1781 int Idx = UnusedIndices.find_first();
1782 int MIdx = MaskedIndices.find_first();
1783 while (MIdx >= 0) {
1784 assert(Idx >= 0 && "Indices must be synced.");
1785 Order[MIdx] = Idx;
1786 Idx = UnusedIndices.find_next(Idx);
1787 MIdx = MaskedIndices.find_next(MIdx);
1788 }
1789}
1790
1791/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1792/// Opcode1.
1794 unsigned Opcode0, unsigned Opcode1) {
1795 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1796 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1797 for (unsigned Lane : seq<unsigned>(VL.size())) {
1798 if (isa<PoisonValue>(VL[Lane]))
1799 continue;
1800 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1801 OpcodeMask.set(Lane * ScalarTyNumElements,
1802 Lane * ScalarTyNumElements + ScalarTyNumElements);
1803 }
1804 return OpcodeMask;
1805}
1806
1807/// Replicates the given \p Val \p VF times.
1809 unsigned VF) {
1810 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1811 "Expected scalar constants.");
1812 SmallVector<Constant *> NewVal(Val.size() * VF);
1813 for (auto [I, V] : enumerate(Val))
1814 std::fill_n(NewVal.begin() + I * VF, VF, V);
1815 return NewVal;
1816}
1817
1819 SmallVectorImpl<int> &Mask) {
1820 Mask.clear();
1821 const unsigned E = Indices.size();
1822 Mask.resize(E, PoisonMaskElem);
1823 for (unsigned I = 0; I < E; ++I)
1824 Mask[Indices[I]] = I;
1825}
1826
1827/// Reorders the list of scalars in accordance with the given \p Mask.
1829 ArrayRef<int> Mask) {
1830 assert(!Mask.empty() && "Expected non-empty mask.");
1831 SmallVector<Value *> Prev(Scalars.size(),
1832 PoisonValue::get(Scalars.front()->getType()));
1833 Prev.swap(Scalars);
1834 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1835 if (Mask[I] != PoisonMaskElem)
1836 Scalars[Mask[I]] = Prev[I];
1837}
1838
1839/// Checks if the provided value does not require scheduling. It does not
1840/// require scheduling if this is not an instruction or it is an instruction
1841/// that does not read/write memory and all operands are either not instructions
1842/// or phi nodes or instructions from different blocks.
1844 auto *I = dyn_cast<Instruction>(V);
1845 if (!I)
1846 return true;
1847 return !mayHaveNonDefUseDependency(*I) &&
1848 all_of(I->operands(), [I](Value *V) {
1849 auto *IO = dyn_cast<Instruction>(V);
1850 if (!IO)
1851 return true;
1852 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1853 });
1854}
1855
1856/// Checks if the provided value does not require scheduling. It does not
1857/// require scheduling if this is not an instruction or it is an instruction
1858/// that does not read/write memory and all users are phi nodes or instructions
1859/// from the different blocks.
1860static bool isUsedOutsideBlock(Value *V) {
1861 auto *I = dyn_cast<Instruction>(V);
1862 if (!I)
1863 return true;
1864 // Limits the number of uses to save compile time.
1865 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1866 all_of(I->users(), [I](User *U) {
1867 auto *IU = dyn_cast<Instruction>(U);
1868 if (!IU)
1869 return true;
1870 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1871 });
1872}
1873
1874/// Checks if the specified value does not require scheduling. It does not
1875/// require scheduling if all operands and all users do not need to be scheduled
1876/// in the current basic block.
1879}
1880
1881/// Checks if the specified array of instructions does not require scheduling.
1882/// It is so if all either instructions have operands that do not require
1883/// scheduling or their users do not require scheduling since they are phis or
1884/// in other basic blocks.
1886 return !VL.empty() &&
1888}
1889
1890/// Returns true if widened type of \p Ty elements with size \p Sz represents
1891/// full vector type, i.e. adding extra element results in extra parts upon type
1892/// legalization.
1894 unsigned Sz) {
1895 if (Sz <= 1)
1896 return false;
1898 return false;
1899 if (has_single_bit(Sz))
1900 return true;
1901 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1902 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1903 Sz % NumParts == 0;
1904}
1905
1906/// Returns number of parts, the type \p VecTy will be split at the codegen
1907/// phase. If the type is going to be scalarized or does not uses whole
1908/// registers, returns 1.
1909static unsigned
1911 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1912 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1913 if (NumParts == 0 || NumParts >= Limit)
1914 return 1;
1915 unsigned Sz = getNumElements(VecTy);
1916 if (NumParts >= Sz || Sz % NumParts != 0 ||
1917 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1918 return 1;
1919 return NumParts;
1920}
1921
1922/// Bottom Up SLP Vectorizer.
1924 class TreeEntry;
1925 class ScheduleEntity;
1926 class ScheduleData;
1927 class ScheduleCopyableData;
1928 class ScheduleBundle;
1931
1932 /// If we decide to generate strided load / store, this struct contains all
1933 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1934 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1935 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1936 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1937 /// size of element of FixedVectorType.
1938 struct StridedPtrInfo {
1939 Value *StrideVal = nullptr;
1940 const SCEV *StrideSCEV = nullptr;
1941 FixedVectorType *Ty = nullptr;
1942 };
1943 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1944
1945public:
1946 /// Tracks the state we can represent the loads in the given sequence.
1954
1961
1963 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1965 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1966 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1967 AC(AC), DB(DB), DL(DL), ORE(ORE),
1968 Builder(Se->getContext(), TargetFolder(*DL)) {
1969 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1970 // Use the vector register size specified by the target unless overridden
1971 // by a command-line option.
1972 // TODO: It would be better to limit the vectorization factor based on
1973 // data type rather than just register size. For example, x86 AVX has
1974 // 256-bit registers, but it does not support integer operations
1975 // at that width (that requires AVX2).
1976 if (MaxVectorRegSizeOption.getNumOccurrences())
1977 MaxVecRegSize = MaxVectorRegSizeOption;
1978 else
1979 MaxVecRegSize =
1980 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1981 .getFixedValue();
1982
1983 if (MinVectorRegSizeOption.getNumOccurrences())
1984 MinVecRegSize = MinVectorRegSizeOption;
1985 else
1986 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1987 }
1988
1989 /// Vectorize the tree that starts with the elements in \p VL.
1990 /// Returns the vectorized root.
1992
1993 /// Vectorize the tree but with the list of externally used values \p
1994 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1995 /// generated extractvalue instructions.
1997 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1998 Instruction *ReductionRoot = nullptr,
1999 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2000
2001 /// \returns the cost incurred by unwanted spills and fills, caused by
2002 /// holding live values over call sites.
2004
2005 /// \returns the vectorization cost of the subtree that starts at \p VL.
2006 /// A negative number means that this is profitable.
2007 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
2008 InstructionCost ReductionCost = TTI::TCC_Free);
2009
2010 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2011 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2012 void buildTree(ArrayRef<Value *> Roots,
2013 const SmallDenseSet<Value *> &UserIgnoreLst);
2014
2015 /// Construct a vectorizable tree that starts at \p Roots.
2016 void buildTree(ArrayRef<Value *> Roots);
2017
2018 /// Return the scalars of the root node.
2020 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2021 return VectorizableTree.front()->Scalars;
2022 }
2023
2024 /// Returns the type/is-signed info for the root node in the graph without
2025 /// casting.
2026 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2027 const TreeEntry &Root = *VectorizableTree.front();
2028 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2029 !Root.Scalars.front()->getType()->isIntegerTy())
2030 return std::nullopt;
2031 auto It = MinBWs.find(&Root);
2032 if (It != MinBWs.end())
2033 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2034 It->second.first),
2035 It->second.second);
2036 if (Root.getOpcode() == Instruction::ZExt ||
2037 Root.getOpcode() == Instruction::SExt)
2038 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2039 Root.getOpcode() == Instruction::SExt);
2040 return std::nullopt;
2041 }
2042
2043 /// Checks if the root graph node can be emitted with narrower bitwidth at
2044 /// codegen and returns it signedness, if so.
2046 return MinBWs.at(VectorizableTree.front().get()).second;
2047 }
2048
2049 /// Returns reduction type after minbitdth analysis.
2051 if (ReductionBitWidth == 0 ||
2052 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2053 ReductionBitWidth >=
2054 DL->getTypeSizeInBits(
2055 VectorizableTree.front()->Scalars.front()->getType()))
2056 return getWidenedType(
2057 VectorizableTree.front()->Scalars.front()->getType(),
2058 VectorizableTree.front()->getVectorFactor());
2059 return getWidenedType(
2061 VectorizableTree.front()->Scalars.front()->getContext(),
2062 ReductionBitWidth),
2063 VectorizableTree.front()->getVectorFactor());
2064 }
2065
2066 /// Builds external uses of the vectorized scalars, i.e. the list of
2067 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2068 /// ExternallyUsedValues contains additional list of external uses to handle
2069 /// vectorization of reductions.
2070 void
2071 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2072
2073 /// Transforms graph nodes to target specific representations, if profitable.
2074 void transformNodes();
2075
2076 /// Clear the internal data structures that are created by 'buildTree'.
2077 void deleteTree() {
2078 VectorizableTree.clear();
2079 ScalarToTreeEntries.clear();
2080 OperandsToTreeEntry.clear();
2081 ScalarsInSplitNodes.clear();
2082 MustGather.clear();
2083 NonScheduledFirst.clear();
2084 EntryToLastInstruction.clear();
2085 LastInstructionToPos.clear();
2086 LoadEntriesToVectorize.clear();
2087 IsGraphTransformMode = false;
2088 GatheredLoadsEntriesFirst.reset();
2089 CompressEntryToData.clear();
2090 ExternalUses.clear();
2091 ExternalUsesAsOriginalScalar.clear();
2092 ExternalUsesWithNonUsers.clear();
2093 for (auto &Iter : BlocksSchedules) {
2094 BlockScheduling *BS = Iter.second.get();
2095 BS->clear();
2096 }
2097 MinBWs.clear();
2098 ReductionBitWidth = 0;
2099 BaseGraphSize = 1;
2100 CastMaxMinBWSizes.reset();
2101 ExtraBitWidthNodes.clear();
2102 InstrElementSize.clear();
2103 UserIgnoreList = nullptr;
2104 PostponedGathers.clear();
2105 ValueToGatherNodes.clear();
2106 TreeEntryToStridedPtrInfoMap.clear();
2107 }
2108
2109 unsigned getTreeSize() const { return VectorizableTree.size(); }
2110
2111 /// Returns the base graph size, before any transformations.
2112 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2113
2114 /// Perform LICM and CSE on the newly generated gather sequences.
2116
2117 /// Does this non-empty order represent an identity order? Identity
2118 /// should be represented as an empty order, so this is used to
2119 /// decide if we can canonicalize a computed order. Undef elements
2120 /// (represented as size) are ignored.
2122 assert(!Order.empty() && "expected non-empty order");
2123 const unsigned Sz = Order.size();
2124 return all_of(enumerate(Order), [&](const auto &P) {
2125 return P.value() == P.index() || P.value() == Sz;
2126 });
2127 }
2128
2129 /// Checks if the specified gather tree entry \p TE can be represented as a
2130 /// shuffled vector entry + (possibly) permutation with other gathers. It
2131 /// implements the checks only for possibly ordered scalars (Loads,
2132 /// ExtractElement, ExtractValue), which can be part of the graph.
2133 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2134 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2135 /// node might be ignored.
2136 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2137 bool TopToBottom,
2138 bool IgnoreReorder);
2139
2140 /// Sort loads into increasing pointers offsets to allow greater clustering.
2141 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2142
2143 /// Gets reordering data for the given tree entry. If the entry is vectorized
2144 /// - just return ReorderIndices, otherwise check if the scalars can be
2145 /// reordered and return the most optimal order.
2146 /// \return std::nullopt if ordering is not important, empty order, if
2147 /// identity order is important, or the actual order.
2148 /// \param TopToBottom If true, include the order of vectorized stores and
2149 /// insertelement nodes, otherwise skip them.
2150 /// \param IgnoreReorder true, if the root node order can be ignored.
2151 std::optional<OrdersType>
2152 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2153
2154 /// Checks if it is profitable to reorder the current tree.
2155 /// If the tree does not contain many profitable reordable nodes, better to
2156 /// skip it to save compile time.
2157 bool isProfitableToReorder() const;
2158
2159 /// Reorders the current graph to the most profitable order starting from the
2160 /// root node to the leaf nodes. The best order is chosen only from the nodes
2161 /// of the same size (vectorization factor). Smaller nodes are considered
2162 /// parts of subgraph with smaller VF and they are reordered independently. We
2163 /// can make it because we still need to extend smaller nodes to the wider VF
2164 /// and we can merge reordering shuffles with the widening shuffles.
2165 void reorderTopToBottom();
2166
2167 /// Reorders the current graph to the most profitable order starting from
2168 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2169 /// number of reshuffles if the leaf nodes use the same order. In this case we
2170 /// can merge the orders and just shuffle user node instead of shuffling its
2171 /// operands. Plus, even the leaf nodes have different orders, it allows to
2172 /// sink reordering in the graph closer to the root node and merge it later
2173 /// during analysis.
2174 void reorderBottomToTop(bool IgnoreReorder = false);
2175
2176 /// \return The vector element size in bits to use when vectorizing the
2177 /// expression tree ending at \p V. If V is a store, the size is the width of
2178 /// the stored value. Otherwise, the size is the width of the largest loaded
2179 /// value reaching V. This method is used by the vectorizer to calculate
2180 /// vectorization factors.
2181 unsigned getVectorElementSize(Value *V);
2182
2183 /// Compute the minimum type sizes required to represent the entries in a
2184 /// vectorizable tree.
2186
2187 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2188 unsigned getMaxVecRegSize() const {
2189 return MaxVecRegSize;
2190 }
2191
2192 // \returns minimum vector register size as set by cl::opt.
2193 unsigned getMinVecRegSize() const {
2194 return MinVecRegSize;
2195 }
2196
2197 unsigned getMinVF(unsigned Sz) const {
2198 return std::max(2U, getMinVecRegSize() / Sz);
2199 }
2200
2201 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2202 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2203 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2204 return MaxVF ? MaxVF : UINT_MAX;
2205 }
2206
2207 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2208 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2209 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2210 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2211 ///
2212 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2213 unsigned canMapToVector(Type *T) const;
2214
2215 /// \returns True if the VectorizableTree is both tiny and not fully
2216 /// vectorizable. We do not vectorize such trees.
2217 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2218
2219 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2220 /// It may happen, if all gather nodes are loads and they cannot be
2221 /// "clusterized". In this case even subgraphs cannot be vectorized more
2222 /// effectively than the base graph.
2223 bool isTreeNotExtendable() const;
2224
2225 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2226 /// can be load combined in the backend. Load combining may not be allowed in
2227 /// the IR optimizer, so we do not want to alter the pattern. For example,
2228 /// partially transforming a scalar bswap() pattern into vector code is
2229 /// effectively impossible for the backend to undo.
2230 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2231 /// may not be necessary.
2232 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2233
2234 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2235 /// can be load combined in the backend. Load combining may not be allowed in
2236 /// the IR optimizer, so we do not want to alter the pattern. For example,
2237 /// partially transforming a scalar bswap() pattern into vector code is
2238 /// effectively impossible for the backend to undo.
2239 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2240 /// may not be necessary.
2241 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2242 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2243 Align Alignment, const int64_t Diff,
2244 const size_t Sz) const;
2245
2246 /// Return true if an array of scalar loads can be replaced with a strided
2247 /// load (with constant stride).
2248 ///
2249 /// It is possible that the load gets "widened". Suppose that originally each
2250 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2251 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2252 /// ...
2253 /// %b + 0 * %s + (w - 1)
2254 ///
2255 /// %b + 1 * %s + 0
2256 /// %b + 1 * %s + 1
2257 /// %b + 1 * %s + 2
2258 /// ...
2259 /// %b + 1 * %s + (w - 1)
2260 /// ...
2261 ///
2262 /// %b + (n - 1) * %s + 0
2263 /// %b + (n - 1) * %s + 1
2264 /// %b + (n - 1) * %s + 2
2265 /// ...
2266 /// %b + (n - 1) * %s + (w - 1)
2267 ///
2268 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2269 ///
2270 /// \param PointerOps list of pointer arguments of loads.
2271 /// \param ElemTy original scalar type of loads.
2272 /// \param Alignment alignment of the first load.
2273 /// \param SortedIndices is the order of PointerOps as returned by
2274 /// `sortPtrAccesses`
2275 /// \param Diff Pointer difference between the lowest and the highes pointer
2276 /// in `PointerOps` as returned by `getPointersDiff`.
2277 /// \param Ptr0 first pointer in `PointersOps`.
2278 /// \param PtrN last pointer in `PointersOps`.
2279 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2280 /// of `SPtrInfo` necessary to generate the strided load later.
2282 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2283 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2284 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2285
2286 /// Return true if an array of scalar loads can be replaced with a strided
2287 /// load (with run-time stride).
2288 /// \param PointerOps list of pointer arguments of loads.
2289 /// \param ScalarTy type of loads.
2290 /// \param CommonAlignment common alignement of loads as computed by
2291 /// `computeCommonAlignment<LoadInst>`.
2292 /// \param SortedIndicies is a list of indicies computed by this function such
2293 /// that the sequence `PointerOps[SortedIndices[0]],
2294 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2295 /// ordered by the coefficient of the stride. For example, if PointerOps is
2296 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2297 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2298 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2299 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2300 /// of `SPtrInfo` necessary to generate the strided load later.
2301 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2302 Align CommonAlignment,
2303 SmallVectorImpl<unsigned> &SortedIndices,
2304 StridedPtrInfo &SPtrInfo) const;
2305
2306 /// Checks if the given array of loads can be represented as a vectorized,
2307 /// scatter or just simple gather.
2308 /// \param VL list of loads.
2309 /// \param VL0 main load value.
2310 /// \param Order returned order of load instructions.
2311 /// \param PointerOps returned list of pointer operands.
2312 /// \param BestVF return best vector factor, if recursive check found better
2313 /// vectorization sequences rather than masked gather.
2314 /// \param TryRecursiveCheck used to check if long masked gather can be
2315 /// represented as a serie of loads/insert subvector, if profitable.
2318 SmallVectorImpl<Value *> &PointerOps,
2319 StridedPtrInfo &SPtrInfo,
2320 unsigned *BestVF = nullptr,
2321 bool TryRecursiveCheck = true) const;
2322
2323 /// Registers non-vectorizable sequence of loads
2324 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2325 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2326 }
2327
2328 /// Checks if the given loads sequence is known as not vectorizable
2329 template <typename T>
2331 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2332 }
2333
2335
2336 /// This structure holds any data we need about the edges being traversed
2337 /// during buildTreeRec(). We keep track of:
2338 /// (i) the user TreeEntry index, and
2339 /// (ii) the index of the edge.
2340 struct EdgeInfo {
2341 EdgeInfo() = default;
2342 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2344 /// The user TreeEntry.
2345 TreeEntry *UserTE = nullptr;
2346 /// The operand index of the use.
2347 unsigned EdgeIdx = UINT_MAX;
2348#ifndef NDEBUG
2350 const BoUpSLP::EdgeInfo &EI) {
2351 EI.dump(OS);
2352 return OS;
2353 }
2354 /// Debug print.
2355 void dump(raw_ostream &OS) const {
2356 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2357 << " EdgeIdx:" << EdgeIdx << "}";
2358 }
2359 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2360#endif
2361 bool operator == (const EdgeInfo &Other) const {
2362 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2363 }
2364
2365 operator bool() const { return UserTE != nullptr; }
2366 };
2367 friend struct DenseMapInfo<EdgeInfo>;
2368
2369 /// A helper class used for scoring candidates for two consecutive lanes.
2371 const TargetLibraryInfo &TLI;
2372 const DataLayout &DL;
2373 ScalarEvolution &SE;
2374 const BoUpSLP &R;
2375 int NumLanes; // Total number of lanes (aka vectorization factor).
2376 int MaxLevel; // The maximum recursion depth for accumulating score.
2377
2378 public:
2380 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2381 int MaxLevel)
2382 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2383 MaxLevel(MaxLevel) {}
2384
2385 // The hard-coded scores listed here are not very important, though it shall
2386 // be higher for better matches to improve the resulting cost. When
2387 // computing the scores of matching one sub-tree with another, we are
2388 // basically counting the number of values that are matching. So even if all
2389 // scores are set to 1, we would still get a decent matching result.
2390 // However, sometimes we have to break ties. For example we may have to
2391 // choose between matching loads vs matching opcodes. This is what these
2392 // scores are helping us with: they provide the order of preference. Also,
2393 // this is important if the scalar is externally used or used in another
2394 // tree entry node in the different lane.
2395
2396 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2397 static const int ScoreConsecutiveLoads = 4;
2398 /// The same load multiple times. This should have a better score than
2399 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2400 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2401 /// a vector load and 1.0 for a broadcast.
2402 static const int ScoreSplatLoads = 3;
2403 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2404 static const int ScoreReversedLoads = 3;
2405 /// A load candidate for masked gather.
2406 static const int ScoreMaskedGatherCandidate = 1;
2407 /// ExtractElementInst from same vector and consecutive indexes.
2408 static const int ScoreConsecutiveExtracts = 4;
2409 /// ExtractElementInst from same vector and reversed indices.
2410 static const int ScoreReversedExtracts = 3;
2411 /// Constants.
2412 static const int ScoreConstants = 2;
2413 /// Instructions with the same opcode.
2414 static const int ScoreSameOpcode = 2;
2415 /// Instructions with alt opcodes (e.g, add + sub).
2416 static const int ScoreAltOpcodes = 1;
2417 /// Identical instructions (a.k.a. splat or broadcast).
2418 static const int ScoreSplat = 1;
2419 /// Matching with an undef is preferable to failing.
2420 static const int ScoreUndef = 1;
2421 /// Score for failing to find a decent match.
2422 static const int ScoreFail = 0;
2423 /// Score if all users are vectorized.
2424 static const int ScoreAllUserVectorized = 1;
2425
2426 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2427 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2428 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2429 /// MainAltOps.
2431 ArrayRef<Value *> MainAltOps) const {
2432 if (!isValidElementType(V1->getType()) ||
2435
2436 if (V1 == V2) {
2437 if (isa<LoadInst>(V1)) {
2438 // Retruns true if the users of V1 and V2 won't need to be extracted.
2439 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2440 // Bail out if we have too many uses to save compilation time.
2441 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2442 return false;
2443
2444 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2445 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2446 return U == U1 || U == U2 || R.isVectorized(U);
2447 });
2448 };
2449 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2450 };
2451 // A broadcast of a load can be cheaper on some targets.
2452 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2453 ElementCount::getFixed(NumLanes)) &&
2454 ((int)V1->getNumUses() == NumLanes ||
2455 AllUsersAreInternal(V1, V2)))
2457 }
2459 }
2460
2461 auto CheckSameEntryOrFail = [&]() {
2462 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2464 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2465 !TEs2.empty() &&
2466 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2468 }
2470 };
2471
2472 auto *LI1 = dyn_cast<LoadInst>(V1);
2473 auto *LI2 = dyn_cast<LoadInst>(V2);
2474 if (LI1 && LI2) {
2475 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2476 !LI2->isSimple())
2477 return CheckSameEntryOrFail();
2478
2479 std::optional<int64_t> Dist = getPointersDiff(
2480 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2481 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2482 if (!Dist || *Dist == 0) {
2483 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2484 getUnderlyingObject(LI2->getPointerOperand()) &&
2485 R.TTI->isLegalMaskedGather(
2486 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2488 return CheckSameEntryOrFail();
2489 }
2490 // The distance is too large - still may be profitable to use masked
2491 // loads/gathers.
2492 if (std::abs(*Dist) > NumLanes / 2)
2494 // This still will detect consecutive loads, but we might have "holes"
2495 // in some cases. It is ok for non-power-2 vectorization and may produce
2496 // better results. It should not affect current vectorization.
2499 }
2500
2501 auto *C1 = dyn_cast<Constant>(V1);
2502 auto *C2 = dyn_cast<Constant>(V2);
2503 if (C1 && C2)
2505
2506 // Consider constants and buildvector compatible.
2507 if ((C1 && isa<InsertElementInst>(V2)) ||
2508 (C2 && isa<InsertElementInst>(V1)))
2510
2511 // Extracts from consecutive indexes of the same vector better score as
2512 // the extracts could be optimized away.
2513 Value *EV1;
2514 ConstantInt *Ex1Idx;
2515 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2516 // Undefs are always profitable for extractelements.
2517 // Compiler can easily combine poison and extractelement <non-poison> or
2518 // undef and extractelement <poison>. But combining undef +
2519 // extractelement <non-poison-but-may-produce-poison> requires some
2520 // extra operations.
2521 if (isa<UndefValue>(V2))
2522 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2525 Value *EV2 = nullptr;
2526 ConstantInt *Ex2Idx = nullptr;
2527 if (match(V2,
2529 m_Undef())))) {
2530 // Undefs are always profitable for extractelements.
2531 if (!Ex2Idx)
2533 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2535 if (EV2 == EV1) {
2536 int Idx1 = Ex1Idx->getZExtValue();
2537 int Idx2 = Ex2Idx->getZExtValue();
2538 int Dist = Idx2 - Idx1;
2539 // The distance is too large - still may be profitable to use
2540 // shuffles.
2541 if (std::abs(Dist) == 0)
2543 if (std::abs(Dist) > NumLanes / 2)
2547 }
2549 }
2550 return CheckSameEntryOrFail();
2551 }
2552
2553 auto *I1 = dyn_cast<Instruction>(V1);
2554 auto *I2 = dyn_cast<Instruction>(V2);
2555 if (I1 && I2) {
2556 if (I1->getParent() != I2->getParent())
2557 return CheckSameEntryOrFail();
2558 SmallVector<Value *, 4> Ops(MainAltOps);
2559 Ops.push_back(I1);
2560 Ops.push_back(I2);
2561 InstructionsState S = getSameOpcode(Ops, TLI);
2562 // Note: Only consider instructions with <= 2 operands to avoid
2563 // complexity explosion.
2564 if (S &&
2565 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2566 !S.isAltShuffle()) &&
2567 all_of(Ops, [&S](Value *V) {
2568 return isa<PoisonValue>(V) ||
2569 cast<Instruction>(V)->getNumOperands() ==
2570 S.getMainOp()->getNumOperands();
2571 }))
2572 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2574 }
2575
2576 if (I1 && isa<PoisonValue>(V2))
2578
2579 if (isa<UndefValue>(V2))
2581
2582 return CheckSameEntryOrFail();
2583 }
2584
2585 /// Go through the operands of \p LHS and \p RHS recursively until
2586 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2587 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2588 /// of \p U1 and \p U2), except at the beginning of the recursion where
2589 /// these are set to nullptr.
2590 ///
2591 /// For example:
2592 /// \verbatim
2593 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2594 /// \ / \ / \ / \ /
2595 /// + + + +
2596 /// G1 G2 G3 G4
2597 /// \endverbatim
2598 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2599 /// each level recursively, accumulating the score. It starts from matching
2600 /// the additions at level 0, then moves on to the loads (level 1). The
2601 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2602 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2603 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2604 /// Please note that the order of the operands does not matter, as we
2605 /// evaluate the score of all profitable combinations of operands. In
2606 /// other words the score of G1 and G4 is the same as G1 and G2. This
2607 /// heuristic is based on ideas described in:
2608 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2609 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2610 /// Luís F. W. Góes
2612 Instruction *U2, int CurrLevel,
2613 ArrayRef<Value *> MainAltOps) const {
2614
2615 // Get the shallow score of V1 and V2.
2616 int ShallowScoreAtThisLevel =
2617 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2618
2619 // If reached MaxLevel,
2620 // or if V1 and V2 are not instructions,
2621 // or if they are SPLAT,
2622 // or if they are not consecutive,
2623 // or if profitable to vectorize loads or extractelements, early return
2624 // the current cost.
2625 auto *I1 = dyn_cast<Instruction>(LHS);
2626 auto *I2 = dyn_cast<Instruction>(RHS);
2627 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2628 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2629 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2630 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2632 ShallowScoreAtThisLevel))
2633 return ShallowScoreAtThisLevel;
2634 assert(I1 && I2 && "Should have early exited.");
2635
2636 // Contains the I2 operand indexes that got matched with I1 operands.
2637 SmallSet<unsigned, 4> Op2Used;
2638
2639 // Recursion towards the operands of I1 and I2. We are trying all possible
2640 // operand pairs, and keeping track of the best score.
2641 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2642 OpIdx1 != NumOperands1; ++OpIdx1) {
2643 // Try to pair op1I with the best operand of I2.
2644 int MaxTmpScore = 0;
2645 unsigned MaxOpIdx2 = 0;
2646 bool FoundBest = false;
2647 // If I2 is commutative try all combinations.
2648 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2649 unsigned ToIdx = isCommutative(I2)
2650 ? I2->getNumOperands()
2651 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2652 assert(FromIdx <= ToIdx && "Bad index");
2653 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2654 // Skip operands already paired with OpIdx1.
2655 if (Op2Used.count(OpIdx2))
2656 continue;
2657 // Recursively calculate the cost at each level
2658 int TmpScore =
2659 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2660 I1, I2, CurrLevel + 1, {});
2661 // Look for the best score.
2662 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2663 TmpScore > MaxTmpScore) {
2664 MaxTmpScore = TmpScore;
2665 MaxOpIdx2 = OpIdx2;
2666 FoundBest = true;
2667 }
2668 }
2669 if (FoundBest) {
2670 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2671 Op2Used.insert(MaxOpIdx2);
2672 ShallowScoreAtThisLevel += MaxTmpScore;
2673 }
2674 }
2675 return ShallowScoreAtThisLevel;
2676 }
2677 };
2678 /// A helper data structure to hold the operands of a vector of instructions.
2679 /// This supports a fixed vector length for all operand vectors.
2681 /// For each operand we need (i) the value, and (ii) the opcode that it
2682 /// would be attached to if the expression was in a left-linearized form.
2683 /// This is required to avoid illegal operand reordering.
2684 /// For example:
2685 /// \verbatim
2686 /// 0 Op1
2687 /// |/
2688 /// Op1 Op2 Linearized + Op2
2689 /// \ / ----------> |/
2690 /// - -
2691 ///
2692 /// Op1 - Op2 (0 + Op1) - Op2
2693 /// \endverbatim
2694 ///
2695 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2696 ///
2697 /// Another way to think of this is to track all the operations across the
2698 /// path from the operand all the way to the root of the tree and to
2699 /// calculate the operation that corresponds to this path. For example, the
2700 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2701 /// corresponding operation is a '-' (which matches the one in the
2702 /// linearized tree, as shown above).
2703 ///
2704 /// For lack of a better term, we refer to this operation as Accumulated
2705 /// Path Operation (APO).
2706 struct OperandData {
2707 OperandData() = default;
2708 OperandData(Value *V, bool APO, bool IsUsed)
2709 : V(V), APO(APO), IsUsed(IsUsed) {}
2710 /// The operand value.
2711 Value *V = nullptr;
2712 /// TreeEntries only allow a single opcode, or an alternate sequence of
2713 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2714 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2715 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2716 /// (e.g., Add/Mul)
2717 bool APO = false;
2718 /// Helper data for the reordering function.
2719 bool IsUsed = false;
2720 };
2721
2722 /// During operand reordering, we are trying to select the operand at lane
2723 /// that matches best with the operand at the neighboring lane. Our
2724 /// selection is based on the type of value we are looking for. For example,
2725 /// if the neighboring lane has a load, we need to look for a load that is
2726 /// accessing a consecutive address. These strategies are summarized in the
2727 /// 'ReorderingMode' enumerator.
2728 enum class ReorderingMode {
2729 Load, ///< Matching loads to consecutive memory addresses
2730 Opcode, ///< Matching instructions based on opcode (same or alternate)
2731 Constant, ///< Matching constants
2732 Splat, ///< Matching the same instruction multiple times (broadcast)
2733 Failed, ///< We failed to create a vectorizable group
2734 };
2735
2736 using OperandDataVec = SmallVector<OperandData, 2>;
2737
2738 /// A vector of operand vectors.
2740 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2741 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2742 unsigned ArgSize = 0;
2743
2744 const TargetLibraryInfo &TLI;
2745 const DataLayout &DL;
2746 ScalarEvolution &SE;
2747 const BoUpSLP &R;
2748 const Loop *L = nullptr;
2749
2750 /// \returns the operand data at \p OpIdx and \p Lane.
2751 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2752 return OpsVec[OpIdx][Lane];
2753 }
2754
2755 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2756 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2757 return OpsVec[OpIdx][Lane];
2758 }
2759
2760 /// Clears the used flag for all entries.
2761 void clearUsed() {
2762 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2763 OpIdx != NumOperands; ++OpIdx)
2764 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2765 ++Lane)
2766 OpsVec[OpIdx][Lane].IsUsed = false;
2767 }
2768
2769 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2770 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2771 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2772 }
2773
2774 /// \param Lane lane of the operands under analysis.
2775 /// \param OpIdx operand index in \p Lane lane we're looking the best
2776 /// candidate for.
2777 /// \param Idx operand index of the current candidate value.
2778 /// \returns The additional score due to possible broadcasting of the
2779 /// elements in the lane. It is more profitable to have power-of-2 unique
2780 /// elements in the lane, it will be vectorized with higher probability
2781 /// after removing duplicates. Currently the SLP vectorizer supports only
2782 /// vectorization of the power-of-2 number of unique scalars.
2783 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2784 const SmallBitVector &UsedLanes) const {
2785 Value *IdxLaneV = getData(Idx, Lane).V;
2786 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2787 isa<ExtractElementInst>(IdxLaneV))
2788 return 0;
2790 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2791 if (Ln == Lane)
2792 continue;
2793 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2794 if (!isa<Instruction>(OpIdxLnV))
2795 return 0;
2796 Uniques.try_emplace(OpIdxLnV, Ln);
2797 }
2798 unsigned UniquesCount = Uniques.size();
2799 auto IdxIt = Uniques.find(IdxLaneV);
2800 unsigned UniquesCntWithIdxLaneV =
2801 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2802 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2803 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2804 unsigned UniquesCntWithOpIdxLaneV =
2805 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2806 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2807 return 0;
2808 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2809 UniquesCntWithOpIdxLaneV,
2810 UniquesCntWithOpIdxLaneV -
2811 bit_floor(UniquesCntWithOpIdxLaneV)) -
2812 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2813 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2814 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2815 }
2816
2817 /// \param Lane lane of the operands under analysis.
2818 /// \param OpIdx operand index in \p Lane lane we're looking the best
2819 /// candidate for.
2820 /// \param Idx operand index of the current candidate value.
2821 /// \returns The additional score for the scalar which users are all
2822 /// vectorized.
2823 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2824 Value *IdxLaneV = getData(Idx, Lane).V;
2825 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2826 // Do not care about number of uses for vector-like instructions
2827 // (extractelement/extractvalue with constant indices), they are extracts
2828 // themselves and already externally used. Vectorization of such
2829 // instructions does not add extra extractelement instruction, just may
2830 // remove it.
2831 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2832 isVectorLikeInstWithConstOps(OpIdxLaneV))
2834 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2835 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2836 return 0;
2837 return R.areAllUsersVectorized(IdxLaneI)
2839 : 0;
2840 }
2841
2842 /// Score scaling factor for fully compatible instructions but with
2843 /// different number of external uses. Allows better selection of the
2844 /// instructions with less external uses.
2845 static const int ScoreScaleFactor = 10;
2846
2847 /// \Returns the look-ahead score, which tells us how much the sub-trees
2848 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2849 /// score. This helps break ties in an informed way when we cannot decide on
2850 /// the order of the operands by just considering the immediate
2851 /// predecessors.
2852 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2853 int Lane, unsigned OpIdx, unsigned Idx,
2854 bool &IsUsed, const SmallBitVector &UsedLanes) {
2855 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2857 // Keep track of the instruction stack as we recurse into the operands
2858 // during the look-ahead score exploration.
2859 int Score =
2860 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2861 /*CurrLevel=*/1, MainAltOps);
2862 if (Score) {
2863 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2864 if (Score <= -SplatScore) {
2865 // Failed score.
2866 Score = 0;
2867 } else {
2868 Score += SplatScore;
2869 // Scale score to see the difference between different operands
2870 // and similar operands but all vectorized/not all vectorized
2871 // uses. It does not affect actual selection of the best
2872 // compatible operand in general, just allows to select the
2873 // operand with all vectorized uses.
2874 Score *= ScoreScaleFactor;
2875 Score += getExternalUseScore(Lane, OpIdx, Idx);
2876 IsUsed = true;
2877 }
2878 }
2879 return Score;
2880 }
2881
2882 /// Best defined scores per lanes between the passes. Used to choose the
2883 /// best operand (with the highest score) between the passes.
2884 /// The key - {Operand Index, Lane}.
2885 /// The value - the best score between the passes for the lane and the
2886 /// operand.
2888 BestScoresPerLanes;
2889
2890 // Search all operands in Ops[*][Lane] for the one that matches best
2891 // Ops[OpIdx][LastLane] and return its opreand index.
2892 // If no good match can be found, return std::nullopt.
2893 std::optional<unsigned>
2894 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2895 ArrayRef<ReorderingMode> ReorderingModes,
2896 ArrayRef<Value *> MainAltOps,
2897 const SmallBitVector &UsedLanes) {
2898 unsigned NumOperands = getNumOperands();
2899
2900 // The operand of the previous lane at OpIdx.
2901 Value *OpLastLane = getData(OpIdx, LastLane).V;
2902
2903 // Our strategy mode for OpIdx.
2904 ReorderingMode RMode = ReorderingModes[OpIdx];
2905 if (RMode == ReorderingMode::Failed)
2906 return std::nullopt;
2907
2908 // The linearized opcode of the operand at OpIdx, Lane.
2909 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2910
2911 // The best operand index and its score.
2912 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2913 // are using the score to differentiate between the two.
2914 struct BestOpData {
2915 std::optional<unsigned> Idx;
2916 unsigned Score = 0;
2917 } BestOp;
2918 BestOp.Score =
2919 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2920 .first->second;
2921
2922 // Track if the operand must be marked as used. If the operand is set to
2923 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2924 // want to reestimate the operands again on the following iterations).
2925 bool IsUsed = RMode == ReorderingMode::Splat ||
2926 RMode == ReorderingMode::Constant ||
2927 RMode == ReorderingMode::Load;
2928 // Iterate through all unused operands and look for the best.
2929 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2930 // Get the operand at Idx and Lane.
2931 OperandData &OpData = getData(Idx, Lane);
2932 Value *Op = OpData.V;
2933 bool OpAPO = OpData.APO;
2934
2935 // Skip already selected operands.
2936 if (OpData.IsUsed)
2937 continue;
2938
2939 // Skip if we are trying to move the operand to a position with a
2940 // different opcode in the linearized tree form. This would break the
2941 // semantics.
2942 if (OpAPO != OpIdxAPO)
2943 continue;
2944
2945 // Look for an operand that matches the current mode.
2946 switch (RMode) {
2947 case ReorderingMode::Load:
2948 case ReorderingMode::Opcode: {
2949 bool LeftToRight = Lane > LastLane;
2950 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2951 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2952 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2953 OpIdx, Idx, IsUsed, UsedLanes);
2954 if (Score > static_cast<int>(BestOp.Score) ||
2955 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2956 Idx == OpIdx)) {
2957 BestOp.Idx = Idx;
2958 BestOp.Score = Score;
2959 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2960 }
2961 break;
2962 }
2963 case ReorderingMode::Constant:
2964 if (isa<Constant>(Op) ||
2965 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2966 BestOp.Idx = Idx;
2967 if (isa<Constant>(Op)) {
2969 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2971 }
2973 IsUsed = false;
2974 }
2975 break;
2976 case ReorderingMode::Splat:
2977 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2978 IsUsed = Op == OpLastLane;
2979 if (Op == OpLastLane) {
2980 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2981 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2983 }
2984 BestOp.Idx = Idx;
2985 }
2986 break;
2987 case ReorderingMode::Failed:
2988 llvm_unreachable("Not expected Failed reordering mode.");
2989 }
2990 }
2991
2992 if (BestOp.Idx) {
2993 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2994 return BestOp.Idx;
2995 }
2996 // If we could not find a good match return std::nullopt.
2997 return std::nullopt;
2998 }
2999
3000 /// Helper for reorderOperandVecs.
3001 /// \returns the lane that we should start reordering from. This is the one
3002 /// which has the least number of operands that can freely move about or
3003 /// less profitable because it already has the most optimal set of operands.
3004 unsigned getBestLaneToStartReordering() const {
3005 unsigned Min = UINT_MAX;
3006 unsigned SameOpNumber = 0;
3007 // std::pair<unsigned, unsigned> is used to implement a simple voting
3008 // algorithm and choose the lane with the least number of operands that
3009 // can freely move about or less profitable because it already has the
3010 // most optimal set of operands. The first unsigned is a counter for
3011 // voting, the second unsigned is the counter of lanes with instructions
3012 // with same/alternate opcodes and same parent basic block.
3014 // Try to be closer to the original results, if we have multiple lanes
3015 // with same cost. If 2 lanes have the same cost, use the one with the
3016 // highest index.
3017 for (int I = getNumLanes(); I > 0; --I) {
3018 unsigned Lane = I - 1;
3019 OperandsOrderData NumFreeOpsHash =
3020 getMaxNumOperandsThatCanBeReordered(Lane);
3021 // Compare the number of operands that can move and choose the one with
3022 // the least number.
3023 if (NumFreeOpsHash.NumOfAPOs < Min) {
3024 Min = NumFreeOpsHash.NumOfAPOs;
3025 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3026 HashMap.clear();
3027 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3028 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3029 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3030 // Select the most optimal lane in terms of number of operands that
3031 // should be moved around.
3032 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3033 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3034 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3035 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3036 auto [It, Inserted] =
3037 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3038 if (!Inserted)
3039 ++It->second.first;
3040 }
3041 }
3042 // Select the lane with the minimum counter.
3043 unsigned BestLane = 0;
3044 unsigned CntMin = UINT_MAX;
3045 for (const auto &Data : reverse(HashMap)) {
3046 if (Data.second.first < CntMin) {
3047 CntMin = Data.second.first;
3048 BestLane = Data.second.second;
3049 }
3050 }
3051 return BestLane;
3052 }
3053
3054 /// Data structure that helps to reorder operands.
3055 struct OperandsOrderData {
3056 /// The best number of operands with the same APOs, which can be
3057 /// reordered.
3058 unsigned NumOfAPOs = UINT_MAX;
3059 /// Number of operands with the same/alternate instruction opcode and
3060 /// parent.
3061 unsigned NumOpsWithSameOpcodeParent = 0;
3062 /// Hash for the actual operands ordering.
3063 /// Used to count operands, actually their position id and opcode
3064 /// value. It is used in the voting mechanism to find the lane with the
3065 /// least number of operands that can freely move about or less profitable
3066 /// because it already has the most optimal set of operands. Can be
3067 /// replaced with SmallVector<unsigned> instead but hash code is faster
3068 /// and requires less memory.
3069 unsigned Hash = 0;
3070 };
3071 /// \returns the maximum number of operands that are allowed to be reordered
3072 /// for \p Lane and the number of compatible instructions(with the same
3073 /// parent/opcode). This is used as a heuristic for selecting the first lane
3074 /// to start operand reordering.
3075 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3076 unsigned CntTrue = 0;
3077 unsigned NumOperands = getNumOperands();
3078 // Operands with the same APO can be reordered. We therefore need to count
3079 // how many of them we have for each APO, like this: Cnt[APO] = x.
3080 // Since we only have two APOs, namely true and false, we can avoid using
3081 // a map. Instead we can simply count the number of operands that
3082 // correspond to one of them (in this case the 'true' APO), and calculate
3083 // the other by subtracting it from the total number of operands.
3084 // Operands with the same instruction opcode and parent are more
3085 // profitable since we don't need to move them in many cases, with a high
3086 // probability such lane already can be vectorized effectively.
3087 bool AllUndefs = true;
3088 unsigned NumOpsWithSameOpcodeParent = 0;
3089 Instruction *OpcodeI = nullptr;
3090 BasicBlock *Parent = nullptr;
3091 unsigned Hash = 0;
3092 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3093 const OperandData &OpData = getData(OpIdx, Lane);
3094 if (OpData.APO)
3095 ++CntTrue;
3096 // Use Boyer-Moore majority voting for finding the majority opcode and
3097 // the number of times it occurs.
3098 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3099 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3100 I->getParent() != Parent) {
3101 if (NumOpsWithSameOpcodeParent == 0) {
3102 NumOpsWithSameOpcodeParent = 1;
3103 OpcodeI = I;
3104 Parent = I->getParent();
3105 } else {
3106 --NumOpsWithSameOpcodeParent;
3107 }
3108 } else {
3109 ++NumOpsWithSameOpcodeParent;
3110 }
3111 }
3112 Hash = hash_combine(
3113 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3114 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3115 }
3116 if (AllUndefs)
3117 return {};
3118 OperandsOrderData Data;
3119 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3120 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3121 Data.Hash = Hash;
3122 return Data;
3123 }
3124
3125 /// Go through the instructions in VL and append their operands.
3126 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3127 const InstructionsState &S) {
3128 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3129 assert((empty() || all_of(Operands,
3130 [this](const ValueList &VL) {
3131 return VL.size() == getNumLanes();
3132 })) &&
3133 "Expected same number of lanes");
3134 assert(S.valid() && "InstructionsState is invalid.");
3135 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3136 // arguments to the intrinsic produces the same result.
3137 Instruction *MainOp = S.getMainOp();
3138 unsigned NumOperands = MainOp->getNumOperands();
3140 OpsVec.resize(ArgSize);
3141 unsigned NumLanes = VL.size();
3142 for (OperandDataVec &Ops : OpsVec)
3143 Ops.resize(NumLanes);
3144 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3145 // Our tree has just 3 nodes: the root and two operands.
3146 // It is therefore trivial to get the APO. We only need to check the
3147 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3148 // operand. The LHS operand of both add and sub is never attached to an
3149 // inversese operation in the linearized form, therefore its APO is
3150 // false. The RHS is true only if V is an inverse operation.
3151
3152 // Since operand reordering is performed on groups of commutative
3153 // operations or alternating sequences (e.g., +, -), we can safely tell
3154 // the inverse operations by checking commutativity.
3155 auto *I = dyn_cast<Instruction>(VL[Lane]);
3156 if (!I && isa<PoisonValue>(VL[Lane])) {
3157 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3158 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3159 continue;
3160 }
3161 bool IsInverseOperation = false;
3162 if (S.isCopyableElement(VL[Lane])) {
3163 // The value is a copyable element.
3164 IsInverseOperation =
3165 !isCommutative(MainOp, VL[Lane], /*IsCopyable=*/true);
3166 } else {
3167 assert(I && "Expected instruction");
3168 auto [SelectedOp, Ops] = convertTo(I, S);
3169 // We cannot check commutativity by the converted instruction
3170 // (SelectedOp) because isCommutative also examines def-use
3171 // relationships.
3172 IsInverseOperation = !isCommutative(SelectedOp, I);
3173 }
3174 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3175 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3176 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3177 }
3178 }
3179 }
3180
3181 /// \returns the number of operands.
3182 unsigned getNumOperands() const { return ArgSize; }
3183
3184 /// \returns the number of lanes.
3185 unsigned getNumLanes() const { return OpsVec[0].size(); }
3186
3187 /// \returns the operand value at \p OpIdx and \p Lane.
3188 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3189 return getData(OpIdx, Lane).V;
3190 }
3191
3192 /// \returns true if the data structure is empty.
3193 bool empty() const { return OpsVec.empty(); }
3194
3195 /// Clears the data.
3196 void clear() { OpsVec.clear(); }
3197
3198 /// \Returns true if there are enough operands identical to \p Op to fill
3199 /// the whole vector (it is mixed with constants or loop invariant values).
3200 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3201 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3202 assert(Op == getValue(OpIdx, Lane) &&
3203 "Op is expected to be getValue(OpIdx, Lane).");
3204 // Small number of loads - try load matching.
3205 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3206 return false;
3207 bool OpAPO = getData(OpIdx, Lane).APO;
3208 bool IsInvariant = L && L->isLoopInvariant(Op);
3209 unsigned Cnt = 0;
3210 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3211 if (Ln == Lane)
3212 continue;
3213 // This is set to true if we found a candidate for broadcast at Lane.
3214 bool FoundCandidate = false;
3215 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3216 OperandData &Data = getData(OpI, Ln);
3217 if (Data.APO != OpAPO || Data.IsUsed)
3218 continue;
3219 Value *OpILane = getValue(OpI, Lane);
3220 bool IsConstantOp = isa<Constant>(OpILane);
3221 // Consider the broadcast candidate if:
3222 // 1. Same value is found in one of the operands.
3223 if (Data.V == Op ||
3224 // 2. The operand in the given lane is not constant but there is a
3225 // constant operand in another lane (which can be moved to the
3226 // given lane). In this case we can represent it as a simple
3227 // permutation of constant and broadcast.
3228 (!IsConstantOp &&
3229 ((Lns > 2 && isa<Constant>(Data.V)) ||
3230 // 2.1. If we have only 2 lanes, need to check that value in the
3231 // next lane does not build same opcode sequence.
3232 (Lns == 2 &&
3233 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3234 isa<Constant>(Data.V)))) ||
3235 // 3. The operand in the current lane is loop invariant (can be
3236 // hoisted out) and another operand is also a loop invariant
3237 // (though not a constant). In this case the whole vector can be
3238 // hoisted out.
3239 // FIXME: need to teach the cost model about this case for better
3240 // estimation.
3241 (IsInvariant && !isa<Constant>(Data.V) &&
3242 !getSameOpcode({Op, Data.V}, TLI) &&
3243 L->isLoopInvariant(Data.V))) {
3244 FoundCandidate = true;
3245 Data.IsUsed = Data.V == Op;
3246 if (Data.V == Op)
3247 ++Cnt;
3248 break;
3249 }
3250 }
3251 if (!FoundCandidate)
3252 return false;
3253 }
3254 return getNumLanes() == 2 || Cnt > 1;
3255 }
3256
3257 /// Checks if there is at least single compatible operand in lanes other
3258 /// than \p Lane, compatible with the operand \p Op.
3259 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3260 assert(Op == getValue(OpIdx, Lane) &&
3261 "Op is expected to be getValue(OpIdx, Lane).");
3262 bool OpAPO = getData(OpIdx, Lane).APO;
3263 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3264 if (Ln == Lane)
3265 continue;
3266 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3267 const OperandData &Data = getData(OpI, Ln);
3268 if (Data.APO != OpAPO || Data.IsUsed)
3269 return true;
3270 Value *OpILn = getValue(OpI, Ln);
3271 return (L && L->isLoopInvariant(OpILn)) ||
3272 (getSameOpcode({Op, OpILn}, TLI) &&
3273 allSameBlock({Op, OpILn}));
3274 }))
3275 return true;
3276 }
3277 return false;
3278 }
3279
3280 public:
3281 /// Initialize with all the operands of the instruction vector \p RootVL.
3283 const InstructionsState &S, const BoUpSLP &R)
3284 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3285 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3286 // Append all the operands of RootVL.
3287 appendOperands(RootVL, Operands, S);
3288 }
3289
3290 /// \Returns a value vector with the operands across all lanes for the
3291 /// opearnd at \p OpIdx.
3292 ValueList getVL(unsigned OpIdx) const {
3293 ValueList OpVL(OpsVec[OpIdx].size());
3294 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3295 "Expected same num of lanes across all operands");
3296 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3297 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3298 return OpVL;
3299 }
3300
3301 // Performs operand reordering for 2 or more operands.
3302 // The original operands are in OrigOps[OpIdx][Lane].
3303 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3304 void reorder() {
3305 unsigned NumOperands = getNumOperands();
3306 unsigned NumLanes = getNumLanes();
3307 // Each operand has its own mode. We are using this mode to help us select
3308 // the instructions for each lane, so that they match best with the ones
3309 // we have selected so far.
3310 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3311
3312 // This is a greedy single-pass algorithm. We are going over each lane
3313 // once and deciding on the best order right away with no back-tracking.
3314 // However, in order to increase its effectiveness, we start with the lane
3315 // that has operands that can move the least. For example, given the
3316 // following lanes:
3317 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3318 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3319 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3320 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3321 // we will start at Lane 1, since the operands of the subtraction cannot
3322 // be reordered. Then we will visit the rest of the lanes in a circular
3323 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3324
3325 // Find the first lane that we will start our search from.
3326 unsigned FirstLane = getBestLaneToStartReordering();
3327
3328 // Initialize the modes.
3329 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3330 Value *OpLane0 = getValue(OpIdx, FirstLane);
3331 // Keep track if we have instructions with all the same opcode on one
3332 // side.
3333 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3334 // Check if OpLane0 should be broadcast.
3335 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3336 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3337 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3338 else if (isa<LoadInst>(OpILane0))
3339 ReorderingModes[OpIdx] = ReorderingMode::Load;
3340 else
3341 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3342 } else if (isa<Constant>(OpLane0)) {
3343 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3344 } else if (isa<Argument>(OpLane0)) {
3345 // Our best hope is a Splat. It may save some cost in some cases.
3346 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3347 } else {
3348 llvm_unreachable("Unexpected value kind.");
3349 }
3350 }
3351
3352 // Check that we don't have same operands. No need to reorder if operands
3353 // are just perfect diamond or shuffled diamond match. Do not do it only
3354 // for possible broadcasts or non-power of 2 number of scalars (just for
3355 // now).
3356 auto &&SkipReordering = [this]() {
3357 SmallPtrSet<Value *, 4> UniqueValues;
3358 ArrayRef<OperandData> Op0 = OpsVec.front();
3359 for (const OperandData &Data : Op0)
3360 UniqueValues.insert(Data.V);
3362 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3363 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3364 return !UniqueValues.contains(Data.V);
3365 }))
3366 return false;
3367 }
3368 // TODO: Check if we can remove a check for non-power-2 number of
3369 // scalars after full support of non-power-2 vectorization.
3370 return UniqueValues.size() != 2 &&
3371 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3372 UniqueValues.size());
3373 };
3374
3375 // If the initial strategy fails for any of the operand indexes, then we
3376 // perform reordering again in a second pass. This helps avoid assigning
3377 // high priority to the failed strategy, and should improve reordering for
3378 // the non-failed operand indexes.
3379 for (int Pass = 0; Pass != 2; ++Pass) {
3380 // Check if no need to reorder operands since they're are perfect or
3381 // shuffled diamond match.
3382 // Need to do it to avoid extra external use cost counting for
3383 // shuffled matches, which may cause regressions.
3384 if (SkipReordering())
3385 break;
3386 // Skip the second pass if the first pass did not fail.
3387 bool StrategyFailed = false;
3388 // Mark all operand data as free to use.
3389 clearUsed();
3390 // We keep the original operand order for the FirstLane, so reorder the
3391 // rest of the lanes. We are visiting the nodes in a circular fashion,
3392 // using FirstLane as the center point and increasing the radius
3393 // distance.
3394 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3395 for (unsigned I = 0; I < NumOperands; ++I)
3396 MainAltOps[I].push_back(getData(I, FirstLane).V);
3397
3398 SmallBitVector UsedLanes(NumLanes);
3399 UsedLanes.set(FirstLane);
3400 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3401 // Visit the lane on the right and then the lane on the left.
3402 for (int Direction : {+1, -1}) {
3403 int Lane = FirstLane + Direction * Distance;
3404 if (Lane < 0 || Lane >= (int)NumLanes)
3405 continue;
3406 UsedLanes.set(Lane);
3407 int LastLane = Lane - Direction;
3408 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3409 "Out of bounds");
3410 // Look for a good match for each operand.
3411 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3412 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3413 std::optional<unsigned> BestIdx =
3414 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3415 MainAltOps[OpIdx], UsedLanes);
3416 // By not selecting a value, we allow the operands that follow to
3417 // select a better matching value. We will get a non-null value in
3418 // the next run of getBestOperand().
3419 if (BestIdx) {
3420 // Swap the current operand with the one returned by
3421 // getBestOperand().
3422 swap(OpIdx, *BestIdx, Lane);
3423 } else {
3424 // Enable the second pass.
3425 StrategyFailed = true;
3426 }
3427 // Try to get the alternate opcode and follow it during analysis.
3428 if (MainAltOps[OpIdx].size() != 2) {
3429 OperandData &AltOp = getData(OpIdx, Lane);
3430 InstructionsState OpS =
3431 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3432 if (OpS && OpS.isAltShuffle())
3433 MainAltOps[OpIdx].push_back(AltOp.V);
3434 }
3435 }
3436 }
3437 }
3438 // Skip second pass if the strategy did not fail.
3439 if (!StrategyFailed)
3440 break;
3441 }
3442 }
3443
3444#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3445 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3446 switch (RMode) {
3447 case ReorderingMode::Load:
3448 return "Load";
3449 case ReorderingMode::Opcode:
3450 return "Opcode";
3451 case ReorderingMode::Constant:
3452 return "Constant";
3453 case ReorderingMode::Splat:
3454 return "Splat";
3455 case ReorderingMode::Failed:
3456 return "Failed";
3457 }
3458 llvm_unreachable("Unimplemented Reordering Type");
3459 }
3460
3461 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3462 raw_ostream &OS) {
3463 return OS << getModeStr(RMode);
3464 }
3465
3466 /// Debug print.
3467 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3468 printMode(RMode, dbgs());
3469 }
3470
3471 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3472 return printMode(RMode, OS);
3473 }
3474
3476 const unsigned Indent = 2;
3477 unsigned Cnt = 0;
3478 for (const OperandDataVec &OpDataVec : OpsVec) {
3479 OS << "Operand " << Cnt++ << "\n";
3480 for (const OperandData &OpData : OpDataVec) {
3481 OS.indent(Indent) << "{";
3482 if (Value *V = OpData.V)
3483 OS << *V;
3484 else
3485 OS << "null";
3486 OS << ", APO:" << OpData.APO << "}\n";
3487 }
3488 OS << "\n";
3489 }
3490 return OS;
3491 }
3492
3493 /// Debug print.
3494 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3495#endif
3496 };
3497
3498 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3499 /// for a pair which have highest score deemed to have best chance to form
3500 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3501 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3502 /// of the cost, considered to be good enough score.
3503 std::optional<int>
3504 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3505 int Limit = LookAheadHeuristics::ScoreFail) const {
3506 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3508 int BestScore = Limit;
3509 std::optional<int> Index;
3510 for (int I : seq<int>(0, Candidates.size())) {
3511 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3512 Candidates[I].second,
3513 /*U1=*/nullptr, /*U2=*/nullptr,
3514 /*CurrLevel=*/1, {});
3515 if (Score > BestScore) {
3516 BestScore = Score;
3517 Index = I;
3518 }
3519 }
3520 return Index;
3521 }
3522
3523 /// Checks if the instruction is marked for deletion.
3524 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3525
3526 /// Removes an instruction from its block and eventually deletes it.
3527 /// It's like Instruction::eraseFromParent() except that the actual deletion
3528 /// is delayed until BoUpSLP is destructed.
3530 DeletedInstructions.insert(I);
3531 }
3532
3533 /// Remove instructions from the parent function and clear the operands of \p
3534 /// DeadVals instructions, marking for deletion trivially dead operands.
3535 template <typename T>
3537 ArrayRef<T *> DeadVals,
3538 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3540 for (T *V : DeadVals) {
3541 auto *I = cast<Instruction>(V);
3543 }
3544 DenseSet<Value *> Processed;
3545 for (T *V : DeadVals) {
3546 if (!V || !Processed.insert(V).second)
3547 continue;
3548 auto *I = cast<Instruction>(V);
3550 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3551 for (Use &U : I->operands()) {
3552 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3553 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3555 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3556 return Entry->VectorizedValue == OpI;
3557 })))
3558 DeadInsts.push_back(OpI);
3559 }
3560 I->dropAllReferences();
3561 }
3562 for (T *V : DeadVals) {
3563 auto *I = cast<Instruction>(V);
3564 if (!I->getParent())
3565 continue;
3566 assert((I->use_empty() || all_of(I->uses(),
3567 [&](Use &U) {
3568 return isDeleted(
3569 cast<Instruction>(U.getUser()));
3570 })) &&
3571 "trying to erase instruction with users.");
3572 I->removeFromParent();
3573 SE->forgetValue(I);
3574 }
3575 // Process the dead instruction list until empty.
3576 while (!DeadInsts.empty()) {
3577 Value *V = DeadInsts.pop_back_val();
3579 if (!VI || !VI->getParent())
3580 continue;
3582 "Live instruction found in dead worklist!");
3583 assert(VI->use_empty() && "Instructions with uses are not dead.");
3584
3585 // Don't lose the debug info while deleting the instructions.
3586 salvageDebugInfo(*VI);
3587
3588 // Null out all of the instruction's operands to see if any operand
3589 // becomes dead as we go.
3590 for (Use &OpU : VI->operands()) {
3591 Value *OpV = OpU.get();
3592 if (!OpV)
3593 continue;
3594 OpU.set(nullptr);
3595
3596 if (!OpV->use_empty())
3597 continue;
3598
3599 // If the operand is an instruction that became dead as we nulled out
3600 // the operand, and if it is 'trivially' dead, delete it in a future
3601 // loop iteration.
3602 if (auto *OpI = dyn_cast<Instruction>(OpV))
3603 if (!DeletedInstructions.contains(OpI) &&
3604 (!OpI->getType()->isVectorTy() ||
3605 none_of(VectorValuesAndScales,
3606 [&](const std::tuple<Value *, unsigned, bool> &V) {
3607 return std::get<0>(V) == OpI;
3608 })) &&
3610 DeadInsts.push_back(OpI);
3611 }
3612
3613 VI->removeFromParent();
3614 eraseInstruction(VI);
3615 SE->forgetValue(VI);
3616 }
3617 }
3618
3619 /// Checks if the instruction was already analyzed for being possible
3620 /// reduction root.
3622 return AnalyzedReductionsRoots.count(I);
3623 }
3624 /// Register given instruction as already analyzed for being possible
3625 /// reduction root.
3627 AnalyzedReductionsRoots.insert(I);
3628 }
3629 /// Checks if the provided list of reduced values was checked already for
3630 /// vectorization.
3632 return AnalyzedReductionVals.contains(hash_value(VL));
3633 }
3634 /// Adds the list of reduced values to list of already checked values for the
3635 /// vectorization.
3637 AnalyzedReductionVals.insert(hash_value(VL));
3638 }
3639 /// Clear the list of the analyzed reduction root instructions.
3641 AnalyzedReductionsRoots.clear();
3642 AnalyzedReductionVals.clear();
3643 AnalyzedMinBWVals.clear();
3644 }
3645 /// Checks if the given value is gathered in one of the nodes.
3646 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3647 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3648 }
3649 /// Checks if the given value is gathered in one of the nodes.
3650 bool isGathered(const Value *V) const {
3651 return MustGather.contains(V);
3652 }
3653 /// Checks if the specified value was not schedule.
3654 bool isNotScheduled(const Value *V) const {
3655 return NonScheduledFirst.contains(V);
3656 }
3657
3658 /// Check if the value is vectorized in the tree.
3659 bool isVectorized(const Value *V) const {
3660 assert(V && "V cannot be nullptr.");
3661 return ScalarToTreeEntries.contains(V);
3662 }
3663
3664 ~BoUpSLP();
3665
3666private:
3667 /// Determine if a node \p E in can be demoted to a smaller type with a
3668 /// truncation. We collect the entries that will be demoted in ToDemote.
3669 /// \param E Node for analysis
3670 /// \param ToDemote indices of the nodes to be demoted.
3671 bool collectValuesToDemote(
3672 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3674 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3675 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3676
3677 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3678 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3679 /// they have only one user and reordarable).
3680 /// \param ReorderableGathers List of all gather nodes that require reordering
3681 /// (e.g., gather of extractlements or partially vectorizable loads).
3682 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3683 /// reordering, subset of \p NonVectorized.
3684 void buildReorderableOperands(
3685 TreeEntry *UserTE,
3686 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3687 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3688 SmallVectorImpl<TreeEntry *> &GatherOps);
3689
3690 /// Checks if the given \p TE is a gather node with clustered reused scalars
3691 /// and reorders it per given \p Mask.
3692 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3693
3694 /// Checks if all users of \p I are the part of the vectorization tree.
3695 bool areAllUsersVectorized(
3696 Instruction *I,
3697 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3698
3699 /// Return information about the vector formed for the specified index
3700 /// of a vector of (the same) instruction.
3702
3703 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3704 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3705 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3706 return const_cast<TreeEntry *>(
3707 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3708 }
3709
3710 /// Gets the root instruction for the given node. If the node is a strided
3711 /// load/store node with the reverse order, the root instruction is the last
3712 /// one.
3713 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3714
3715 /// \returns Cast context for the given graph node.
3717 getCastContextHint(const TreeEntry &TE) const;
3718
3719 /// \returns the cost of the vectorizable entry.
3720 InstructionCost getEntryCost(const TreeEntry *E,
3721 ArrayRef<Value *> VectorizedVals,
3722 SmallPtrSetImpl<Value *> &CheckedExtracts);
3723
3724 /// Checks if it is legal and profitable to build SplitVectorize node for the
3725 /// given \p VL.
3726 /// \param Op1 first homogeneous scalars.
3727 /// \param Op2 second homogeneous scalars.
3728 /// \param ReorderIndices indices to reorder the scalars.
3729 /// \returns true if the node was successfully built.
3730 bool canBuildSplitNode(ArrayRef<Value *> VL,
3731 const InstructionsState &LocalState,
3734 OrdersType &ReorderIndices) const;
3735
3736 /// This is the recursive part of buildTree.
3737 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3738 unsigned InterleaveFactor = 0);
3739
3740 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3741 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3742 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3743 /// returns false, setting \p CurrentOrder to either an empty vector or a
3744 /// non-identity permutation that allows to reuse extract instructions.
3745 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3746 /// extract order.
3747 bool canReuseExtract(ArrayRef<Value *> VL,
3748 SmallVectorImpl<unsigned> &CurrentOrder,
3749 bool ResizeAllowed = false) const;
3750
3751 /// Vectorize a single entry in the tree.
3752 Value *vectorizeTree(TreeEntry *E);
3753
3754 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3755 /// \p E.
3756 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3757
3758 /// Create a new vector from a list of scalar values. Produces a sequence
3759 /// which exploits values reused across lanes, and arranges the inserts
3760 /// for ease of later optimization.
3761 template <typename BVTy, typename ResTy, typename... Args>
3762 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3763
3764 /// Create a new vector from a list of scalar values. Produces a sequence
3765 /// which exploits values reused across lanes, and arranges the inserts
3766 /// for ease of later optimization.
3767 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3768
3769 /// Returns the instruction in the bundle, which can be used as a base point
3770 /// for scheduling. Usually it is the last instruction in the bundle, except
3771 /// for the case when all operands are external (in this case, it is the first
3772 /// instruction in the list).
3773 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3774
3775 /// Tries to find extractelement instructions with constant indices from fixed
3776 /// vector type and gather such instructions into a bunch, which highly likely
3777 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3778 /// was successful, the matched scalars are replaced by poison values in \p VL
3779 /// for future analysis.
3780 std::optional<TargetTransformInfo::ShuffleKind>
3781 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3782 SmallVectorImpl<int> &Mask) const;
3783
3784 /// Tries to find extractelement instructions with constant indices from fixed
3785 /// vector type and gather such instructions into a bunch, which highly likely
3786 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3787 /// was successful, the matched scalars are replaced by poison values in \p VL
3788 /// for future analysis.
3790 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3792 unsigned NumParts) const;
3793
3794 /// Checks if the gathered \p VL can be represented as a single register
3795 /// shuffle(s) of previous tree entries.
3796 /// \param TE Tree entry checked for permutation.
3797 /// \param VL List of scalars (a subset of the TE scalar), checked for
3798 /// permutations. Must form single-register vector.
3799 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3800 /// commands to build the mask using the original vector value, without
3801 /// relying on the potential reordering.
3802 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3803 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3804 std::optional<TargetTransformInfo::ShuffleKind>
3805 isGatherShuffledSingleRegisterEntry(
3806 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3807 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3808 bool ForOrder);
3809
3810 /// Checks if the gathered \p VL can be represented as multi-register
3811 /// shuffle(s) of previous tree entries.
3812 /// \param TE Tree entry checked for permutation.
3813 /// \param VL List of scalars (a subset of the TE scalar), checked for
3814 /// permutations.
3815 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3816 /// commands to build the mask using the original vector value, without
3817 /// relying on the potential reordering.
3818 /// \returns per-register series of ShuffleKind, if gathered values can be
3819 /// represented as shuffles of previous tree entries. \p Mask is filled with
3820 /// the shuffle mask (also on per-register base).
3822 isGatherShuffledEntry(
3823 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3825 unsigned NumParts, bool ForOrder = false);
3826
3827 /// \returns the cost of gathering (inserting) the values in \p VL into a
3828 /// vector.
3829 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3830 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3831 Type *ScalarTy) const;
3832
3833 /// Set the Builder insert point to one after the last instruction in
3834 /// the bundle
3835 void setInsertPointAfterBundle(const TreeEntry *E);
3836
3837 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3838 /// specified, the starting vector value is poison.
3839 Value *
3840 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3841 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3842
3843 /// \returns whether the VectorizableTree is fully vectorizable and will
3844 /// be beneficial even the tree height is tiny.
3845 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3846
3847 /// Run through the list of all gathered loads in the graph and try to find
3848 /// vector loads/masked gathers instead of regular gathers. Later these loads
3849 /// are reshufled to build final gathered nodes.
3850 void tryToVectorizeGatheredLoads(
3851 const SmallMapVector<
3852 std::tuple<BasicBlock *, Value *, Type *>,
3853 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3854 &GatheredLoads);
3855
3856 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3857 /// users of \p TE and collects the stores. It returns the map from the store
3858 /// pointers to the collected stores.
3860 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3861
3862 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3863 /// stores in \p StoresVec can form a vector instruction. If so it returns
3864 /// true and populates \p ReorderIndices with the shuffle indices of the
3865 /// stores when compared to the sorted vector.
3866 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3867 OrdersType &ReorderIndices) const;
3868
3869 /// Iterates through the users of \p TE, looking for scalar stores that can be
3870 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3871 /// their order and builds an order index vector for each store bundle. It
3872 /// returns all these order vectors found.
3873 /// We run this after the tree has formed, otherwise we may come across user
3874 /// instructions that are not yet in the tree.
3876 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3877
3878 /// Tries to reorder the gathering node for better vectorization
3879 /// opportunities.
3880 void reorderGatherNode(TreeEntry &TE);
3881
3882 class TreeEntry {
3883 public:
3884 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3885 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3886
3887 /// \returns Common mask for reorder indices and reused scalars.
3888 SmallVector<int> getCommonMask() const {
3889 if (State == TreeEntry::SplitVectorize)
3890 return {};
3891 SmallVector<int> Mask;
3892 inversePermutation(ReorderIndices, Mask);
3893 ::addMask(Mask, ReuseShuffleIndices);
3894 return Mask;
3895 }
3896
3897 /// \returns The mask for split nodes.
3898 SmallVector<int> getSplitMask() const {
3899 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3900 "Expected only split vectorize node.");
3901 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3902 unsigned CommonVF = std::max<unsigned>(
3903 CombinedEntriesWithIndices.back().second,
3904 Scalars.size() - CombinedEntriesWithIndices.back().second);
3905 for (auto [Idx, I] : enumerate(ReorderIndices))
3906 Mask[I] =
3907 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3908 ? CommonVF - CombinedEntriesWithIndices.back().second
3909 : 0);
3910 return Mask;
3911 }
3912
3913 /// Updates (reorders) SplitVectorize node according to the given mask \p
3914 /// Mask and order \p MaskOrder.
3915 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3916 ArrayRef<int> MaskOrder);
3917
3918 /// \returns true if the scalars in VL are equal to this entry.
3919 bool isSame(ArrayRef<Value *> VL) const {
3920 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3921 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3922 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3923 return VL.size() == Mask.size() &&
3924 std::equal(VL.begin(), VL.end(), Mask.begin(),
3925 [Scalars](Value *V, int Idx) {
3926 return (isa<UndefValue>(V) &&
3927 Idx == PoisonMaskElem) ||
3928 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3929 });
3930 };
3931 if (!ReorderIndices.empty()) {
3932 // TODO: implement matching if the nodes are just reordered, still can
3933 // treat the vector as the same if the list of scalars matches VL
3934 // directly, without reordering.
3935 SmallVector<int> Mask;
3936 inversePermutation(ReorderIndices, Mask);
3937 if (VL.size() == Scalars.size())
3938 return IsSame(Scalars, Mask);
3939 if (VL.size() == ReuseShuffleIndices.size()) {
3940 ::addMask(Mask, ReuseShuffleIndices);
3941 return IsSame(Scalars, Mask);
3942 }
3943 return false;
3944 }
3945 return IsSame(Scalars, ReuseShuffleIndices);
3946 }
3947
3948 /// \returns true if current entry has same operands as \p TE.
3949 bool hasEqualOperands(const TreeEntry &TE) const {
3950 if (TE.getNumOperands() != getNumOperands())
3951 return false;
3952 SmallBitVector Used(getNumOperands());
3953 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3954 unsigned PrevCount = Used.count();
3955 for (unsigned K = 0; K < E; ++K) {
3956 if (Used.test(K))
3957 continue;
3958 if (getOperand(K) == TE.getOperand(I)) {
3959 Used.set(K);
3960 break;
3961 }
3962 }
3963 // Check if we actually found the matching operand.
3964 if (PrevCount == Used.count())
3965 return false;
3966 }
3967 return true;
3968 }
3969
3970 /// \return Final vectorization factor for the node. Defined by the total
3971 /// number of vectorized scalars, including those, used several times in the
3972 /// entry and counted in the \a ReuseShuffleIndices, if any.
3973 unsigned getVectorFactor() const {
3974 if (!ReuseShuffleIndices.empty())
3975 return ReuseShuffleIndices.size();
3976 return Scalars.size();
3977 };
3978
3979 /// Checks if the current node is a gather node.
3980 bool isGather() const { return State == NeedToGather; }
3981
3982 /// A vector of scalars.
3983 ValueList Scalars;
3984
3985 /// The Scalars are vectorized into this value. It is initialized to Null.
3986 WeakTrackingVH VectorizedValue = nullptr;
3987
3988 /// Do we need to gather this sequence or vectorize it
3989 /// (either with vector instruction or with scatter/gather
3990 /// intrinsics for store/load)?
3991 enum EntryState {
3992 Vectorize, ///< The node is regularly vectorized.
3993 ScatterVectorize, ///< Masked scatter/gather node.
3994 StridedVectorize, ///< Strided loads (and stores)
3995 CompressVectorize, ///< (Masked) load with compress.
3996 NeedToGather, ///< Gather/buildvector node.
3997 CombinedVectorize, ///< Vectorized node, combined with its user into more
3998 ///< complex node like select/cmp to minmax, mul/add to
3999 ///< fma, etc. Must be used for the following nodes in
4000 ///< the pattern, not the very first one.
4001 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4002 ///< independently and then combines back.
4003 };
4004 EntryState State;
4005
4006 /// List of combined opcodes supported by the vectorizer.
4007 enum CombinedOpcode {
4008 NotCombinedOp = -1,
4009 MinMax = Instruction::OtherOpsEnd + 1,
4010 FMulAdd,
4011 };
4012 CombinedOpcode CombinedOp = NotCombinedOp;
4013
4014 /// Does this sequence require some shuffling?
4015 SmallVector<int, 4> ReuseShuffleIndices;
4016
4017 /// Does this entry require reordering?
4018 SmallVector<unsigned, 4> ReorderIndices;
4019
4020 /// Points back to the VectorizableTree.
4021 ///
4022 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4023 /// to be a pointer and needs to be able to initialize the child iterator.
4024 /// Thus we need a reference back to the container to translate the indices
4025 /// to entries.
4026 VecTreeTy &Container;
4027
4028 /// The TreeEntry index containing the user of this entry.
4029 EdgeInfo UserTreeIndex;
4030
4031 /// The index of this treeEntry in VectorizableTree.
4032 unsigned Idx = 0;
4033
4034 /// For gather/buildvector/alt opcode nodes, which are combined from
4035 /// other nodes as a series of insertvector instructions.
4036 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4037
4038 private:
4039 /// The operands of each instruction in each lane Operands[op_index][lane].
4040 /// Note: This helps avoid the replication of the code that performs the
4041 /// reordering of operands during buildTreeRec() and vectorizeTree().
4042 SmallVector<ValueList, 2> Operands;
4043
4044 /// Copyable elements of the entry node.
4045 SmallPtrSet<const Value *, 4> CopyableElements;
4046
4047 /// MainOp and AltOp are recorded inside. S should be obtained from
4048 /// newTreeEntry.
4049 InstructionsState S = InstructionsState::invalid();
4050
4051 /// Interleaving factor for interleaved loads Vectorize nodes.
4052 unsigned InterleaveFactor = 0;
4053
4054 /// True if the node does not require scheduling.
4055 bool DoesNotNeedToSchedule = false;
4056
4057 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4058 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4059 if (Operands.size() < OpIdx + 1)
4060 Operands.resize(OpIdx + 1);
4061 assert(Operands[OpIdx].empty() && "Already resized?");
4062 assert(OpVL.size() <= Scalars.size() &&
4063 "Number of operands is greater than the number of scalars.");
4064 Operands[OpIdx].resize(OpVL.size());
4065 copy(OpVL, Operands[OpIdx].begin());
4066 }
4067
4068 public:
4069 /// Returns interleave factor for interleave nodes.
4070 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4071 /// Sets interleaving factor for the interleaving nodes.
4072 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4073
4074 /// Marks the node as one that does not require scheduling.
4075 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4076 /// Returns true if the node is marked as one that does not require
4077 /// scheduling.
4078 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4079
4080 /// Set this bundle's operands from \p Operands.
4081 void setOperands(ArrayRef<ValueList> Operands) {
4082 for (unsigned I : seq<unsigned>(Operands.size()))
4083 setOperand(I, Operands[I]);
4084 }
4085
4086 /// Reorders operands of the node to the given mask \p Mask.
4087 void reorderOperands(ArrayRef<int> Mask) {
4088 for (ValueList &Operand : Operands)
4089 reorderScalars(Operand, Mask);
4090 }
4091
4092 /// \returns the \p OpIdx operand of this TreeEntry.
4093 ValueList &getOperand(unsigned OpIdx) {
4094 assert(OpIdx < Operands.size() && "Off bounds");
4095 return Operands[OpIdx];
4096 }
4097
4098 /// \returns the \p OpIdx operand of this TreeEntry.
4099 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4100 assert(OpIdx < Operands.size() && "Off bounds");
4101 return Operands[OpIdx];
4102 }
4103
4104 /// \returns the number of operands.
4105 unsigned getNumOperands() const { return Operands.size(); }
4106
4107 /// \return the single \p OpIdx operand.
4108 Value *getSingleOperand(unsigned OpIdx) const {
4109 assert(OpIdx < Operands.size() && "Off bounds");
4110 assert(!Operands[OpIdx].empty() && "No operand available");
4111 return Operands[OpIdx][0];
4112 }
4113
4114 /// Some of the instructions in the list have alternate opcodes.
4115 bool isAltShuffle() const { return S.isAltShuffle(); }
4116
4117 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4118 return S.getMatchingMainOpOrAltOp(I);
4119 }
4120
4121 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4122 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4123 /// \p OpValue.
4124 Value *isOneOf(Value *Op) const {
4125 auto *I = dyn_cast<Instruction>(Op);
4126 if (I && getMatchingMainOpOrAltOp(I))
4127 return Op;
4128 return S.getMainOp();
4129 }
4130
4131 void setOperations(const InstructionsState &S) {
4132 assert(S && "InstructionsState is invalid.");
4133 this->S = S;
4134 }
4135
4136 Instruction *getMainOp() const { return S.getMainOp(); }
4137
4138 Instruction *getAltOp() const { return S.getAltOp(); }
4139
4140 /// The main/alternate opcodes for the list of instructions.
4141 unsigned getOpcode() const { return S.getOpcode(); }
4142
4143 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4144
4145 bool hasState() const { return S.valid(); }
4146
4147 /// Add \p V to the list of copyable elements.
4148 void addCopyableElement(Value *V) {
4149 assert(S.isCopyableElement(V) && "Not a copyable element.");
4150 CopyableElements.insert(V);
4151 }
4152
4153 /// Returns true if \p V is a copyable element.
4154 bool isCopyableElement(Value *V) const {
4155 return CopyableElements.contains(V);
4156 }
4157
4158 /// Returns true if any scalar in the list is a copyable element.
4159 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4160
4161 /// Returns the state of the operations.
4162 const InstructionsState &getOperations() const { return S; }
4163
4164 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4165 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4166 unsigned findLaneForValue(Value *V) const {
4167 unsigned FoundLane = getVectorFactor();
4168 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4169 std::advance(It, 1)) {
4170 if (*It != V)
4171 continue;
4172 FoundLane = std::distance(Scalars.begin(), It);
4173 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4174 if (!ReorderIndices.empty())
4175 FoundLane = ReorderIndices[FoundLane];
4176 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4177 if (ReuseShuffleIndices.empty())
4178 break;
4179 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4180 RIt != ReuseShuffleIndices.end()) {
4181 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4182 break;
4183 }
4184 }
4185 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4186 return FoundLane;
4187 }
4188
4189 /// Build a shuffle mask for graph entry which represents a merge of main
4190 /// and alternate operations.
4191 void
4192 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4193 SmallVectorImpl<int> &Mask,
4194 SmallVectorImpl<Value *> *OpScalars = nullptr,
4195 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4196
4197 /// Return true if this is a non-power-of-2 node.
4198 bool isNonPowOf2Vec() const {
4199 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4200 return IsNonPowerOf2;
4201 }
4202
4203 /// Return true if this is a node, which tries to vectorize number of
4204 /// elements, forming whole vectors.
4205 bool
4206 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4207 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4208 TTI, getValueType(Scalars.front()), Scalars.size());
4209 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4210 "Reshuffling not supported with non-power-of-2 vectors yet.");
4211 return IsNonPowerOf2;
4212 }
4213
4214 Value *getOrdered(unsigned Idx) const {
4215 assert(isGather() && "Must be used only for buildvectors/gathers.");
4216 if (ReorderIndices.empty())
4217 return Scalars[Idx];
4218 SmallVector<int> Mask;
4219 inversePermutation(ReorderIndices, Mask);
4220 return Scalars[Mask[Idx]];
4221 }
4222
4223#ifndef NDEBUG
4224 /// Debug printer.
4225 LLVM_DUMP_METHOD void dump() const {
4226 dbgs() << Idx << ".\n";
4227 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4228 dbgs() << "Operand " << OpI << ":\n";
4229 for (const Value *V : Operands[OpI])
4230 dbgs().indent(2) << *V << "\n";
4231 }
4232 dbgs() << "Scalars: \n";
4233 for (Value *V : Scalars)
4234 dbgs().indent(2) << *V << "\n";
4235 dbgs() << "State: ";
4236 if (S && hasCopyableElements())
4237 dbgs() << "[[Copyable]] ";
4238 switch (State) {
4239 case Vectorize:
4240 if (InterleaveFactor > 0) {
4241 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4242 << "\n";
4243 } else {
4244 dbgs() << "Vectorize\n";
4245 }
4246 break;
4247 case ScatterVectorize:
4248 dbgs() << "ScatterVectorize\n";
4249 break;
4250 case StridedVectorize:
4251 dbgs() << "StridedVectorize\n";
4252 break;
4253 case CompressVectorize:
4254 dbgs() << "CompressVectorize\n";
4255 break;
4256 case NeedToGather:
4257 dbgs() << "NeedToGather\n";
4258 break;
4259 case CombinedVectorize:
4260 dbgs() << "CombinedVectorize\n";
4261 break;
4262 case SplitVectorize:
4263 dbgs() << "SplitVectorize\n";
4264 break;
4265 }
4266 if (S) {
4267 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4268 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4269 } else {
4270 dbgs() << "MainOp: NULL\n";
4271 dbgs() << "AltOp: NULL\n";
4272 }
4273 dbgs() << "VectorizedValue: ";
4274 if (VectorizedValue)
4275 dbgs() << *VectorizedValue << "\n";
4276 else
4277 dbgs() << "NULL\n";
4278 dbgs() << "ReuseShuffleIndices: ";
4279 if (ReuseShuffleIndices.empty())
4280 dbgs() << "Empty";
4281 else
4282 for (int ReuseIdx : ReuseShuffleIndices)
4283 dbgs() << ReuseIdx << ", ";
4284 dbgs() << "\n";
4285 dbgs() << "ReorderIndices: ";
4286 for (unsigned ReorderIdx : ReorderIndices)
4287 dbgs() << ReorderIdx << ", ";
4288 dbgs() << "\n";
4289 dbgs() << "UserTreeIndex: ";
4290 if (UserTreeIndex)
4291 dbgs() << UserTreeIndex;
4292 else
4293 dbgs() << "<invalid>";
4294 dbgs() << "\n";
4295 if (!CombinedEntriesWithIndices.empty()) {
4296 dbgs() << "Combined entries: ";
4297 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4298 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4299 });
4300 dbgs() << "\n";
4301 }
4302 }
4303#endif
4304 };
4305
4306#ifndef NDEBUG
4307 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4308 InstructionCost VecCost, InstructionCost ScalarCost,
4309 StringRef Banner) const {
4310 dbgs() << "SLP: " << Banner << ":\n";
4311 E->dump();
4312 dbgs() << "SLP: Costs:\n";
4313 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4314 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4315 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4316 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4317 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4318 }
4319#endif
4320
4321 /// Create a new gather TreeEntry
4322 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4323 const InstructionsState &S,
4324 const EdgeInfo &UserTreeIdx,
4325 ArrayRef<int> ReuseShuffleIndices = {}) {
4326 auto Invalid = ScheduleBundle::invalid();
4327 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4328 }
4329
4330 /// Create a new VectorizableTree entry.
4331 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4332 const InstructionsState &S,
4333 const EdgeInfo &UserTreeIdx,
4334 ArrayRef<int> ReuseShuffleIndices = {},
4335 ArrayRef<unsigned> ReorderIndices = {},
4336 unsigned InterleaveFactor = 0) {
4337 TreeEntry::EntryState EntryState =
4338 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4339 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4340 ReuseShuffleIndices, ReorderIndices);
4341 if (E && InterleaveFactor > 0)
4342 E->setInterleave(InterleaveFactor);
4343 return E;
4344 }
4345
4346 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4347 TreeEntry::EntryState EntryState,
4348 ScheduleBundle &Bundle, const InstructionsState &S,
4349 const EdgeInfo &UserTreeIdx,
4350 ArrayRef<int> ReuseShuffleIndices = {},
4351 ArrayRef<unsigned> ReorderIndices = {}) {
4352 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4353 EntryState == TreeEntry::SplitVectorize)) ||
4354 (Bundle && EntryState != TreeEntry::NeedToGather &&
4355 EntryState != TreeEntry::SplitVectorize)) &&
4356 "Need to vectorize gather entry?");
4357 // Gathered loads still gathered? Do not create entry, use the original one.
4358 if (GatheredLoadsEntriesFirst.has_value() &&
4359 EntryState == TreeEntry::NeedToGather && S &&
4360 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4361 !UserTreeIdx.UserTE)
4362 return nullptr;
4363 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4364 TreeEntry *Last = VectorizableTree.back().get();
4365 Last->Idx = VectorizableTree.size() - 1;
4366 Last->State = EntryState;
4367 if (UserTreeIdx.UserTE)
4368 OperandsToTreeEntry.try_emplace(
4369 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4370 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4371 // for non-power-of-two vectors.
4372 assert(
4373 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4374 ReuseShuffleIndices.empty()) &&
4375 "Reshuffling scalars not yet supported for nodes with padding");
4376 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4377 ReuseShuffleIndices.end());
4378 if (ReorderIndices.empty()) {
4379 Last->Scalars.assign(VL.begin(), VL.end());
4380 if (S)
4381 Last->setOperations(S);
4382 } else {
4383 // Reorder scalars and build final mask.
4384 Last->Scalars.assign(VL.size(), nullptr);
4385 transform(ReorderIndices, Last->Scalars.begin(),
4386 [VL](unsigned Idx) -> Value * {
4387 if (Idx >= VL.size())
4388 return UndefValue::get(VL.front()->getType());
4389 return VL[Idx];
4390 });
4391 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4392 if (S)
4393 Last->setOperations(S);
4394 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4395 }
4396 if (EntryState == TreeEntry::SplitVectorize) {
4397 assert(S && "Split nodes must have operations.");
4398 Last->setOperations(S);
4399 SmallPtrSet<Value *, 4> Processed;
4400 for (Value *V : VL) {
4401 auto *I = dyn_cast<Instruction>(V);
4402 if (!I)
4403 continue;
4404 auto It = ScalarsInSplitNodes.find(V);
4405 if (It == ScalarsInSplitNodes.end()) {
4406 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4407 (void)Processed.insert(V);
4408 } else if (Processed.insert(V).second) {
4409 assert(!is_contained(It->getSecond(), Last) &&
4410 "Value already associated with the node.");
4411 It->getSecond().push_back(Last);
4412 }
4413 }
4414 } else if (!Last->isGather()) {
4415 if (isa<PHINode>(S.getMainOp()) ||
4416 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4417 (!S.areInstructionsWithCopyableElements() &&
4418 doesNotNeedToSchedule(VL)) ||
4419 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4420 Last->setDoesNotNeedToSchedule();
4421 SmallPtrSet<Value *, 4> Processed;
4422 for (Value *V : VL) {
4423 if (isa<PoisonValue>(V))
4424 continue;
4425 if (S.isCopyableElement(V)) {
4426 Last->addCopyableElement(V);
4427 continue;
4428 }
4429 auto It = ScalarToTreeEntries.find(V);
4430 if (It == ScalarToTreeEntries.end()) {
4431 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4432 (void)Processed.insert(V);
4433 } else if (Processed.insert(V).second) {
4434 assert(!is_contained(It->getSecond(), Last) &&
4435 "Value already associated with the node.");
4436 It->getSecond().push_back(Last);
4437 }
4438 }
4439 // Update the scheduler bundle to point to this TreeEntry.
4440 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4441 "Bundle and VL out of sync");
4442 if (!Bundle.getBundle().empty()) {
4443#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4444 auto *BundleMember = Bundle.getBundle().begin();
4445 SmallPtrSet<Value *, 4> Processed;
4446 for (Value *V : VL) {
4447 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4448 continue;
4449 ++BundleMember;
4450 }
4451 assert(BundleMember == Bundle.getBundle().end() &&
4452 "Bundle and VL out of sync");
4453#endif
4454 Bundle.setTreeEntry(Last);
4455 }
4456 } else {
4457 // Build a map for gathered scalars to the nodes where they are used.
4458 bool AllConstsOrCasts = true;
4459 for (Value *V : VL) {
4460 if (S && S.areInstructionsWithCopyableElements() &&
4461 S.isCopyableElement(V))
4462 Last->addCopyableElement(V);
4463 if (!isConstant(V)) {
4464 auto *I = dyn_cast<CastInst>(V);
4465 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4466 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4467 !UserTreeIdx.UserTE->isGather())
4468 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4469 }
4470 }
4471 if (AllConstsOrCasts)
4472 CastMaxMinBWSizes =
4473 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4474 MustGather.insert_range(VL);
4475 }
4476
4477 if (UserTreeIdx.UserTE)
4478 Last->UserTreeIndex = UserTreeIdx;
4479 return Last;
4480 }
4481
4482 /// -- Vectorization State --
4483 /// Holds all of the tree entries.
4484 TreeEntry::VecTreeTy VectorizableTree;
4485
4486#ifndef NDEBUG
4487 /// Debug printer.
4488 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4489 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4490 VectorizableTree[Id]->dump();
4491 dbgs() << "\n";
4492 }
4493 }
4494#endif
4495
4496 /// Get list of vector entries, associated with the value \p V.
4497 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4498 assert(V && "V cannot be nullptr.");
4499 auto It = ScalarToTreeEntries.find(V);
4500 if (It == ScalarToTreeEntries.end())
4501 return {};
4502 return It->getSecond();
4503 }
4504
4505 /// Get list of split vector entries, associated with the value \p V.
4506 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4507 assert(V && "V cannot be nullptr.");
4508 auto It = ScalarsInSplitNodes.find(V);
4509 if (It == ScalarsInSplitNodes.end())
4510 return {};
4511 return It->getSecond();
4512 }
4513
4514 /// Returns first vector node for value \p V, matching values \p VL.
4515 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4516 bool SameVF = false) const {
4517 assert(V && "V cannot be nullptr.");
4518 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4519 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4520 return TE;
4521 return nullptr;
4522 }
4523
4524 /// Check that the operand node of alternate node does not generate
4525 /// buildvector sequence. If it is, then probably not worth it to build
4526 /// alternate shuffle, if number of buildvector operands + alternate
4527 /// instruction > than the number of buildvector instructions.
4528 /// \param S the instructions state of the analyzed values.
4529 /// \param VL list of the instructions with alternate opcodes.
4530 bool areAltOperandsProfitable(const InstructionsState &S,
4531 ArrayRef<Value *> VL) const;
4532
4533 /// Contains all the outputs of legality analysis for a list of values to
4534 /// vectorize.
4535 class ScalarsVectorizationLegality {
4536 InstructionsState S;
4537 bool IsLegal;
4538 bool TryToFindDuplicates;
4539 bool TrySplitVectorize;
4540
4541 public:
4542 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4543 bool TryToFindDuplicates = true,
4544 bool TrySplitVectorize = false)
4545 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4546 TrySplitVectorize(TrySplitVectorize) {
4547 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4548 "Inconsistent state");
4549 }
4550 const InstructionsState &getInstructionsState() const { return S; };
4551 bool isLegal() const { return IsLegal; }
4552 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4553 bool trySplitVectorize() const { return TrySplitVectorize; }
4554 };
4555
4556 /// Checks if the specified list of the instructions/values can be vectorized
4557 /// in general.
4558 ScalarsVectorizationLegality
4559 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4560 const EdgeInfo &UserTreeIdx,
4561 bool TryCopyableElementsVectorization) const;
4562
4563 /// Checks if the specified list of the instructions/values can be vectorized
4564 /// and fills required data before actual scheduling of the instructions.
4565 TreeEntry::EntryState getScalarsVectorizationState(
4566 const InstructionsState &S, ArrayRef<Value *> VL,
4567 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4568 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4569
4570 /// Maps a specific scalar to its tree entry(ies).
4571 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4572
4573 /// Maps the operand index and entry to the corresponding tree entry.
4574 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4575 OperandsToTreeEntry;
4576
4577 /// Scalars, used in split vectorize nodes.
4578 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4579
4580 /// Maps a value to the proposed vectorizable size.
4581 SmallDenseMap<Value *, unsigned> InstrElementSize;
4582
4583 /// A list of scalars that we found that we need to keep as scalars.
4584 ValueSet MustGather;
4585
4586 /// A set of first non-schedulable values.
4587 ValueSet NonScheduledFirst;
4588
4589 /// A map between the vectorized entries and the last instructions in the
4590 /// bundles. The bundles are built in use order, not in the def order of the
4591 /// instructions. So, we cannot rely directly on the last instruction in the
4592 /// bundle being the last instruction in the program order during
4593 /// vectorization process since the basic blocks are affected, need to
4594 /// pre-gather them before.
4595 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4596
4597 /// Keeps the mapping between the last instructions and their insertion
4598 /// points, which is an instruction-after-the-last-instruction.
4599 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4600
4601 /// List of gather nodes, depending on other gather/vector nodes, which should
4602 /// be emitted after the vector instruction emission process to correctly
4603 /// handle order of the vector instructions and shuffles.
4604 SetVector<const TreeEntry *> PostponedGathers;
4605
4606 using ValueToGatherNodesMap =
4607 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4608 ValueToGatherNodesMap ValueToGatherNodes;
4609
4610 /// A list of the load entries (node indices), which can be vectorized using
4611 /// strided or masked gather approach, but attempted to be represented as
4612 /// contiguous loads.
4613 SetVector<unsigned> LoadEntriesToVectorize;
4614
4615 /// true if graph nodes transforming mode is on.
4616 bool IsGraphTransformMode = false;
4617
4618 /// The index of the first gathered load entry in the VectorizeTree.
4619 std::optional<unsigned> GatheredLoadsEntriesFirst;
4620
4621 /// Maps compress entries to their mask data for the final codegen.
4622 SmallDenseMap<const TreeEntry *,
4623 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4624 CompressEntryToData;
4625
4626 /// This POD struct describes one external user in the vectorized tree.
4627 struct ExternalUser {
4628 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4629 : Scalar(S), User(U), E(E), Lane(L) {}
4630
4631 /// Which scalar in our function.
4632 Value *Scalar = nullptr;
4633
4634 /// Which user that uses the scalar.
4635 llvm::User *User = nullptr;
4636
4637 /// Vector node, the value is part of.
4638 const TreeEntry &E;
4639
4640 /// Which lane does the scalar belong to.
4641 unsigned Lane;
4642 };
4643 using UserList = SmallVector<ExternalUser, 16>;
4644
4645 /// Checks if two instructions may access the same memory.
4646 ///
4647 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4648 /// is invariant in the calling loop.
4649 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4650 Instruction *Inst2) {
4651 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4652 // First check if the result is already in the cache.
4653 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4654 auto Res = AliasCache.try_emplace(Key);
4655 if (!Res.second)
4656 return Res.first->second;
4657 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4658 // Store the result in the cache.
4659 Res.first->getSecond() = Aliased;
4660 return Aliased;
4661 }
4662
4663 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4664
4665 /// Cache for alias results.
4666 /// TODO: consider moving this to the AliasAnalysis itself.
4667 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4668
4669 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4670 // globally through SLP because we don't perform any action which
4671 // invalidates capture results.
4672 BatchAAResults BatchAA;
4673
4674 /// Temporary store for deleted instructions. Instructions will be deleted
4675 /// eventually when the BoUpSLP is destructed. The deferral is required to
4676 /// ensure that there are no incorrect collisions in the AliasCache, which
4677 /// can happen if a new instruction is allocated at the same address as a
4678 /// previously deleted instruction.
4679 DenseSet<Instruction *> DeletedInstructions;
4680
4681 /// Set of the instruction, being analyzed already for reductions.
4682 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4683
4684 /// Set of hashes for the list of reduction values already being analyzed.
4685 DenseSet<size_t> AnalyzedReductionVals;
4686
4687 /// Values, already been analyzed for mininmal bitwidth and found to be
4688 /// non-profitable.
4689 DenseSet<Value *> AnalyzedMinBWVals;
4690
4691 /// A list of values that need to extracted out of the tree.
4692 /// This list holds pairs of (Internal Scalar : External User). External User
4693 /// can be nullptr, it means that this Internal Scalar will be used later,
4694 /// after vectorization.
4695 UserList ExternalUses;
4696
4697 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4698 /// extractelement instructions.
4699 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4700
4701 /// A list of scalar to be extracted without specific user necause of too many
4702 /// uses.
4703 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4704
4705 /// Values used only by @llvm.assume calls.
4706 SmallPtrSet<const Value *, 32> EphValues;
4707
4708 /// Holds all of the instructions that we gathered, shuffle instructions and
4709 /// extractelements.
4710 SetVector<Instruction *> GatherShuffleExtractSeq;
4711
4712 /// A list of blocks that we are going to CSE.
4713 DenseSet<BasicBlock *> CSEBlocks;
4714
4715 /// List of hashes of vector of loads, which are known to be non vectorizable.
4716 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4717
4718 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4719 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4720 /// instructions, while ScheduleBundle represents a batch of instructions,
4721 /// going to be groupped together. ScheduleCopyableData models extra user for
4722 /// "copyable" instructions.
4723 class ScheduleEntity {
4724 friend class ScheduleBundle;
4725 friend class ScheduleData;
4726 friend class ScheduleCopyableData;
4727
4728 protected:
4729 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4730 Kind getKind() const { return K; }
4731 ScheduleEntity(Kind K) : K(K) {}
4732
4733 private:
4734 /// Used for getting a "good" final ordering of instructions.
4735 int SchedulingPriority = 0;
4736 /// True if this instruction (or bundle) is scheduled (or considered as
4737 /// scheduled in the dry-run).
4738 bool IsScheduled = false;
4739 /// The kind of the ScheduleEntity.
4740 const Kind K = Kind::ScheduleData;
4741
4742 public:
4743 ScheduleEntity() = delete;
4744 /// Gets/sets the scheduling priority.
4745 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4746 int getSchedulingPriority() const { return SchedulingPriority; }
4747 bool isReady() const {
4748 if (const auto *SD = dyn_cast<ScheduleData>(this))
4749 return SD->isReady();
4750 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4751 return CD->isReady();
4752 return cast<ScheduleBundle>(this)->isReady();
4753 }
4754 /// Returns true if the dependency information has been calculated.
4755 /// Note that depenendency validity can vary between instructions within
4756 /// a single bundle.
4757 bool hasValidDependencies() const {
4758 if (const auto *SD = dyn_cast<ScheduleData>(this))
4759 return SD->hasValidDependencies();
4760 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4761 return CD->hasValidDependencies();
4762 return cast<ScheduleBundle>(this)->hasValidDependencies();
4763 }
4764 /// Gets the number of unscheduled dependencies.
4765 int getUnscheduledDeps() const {
4766 if (const auto *SD = dyn_cast<ScheduleData>(this))
4767 return SD->getUnscheduledDeps();
4768 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4769 return CD->getUnscheduledDeps();
4770 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4771 }
4772 /// Increments the number of unscheduled dependencies.
4773 int incrementUnscheduledDeps(int Incr) {
4774 if (auto *SD = dyn_cast<ScheduleData>(this))
4775 return SD->incrementUnscheduledDeps(Incr);
4776 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4777 }
4778 /// Gets the number of dependencies.
4779 int getDependencies() const {
4780 if (const auto *SD = dyn_cast<ScheduleData>(this))
4781 return SD->getDependencies();
4782 return cast<ScheduleCopyableData>(this)->getDependencies();
4783 }
4784 /// Gets the instruction.
4785 Instruction *getInst() const {
4786 if (const auto *SD = dyn_cast<ScheduleData>(this))
4787 return SD->getInst();
4788 return cast<ScheduleCopyableData>(this)->getInst();
4789 }
4790
4791 /// Gets/sets if the bundle is scheduled.
4792 bool isScheduled() const { return IsScheduled; }
4793 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4794
4795 static bool classof(const ScheduleEntity *) { return true; }
4796
4797#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4798 void dump(raw_ostream &OS) const {
4799 if (const auto *SD = dyn_cast<ScheduleData>(this))
4800 return SD->dump(OS);
4801 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4802 return CD->dump(OS);
4803 return cast<ScheduleBundle>(this)->dump(OS);
4804 }
4805
4806 LLVM_DUMP_METHOD void dump() const {
4807 dump(dbgs());
4808 dbgs() << '\n';
4809 }
4810#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4811 };
4812
4813#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4815 const BoUpSLP::ScheduleEntity &SE) {
4816 SE.dump(OS);
4817 return OS;
4818 }
4819#endif
4820
4821 /// Contains all scheduling relevant data for an instruction.
4822 /// A ScheduleData either represents a single instruction or a member of an
4823 /// instruction bundle (= a group of instructions which is combined into a
4824 /// vector instruction).
4825 class ScheduleData final : public ScheduleEntity {
4826 public:
4827 // The initial value for the dependency counters. It means that the
4828 // dependencies are not calculated yet.
4829 enum { InvalidDeps = -1 };
4830
4831 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4832 static bool classof(const ScheduleEntity *Entity) {
4833 return Entity->getKind() == Kind::ScheduleData;
4834 }
4835
4836 void init(int BlockSchedulingRegionID, Instruction *I) {
4837 NextLoadStore = nullptr;
4838 IsScheduled = false;
4839 SchedulingRegionID = BlockSchedulingRegionID;
4840 clearDependencies();
4841 Inst = I;
4842 }
4843
4844 /// Verify basic self consistency properties
4845 void verify() {
4846 if (hasValidDependencies()) {
4847 assert(UnscheduledDeps <= Dependencies && "invariant");
4848 } else {
4849 assert(UnscheduledDeps == Dependencies && "invariant");
4850 }
4851
4852 if (IsScheduled) {
4853 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4854 "unexpected scheduled state");
4855 }
4856 }
4857
4858 /// Returns true if the dependency information has been calculated.
4859 /// Note that depenendency validity can vary between instructions within
4860 /// a single bundle.
4861 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4862
4863 /// Returns true if it is ready for scheduling, i.e. it has no more
4864 /// unscheduled depending instructions/bundles.
4865 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4866
4867 /// Modifies the number of unscheduled dependencies for this instruction,
4868 /// and returns the number of remaining dependencies for the containing
4869 /// bundle.
4870 int incrementUnscheduledDeps(int Incr) {
4871 assert(hasValidDependencies() &&
4872 "increment of unscheduled deps would be meaningless");
4873 UnscheduledDeps += Incr;
4874 assert(UnscheduledDeps >= 0 &&
4875 "Expected valid number of unscheduled deps");
4876 return UnscheduledDeps;
4877 }
4878
4879 /// Sets the number of unscheduled dependencies to the number of
4880 /// dependencies.
4881 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4882
4883 /// Clears all dependency information.
4884 void clearDependencies() {
4885 clearDirectDependencies();
4886 MemoryDependencies.clear();
4887 ControlDependencies.clear();
4888 }
4889
4890 /// Clears all direct dependencies only, except for control and memory
4891 /// dependencies.
4892 /// Required for copyable elements to correctly handle control/memory deps
4893 /// and avoid extra reclaculation of such deps.
4894 void clearDirectDependencies() {
4895 Dependencies = InvalidDeps;
4896 resetUnscheduledDeps();
4897 IsScheduled = false;
4898 }
4899
4900 /// Gets the number of unscheduled dependencies.
4901 int getUnscheduledDeps() const { return UnscheduledDeps; }
4902 /// Gets the number of dependencies.
4903 int getDependencies() const { return Dependencies; }
4904 /// Initializes the number of dependencies.
4905 void initDependencies() { Dependencies = 0; }
4906 /// Increments the number of dependencies.
4907 void incDependencies() { Dependencies++; }
4908
4909 /// Gets scheduling region ID.
4910 int getSchedulingRegionID() const { return SchedulingRegionID; }
4911
4912 /// Gets the instruction.
4913 Instruction *getInst() const { return Inst; }
4914
4915 /// Gets the list of memory dependencies.
4916 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4917 return MemoryDependencies;
4918 }
4919 /// Adds a memory dependency.
4920 void addMemoryDependency(ScheduleData *Dep) {
4921 MemoryDependencies.push_back(Dep);
4922 }
4923 /// Gets the list of control dependencies.
4924 ArrayRef<ScheduleData *> getControlDependencies() const {
4925 return ControlDependencies;
4926 }
4927 /// Adds a control dependency.
4928 void addControlDependency(ScheduleData *Dep) {
4929 ControlDependencies.push_back(Dep);
4930 }
4931 /// Gets/sets the next load/store instruction in the block.
4932 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4933 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4934
4935 void dump(raw_ostream &OS) const { OS << *Inst; }
4936
4937 LLVM_DUMP_METHOD void dump() const {
4938 dump(dbgs());
4939 dbgs() << '\n';
4940 }
4941
4942 private:
4943 Instruction *Inst = nullptr;
4944
4945 /// Single linked list of all memory instructions (e.g. load, store, call)
4946 /// in the block - until the end of the scheduling region.
4947 ScheduleData *NextLoadStore = nullptr;
4948
4949 /// The dependent memory instructions.
4950 /// This list is derived on demand in calculateDependencies().
4951 SmallVector<ScheduleData *> MemoryDependencies;
4952
4953 /// List of instructions which this instruction could be control dependent
4954 /// on. Allowing such nodes to be scheduled below this one could introduce
4955 /// a runtime fault which didn't exist in the original program.
4956 /// ex: this is a load or udiv following a readonly call which inf loops
4957 SmallVector<ScheduleData *> ControlDependencies;
4958
4959 /// This ScheduleData is in the current scheduling region if this matches
4960 /// the current SchedulingRegionID of BlockScheduling.
4961 int SchedulingRegionID = 0;
4962
4963 /// The number of dependencies. Constitutes of the number of users of the
4964 /// instruction plus the number of dependent memory instructions (if any).
4965 /// This value is calculated on demand.
4966 /// If InvalidDeps, the number of dependencies is not calculated yet.
4967 int Dependencies = InvalidDeps;
4968
4969 /// The number of dependencies minus the number of dependencies of scheduled
4970 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4971 /// for scheduling.
4972 /// Note that this is negative as long as Dependencies is not calculated.
4973 int UnscheduledDeps = InvalidDeps;
4974 };
4975
4976#ifndef NDEBUG
4978 const BoUpSLP::ScheduleData &SD) {
4979 SD.dump(OS);
4980 return OS;
4981 }
4982#endif
4983
4984 class ScheduleBundle final : public ScheduleEntity {
4985 /// The schedule data for the instructions in the bundle.
4987 /// True if this bundle is valid.
4988 bool IsValid = true;
4989 /// The TreeEntry that this instruction corresponds to.
4990 TreeEntry *TE = nullptr;
4991 ScheduleBundle(bool IsValid)
4992 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4993
4994 public:
4995 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4996 static bool classof(const ScheduleEntity *Entity) {
4997 return Entity->getKind() == Kind::ScheduleBundle;
4998 }
4999
5000 /// Verify basic self consistency properties
5001 void verify() const {
5002 for (const ScheduleEntity *SD : Bundle) {
5003 if (SD->hasValidDependencies()) {
5004 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5005 "invariant");
5006 } else {
5007 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5008 "invariant");
5009 }
5010
5011 if (isScheduled()) {
5012 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5013 "unexpected scheduled state");
5014 }
5015 }
5016 }
5017
5018 /// Returns the number of unscheduled dependencies in the bundle.
5019 int unscheduledDepsInBundle() const {
5020 assert(*this && "bundle must not be empty");
5021 int Sum = 0;
5022 for (const ScheduleEntity *BundleMember : Bundle) {
5023 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5024 return ScheduleData::InvalidDeps;
5025 Sum += BundleMember->getUnscheduledDeps();
5026 }
5027 return Sum;
5028 }
5029
5030 /// Returns true if the dependency information has been calculated.
5031 /// Note that depenendency validity can vary between instructions within
5032 /// a single bundle.
5033 bool hasValidDependencies() const {
5034 return all_of(Bundle, [](const ScheduleEntity *SD) {
5035 return SD->hasValidDependencies();
5036 });
5037 }
5038
5039 /// Returns true if it is ready for scheduling, i.e. it has no more
5040 /// unscheduled depending instructions/bundles.
5041 bool isReady() const {
5042 assert(*this && "bundle must not be empty");
5043 return unscheduledDepsInBundle() == 0 && !isScheduled();
5044 }
5045
5046 /// Returns the bundle of scheduling data, associated with the current
5047 /// instruction.
5048 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5049 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5050 /// Adds an instruction to the bundle.
5051 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5052
5053 /// Gets/sets the associated tree entry.
5054 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5055 TreeEntry *getTreeEntry() const { return TE; }
5056
5057 static ScheduleBundle invalid() { return {false}; }
5058
5059 operator bool() const { return IsValid; }
5060
5061#ifndef NDEBUG
5062 void dump(raw_ostream &OS) const {
5063 if (!*this) {
5064 OS << "[]";
5065 return;
5066 }
5067 OS << '[';
5068 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5070 OS << "<Copyable>";
5071 OS << *SD->getInst();
5072 });
5073 OS << ']';
5074 }
5075
5076 LLVM_DUMP_METHOD void dump() const {
5077 dump(dbgs());
5078 dbgs() << '\n';
5079 }
5080#endif // NDEBUG
5081 };
5082
5083#ifndef NDEBUG
5085 const BoUpSLP::ScheduleBundle &Bundle) {
5086 Bundle.dump(OS);
5087 return OS;
5088 }
5089#endif
5090
5091 /// Contains all scheduling relevant data for the copyable instruction.
5092 /// It models the virtual instructions, supposed to replace the original
5093 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5094 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5095 /// instruction %virt = add %0, 0.
5096 class ScheduleCopyableData final : public ScheduleEntity {
5097 /// The source schedule data for the instruction.
5098 Instruction *Inst = nullptr;
5099 /// The edge information for the instruction.
5100 const EdgeInfo EI;
5101 /// This ScheduleData is in the current scheduling region if this matches
5102 /// the current SchedulingRegionID of BlockScheduling.
5103 int SchedulingRegionID = 0;
5104 /// Bundle, this data is part of.
5105 ScheduleBundle &Bundle;
5106
5107 public:
5108 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5109 const EdgeInfo &EI, ScheduleBundle &Bundle)
5110 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5111 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5112 static bool classof(const ScheduleEntity *Entity) {
5113 return Entity->getKind() == Kind::ScheduleCopyableData;
5114 }
5115
5116 /// Verify basic self consistency properties
5117 void verify() {
5118 if (hasValidDependencies()) {
5119 assert(UnscheduledDeps <= Dependencies && "invariant");
5120 } else {
5121 assert(UnscheduledDeps == Dependencies && "invariant");
5122 }
5123
5124 if (IsScheduled) {
5125 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5126 "unexpected scheduled state");
5127 }
5128 }
5129
5130 /// Returns true if the dependency information has been calculated.
5131 /// Note that depenendency validity can vary between instructions within
5132 /// a single bundle.
5133 bool hasValidDependencies() const {
5134 return Dependencies != ScheduleData::InvalidDeps;
5135 }
5136
5137 /// Returns true if it is ready for scheduling, i.e. it has no more
5138 /// unscheduled depending instructions/bundles.
5139 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5140
5141 /// Modifies the number of unscheduled dependencies for this instruction,
5142 /// and returns the number of remaining dependencies for the containing
5143 /// bundle.
5144 int incrementUnscheduledDeps(int Incr) {
5145 assert(hasValidDependencies() &&
5146 "increment of unscheduled deps would be meaningless");
5147 UnscheduledDeps += Incr;
5148 assert(UnscheduledDeps >= 0 && "invariant");
5149 return UnscheduledDeps;
5150 }
5151
5152 /// Sets the number of unscheduled dependencies to the number of
5153 /// dependencies.
5154 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5155
5156 /// Gets the number of unscheduled dependencies.
5157 int getUnscheduledDeps() const { return UnscheduledDeps; }
5158 /// Gets the number of dependencies.
5159 int getDependencies() const { return Dependencies; }
5160 /// Initializes the number of dependencies.
5161 void initDependencies() { Dependencies = 0; }
5162 /// Increments the number of dependencies.
5163 void incDependencies() { Dependencies++; }
5164
5165 /// Gets scheduling region ID.
5166 int getSchedulingRegionID() const { return SchedulingRegionID; }
5167
5168 /// Gets the instruction.
5169 Instruction *getInst() const { return Inst; }
5170
5171 /// Clears all dependency information.
5172 void clearDependencies() {
5173 Dependencies = ScheduleData::InvalidDeps;
5174 UnscheduledDeps = ScheduleData::InvalidDeps;
5175 IsScheduled = false;
5176 }
5177
5178 /// Gets the edge information.
5179 const EdgeInfo &getEdgeInfo() const { return EI; }
5180
5181 /// Gets the bundle.
5182 ScheduleBundle &getBundle() { return Bundle; }
5183 const ScheduleBundle &getBundle() const { return Bundle; }
5184
5185#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5186 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5187
5188 LLVM_DUMP_METHOD void dump() const {
5189 dump(dbgs());
5190 dbgs() << '\n';
5191 }
5192#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5193
5194 private:
5195 /// true, if it has valid dependency information. These nodes always have
5196 /// only single dependency.
5197 int Dependencies = ScheduleData::InvalidDeps;
5198
5199 /// The number of dependencies minus the number of dependencies of scheduled
5200 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5201 /// for scheduling.
5202 /// Note that this is negative as long as Dependencies is not calculated.
5203 int UnscheduledDeps = ScheduleData::InvalidDeps;
5204 };
5205
5206#ifndef NDEBUG
5207 friend inline raw_ostream &
5208 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5209 SD.dump(OS);
5210 return OS;
5211 }
5212#endif
5213
5214 friend struct GraphTraits<BoUpSLP *>;
5215 friend struct DOTGraphTraits<BoUpSLP *>;
5216
5217 /// Contains all scheduling data for a basic block.
5218 /// It does not schedules instructions, which are not memory read/write
5219 /// instructions and their operands are either constants, or arguments, or
5220 /// phis, or instructions from others blocks, or their users are phis or from
5221 /// the other blocks. The resulting vector instructions can be placed at the
5222 /// beginning of the basic block without scheduling (if operands does not need
5223 /// to be scheduled) or at the end of the block (if users are outside of the
5224 /// block). It allows to save some compile time and memory used by the
5225 /// compiler.
5226 /// ScheduleData is assigned for each instruction in between the boundaries of
5227 /// the tree entry, even for those, which are not part of the graph. It is
5228 /// required to correctly follow the dependencies between the instructions and
5229 /// their correct scheduling. The ScheduleData is not allocated for the
5230 /// instructions, which do not require scheduling, like phis, nodes with
5231 /// extractelements/insertelements only or nodes with instructions, with
5232 /// uses/operands outside of the block.
5233 struct BlockScheduling {
5234 BlockScheduling(BasicBlock *BB)
5235 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5236
5237 void clear() {
5238 ScheduledBundles.clear();
5239 ScheduledBundlesList.clear();
5240 ScheduleCopyableDataMap.clear();
5241 ScheduleCopyableDataMapByInst.clear();
5242 ScheduleCopyableDataMapByInstUser.clear();
5243 ScheduleCopyableDataMapByUsers.clear();
5244 ReadyInsts.clear();
5245 ScheduleStart = nullptr;
5246 ScheduleEnd = nullptr;
5247 FirstLoadStoreInRegion = nullptr;
5248 LastLoadStoreInRegion = nullptr;
5249 RegionHasStackSave = false;
5250
5251 // Reduce the maximum schedule region size by the size of the
5252 // previous scheduling run.
5253 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5254 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5255 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5256 ScheduleRegionSize = 0;
5257
5258 // Make a new scheduling region, i.e. all existing ScheduleData is not
5259 // in the new region yet.
5260 ++SchedulingRegionID;
5261 }
5262
5263 ScheduleData *getScheduleData(Instruction *I) {
5264 if (!I)
5265 return nullptr;
5266 if (BB != I->getParent())
5267 // Avoid lookup if can't possibly be in map.
5268 return nullptr;
5269 ScheduleData *SD = ScheduleDataMap.lookup(I);
5270 if (SD && isInSchedulingRegion(*SD))
5271 return SD;
5272 return nullptr;
5273 }
5274
5275 ScheduleData *getScheduleData(Value *V) {
5276 return getScheduleData(dyn_cast<Instruction>(V));
5277 }
5278
5279 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5280 /// operand number) and value.
5281 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5282 const Value *V) const {
5283 if (ScheduleCopyableDataMap.empty())
5284 return nullptr;
5285 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5286 if (It == ScheduleCopyableDataMap.end())
5287 return nullptr;
5288 ScheduleCopyableData *SD = It->getSecond().get();
5289 if (!isInSchedulingRegion(*SD))
5290 return nullptr;
5291 return SD;
5292 }
5293
5294 /// Returns the ScheduleCopyableData for the given user \p User, operand
5295 /// number and operand \p V.
5297 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5298 const Value *V) {
5299 if (ScheduleCopyableDataMapByInstUser.empty())
5300 return {};
5301 const auto It = ScheduleCopyableDataMapByInstUser.find(
5302 std::make_pair(std::make_pair(User, OperandIdx), V));
5303 if (It == ScheduleCopyableDataMapByInstUser.end())
5304 return {};
5306 for (ScheduleCopyableData *SD : It->getSecond()) {
5307 if (isInSchedulingRegion(*SD))
5308 Res.push_back(SD);
5309 }
5310 return Res;
5311 }
5312
5313 /// Returns true if all operands of the given instruction \p User are
5314 /// replaced by copyable data.
5315 /// \param User The user instruction.
5316 /// \param Op The operand, which might be replaced by the copyable data.
5317 /// \param SLP The SLP tree.
5318 /// \param NumOps The number of operands used. If the instruction uses the
5319 /// same operand several times, check for the first use, then the second,
5320 /// etc.
5321 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5322 Instruction *Op, BoUpSLP &SLP,
5323 unsigned NumOps) const {
5324 assert(NumOps > 0 && "No operands");
5325 if (ScheduleCopyableDataMap.empty())
5326 return false;
5327 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5328 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5329 for (const Use &U : User->operands()) {
5330 if (U.get() != Op)
5331 continue;
5332 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5333 if (Entries.empty())
5334 return false;
5335 // Check all tree entries, if they have operands replaced by copyable
5336 // data.
5337 for (TreeEntry *TE : Entries) {
5338 unsigned Inc = 0;
5339 bool IsNonSchedulableWithParentPhiNode =
5340 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5341 TE->UserTreeIndex.UserTE->hasState() &&
5342 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5343 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5344 // Count the number of unique phi nodes, which are the parent for
5345 // parent entry, and exit, if all the unique phis are processed.
5346 if (IsNonSchedulableWithParentPhiNode) {
5347 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5348 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5349 for (Value *V : ParentTE->Scalars) {
5350 auto *PHI = dyn_cast<PHINode>(V);
5351 if (!PHI)
5352 continue;
5353 if (ParentsUniqueUsers.insert(PHI).second &&
5354 is_contained(PHI->incoming_values(), User))
5355 ++Inc;
5356 }
5357 } else {
5358 Inc = 1;
5359 }
5360
5361 // Check if the user is commutative.
5362 // The commutatives are handled later, as their operands can be
5363 // reordered.
5364 // Same applies even for non-commutative cmps, because we can invert
5365 // their predicate potentially and, thus, reorder the operands.
5366 bool IsCommutativeUser =
5367 ::isCommutative(User) ||
5368 ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
5369 if (!IsCommutativeUser && !isa<CmpInst>(User)) {
5370 unsigned &OpCnt =
5371 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
5372 EdgeInfo EI(TE, U.getOperandNo());
5373 if (!getScheduleCopyableData(EI, Op))
5374 continue;
5375 // Found copyable operand - continue.
5376 OpCnt += Inc;
5377 continue;
5378 }
5379 PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5380 .first->getSecond() += Inc;
5381 }
5382 }
5383 if (PotentiallyReorderedEntriesCount.empty())
5384 return all_of(OrderedEntriesCount,
5385 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5386 return P.second == NumOps;
5387 });
5388 // Check the commutative/cmp entries.
5389 for (auto &P : PotentiallyReorderedEntriesCount) {
5390 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5391 bool IsNonSchedulableWithParentPhiNode =
5392 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5393 P.first->UserTreeIndex.UserTE->hasState() &&
5394 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5395 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5396 auto *It = find(P.first->Scalars, User);
5397 do {
5398 assert(It != P.first->Scalars.end() &&
5399 "User is not in the tree entry");
5400 int Lane = std::distance(P.first->Scalars.begin(), It);
5401 assert(Lane >= 0 && "Lane is not found");
5402 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5403 Lane = P.first->ReorderIndices[Lane];
5404 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5405 "Couldn't find extract lane");
5406 // Count the number of unique phi nodes, which are the parent for
5407 // parent entry, and exit, if all the unique phis are processed.
5408 if (IsNonSchedulableWithParentPhiNode) {
5409 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5410 Value *User = ParentTE->Scalars[Lane];
5411 if (!ParentsUniqueUsers.insert(User).second) {
5412 It =
5413 find(make_range(std::next(It), P.first->Scalars.end()), User);
5414 continue;
5415 }
5416 }
5417 for (unsigned OpIdx :
5419 P.first->getMainOp()))) {
5420 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5421 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5422 --P.getSecond();
5423 }
5424 // If parent node is schedulable, it will be handled correctly.
5425 if (!IsNonSchedulableWithParentPhiNode)
5426 break;
5427 It = find(make_range(std::next(It), P.first->Scalars.end()), User);
5428 } while (It != P.first->Scalars.end());
5429 }
5430 return all_of(PotentiallyReorderedEntriesCount,
5431 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5432 return P.second == NumOps - 1;
5433 }) &&
5434 all_of(OrderedEntriesCount,
5435 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5436 return P.second == NumOps;
5437 });
5438 }
5439
5441 getScheduleCopyableData(const Instruction *I) const {
5442 if (ScheduleCopyableDataMapByInst.empty())
5443 return {};
5444 const auto It = ScheduleCopyableDataMapByInst.find(I);
5445 if (It == ScheduleCopyableDataMapByInst.end())
5446 return {};
5448 for (ScheduleCopyableData *SD : It->getSecond()) {
5449 if (isInSchedulingRegion(*SD))
5450 Res.push_back(SD);
5451 }
5452 return Res;
5453 }
5454
5456 getScheduleCopyableDataUsers(const Instruction *User) const {
5457 if (ScheduleCopyableDataMapByUsers.empty())
5458 return {};
5459 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5460 if (It == ScheduleCopyableDataMapByUsers.end())
5461 return {};
5463 for (ScheduleCopyableData *SD : It->getSecond()) {
5464 if (isInSchedulingRegion(*SD))
5465 Res.push_back(SD);
5466 }
5467 return Res;
5468 }
5469
5470 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5471 Instruction *I,
5472 int SchedulingRegionID,
5473 ScheduleBundle &Bundle) {
5474 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5475 ScheduleCopyableData *CD =
5476 ScheduleCopyableDataMap
5477 .try_emplace(std::make_pair(EI, I),
5478 std::make_unique<ScheduleCopyableData>(
5479 SchedulingRegionID, I, EI, Bundle))
5480 .first->getSecond()
5481 .get();
5482 ScheduleCopyableDataMapByInst[I].push_back(CD);
5483 if (EI.UserTE) {
5484 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5485 const auto *It = find(Op, I);
5486 assert(It != Op.end() && "Lane not set");
5487 SmallPtrSet<Instruction *, 4> Visited;
5488 do {
5489 int Lane = std::distance(Op.begin(), It);
5490 assert(Lane >= 0 && "Lane not set");
5491 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5492 !EI.UserTE->ReorderIndices.empty())
5493 Lane = EI.UserTE->ReorderIndices[Lane];
5494 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5495 "Couldn't find extract lane");
5496 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5497 if (!Visited.insert(In).second) {
5498 It = find(make_range(std::next(It), Op.end()), I);
5499 continue;
5500 }
5501 ScheduleCopyableDataMapByInstUser
5502 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5503 .first->getSecond()
5504 .push_back(CD);
5505 ScheduleCopyableDataMapByUsers.try_emplace(I)
5506 .first->getSecond()
5507 .insert(CD);
5508 // Remove extra deps for users, becoming non-immediate users of the
5509 // instruction. It may happen, if the chain of same copyable elements
5510 // appears in the tree.
5511 if (In == I) {
5512 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5513 if (ScheduleCopyableData *UserCD =
5514 getScheduleCopyableData(UserEI, In))
5515 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5516 }
5517 It = find(make_range(std::next(It), Op.end()), I);
5518 } while (It != Op.end());
5519 } else {
5520 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5521 CD);
5522 }
5523 return *CD;
5524 }
5525
5526 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5527 auto *I = dyn_cast<Instruction>(V);
5528 if (!I)
5529 return {};
5530 auto It = ScheduledBundles.find(I);
5531 if (It == ScheduledBundles.end())
5532 return {};
5533 return It->getSecond();
5534 }
5535
5536 /// Returns true if the entity is in the scheduling region.
5537 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5538 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5539 return Data->getSchedulingRegionID() == SchedulingRegionID;
5540 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5541 return CD->getSchedulingRegionID() == SchedulingRegionID;
5542 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5543 [&](const ScheduleEntity *BundleMember) {
5544 return isInSchedulingRegion(*BundleMember);
5545 });
5546 }
5547
5548 /// Marks an instruction as scheduled and puts all dependent ready
5549 /// instructions into the ready-list.
5550 template <typename ReadyListType>
5551 void schedule(const BoUpSLP &R, const InstructionsState &S,
5552 const EdgeInfo &EI, ScheduleEntity *Data,
5553 ReadyListType &ReadyList) {
5554 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5556 // Handle the def-use chain dependencies.
5557
5558 // Decrement the unscheduled counter and insert to ready list if ready.
5559 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5560 if ((IsControl || Data->hasValidDependencies()) &&
5561 Data->incrementUnscheduledDeps(-1) == 0) {
5562 // There are no more unscheduled dependencies after
5563 // decrementing, so we can put the dependent instruction
5564 // into the ready list.
5565 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5567 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5568 CopyableBundle.push_back(&CD->getBundle());
5569 Bundles = CopyableBundle;
5570 } else {
5571 Bundles = getScheduleBundles(Data->getInst());
5572 }
5573 if (!Bundles.empty()) {
5574 for (ScheduleBundle *Bundle : Bundles) {
5575 if (Bundle->unscheduledDepsInBundle() == 0) {
5576 assert(!Bundle->isScheduled() &&
5577 "already scheduled bundle gets ready");
5578 ReadyList.insert(Bundle);
5580 << "SLP: gets ready: " << *Bundle << "\n");
5581 }
5582 }
5583 return;
5584 }
5585 assert(!Data->isScheduled() &&
5586 "already scheduled bundle gets ready");
5588 "Expected non-copyable data");
5589 ReadyList.insert(Data);
5590 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5591 }
5592 };
5593
5594 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5595 Instruction *I) {
5596 if (!ScheduleCopyableDataMap.empty()) {
5598 getScheduleCopyableData(User, OpIdx, I);
5599 for (ScheduleCopyableData *CD : CopyableData)
5600 DecrUnsched(CD, /*IsControl=*/false);
5601 if (!CopyableData.empty())
5602 return;
5603 }
5604 if (ScheduleData *OpSD = getScheduleData(I))
5605 DecrUnsched(OpSD, /*IsControl=*/false);
5606 };
5607
5608 // If BundleMember is a vector bundle, its operands may have been
5609 // reordered during buildTree(). We therefore need to get its operands
5610 // through the TreeEntry.
5611 if (!Bundles.empty()) {
5612 auto *In = BundleMember->getInst();
5613 // Count uses of each instruction operand.
5614 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5615 unsigned TotalOpCount = 0;
5616 if (isa<ScheduleCopyableData>(BundleMember)) {
5617 // Copyable data is used only once (uses itself).
5618 TotalOpCount = OperandsUses[In] = 1;
5619 } else {
5620 for (const Use &U : In->operands()) {
5621 if (auto *I = dyn_cast<Instruction>(U.get())) {
5622 auto Res = OperandsUses.try_emplace(I, 0);
5623 ++Res.first->getSecond();
5624 ++TotalOpCount;
5625 }
5626 }
5627 }
5628 // Decrement the unscheduled counter and insert to ready list if
5629 // ready.
5630 auto DecrUnschedForInst =
5631 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5632 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5633 &Checked) {
5634 if (!ScheduleCopyableDataMap.empty()) {
5635 const EdgeInfo EI = {UserTE, OpIdx};
5636 if (ScheduleCopyableData *CD =
5637 getScheduleCopyableData(EI, I)) {
5638 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
5639 return;
5640 DecrUnsched(CD, /*IsControl=*/false);
5641 return;
5642 }
5643 }
5644 auto It = OperandsUses.find(I);
5645 assert(It != OperandsUses.end() && "Operand not found");
5646 if (It->second > 0) {
5647 --It->getSecond();
5648 assert(TotalOpCount > 0 && "No more operands to decrement");
5649 --TotalOpCount;
5650 if (ScheduleData *OpSD = getScheduleData(I)) {
5651 if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
5652 return;
5653 DecrUnsched(OpSD, /*IsControl=*/false);
5654 }
5655 }
5656 };
5657
5658 for (ScheduleBundle *Bundle : Bundles) {
5659 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5660 break;
5661 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5662 // Need to search for the lane since the tree entry can be
5663 // reordered.
5664 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
5665 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5666 bool IsNonSchedulableWithParentPhiNode =
5667 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5668 Bundle->getTreeEntry()->UserTreeIndex &&
5669 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5670 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5671 TreeEntry::SplitVectorize &&
5672 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5673 Instruction::PHI;
5674 do {
5675 int Lane =
5676 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5677 assert(Lane >= 0 && "Lane not set");
5678 if (isa<StoreInst>(In) &&
5679 !Bundle->getTreeEntry()->ReorderIndices.empty())
5680 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5681 assert(Lane < static_cast<int>(
5682 Bundle->getTreeEntry()->Scalars.size()) &&
5683 "Couldn't find extract lane");
5684
5685 // Since vectorization tree is being built recursively this
5686 // assertion ensures that the tree entry has all operands set
5687 // before reaching this code. Couple of exceptions known at the
5688 // moment are extracts where their second (immediate) operand is
5689 // not added. Since immediates do not affect scheduler behavior
5690 // this is considered okay.
5691 assert(In &&
5693 In->getNumOperands() ==
5694 Bundle->getTreeEntry()->getNumOperands() ||
5695 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5696 "Missed TreeEntry operands?");
5697
5698 // Count the number of unique phi nodes, which are the parent for
5699 // parent entry, and exit, if all the unique phis are processed.
5700 if (IsNonSchedulableWithParentPhiNode) {
5701 const TreeEntry *ParentTE =
5702 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5703 Value *User = ParentTE->Scalars[Lane];
5704 if (!ParentsUniqueUsers.insert(User).second) {
5705 It = std::find(std::next(It),
5706 Bundle->getTreeEntry()->Scalars.end(), In);
5707 continue;
5708 }
5709 }
5710
5711 for (unsigned OpIdx :
5712 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5713 if (auto *I = dyn_cast<Instruction>(
5714 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5715 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5716 << *I << "\n");
5717 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5718 }
5719 // If parent node is schedulable, it will be handled correctly.
5720 if (!IsNonSchedulableWithParentPhiNode)
5721 break;
5722 It = std::find(std::next(It),
5723 Bundle->getTreeEntry()->Scalars.end(), In);
5724 } while (It != Bundle->getTreeEntry()->Scalars.end());
5725 }
5726 } else {
5727 // If BundleMember is a stand-alone instruction, no operand reordering
5728 // has taken place, so we directly access its operands.
5729 for (Use &U : BundleMember->getInst()->operands()) {
5730 if (auto *I = dyn_cast<Instruction>(U.get())) {
5732 << "SLP: check for readiness (def): " << *I << "\n");
5733 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5734 }
5735 }
5736 }
5737 // Handle the memory dependencies.
5738 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5739 if (!SD)
5740 return;
5741 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5742 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5743 if (!VisitedMemory.insert(MemoryDep).second)
5744 continue;
5745 // There are no more unscheduled dependencies after decrementing,
5746 // so we can put the dependent instruction into the ready list.
5747 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5748 << *MemoryDep << "\n");
5749 DecrUnsched(MemoryDep);
5750 }
5751 // Handle the control dependencies.
5752 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5753 for (ScheduleData *Dep : SD->getControlDependencies()) {
5754 if (!VisitedControl.insert(Dep).second)
5755 continue;
5756 // There are no more unscheduled dependencies after decrementing,
5757 // so we can put the dependent instruction into the ready list.
5759 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5760 DecrUnsched(Dep, /*IsControl=*/true);
5761 }
5762 };
5763 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5764 SD->setScheduled(/*Scheduled=*/true);
5765 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5768 Instruction *In = SD->getInst();
5769 if (R.isVectorized(In)) {
5770 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5771 for (TreeEntry *TE : Entries) {
5773 In->getNumOperands() != TE->getNumOperands())
5774 continue;
5775 auto &BundlePtr =
5776 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5777 BundlePtr->setTreeEntry(TE);
5778 BundlePtr->add(SD);
5779 Bundles.push_back(BundlePtr.get());
5780 }
5781 }
5782 ProcessBundleMember(SD, Bundles);
5783 } else {
5784 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5785 Bundle.setScheduled(/*Scheduled=*/true);
5786 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5787 auto AreAllBundlesScheduled =
5788 [&](const ScheduleEntity *SD,
5789 ArrayRef<ScheduleBundle *> SDBundles) {
5791 return true;
5792 return !SDBundles.empty() &&
5793 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5794 return SDBundle->isScheduled();
5795 });
5796 };
5797 for (ScheduleEntity *SD : Bundle.getBundle()) {
5800 SDBundles = getScheduleBundles(SD->getInst());
5801 if (AreAllBundlesScheduled(SD, SDBundles)) {
5802 SD->setScheduled(/*Scheduled=*/true);
5803 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5804 : SDBundles);
5805 }
5806 }
5807 }
5808 }
5809
5810 /// Verify basic self consistency properties of the data structure.
5811 void verify() {
5812 if (!ScheduleStart)
5813 return;
5814
5815 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5816 ScheduleStart->comesBefore(ScheduleEnd) &&
5817 "Not a valid scheduling region?");
5818
5819 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5820 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5821 if (!Bundles.empty()) {
5822 for (ScheduleBundle *Bundle : Bundles) {
5823 assert(isInSchedulingRegion(*Bundle) &&
5824 "primary schedule data not in window?");
5825 Bundle->verify();
5826 }
5827 continue;
5828 }
5829 auto *SD = getScheduleData(I);
5830 if (!SD)
5831 continue;
5832 assert(isInSchedulingRegion(*SD) &&
5833 "primary schedule data not in window?");
5834 SD->verify();
5835 }
5836
5837 assert(all_of(ReadyInsts,
5838 [](const ScheduleEntity *Bundle) {
5839 return Bundle->isReady();
5840 }) &&
5841 "item in ready list not ready?");
5842 }
5843
5844 /// Put all instructions into the ReadyList which are ready for scheduling.
5845 template <typename ReadyListType>
5846 void initialFillReadyList(ReadyListType &ReadyList) {
5847 SmallPtrSet<ScheduleBundle *, 16> Visited;
5848 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5849 ScheduleData *SD = getScheduleData(I);
5850 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5851 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5852 !Bundles.empty()) {
5853 for (ScheduleBundle *Bundle : Bundles) {
5854 if (!Visited.insert(Bundle).second)
5855 continue;
5856 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5857 ReadyList.insert(Bundle);
5858 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5859 << *Bundle << "\n");
5860 }
5861 }
5862 continue;
5863 }
5864 ReadyList.insert(SD);
5866 << "SLP: initially in ready list: " << *SD << "\n");
5867 }
5868 }
5869 }
5870
5871 /// Build a bundle from the ScheduleData nodes corresponding to the
5872 /// scalar instruction for each lane.
5873 /// \param VL The list of scalar instructions.
5874 /// \param S The state of the instructions.
5875 /// \param EI The edge in the SLP graph or the user node/operand number.
5876 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5877 const InstructionsState &S, const EdgeInfo &EI);
5878
5879 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5880 /// cyclic dependencies. This is only a dry-run, no instructions are
5881 /// actually moved at this stage.
5882 /// \returns the scheduling bundle. The returned Optional value is not
5883 /// std::nullopt if \p VL is allowed to be scheduled.
5884 std::optional<ScheduleBundle *>
5885 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5886 const InstructionsState &S, const EdgeInfo &EI);
5887
5888 /// Allocates schedule data chunk.
5889 ScheduleData *allocateScheduleDataChunks();
5890
5891 /// Extends the scheduling region so that V is inside the region.
5892 /// \returns true if the region size is within the limit.
5893 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5894
5895 /// Initialize the ScheduleData structures for new instructions in the
5896 /// scheduling region.
5897 void initScheduleData(Instruction *FromI, Instruction *ToI,
5898 ScheduleData *PrevLoadStore,
5899 ScheduleData *NextLoadStore);
5900
5901 /// Updates the dependency information of a bundle and of all instructions/
5902 /// bundles which depend on the original bundle.
5903 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5904 BoUpSLP *SLP,
5905 ArrayRef<ScheduleData *> ControlDeps = {});
5906
5907 /// Sets all instruction in the scheduling region to un-scheduled.
5908 void resetSchedule();
5909
5910 BasicBlock *BB;
5911
5912 /// Simple memory allocation for ScheduleData.
5914
5915 /// The size of a ScheduleData array in ScheduleDataChunks.
5916 int ChunkSize;
5917
5918 /// The allocator position in the current chunk, which is the last entry
5919 /// of ScheduleDataChunks.
5920 int ChunkPos;
5921
5922 /// Attaches ScheduleData to Instruction.
5923 /// Note that the mapping survives during all vectorization iterations, i.e.
5924 /// ScheduleData structures are recycled.
5925 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5926
5927 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5928 /// number) and the operand instruction, represented as copyable element.
5929 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5930 std::unique_ptr<ScheduleCopyableData>>
5931 ScheduleCopyableDataMap;
5932
5933 /// Represents mapping between instruction and all related
5934 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5935 /// element). The SLP tree may contain several representations of the same
5936 /// instruction.
5937 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5938 ScheduleCopyableDataMapByInst;
5939
5940 /// Represents mapping between user value and operand number, the operand
5941 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5942 /// the same user may refernce the same operand in different tree entries
5943 /// and the operand may be modelled by the different copyable data element.
5944 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5946 ScheduleCopyableDataMapByInstUser;
5947
5948 /// Represents mapping between instruction and all related
5949 /// ScheduleCopyableData. It represents the mapping between the actual
5950 /// instruction and the last copyable data element in the chain. E.g., if
5951 /// the graph models the following instructions:
5952 /// %0 = non-add instruction ...
5953 /// ...
5954 /// %4 = add %3, 1
5955 /// %5 = add %4, 1
5956 /// %6 = insertelement poison, %0, 0
5957 /// %7 = insertelement %6, %5, 1
5958 /// And the graph is modeled as:
5959 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
5960 /// -> [1, 0] -> [%1, 0]
5961 ///
5962 /// this map will map %0 only to the copyable element <1>, which is the last
5963 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
5964 /// keep the map to <0>, not the %0.
5965 SmallDenseMap<const Instruction *,
5966 SmallSetVector<ScheduleCopyableData *, 4>>
5967 ScheduleCopyableDataMapByUsers;
5968
5969 /// Attaches ScheduleBundle to Instruction.
5970 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5971 ScheduledBundles;
5972 /// The list of ScheduleBundles.
5973 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5974
5975 /// The ready-list for scheduling (only used for the dry-run).
5976 SetVector<ScheduleEntity *> ReadyInsts;
5977
5978 /// The first instruction of the scheduling region.
5979 Instruction *ScheduleStart = nullptr;
5980
5981 /// The first instruction _after_ the scheduling region.
5982 Instruction *ScheduleEnd = nullptr;
5983
5984 /// The first memory accessing instruction in the scheduling region
5985 /// (can be null).
5986 ScheduleData *FirstLoadStoreInRegion = nullptr;
5987
5988 /// The last memory accessing instruction in the scheduling region
5989 /// (can be null).
5990 ScheduleData *LastLoadStoreInRegion = nullptr;
5991
5992 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5993 /// region? Used to optimize the dependence calculation for the
5994 /// common case where there isn't.
5995 bool RegionHasStackSave = false;
5996
5997 /// The current size of the scheduling region.
5998 int ScheduleRegionSize = 0;
5999
6000 /// The maximum size allowed for the scheduling region.
6001 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6002
6003 /// The ID of the scheduling region. For a new vectorization iteration this
6004 /// is incremented which "removes" all ScheduleData from the region.
6005 /// Make sure that the initial SchedulingRegionID is greater than the
6006 /// initial SchedulingRegionID in ScheduleData (which is 0).
6007 int SchedulingRegionID = 1;
6008 };
6009
6010 /// Attaches the BlockScheduling structures to basic blocks.
6011 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6012
6013 /// Performs the "real" scheduling. Done before vectorization is actually
6014 /// performed in a basic block.
6015 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6016
6017 /// List of users to ignore during scheduling and that don't need extracting.
6018 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6019
6020 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6021 /// sorted SmallVectors of unsigned.
6022 struct OrdersTypeDenseMapInfo {
6023 static OrdersType getEmptyKey() {
6024 OrdersType V;
6025 V.push_back(~1U);
6026 return V;
6027 }
6028
6029 static OrdersType getTombstoneKey() {
6030 OrdersType V;
6031 V.push_back(~2U);
6032 return V;
6033 }
6034
6035 static unsigned getHashValue(const OrdersType &V) {
6036 return static_cast<unsigned>(hash_combine_range(V));
6037 }
6038
6039 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6040 return LHS == RHS;
6041 }
6042 };
6043
6044 // Analysis and block reference.
6045 Function *F;
6046 ScalarEvolution *SE;
6047 TargetTransformInfo *TTI;
6048 TargetLibraryInfo *TLI;
6049 LoopInfo *LI;
6050 DominatorTree *DT;
6051 AssumptionCache *AC;
6052 DemandedBits *DB;
6053 const DataLayout *DL;
6054 OptimizationRemarkEmitter *ORE;
6055
6056 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6057 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6058
6059 /// Instruction builder to construct the vectorized tree.
6060 IRBuilder<TargetFolder> Builder;
6061
6062 /// A map of scalar integer values to the smallest bit width with which they
6063 /// can legally be represented. The values map to (width, signed) pairs,
6064 /// where "width" indicates the minimum bit width and "signed" is True if the
6065 /// value must be signed-extended, rather than zero-extended, back to its
6066 /// original width.
6067 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6068
6069 /// Final size of the reduced vector, if the current graph represents the
6070 /// input for the reduction and it was possible to narrow the size of the
6071 /// reduction.
6072 unsigned ReductionBitWidth = 0;
6073
6074 /// Canonical graph size before the transformations.
6075 unsigned BaseGraphSize = 1;
6076
6077 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6078 /// type sizes, used in the tree.
6079 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6080
6081 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6082 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6083 DenseSet<unsigned> ExtraBitWidthNodes;
6084};
6085
6086template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6090 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6091 SecondInfo::getEmptyKey());
6092 }
6093
6095 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6096 SecondInfo::getTombstoneKey());
6097 }
6098
6099 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6100 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6101 SecondInfo::getHashValue(Val.EdgeIdx));
6102 }
6103
6104 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6105 const BoUpSLP::EdgeInfo &RHS) {
6106 return LHS == RHS;
6107 }
6108};
6109
6110template <> struct llvm::GraphTraits<BoUpSLP *> {
6111 using TreeEntry = BoUpSLP::TreeEntry;
6112
6113 /// NodeRef has to be a pointer per the GraphWriter.
6115
6116 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6117
6118 /// Add the VectorizableTree to the index iterator to be able to return
6119 /// TreeEntry pointers.
6121 : public iterator_adaptor_base<
6122 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6124
6128
6129 NodeRef operator*() { return I->UserTE; }
6130 };
6131
6133 return R.VectorizableTree[0].get();
6134 }
6135
6137 return {&N->UserTreeIndex, N->Container};
6138 }
6139
6141 return {&N->UserTreeIndex + 1, N->Container};
6142 }
6143
6144 /// For the node iterator we just need to turn the TreeEntry iterator into a
6145 /// TreeEntry* iterator so that it dereferences to NodeRef.
6147 using ItTy = ContainerTy::iterator;
6148 ItTy It;
6149
6150 public:
6151 nodes_iterator(const ItTy &It2) : It(It2) {}
6152 NodeRef operator*() { return It->get(); }
6154 ++It;
6155 return *this;
6156 }
6157 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6158 };
6159
6161 return nodes_iterator(R->VectorizableTree.begin());
6162 }
6163
6165 return nodes_iterator(R->VectorizableTree.end());
6166 }
6167
6168 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6169};
6170
6171template <>
6173 using TreeEntry = BoUpSLP::TreeEntry;
6174
6175 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6176
6177 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6178 std::string Str;
6179 raw_string_ostream OS(Str);
6180 OS << Entry->Idx << ".\n";
6181 if (isSplat(Entry->Scalars))
6182 OS << "<splat> ";
6183 for (auto *V : Entry->Scalars) {
6184 OS << *V;
6185 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6186 return EU.Scalar == V;
6187 }))
6188 OS << " <extract>";
6189 OS << "\n";
6190 }
6191 return Str;
6192 }
6193
6194 static std::string getNodeAttributes(const TreeEntry *Entry,
6195 const BoUpSLP *) {
6196 if (Entry->isGather())
6197 return "color=red";
6198 if (Entry->State == TreeEntry::ScatterVectorize ||
6199 Entry->State == TreeEntry::StridedVectorize ||
6200 Entry->State == TreeEntry::CompressVectorize)
6201 return "color=blue";
6202 return "";
6203 }
6204};
6205
6208 for (auto *I : DeletedInstructions) {
6209 if (!I->getParent()) {
6210 // Temporarily insert instruction back to erase them from parent and
6211 // memory later.
6212 if (isa<PHINode>(I))
6213 // Phi nodes must be the very first instructions in the block.
6214 I->insertBefore(F->getEntryBlock(),
6215 F->getEntryBlock().getFirstNonPHIIt());
6216 else
6217 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6218 continue;
6219 }
6220 for (Use &U : I->operands()) {
6221 auto *Op = dyn_cast<Instruction>(U.get());
6222 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6224 DeadInsts.emplace_back(Op);
6225 }
6226 I->dropAllReferences();
6227 }
6228 for (auto *I : DeletedInstructions) {
6229 assert(I->use_empty() &&
6230 "trying to erase instruction with users.");
6231 I->eraseFromParent();
6232 }
6233
6234 // Cleanup any dead scalar code feeding the vectorized instructions
6236
6237#ifdef EXPENSIVE_CHECKS
6238 // If we could guarantee that this call is not extremely slow, we could
6239 // remove the ifdef limitation (see PR47712).
6240 assert(!verifyFunction(*F, &dbgs()));
6241#endif
6242}
6243
6244/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6245/// contains original mask for the scalars reused in the node. Procedure
6246/// transform this mask in accordance with the given \p Mask.
6248 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6249 "Expected non-empty mask.");
6250 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6251 Prev.swap(Reuses);
6252 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6253 if (Mask[I] != PoisonMaskElem)
6254 Reuses[Mask[I]] = Prev[I];
6255}
6256
6257/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6258/// the original order of the scalars. Procedure transforms the provided order
6259/// in accordance with the given \p Mask. If the resulting \p Order is just an
6260/// identity order, \p Order is cleared.
6262 bool BottomOrder = false) {
6263 assert(!Mask.empty() && "Expected non-empty mask.");
6264 unsigned Sz = Mask.size();
6265 if (BottomOrder) {
6266 SmallVector<unsigned> PrevOrder;
6267 if (Order.empty()) {
6268 PrevOrder.resize(Sz);
6269 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6270 } else {
6271 PrevOrder.swap(Order);
6272 }
6273 Order.assign(Sz, Sz);
6274 for (unsigned I = 0; I < Sz; ++I)
6275 if (Mask[I] != PoisonMaskElem)
6276 Order[I] = PrevOrder[Mask[I]];
6277 if (all_of(enumerate(Order), [&](const auto &Data) {
6278 return Data.value() == Sz || Data.index() == Data.value();
6279 })) {
6280 Order.clear();
6281 return;
6282 }
6283 fixupOrderingIndices(Order);
6284 return;
6285 }
6286 SmallVector<int> MaskOrder;
6287 if (Order.empty()) {
6288 MaskOrder.resize(Sz);
6289 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6290 } else {
6291 inversePermutation(Order, MaskOrder);
6292 }
6293 reorderReuses(MaskOrder, Mask);
6294 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6295 Order.clear();
6296 return;
6297 }
6298 Order.assign(Sz, Sz);
6299 for (unsigned I = 0; I < Sz; ++I)
6300 if (MaskOrder[I] != PoisonMaskElem)
6301 Order[MaskOrder[I]] = I;
6302 fixupOrderingIndices(Order);
6303}
6304
6305std::optional<BoUpSLP::OrdersType>
6306BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6307 bool TopToBottom, bool IgnoreReorder) {
6308 assert(TE.isGather() && "Expected gather node only.");
6309 // Try to find subvector extract/insert patterns and reorder only such
6310 // patterns.
6311 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6312 Type *ScalarTy = GatheredScalars.front()->getType();
6313 size_t NumScalars = GatheredScalars.size();
6314 if (!isValidElementType(ScalarTy))
6315 return std::nullopt;
6316 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6317 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6318 SmallVector<int> ExtractMask;
6319 SmallVector<int> Mask;
6322 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6324 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6325 /*ForOrder=*/true);
6326 // No shuffled operands - ignore.
6327 if (GatherShuffles.empty() && ExtractShuffles.empty())
6328 return std::nullopt;
6329 OrdersType CurrentOrder(NumScalars, NumScalars);
6330 if (GatherShuffles.size() == 1 &&
6331 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6332 Entries.front().front()->isSame(TE.Scalars)) {
6333 // If the full matched node in whole tree rotation - no need to consider the
6334 // matching order, rotating the whole tree.
6335 if (TopToBottom)
6336 return std::nullopt;
6337 // No need to keep the order for the same user node.
6338 if (Entries.front().front()->UserTreeIndex.UserTE ==
6339 TE.UserTreeIndex.UserTE)
6340 return std::nullopt;
6341 // No need to keep the order for the matched root node, if it can be freely
6342 // reordered.
6343 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6344 return std::nullopt;
6345 // If shuffling 2 elements only and the matching node has reverse reuses -
6346 // no need to count order, both work fine.
6347 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6348 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6349 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6350 [](const auto &P) {
6351 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6352 }))
6353 return std::nullopt;
6354
6355 // Perfect match in the graph, will reuse the previously vectorized
6356 // node. Cost is 0.
6357 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6358 return CurrentOrder;
6359 }
6360 auto IsSplatMask = [](ArrayRef<int> Mask) {
6361 int SingleElt = PoisonMaskElem;
6362 return all_of(Mask, [&](int I) {
6363 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6364 SingleElt = I;
6365 return I == PoisonMaskElem || I == SingleElt;
6366 });
6367 };
6368 // Exclusive broadcast mask - ignore.
6369 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6370 (Entries.size() != 1 ||
6371 Entries.front().front()->ReorderIndices.empty())) ||
6372 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6373 return std::nullopt;
6374 SmallBitVector ShuffledSubMasks(NumParts);
6375 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6376 ArrayRef<int> Mask, int PartSz, int NumParts,
6377 function_ref<unsigned(unsigned)> GetVF) {
6378 for (int I : seq<int>(0, NumParts)) {
6379 if (ShuffledSubMasks.test(I))
6380 continue;
6381 const int VF = GetVF(I);
6382 if (VF == 0)
6383 continue;
6384 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6385 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6386 // Shuffle of at least 2 vectors - ignore.
6387 if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {
6388 llvm::fill(Slice, NumScalars);
6389 ShuffledSubMasks.set(I);
6390 continue;
6391 }
6392 // Try to include as much elements from the mask as possible.
6393 int FirstMin = INT_MAX;
6394 int SecondVecFound = false;
6395 for (int K : seq<int>(Limit)) {
6396 int Idx = Mask[I * PartSz + K];
6397 if (Idx == PoisonMaskElem) {
6398 Value *V = GatheredScalars[I * PartSz + K];
6399 if (isConstant(V) && !isa<PoisonValue>(V)) {
6400 SecondVecFound = true;
6401 break;
6402 }
6403 continue;
6404 }
6405 if (Idx < VF) {
6406 if (FirstMin > Idx)
6407 FirstMin = Idx;
6408 } else {
6409 SecondVecFound = true;
6410 break;
6411 }
6412 }
6413 FirstMin = (FirstMin / PartSz) * PartSz;
6414 // Shuffle of at least 2 vectors - ignore.
6415 if (SecondVecFound) {
6416 llvm::fill(Slice, NumScalars);
6417 ShuffledSubMasks.set(I);
6418 continue;
6419 }
6420 for (int K : seq<int>(Limit)) {
6421 int Idx = Mask[I * PartSz + K];
6422 if (Idx == PoisonMaskElem)
6423 continue;
6424 Idx -= FirstMin;
6425 if (Idx >= PartSz) {
6426 SecondVecFound = true;
6427 break;
6428 }
6429 if (CurrentOrder[I * PartSz + Idx] >
6430 static_cast<unsigned>(I * PartSz + K) &&
6431 CurrentOrder[I * PartSz + Idx] !=
6432 static_cast<unsigned>(I * PartSz + Idx))
6433 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6434 }
6435 // Shuffle of at least 2 vectors - ignore.
6436 if (SecondVecFound) {
6437 llvm::fill(Slice, NumScalars);
6438 ShuffledSubMasks.set(I);
6439 continue;
6440 }
6441 }
6442 };
6443 int PartSz = getPartNumElems(NumScalars, NumParts);
6444 if (!ExtractShuffles.empty())
6445 TransformMaskToOrder(
6446 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6447 if (!ExtractShuffles[I])
6448 return 0U;
6449 unsigned VF = 0;
6450 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6451 for (unsigned Idx : seq<unsigned>(Sz)) {
6452 int K = I * PartSz + Idx;
6453 if (ExtractMask[K] == PoisonMaskElem)
6454 continue;
6455 if (!TE.ReuseShuffleIndices.empty())
6456 K = TE.ReuseShuffleIndices[K];
6457 if (K == PoisonMaskElem)
6458 continue;
6459 if (!TE.ReorderIndices.empty())
6460 K = std::distance(TE.ReorderIndices.begin(),
6461 find(TE.ReorderIndices, K));
6462 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6463 if (!EI)
6464 continue;
6465 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6466 ->getElementCount()
6467 .getKnownMinValue());
6468 }
6469 return VF;
6470 });
6471 // Check special corner case - single shuffle of the same entry.
6472 if (GatherShuffles.size() == 1 && NumParts != 1) {
6473 if (ShuffledSubMasks.any())
6474 return std::nullopt;
6475 PartSz = NumScalars;
6476 NumParts = 1;
6477 }
6478 if (!Entries.empty())
6479 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6480 if (!GatherShuffles[I])
6481 return 0U;
6482 return std::max(Entries[I].front()->getVectorFactor(),
6483 Entries[I].back()->getVectorFactor());
6484 });
6485 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6486 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6487 return std::nullopt;
6488 return std::move(CurrentOrder);
6489}
6490
6491static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6492 const TargetLibraryInfo &TLI,
6493 bool CompareOpcodes = true) {
6496 return false;
6497 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6498 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6499 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6500 (!GEP2 || GEP2->getNumOperands() == 2) &&
6501 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6502 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6503 !CompareOpcodes ||
6504 (GEP1 && GEP2 &&
6505 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6506}
6507
6508/// Calculates minimal alignment as a common alignment.
6509template <typename T>
6511 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6512 for (Value *V : VL)
6513 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6514 return CommonAlignment;
6515}
6516
6517/// Check if \p Order represents reverse order.
6519 assert(!Order.empty() &&
6520 "Order is empty. Please check it before using isReverseOrder.");
6521 unsigned Sz = Order.size();
6522 return all_of(enumerate(Order), [&](const auto &Pair) {
6523 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6524 });
6525}
6526
6527/// Checks if the provided list of pointers \p Pointers represents the strided
6528/// pointers for type ElemTy. If they are not, nullptr is returned.
6529/// Otherwise, SCEV* of the stride value is returned.
6530static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6531 const DataLayout &DL, ScalarEvolution &SE,
6532 SmallVectorImpl<unsigned> &SortedIndices) {
6534 const SCEV *PtrSCEVLowest = nullptr;
6535 const SCEV *PtrSCEVHighest = nullptr;
6536 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6537 // addresses).
6538 for (Value *Ptr : PointerOps) {
6539 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6540 if (!PtrSCEV)
6541 return nullptr;
6542 SCEVs.push_back(PtrSCEV);
6543 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6544 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6545 continue;
6546 }
6547 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6548 if (isa<SCEVCouldNotCompute>(Diff))
6549 return nullptr;
6550 if (Diff->isNonConstantNegative()) {
6551 PtrSCEVLowest = PtrSCEV;
6552 continue;
6553 }
6554 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6555 if (isa<SCEVCouldNotCompute>(Diff1))
6556 return nullptr;
6557 if (Diff1->isNonConstantNegative()) {
6558 PtrSCEVHighest = PtrSCEV;
6559 continue;
6560 }
6561 }
6562 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6563 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6564 if (isa<SCEVCouldNotCompute>(Dist))
6565 return nullptr;
6566 int Size = DL.getTypeStoreSize(ElemTy);
6567 auto TryGetStride = [&](const SCEV *Dist,
6568 const SCEV *Multiplier) -> const SCEV * {
6569 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6570 if (M->getOperand(0) == Multiplier)
6571 return M->getOperand(1);
6572 if (M->getOperand(1) == Multiplier)
6573 return M->getOperand(0);
6574 return nullptr;
6575 }
6576 if (Multiplier == Dist)
6577 return SE.getConstant(Dist->getType(), 1);
6578 return SE.getUDivExactExpr(Dist, Multiplier);
6579 };
6580 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6581 const SCEV *Stride = nullptr;
6582 if (Size != 1 || SCEVs.size() > 2) {
6583 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6584 Stride = TryGetStride(Dist, Sz);
6585 if (!Stride)
6586 return nullptr;
6587 }
6588 if (!Stride || isa<SCEVConstant>(Stride))
6589 return nullptr;
6590 // Iterate through all pointers and check if all distances are
6591 // unique multiple of Stride.
6592 using DistOrdPair = std::pair<int64_t, int>;
6593 auto Compare = llvm::less_first();
6594 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6595 int Cnt = 0;
6596 bool IsConsecutive = true;
6597 for (const SCEV *PtrSCEV : SCEVs) {
6598 unsigned Dist = 0;
6599 if (PtrSCEV != PtrSCEVLowest) {
6600 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6601 const SCEV *Coeff = TryGetStride(Diff, Stride);
6602 if (!Coeff)
6603 return nullptr;
6604 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6605 if (!SC || isa<SCEVCouldNotCompute>(SC))
6606 return nullptr;
6607 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6608 SE.getMulExpr(Stride, SC)))
6609 ->isZero())
6610 return nullptr;
6611 Dist = SC->getAPInt().getZExtValue();
6612 }
6613 // If the strides are not the same or repeated, we can't vectorize.
6614 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6615 return nullptr;
6616 auto Res = Offsets.emplace(Dist, Cnt);
6617 if (!Res.second)
6618 return nullptr;
6619 // Consecutive order if the inserted element is the last one.
6620 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6621 ++Cnt;
6622 }
6623 if (Offsets.size() != SCEVs.size())
6624 return nullptr;
6625 SortedIndices.clear();
6626 if (!IsConsecutive) {
6627 // Fill SortedIndices array only if it is non-consecutive.
6628 SortedIndices.resize(PointerOps.size());
6629 Cnt = 0;
6630 for (const std::pair<int64_t, int> &Pair : Offsets) {
6631 SortedIndices[Cnt] = Pair.second;
6632 ++Cnt;
6633 }
6634 }
6635 return Stride;
6636}
6637
6638static std::pair<InstructionCost, InstructionCost>
6640 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6641 Type *ScalarTy, VectorType *VecTy);
6642
6643/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6644/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6645/// subvector pattern.
6646static InstructionCost
6648 VectorType *Tp, ArrayRef<int> Mask = {},
6650 int Index = 0, VectorType *SubTp = nullptr,
6652 VectorType *DstTy = Tp;
6653 if (!Mask.empty())
6654 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6655
6656 if (Kind != TTI::SK_PermuteTwoSrc)
6657 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6658 Args);
6659 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6660 int NumSubElts;
6662 Mask, NumSrcElts, NumSubElts, Index)) {
6663 if (Index + NumSubElts > NumSrcElts &&
6664 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6665 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6666 TTI::TCK_RecipThroughput, Index, Tp);
6667 }
6668 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6669 Args);
6670}
6671
6672/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6673/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6674/// instead of a scalar.
6675static InstructionCost
6677 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6678 bool Extract, TTI::TargetCostKind CostKind,
6679 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6681 "ScalableVectorType is not supported.");
6682 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6683 getNumElements(Ty) &&
6684 "Incorrect usage.");
6685 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6686 assert(SLPReVec && "Only supported by REVEC.");
6687 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6688 // of CreateInsertElement.
6689 unsigned ScalarTyNumElements = VecTy->getNumElements();
6690 InstructionCost Cost = 0;
6691 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6692 if (!DemandedElts[I])
6693 continue;
6694 if (Insert)
6696 I * ScalarTyNumElements, VecTy);
6697 if (Extract)
6699 I * ScalarTyNumElements, VecTy);
6700 }
6701 return Cost;
6702 }
6703 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6704 CostKind, ForPoisonSrc, VL);
6705}
6706
6707/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6708/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6710 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6711 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6712 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6713 if (Opcode == Instruction::ExtractElement) {
6714 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6715 assert(SLPReVec && "Only supported by REVEC.");
6716 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6718 cast<VectorType>(Val), {}, CostKind,
6719 Index * VecTy->getNumElements(), VecTy);
6720 }
6721 }
6722 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6723 ScalarUserAndIdx);
6724}
6725
6726/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6727/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6729 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6730 VectorType *VecTy, unsigned Index,
6732 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6733 assert(SLPReVec && "Only supported by REVEC.");
6734 auto *SubTp =
6735 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6737 Index * ScalarTy->getNumElements(), SubTp) +
6738 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6739 CostKind);
6740 }
6741 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6742}
6743
6744/// Creates subvector insert. Generates shuffle using \p Generator or
6745/// using default shuffle.
6747 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6748 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6749 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6750 return Vec;
6751 const unsigned SubVecVF = getNumElements(V->getType());
6752 // Create shuffle, insertvector requires that index is multiple of
6753 // the subvector length.
6754 const unsigned VecVF = getNumElements(Vec->getType());
6756 if (isa<PoisonValue>(Vec)) {
6757 auto *Begin = std::next(Mask.begin(), Index);
6758 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6759 Vec = Builder.CreateShuffleVector(V, Mask);
6760 return Vec;
6761 }
6762 std::iota(Mask.begin(), Mask.end(), 0);
6763 std::iota(std::next(Mask.begin(), Index),
6764 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6765 if (Generator)
6766 return Generator(Vec, V, Mask);
6767 // 1. Resize V to the size of Vec.
6768 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6769 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6770 V = Builder.CreateShuffleVector(V, ResizeMask);
6771 // 2. Insert V into Vec.
6772 return Builder.CreateShuffleVector(Vec, V, Mask);
6773}
6774
6775/// Generates subvector extract using \p Generator or using default shuffle.
6777 unsigned SubVecVF, unsigned Index) {
6778 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6779 std::iota(Mask.begin(), Mask.end(), Index);
6780 return Builder.CreateShuffleVector(Vec, Mask);
6781}
6782
6783/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6784/// with \p Order.
6785/// \return true if the mask represents strided access, false - otherwise.
6787 ArrayRef<unsigned> Order, Type *ScalarTy,
6788 const DataLayout &DL, ScalarEvolution &SE,
6789 SmallVectorImpl<int> &CompressMask) {
6790 const unsigned Sz = PointerOps.size();
6791 CompressMask.assign(Sz, PoisonMaskElem);
6792 // The first element always set.
6793 CompressMask[0] = 0;
6794 // Check if the mask represents strided access.
6795 std::optional<unsigned> Stride = 0;
6796 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6797 for (unsigned I : seq<unsigned>(1, Sz)) {
6798 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6799 std::optional<int64_t> OptPos =
6800 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6801 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6802 return false;
6803 unsigned Pos = static_cast<unsigned>(*OptPos);
6804 CompressMask[I] = Pos;
6805 if (!Stride)
6806 continue;
6807 if (*Stride == 0) {
6808 *Stride = Pos;
6809 continue;
6810 }
6811 if (Pos != *Stride * I)
6812 Stride.reset();
6813 }
6814 return Stride.has_value();
6815}
6816
6817/// Checks if the \p VL can be transformed to a (masked)load + compress or
6818/// (masked) interleaved load.
6820 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6823 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6824 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6825 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6826 VectorType *&LoadVecTy) {
6827 InterleaveFactor = 0;
6828 Type *ScalarTy = VL.front()->getType();
6829 const size_t Sz = VL.size();
6830 auto *VecTy = getWidenedType(ScalarTy, Sz);
6832 SmallVector<int> Mask;
6833 if (!Order.empty())
6834 inversePermutation(Order, Mask);
6835 // Check external uses.
6836 for (const auto [I, V] : enumerate(VL)) {
6837 if (AreAllUsersVectorized(V))
6838 continue;
6839 InstructionCost ExtractCost =
6840 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6841 Mask.empty() ? I : Mask[I]);
6842 InstructionCost ScalarCost =
6843 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6844 if (ExtractCost <= ScalarCost)
6845 return false;
6846 }
6847 Value *Ptr0;
6848 Value *PtrN;
6849 if (Order.empty()) {
6850 Ptr0 = PointerOps.front();
6851 PtrN = PointerOps.back();
6852 } else {
6853 Ptr0 = PointerOps[Order.front()];
6854 PtrN = PointerOps[Order.back()];
6855 }
6856 std::optional<int64_t> Diff =
6857 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6858 if (!Diff)
6859 return false;
6860 const size_t MaxRegSize =
6862 .getFixedValue();
6863 // Check for very large distances between elements.
6864 if (*Diff / Sz >= MaxRegSize / 8)
6865 return false;
6866 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6867 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6868 Align CommonAlignment = LI->getAlign();
6869 IsMasked = !isSafeToLoadUnconditionally(
6870 Ptr0, LoadVecTy, CommonAlignment, DL,
6871 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6872 &TLI);
6873 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6874 LI->getPointerAddressSpace()))
6875 return false;
6876 // TODO: perform the analysis of each scalar load for better
6877 // safe-load-unconditionally analysis.
6878 bool IsStrided =
6879 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6880 assert(CompressMask.size() >= 2 && "At least two elements are required");
6881 SmallVector<Value *> OrderedPointerOps(PointerOps);
6882 if (!Order.empty())
6883 reorderScalars(OrderedPointerOps, Mask);
6884 auto [ScalarGEPCost, VectorGEPCost] =
6885 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6886 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6887 // The cost of scalar loads.
6888 InstructionCost ScalarLoadsCost =
6889 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6890 [&](InstructionCost C, Value *V) {
6891 return C + TTI.getInstructionCost(cast<Instruction>(V),
6892 CostKind);
6893 }) +
6894 ScalarGEPCost;
6895 APInt DemandedElts = APInt::getAllOnes(Sz);
6896 InstructionCost GatherCost =
6897 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6898 /*Insert=*/true,
6899 /*Extract=*/false, CostKind) +
6900 ScalarLoadsCost;
6901 InstructionCost LoadCost = 0;
6902 if (IsMasked) {
6903 LoadCost = TTI.getMaskedMemoryOpCost({Intrinsic::masked_load, LoadVecTy,
6904 CommonAlignment,
6905 LI->getPointerAddressSpace()},
6906 CostKind);
6907 } else {
6908 LoadCost =
6909 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6910 LI->getPointerAddressSpace(), CostKind);
6911 }
6912 if (IsStrided && !IsMasked && Order.empty()) {
6913 // Check for potential segmented(interleaved) loads.
6914 VectorType *AlignedLoadVecTy = getWidenedType(
6915 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6916 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6917 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6918 &TLI))
6919 AlignedLoadVecTy = LoadVecTy;
6920 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6921 CommonAlignment,
6922 LI->getPointerAddressSpace())) {
6923 InstructionCost InterleavedCost =
6924 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6925 Instruction::Load, AlignedLoadVecTy,
6926 CompressMask[1], {}, CommonAlignment,
6927 LI->getPointerAddressSpace(), CostKind, IsMasked);
6928 if (InterleavedCost < GatherCost) {
6929 InterleaveFactor = CompressMask[1];
6930 LoadVecTy = AlignedLoadVecTy;
6931 return true;
6932 }
6933 }
6934 }
6935 InstructionCost CompressCost = ::getShuffleCost(
6936 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
6937 if (!Order.empty()) {
6938 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6939 for (unsigned I : seq<unsigned>(Sz)) {
6940 NewMask[I] = CompressMask[Mask[I]];
6941 }
6942 CompressMask.swap(NewMask);
6943 }
6944 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6945 return TotalVecCost < GatherCost;
6946}
6947
6948/// Checks if the \p VL can be transformed to a (masked)load + compress or
6949/// (masked) interleaved load.
6950static bool
6953 const DataLayout &DL, ScalarEvolution &SE,
6954 AssumptionCache &AC, const DominatorTree &DT,
6955 const TargetLibraryInfo &TLI,
6956 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6957 bool IsMasked;
6958 unsigned InterleaveFactor;
6959 SmallVector<int> CompressMask;
6960 VectorType *LoadVecTy;
6961 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6962 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6963 CompressMask, LoadVecTy);
6964}
6965
6966/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6967/// PointerOps:
6968/// 1. Target with strided load support is detected.
6969/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6970/// potential stride <= MaxProfitableLoadStride and the potential stride is
6971/// power-of-2 (to avoid perf regressions for the very small number of loads)
6972/// and max distance > number of loads, or potential stride is -1.
6973/// 3. The loads are ordered, or number of unordered loads <=
6974/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6975/// to avoid extra costs for very expensive shuffles).
6976/// 4. Any pointer operand is an instruction with the users outside of the
6977/// current graph (for masked gathers extra extractelement instructions
6978/// might be required).
6980 Align Alignment, const int64_t Diff,
6981 const size_t Sz) const {
6982 if (Diff % (Sz - 1) != 0)
6983 return false;
6984
6985 // Try to generate strided load node.
6986 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
6987 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
6988 return !isVectorized(U) && !MustGather.contains(U);
6989 });
6990 });
6991
6992 const uint64_t AbsoluteDiff = std::abs(Diff);
6993 auto *VecTy = getWidenedType(ScalarTy, Sz);
6994 if (IsAnyPointerUsedOutGraph ||
6995 (AbsoluteDiff > Sz &&
6997 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
6998 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
6999 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7000 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7001 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7002 return false;
7003 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7004 return false;
7005 return true;
7006 }
7007 return false;
7008}
7009
7011 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7012 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7013 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
7014 const size_t Sz = PointerOps.size();
7015 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7016 // Go through `PointerOps` in sorted order and record offsets from `Ptr0`.
7017 for (unsigned I : seq<unsigned>(Sz)) {
7018 Value *Ptr =
7019 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7020 SortedOffsetsFromBase[I] =
7021 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
7022 }
7023
7024 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7025 // ```
7026 // [
7027 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7028 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7029 // ...
7030 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7031 // GroupSize - 1}), // last group
7032 // ]
7033 // ```
7034 // The distance between consecutive elements within each group should all be
7035 // the same `StrideWithinGroup`. The distance between the first elements of
7036 // consecutive groups should all be the same `StrideBetweenGroups`.
7037
7038 int64_t StrideWithinGroup =
7039 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7040 // Determine size of the first group. Later we will check that all other
7041 // groups have the same size.
7042 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7043 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7044 StrideWithinGroup;
7045 };
7046 auto Indices = seq<unsigned>(1, Sz);
7047 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7048 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7049
7050 unsigned VecSz = Sz;
7051 Type *NewScalarTy = ScalarTy;
7052
7053 // Quick detour: at this point we can say what the type of strided load would
7054 // be if all the checks pass. Check if this type is legal for the target.
7055 bool NeedsWidening = Sz != GroupSize;
7056 if (NeedsWidening) {
7057 if (Sz % GroupSize != 0)
7058 return false;
7059
7060 if (StrideWithinGroup != 1)
7061 return false;
7062 VecSz = Sz / GroupSize;
7063 NewScalarTy = Type::getIntNTy(
7064 SE->getContext(),
7065 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7066 }
7067
7068 if (!isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7069 return false;
7070
7071 int64_t StrideIntVal = StrideWithinGroup;
7072 if (NeedsWidening) {
7073 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7074 // Check that the strides between groups are all the same.
7075 unsigned CurrentGroupStartIdx = GroupSize;
7076 int64_t StrideBetweenGroups =
7077 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7078 StrideIntVal = StrideBetweenGroups;
7079 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7080 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7081 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7082 StrideBetweenGroups)
7083 return false;
7084 }
7085
7086 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7087 auto Indices = seq<unsigned>(StartIdx + 1, Sz);
7088 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7089 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7090 return GroupEndIdx - StartIdx == GroupSize;
7091 };
7092 for (unsigned I = 0; I < Sz; I += GroupSize) {
7093 if (!CheckGroup(I))
7094 return false;
7095 }
7096 }
7097
7098 Type *StrideTy = DL->getIndexType(Ptr0->getType());
7099 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
7100 SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
7101 return true;
7102}
7103
7105 Type *ScalarTy, Align CommonAlignment,
7106 SmallVectorImpl<unsigned> &SortedIndices,
7107 StridedPtrInfo &SPtrInfo) const {
7108 const unsigned Sz = PointerOps.size();
7109 FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, Sz);
7110 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7111 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7112 return false;
7113 if (const SCEV *Stride =
7114 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) {
7115 SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
7116 SPtrInfo.StrideSCEV = Stride;
7117 return true;
7118 }
7119 return false;
7120}
7121
7123 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7124 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7125 unsigned *BestVF, bool TryRecursiveCheck) const {
7126 // Check that a vectorized load would load the same memory as a scalar
7127 // load. For example, we don't want to vectorize loads that are smaller
7128 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7129 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7130 // from such a struct, we read/write packed bits disagreeing with the
7131 // unvectorized version.
7132 if (BestVF)
7133 *BestVF = 0;
7135 return LoadsState::Gather;
7136 Type *ScalarTy = VL0->getType();
7137
7138 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7139 return LoadsState::Gather;
7140
7141 // Make sure all loads in the bundle are simple - we can't vectorize
7142 // atomic or volatile loads.
7143 PointerOps.clear();
7144 const size_t Sz = VL.size();
7145 PointerOps.resize(Sz);
7146 auto *POIter = PointerOps.begin();
7147 for (Value *V : VL) {
7148 auto *L = dyn_cast<LoadInst>(V);
7149 if (!L || !L->isSimple())
7150 return LoadsState::Gather;
7151 *POIter = L->getPointerOperand();
7152 ++POIter;
7153 }
7154
7155 Order.clear();
7156 // Check the order of pointer operands or that all pointers are the same.
7157 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7158
7159 auto *VecTy = getWidenedType(ScalarTy, Sz);
7160 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7161 if (!IsSorted) {
7162 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7163 SPtrInfo))
7165
7166 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7167 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7168 return LoadsState::Gather;
7169
7170 if (!all_of(PointerOps, [&](Value *P) {
7171 return arePointersCompatible(P, PointerOps.front(), *TLI);
7172 }))
7173 return LoadsState::Gather;
7174
7175 } else {
7176 Value *Ptr0;
7177 Value *PtrN;
7178 if (Order.empty()) {
7179 Ptr0 = PointerOps.front();
7180 PtrN = PointerOps.back();
7181 } else {
7182 Ptr0 = PointerOps[Order.front()];
7183 PtrN = PointerOps[Order.back()];
7184 }
7185 std::optional<int64_t> Diff =
7186 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7187 // Check that the sorted loads are consecutive.
7188 if (static_cast<uint64_t>(*Diff) == Sz - 1)
7189 return LoadsState::Vectorize;
7190 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7191 *TLI, [&](Value *V) {
7192 return areAllUsersVectorized(
7193 cast<Instruction>(V), UserIgnoreList);
7194 }))
7196 Align Alignment =
7197 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7198 ->getAlign();
7199 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7200 *Diff, Ptr0, PtrN, SPtrInfo))
7202 }
7203 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7204 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7205 return LoadsState::Gather;
7206 // Correctly identify compare the cost of loads + shuffles rather than
7207 // strided/masked gather loads. Returns true if vectorized + shuffles
7208 // representation is better than just gather.
7209 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7210 unsigned *BestVF,
7211 bool ProfitableGatherPointers) {
7212 if (BestVF)
7213 *BestVF = 0;
7214 // Compare masked gather cost and loads + insert subvector costs.
7216 auto [ScalarGEPCost, VectorGEPCost] =
7217 getGEPCosts(TTI, PointerOps, PointerOps.front(),
7218 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7219 // Estimate the cost of masked gather GEP. If not a splat, roughly
7220 // estimate as a buildvector, otherwise estimate as splat.
7221 APInt DemandedElts = APInt::getAllOnes(Sz);
7222 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7223 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7224 if (static_cast<unsigned>(count_if(
7225 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7226 any_of(PointerOps, [&](Value *V) {
7227 return getUnderlyingObject(V) !=
7228 getUnderlyingObject(PointerOps.front());
7229 }))
7230 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7231 DemandedElts, /*Insert=*/true,
7232 /*Extract=*/false, CostKind);
7233 else
7234 VectorGEPCost +=
7236 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7237 /*Insert=*/true, /*Extract=*/false, CostKind) +
7238 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7239 // The cost of scalar loads.
7240 InstructionCost ScalarLoadsCost =
7241 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7242 [&](InstructionCost C, Value *V) {
7243 return C + TTI.getInstructionCost(
7245 }) +
7246 ScalarGEPCost;
7247 // The cost of masked gather.
7248 InstructionCost MaskedGatherCost =
7249 TTI.getGatherScatterOpCost(
7250 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
7251 /*VariableMask=*/false, CommonAlignment, CostKind) +
7252 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7253 InstructionCost GatherCost =
7254 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7255 /*Insert=*/true,
7256 /*Extract=*/false, CostKind) +
7257 ScalarLoadsCost;
7258 // The list of loads is small or perform partial check already - directly
7259 // compare masked gather cost and gather cost.
7260 constexpr unsigned ListLimit = 4;
7261 if (!TryRecursiveCheck || VL.size() < ListLimit)
7262 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7263
7264 // FIXME: The following code has not been updated for non-power-of-2
7265 // vectors (and not whole registers). The splitting logic here does not
7266 // cover the original vector if the vector factor is not a power of two.
7267 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7268 return false;
7269
7270 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7271 unsigned MinVF = getMinVF(2 * Sz);
7272 DemandedElts.clearAllBits();
7273 // Iterate through possible vectorization factors and check if vectorized +
7274 // shuffles is better than just gather.
7275 for (unsigned VF =
7276 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7277 VF >= MinVF;
7278 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7280 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7281 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7283 SmallVector<Value *> PointerOps;
7284 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7285 PointerOps, SPtrInfo, BestVF,
7286 /*TryRecursiveCheck=*/false);
7287 // Check that the sorted loads are consecutive.
7288 if (LS == LoadsState::Gather) {
7289 if (BestVF) {
7290 DemandedElts.setAllBits();
7291 break;
7292 }
7293 DemandedElts.setBits(Cnt, Cnt + VF);
7294 continue;
7295 }
7296 // If need the reorder - consider as high-cost masked gather for now.
7297 if ((LS == LoadsState::Vectorize ||
7300 !Order.empty() && !isReverseOrder(Order))
7302 States.push_back(LS);
7303 }
7304 if (DemandedElts.isAllOnes())
7305 // All loads gathered - try smaller VF.
7306 continue;
7307 // Can be vectorized later as a serie of loads/insertelements.
7308 InstructionCost VecLdCost = 0;
7309 if (!DemandedElts.isZero()) {
7310 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7311 /*Insert=*/true,
7312 /*Extract=*/false, CostKind) +
7313 ScalarGEPCost;
7314 for (unsigned Idx : seq<unsigned>(VL.size()))
7315 if (DemandedElts[Idx])
7316 VecLdCost +=
7317 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7318 }
7319 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7320 for (auto [I, LS] : enumerate(States)) {
7321 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7322 InstructionCost VectorGEPCost =
7323 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7324 ? 0
7325 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7326 LI0->getPointerOperand(),
7327 Instruction::GetElementPtr, CostKind, ScalarTy,
7328 SubVecTy)
7329 .second;
7330 if (LS == LoadsState::ScatterVectorize) {
7331 if (static_cast<unsigned>(
7332 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7333 PointerOps.size() - 1 ||
7334 any_of(PointerOps, [&](Value *V) {
7335 return getUnderlyingObject(V) !=
7336 getUnderlyingObject(PointerOps.front());
7337 }))
7338 VectorGEPCost += getScalarizationOverhead(
7339 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7340 /*Insert=*/true, /*Extract=*/false, CostKind);
7341 else
7342 VectorGEPCost +=
7344 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7345 /*Insert=*/true, /*Extract=*/false, CostKind) +
7346 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7347 CostKind);
7348 }
7349 switch (LS) {
7351 VecLdCost +=
7352 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7353 LI0->getPointerAddressSpace(), CostKind,
7355 VectorGEPCost;
7356 break;
7358 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7359 LI0->getPointerOperand(),
7360 /*VariableMask=*/false,
7361 CommonAlignment, CostKind) +
7362 VectorGEPCost;
7363 break;
7365 VecLdCost += TTI.getMaskedMemoryOpCost(
7366 {Intrinsic::masked_load, SubVecTy, CommonAlignment,
7367 LI0->getPointerAddressSpace()},
7368 CostKind) +
7369 VectorGEPCost +
7371 {}, CostKind);
7372 break;
7374 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7375 LI0->getPointerOperand(),
7376 /*VariableMask=*/false,
7377 CommonAlignment, CostKind) +
7378 VectorGEPCost;
7379 break;
7380 case LoadsState::Gather:
7381 // Gathers are already calculated - ignore.
7382 continue;
7383 }
7384 SmallVector<int> ShuffleMask(VL.size());
7385 for (int Idx : seq<int>(0, VL.size()))
7386 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7387 if (I > 0)
7388 VecLdCost +=
7389 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7390 CostKind, I * VF, SubVecTy);
7391 }
7392 // If masked gather cost is higher - better to vectorize, so
7393 // consider it as a gather node. It will be better estimated
7394 // later.
7395 if (MaskedGatherCost >= VecLdCost &&
7396 VecLdCost - GatherCost < -SLPCostThreshold) {
7397 if (BestVF)
7398 *BestVF = VF;
7399 return true;
7400 }
7401 }
7402 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7403 };
7404 // TODO: need to improve analysis of the pointers, if not all of them are
7405 // GEPs or have > 2 operands, we end up with a gather node, which just
7406 // increases the cost.
7407 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7408 bool ProfitableGatherPointers =
7409 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7410 return L->isLoopInvariant(V);
7411 })) <= Sz / 2;
7412 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7414 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7415 (GEP && GEP->getNumOperands() == 2 &&
7416 isa<Constant, Instruction>(GEP->getOperand(1)));
7417 })) {
7418 // Check if potential masked gather can be represented as series
7419 // of loads + insertsubvectors.
7420 // If masked gather cost is higher - better to vectorize, so
7421 // consider it as a gather node. It will be better estimated
7422 // later.
7423 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7424 ProfitableGatherPointers))
7426 }
7427
7428 return LoadsState::Gather;
7429}
7430
7432 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7433 const DataLayout &DL, ScalarEvolution &SE,
7434 SmallVectorImpl<unsigned> &SortedIndices) {
7435 assert(
7436 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7437 "Expected list of pointer operands.");
7438 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7439 // Ptr into, sort and return the sorted indices with values next to one
7440 // another.
7442 std::pair<BasicBlock *, Value *>,
7444 Bases;
7445 Bases
7446 .try_emplace(std::make_pair(
7448 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7449
7450 SortedIndices.clear();
7451 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7452 auto Key = std::make_pair(BBs[Cnt + 1],
7454 bool Found = any_of(Bases.try_emplace(Key).first->second,
7455 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7456 std::optional<int64_t> Diff =
7457 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7458 ElemTy, Ptr, DL, SE,
7459 /*StrictCheck=*/true);
7460 if (!Diff)
7461 return false;
7462
7463 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7464 return true;
7465 });
7466
7467 if (!Found) {
7468 // If we haven't found enough to usefully cluster, return early.
7469 if (Bases.size() > VL.size() / 2 - 1)
7470 return false;
7471
7472 // Not found already - add a new Base
7473 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7474 }
7475 }
7476
7477 if (Bases.size() == VL.size())
7478 return false;
7479
7480 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7481 Bases.front().second.size() == VL.size()))
7482 return false;
7483
7484 // For each of the bases sort the pointers by Offset and check if any of the
7485 // base become consecutively allocated.
7486 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7487 SmallPtrSet<Value *, 13> FirstPointers;
7488 SmallPtrSet<Value *, 13> SecondPointers;
7489 Value *P1 = Ptr1;
7490 Value *P2 = Ptr2;
7491 unsigned Depth = 0;
7492 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7493 if (P1 == P2 || Depth > RecursionMaxDepth)
7494 return false;
7495 FirstPointers.insert(P1);
7496 SecondPointers.insert(P2);
7497 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7498 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7499 ++Depth;
7500 }
7501 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7502 "Unable to find matching root.");
7503 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7504 };
7505 for (auto &Base : Bases) {
7506 for (auto &Vec : Base.second) {
7507 if (Vec.size() > 1) {
7509 int64_t InitialOffset = std::get<1>(Vec[0]);
7510 bool AnyConsecutive =
7511 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7512 return std::get<1>(P.value()) ==
7513 int64_t(P.index()) + InitialOffset;
7514 });
7515 // Fill SortedIndices array only if it looks worth-while to sort the
7516 // ptrs.
7517 if (!AnyConsecutive)
7518 return false;
7519 }
7520 }
7521 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7522 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7523 });
7524 }
7525
7526 for (auto &T : Bases)
7527 for (const auto &Vec : T.second)
7528 for (const auto &P : Vec)
7529 SortedIndices.push_back(std::get<2>(P));
7530
7531 assert(SortedIndices.size() == VL.size() &&
7532 "Expected SortedIndices to be the size of VL");
7533 return true;
7534}
7535
7536std::optional<BoUpSLP::OrdersType>
7537BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7538 assert(TE.isGather() && "Expected gather node only.");
7539 Type *ScalarTy = TE.Scalars[0]->getType();
7540
7542 Ptrs.reserve(TE.Scalars.size());
7544 BBs.reserve(TE.Scalars.size());
7545 for (Value *V : TE.Scalars) {
7546 auto *L = dyn_cast<LoadInst>(V);
7547 if (!L || !L->isSimple())
7548 return std::nullopt;
7549 Ptrs.push_back(L->getPointerOperand());
7550 BBs.push_back(L->getParent());
7551 }
7552
7553 BoUpSLP::OrdersType Order;
7554 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7555 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7556 return std::move(Order);
7557 return std::nullopt;
7558}
7559
7560/// Check if two insertelement instructions are from the same buildvector.
7563 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7564 // Instructions must be from the same basic blocks.
7565 if (VU->getParent() != V->getParent())
7566 return false;
7567 // Checks if 2 insertelements are from the same buildvector.
7568 if (VU->getType() != V->getType())
7569 return false;
7570 // Multiple used inserts are separate nodes.
7571 if (!VU->hasOneUse() && !V->hasOneUse())
7572 return false;
7573 auto *IE1 = VU;
7574 auto *IE2 = V;
7575 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7576 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7577 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7578 return false;
7579 // Go through the vector operand of insertelement instructions trying to find
7580 // either VU as the original vector for IE2 or V as the original vector for
7581 // IE1.
7582 SmallBitVector ReusedIdx(
7583 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7584 bool IsReusedIdx = false;
7585 do {
7586 if (IE2 == VU && !IE1)
7587 return VU->hasOneUse();
7588 if (IE1 == V && !IE2)
7589 return V->hasOneUse();
7590 if (IE1 && IE1 != V) {
7591 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7592 IsReusedIdx |= ReusedIdx.test(Idx1);
7593 ReusedIdx.set(Idx1);
7594 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7595 IE1 = nullptr;
7596 else
7597 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7598 }
7599 if (IE2 && IE2 != VU) {
7600 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7601 IsReusedIdx |= ReusedIdx.test(Idx2);
7602 ReusedIdx.set(Idx2);
7603 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7604 IE2 = nullptr;
7605 else
7606 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7607 }
7608 } while (!IsReusedIdx && (IE1 || IE2));
7609 return false;
7610}
7611
7612/// Checks if the specified instruction \p I is an alternate operation for
7613/// the given \p MainOp and \p AltOp instructions.
7614static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7615 Instruction *AltOp,
7616 const TargetLibraryInfo &TLI);
7617
7618std::optional<BoUpSLP::OrdersType>
7619BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7620 bool IgnoreReorder) {
7621 // No need to reorder if need to shuffle reuses, still need to shuffle the
7622 // node.
7623 if (!TE.ReuseShuffleIndices.empty()) {
7624 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7625 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7626 "Reshuffling scalars not yet supported for nodes with padding");
7627
7628 if (isSplat(TE.Scalars))
7629 return std::nullopt;
7630 // Check if reuse shuffle indices can be improved by reordering.
7631 // For this, check that reuse mask is "clustered", i.e. each scalar values
7632 // is used once in each submask of size <number_of_scalars>.
7633 // Example: 4 scalar values.
7634 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7635 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7636 // element 3 is used twice in the second submask.
7637 unsigned Sz = TE.Scalars.size();
7638 if (TE.isGather()) {
7639 if (std::optional<OrdersType> CurrentOrder =
7640 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7641 SmallVector<int> Mask;
7642 fixupOrderingIndices(*CurrentOrder);
7643 inversePermutation(*CurrentOrder, Mask);
7644 ::addMask(Mask, TE.ReuseShuffleIndices);
7645 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7646 unsigned Sz = TE.Scalars.size();
7647 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7648 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7649 if (Idx != PoisonMaskElem)
7650 Res[Idx + K * Sz] = I + K * Sz;
7651 }
7652 return std::move(Res);
7653 }
7654 }
7655 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7656 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7657 2 * TE.getVectorFactor())) == 1)
7658 return std::nullopt;
7659 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7660 return std::nullopt;
7661 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7662 Sz)) {
7663 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7664 if (TE.ReorderIndices.empty())
7665 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7666 else
7667 inversePermutation(TE.ReorderIndices, ReorderMask);
7668 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7669 unsigned VF = ReorderMask.size();
7670 OrdersType ResOrder(VF, VF);
7671 unsigned NumParts = divideCeil(VF, Sz);
7672 SmallBitVector UsedVals(NumParts);
7673 for (unsigned I = 0; I < VF; I += Sz) {
7674 int Val = PoisonMaskElem;
7675 unsigned UndefCnt = 0;
7676 unsigned Limit = std::min(Sz, VF - I);
7677 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7678 [&](int Idx) {
7679 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7680 Val = Idx;
7681 if (Idx == PoisonMaskElem)
7682 ++UndefCnt;
7683 return Idx != PoisonMaskElem && Idx != Val;
7684 }) ||
7685 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7686 UndefCnt > Sz / 2)
7687 return std::nullopt;
7688 UsedVals.set(Val);
7689 for (unsigned K = 0; K < NumParts; ++K) {
7690 unsigned Idx = Val + Sz * K;
7691 if (Idx < VF && I + K < VF)
7692 ResOrder[Idx] = I + K;
7693 }
7694 }
7695 return std::move(ResOrder);
7696 }
7697 unsigned VF = TE.getVectorFactor();
7698 // Try build correct order for extractelement instructions.
7699 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7700 TE.ReuseShuffleIndices.end());
7701 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7702 all_of(TE.Scalars, [Sz](Value *V) {
7703 if (isa<PoisonValue>(V))
7704 return true;
7705 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7706 return Idx && *Idx < Sz;
7707 })) {
7708 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7709 "by BinaryOperator and CastInst.");
7710 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7711 if (TE.ReorderIndices.empty())
7712 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7713 else
7714 inversePermutation(TE.ReorderIndices, ReorderMask);
7715 for (unsigned I = 0; I < VF; ++I) {
7716 int &Idx = ReusedMask[I];
7717 if (Idx == PoisonMaskElem)
7718 continue;
7719 Value *V = TE.Scalars[ReorderMask[Idx]];
7720 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7721 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7722 }
7723 }
7724 // Build the order of the VF size, need to reorder reuses shuffles, they are
7725 // always of VF size.
7726 OrdersType ResOrder(VF);
7727 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7728 auto *It = ResOrder.begin();
7729 for (unsigned K = 0; K < VF; K += Sz) {
7730 OrdersType CurrentOrder(TE.ReorderIndices);
7731 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7732 if (SubMask.front() == PoisonMaskElem)
7733 std::iota(SubMask.begin(), SubMask.end(), 0);
7734 reorderOrder(CurrentOrder, SubMask);
7735 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7736 std::advance(It, Sz);
7737 }
7738 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7739 return Data.index() == Data.value();
7740 }))
7741 return std::nullopt; // No need to reorder.
7742 return std::move(ResOrder);
7743 }
7744 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7745 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7746 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
7747 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
7748 return std::nullopt;
7749 if (TE.State == TreeEntry::SplitVectorize ||
7750 ((TE.State == TreeEntry::Vectorize ||
7751 TE.State == TreeEntry::StridedVectorize ||
7752 TE.State == TreeEntry::CompressVectorize) &&
7754 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7755 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7756 "Alternate instructions are only supported by "
7757 "BinaryOperator and CastInst.");
7758 return TE.ReorderIndices;
7759 }
7760 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7761 TE.isAltShuffle()) {
7762 assert(TE.ReuseShuffleIndices.empty() &&
7763 "ReuseShuffleIndices should be "
7764 "empty for alternate instructions.");
7765 SmallVector<int> Mask;
7766 TE.buildAltOpShuffleMask(
7767 [&](Instruction *I) {
7768 assert(TE.getMatchingMainOpOrAltOp(I) &&
7769 "Unexpected main/alternate opcode");
7770 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
7771 },
7772 Mask);
7773 const int VF = TE.getVectorFactor();
7774 OrdersType ResOrder(VF, VF);
7775 for (unsigned I : seq<unsigned>(VF)) {
7776 if (Mask[I] == PoisonMaskElem)
7777 continue;
7778 ResOrder[Mask[I] % VF] = I;
7779 }
7780 return std::move(ResOrder);
7781 }
7782 if (!TE.ReorderIndices.empty())
7783 return TE.ReorderIndices;
7784 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7785 if (!TE.ReorderIndices.empty())
7786 return TE.ReorderIndices;
7787
7788 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
7789 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
7790 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7791 continue;
7792 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
7793 if (!II)
7794 continue;
7795 Instruction *BVHead = nullptr;
7796 BasicBlock *BB = II->getParent();
7797 while (II && II->hasOneUse() && II->getParent() == BB) {
7798 BVHead = II;
7799 II = dyn_cast<InsertElementInst>(II->getOperand(0));
7800 }
7801 I = BVHead;
7802 }
7803
7804 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
7805 assert(BB1 != BB2 && "Expected different basic blocks.");
7806 if (!DT->isReachableFromEntry(BB1))
7807 return false;
7808 if (!DT->isReachableFromEntry(BB2))
7809 return true;
7810 auto *NodeA = DT->getNode(BB1);
7811 auto *NodeB = DT->getNode(BB2);
7812 assert(NodeA && "Should only process reachable instructions");
7813 assert(NodeB && "Should only process reachable instructions");
7814 assert((NodeA == NodeB) ==
7815 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7816 "Different nodes should have different DFS numbers");
7817 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7818 };
7819 auto PHICompare = [&](unsigned I1, unsigned I2) {
7820 Value *V1 = TE.Scalars[I1];
7821 Value *V2 = TE.Scalars[I2];
7822 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
7823 return false;
7824 if (isa<PoisonValue>(V1))
7825 return true;
7826 if (isa<PoisonValue>(V2))
7827 return false;
7828 if (V1->getNumUses() < V2->getNumUses())
7829 return true;
7830 if (V1->getNumUses() > V2->getNumUses())
7831 return false;
7832 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
7833 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
7834 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7835 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7836 FirstUserOfPhi2->getParent());
7837 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7838 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7839 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7840 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7841 if (IE1 && !IE2)
7842 return true;
7843 if (!IE1 && IE2)
7844 return false;
7845 if (IE1 && IE2) {
7846 if (UserBVHead[I1] && !UserBVHead[I2])
7847 return true;
7848 if (!UserBVHead[I1])
7849 return false;
7850 if (UserBVHead[I1] == UserBVHead[I2])
7851 return getElementIndex(IE1) < getElementIndex(IE2);
7852 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
7853 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
7854 UserBVHead[I2]->getParent());
7855 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7856 }
7857 if (EE1 && !EE2)
7858 return true;
7859 if (!EE1 && EE2)
7860 return false;
7861 if (EE1 && EE2) {
7862 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7863 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7864 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7865 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7866 if (!Inst2 && !P2)
7867 return Inst1 || P1;
7868 if (EE1->getOperand(0) == EE2->getOperand(0))
7869 return getElementIndex(EE1) < getElementIndex(EE2);
7870 if (!Inst1 && Inst2)
7871 return false;
7872 if (Inst1 && Inst2) {
7873 if (Inst1->getParent() != Inst2->getParent())
7874 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
7875 return Inst1->comesBefore(Inst2);
7876 }
7877 if (!P1 && P2)
7878 return false;
7879 assert(P1 && P2 &&
7880 "Expected either instructions or arguments vector operands.");
7881 return P1->getArgNo() < P2->getArgNo();
7882 }
7883 return false;
7884 };
7885 OrdersType Phis(TE.Scalars.size());
7886 std::iota(Phis.begin(), Phis.end(), 0);
7887 stable_sort(Phis, PHICompare);
7888 if (isIdentityOrder(Phis))
7889 return std::nullopt; // No need to reorder.
7890 return std::move(Phis);
7891 }
7892 if (TE.isGather() &&
7893 (!TE.hasState() || !TE.isAltShuffle() ||
7894 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7895 allSameType(TE.Scalars)) {
7896 // TODO: add analysis of other gather nodes with extractelement
7897 // instructions and other values/instructions, not only undefs.
7898 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7900 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7901 all_of(TE.Scalars, [](Value *V) {
7902 auto *EE = dyn_cast<ExtractElementInst>(V);
7903 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7904 })) {
7905 // Check that gather of extractelements can be represented as
7906 // just a shuffle of a single vector.
7907 OrdersType CurrentOrder;
7908 bool Reuse =
7909 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
7910 if (Reuse || !CurrentOrder.empty())
7911 return std::move(CurrentOrder);
7912 }
7913 // If the gather node is <undef, v, .., poison> and
7914 // insertelement poison, v, 0 [+ permute]
7915 // is cheaper than
7916 // insertelement poison, v, n - try to reorder.
7917 // If rotating the whole graph, exclude the permute cost, the whole graph
7918 // might be transformed.
7919 int Sz = TE.Scalars.size();
7920 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
7921 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7922 const auto *It = find_if_not(TE.Scalars, isConstant);
7923 if (It == TE.Scalars.begin())
7924 return OrdersType();
7925 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
7926 if (It != TE.Scalars.end()) {
7927 OrdersType Order(Sz, Sz);
7928 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7929 Order[Idx] = 0;
7930 fixupOrderingIndices(Order);
7931 SmallVector<int> Mask;
7932 inversePermutation(Order, Mask);
7933 InstructionCost PermuteCost =
7934 TopToBottom
7935 ? 0
7936 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
7937 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
7938 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
7939 PoisonValue::get(Ty), *It);
7940 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
7941 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
7942 PoisonValue::get(Ty), *It);
7943 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7944 OrdersType Order(Sz, Sz);
7945 Order[Idx] = 0;
7946 return std::move(Order);
7947 }
7948 }
7949 }
7950 if (isSplat(TE.Scalars))
7951 return std::nullopt;
7952 if (TE.Scalars.size() >= 3)
7953 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
7954 return Order;
7955 // Check if can include the order of vectorized loads. For masked gathers do
7956 // extra analysis later, so include such nodes into a special list.
7957 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7958 SmallVector<Value *> PointerOps;
7959 StridedPtrInfo SPtrInfo;
7960 OrdersType CurrentOrder;
7961 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7962 CurrentOrder, PointerOps, SPtrInfo);
7965 return std::move(CurrentOrder);
7966 }
7967 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
7968 // has been auditted for correctness with non-power-of-two vectors.
7969 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
7970 if (std::optional<OrdersType> CurrentOrder =
7971 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
7972 return CurrentOrder;
7973 }
7974 return std::nullopt;
7975}
7976
7977/// Checks if the given mask is a "clustered" mask with the same clusters of
7978/// size \p Sz, which are not identity submasks.
7980 unsigned Sz) {
7981 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
7982 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
7983 return false;
7984 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
7985 ArrayRef<int> Cluster = Mask.slice(I, Sz);
7986 if (Cluster != FirstCluster)
7987 return false;
7988 }
7989 return true;
7990}
7991
7992void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
7993 // Reorder reuses mask.
7994 reorderReuses(TE.ReuseShuffleIndices, Mask);
7995 const unsigned Sz = TE.Scalars.size();
7996 // For vectorized and non-clustered reused no need to do anything else.
7997 if (!TE.isGather() ||
7999 Sz) ||
8000 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
8001 return;
8002 SmallVector<int> NewMask;
8003 inversePermutation(TE.ReorderIndices, NewMask);
8004 addMask(NewMask, TE.ReuseShuffleIndices);
8005 // Clear reorder since it is going to be applied to the new mask.
8006 TE.ReorderIndices.clear();
8007 // Try to improve gathered nodes with clustered reuses, if possible.
8008 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
8009 SmallVector<unsigned> NewOrder(Slice);
8010 inversePermutation(NewOrder, NewMask);
8011 reorderScalars(TE.Scalars, NewMask);
8012 // Fill the reuses mask with the identity submasks.
8013 for (auto *It = TE.ReuseShuffleIndices.begin(),
8014 *End = TE.ReuseShuffleIndices.end();
8015 It != End; std::advance(It, Sz))
8016 std::iota(It, std::next(It, Sz), 0);
8017}
8018
8020 ArrayRef<unsigned> SecondaryOrder) {
8021 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8022 "Expected same size of orders");
8023 size_t Sz = Order.size();
8024 SmallBitVector UsedIndices(Sz);
8025 for (unsigned Idx : seq<unsigned>(0, Sz)) {
8026 if (Order[Idx] != Sz)
8027 UsedIndices.set(Order[Idx]);
8028 }
8029 if (SecondaryOrder.empty()) {
8030 for (unsigned Idx : seq<unsigned>(0, Sz))
8031 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8032 Order[Idx] = Idx;
8033 } else {
8034 for (unsigned Idx : seq<unsigned>(0, Sz))
8035 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8036 !UsedIndices.test(SecondaryOrder[Idx]))
8037 Order[Idx] = SecondaryOrder[Idx];
8038 }
8039}
8040
8043 return false;
8044
8045 constexpr unsigned TinyVF = 2;
8046 constexpr unsigned TinyTree = 10;
8047 constexpr unsigned PhiOpsLimit = 12;
8048 constexpr unsigned GatherLoadsLimit = 2;
8049 if (VectorizableTree.size() <= TinyTree)
8050 return true;
8051 if (VectorizableTree.front()->hasState() &&
8052 !VectorizableTree.front()->isGather() &&
8053 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8054 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8055 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8056 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8057 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8058 VectorizableTree.front()->ReorderIndices.empty()) {
8059 // Check if the tree has only single store and single (unordered) load node,
8060 // other nodes are phis or geps/binops, combined with phis, and/or single
8061 // gather load node
8062 if (VectorizableTree.front()->hasState() &&
8063 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8064 VectorizableTree.front()->Scalars.size() == TinyVF &&
8065 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8066 return false;
8067 // Single node, which require reorder - skip.
8068 if (VectorizableTree.front()->hasState() &&
8069 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8070 VectorizableTree.front()->ReorderIndices.empty()) {
8071 const unsigned ReorderedSplitsCnt =
8072 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8073 return TE->State == TreeEntry::SplitVectorize &&
8074 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8075 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8076 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
8077 });
8078 if (ReorderedSplitsCnt <= 1 &&
8079 static_cast<unsigned>(count_if(
8080 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8081 return ((!TE->isGather() &&
8082 (TE->ReorderIndices.empty() ||
8083 (TE->UserTreeIndex.UserTE &&
8084 TE->UserTreeIndex.UserTE->State ==
8085 TreeEntry::Vectorize &&
8086 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8087 .empty()))) ||
8088 (TE->isGather() && TE->ReorderIndices.empty() &&
8089 (!TE->hasState() || TE->isAltShuffle() ||
8090 TE->getOpcode() == Instruction::Load ||
8091 TE->getOpcode() == Instruction::ZExt ||
8092 TE->getOpcode() == Instruction::SExt))) &&
8093 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8094 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
8095 return !isConstant(V) && isVectorized(V);
8096 }));
8097 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8098 return false;
8099 }
8100 bool HasPhis = false;
8101 bool HasLoad = true;
8102 unsigned GatherLoads = 0;
8103 for (const std::unique_ptr<TreeEntry> &TE :
8104 ArrayRef(VectorizableTree).drop_front()) {
8105 if (TE->State == TreeEntry::SplitVectorize)
8106 continue;
8107 if (!TE->hasState()) {
8108 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
8110 continue;
8111 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8113 continue;
8114 return true;
8115 }
8116 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8117 if (!TE->isGather()) {
8118 HasLoad = false;
8119 continue;
8120 }
8121 if (HasLoad)
8122 return true;
8123 ++GatherLoads;
8124 if (GatherLoads >= GatherLoadsLimit)
8125 return true;
8126 }
8127 if (TE->getOpcode() == Instruction::GetElementPtr ||
8128 Instruction::isBinaryOp(TE->getOpcode()))
8129 continue;
8130 if (TE->getOpcode() != Instruction::PHI &&
8131 (!TE->hasCopyableElements() ||
8132 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8133 TE->Scalars.size() / 2))
8134 return true;
8135 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8136 TE->getNumOperands() > PhiOpsLimit)
8137 return false;
8138 HasPhis = true;
8139 }
8140 return !HasPhis;
8141 }
8142 return true;
8143}
8144
8145void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8146 ArrayRef<int> MaskOrder) {
8147 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8148 SmallVector<int> NewMask(getVectorFactor());
8149 SmallVector<int> NewMaskOrder(getVectorFactor());
8150 std::iota(NewMask.begin(), NewMask.end(), 0);
8151 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8152 if (Idx == 0) {
8153 copy(Mask, NewMask.begin());
8154 copy(MaskOrder, NewMaskOrder.begin());
8155 } else {
8156 assert(Idx == 1 && "Expected either 0 or 1 index.");
8157 unsigned Offset = CombinedEntriesWithIndices.back().second;
8158 for (unsigned I : seq<unsigned>(Mask.size())) {
8159 NewMask[I + Offset] = Mask[I] + Offset;
8160 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8161 }
8162 }
8163 reorderScalars(Scalars, NewMask);
8164 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8165 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8166 ReorderIndices.clear();
8167}
8168
8170 // Maps VF to the graph nodes.
8172 // ExtractElement gather nodes which can be vectorized and need to handle
8173 // their ordering.
8175
8176 // Phi nodes can have preferred ordering based on their result users
8178
8179 // AltShuffles can also have a preferred ordering that leads to fewer
8180 // instructions, e.g., the addsub instruction in x86.
8181 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8182
8183 // Maps a TreeEntry to the reorder indices of external users.
8185 ExternalUserReorderMap;
8186 // Find all reorderable nodes with the given VF.
8187 // Currently the are vectorized stores,loads,extracts + some gathering of
8188 // extracts.
8189 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8190 const std::unique_ptr<TreeEntry> &TE) {
8191 // Look for external users that will probably be vectorized.
8192 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8193 findExternalStoreUsersReorderIndices(TE.get());
8194 if (!ExternalUserReorderIndices.empty()) {
8195 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8196 ExternalUserReorderMap.try_emplace(TE.get(),
8197 std::move(ExternalUserReorderIndices));
8198 }
8199
8200 // Patterns like [fadd,fsub] can be combined into a single instruction in
8201 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8202 // to take into account their order when looking for the most used order.
8203 if (TE->hasState() && TE->isAltShuffle() &&
8204 TE->State != TreeEntry::SplitVectorize) {
8205 Type *ScalarTy = TE->Scalars[0]->getType();
8206 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8207 unsigned Opcode0 = TE->getOpcode();
8208 unsigned Opcode1 = TE->getAltOpcode();
8209 SmallBitVector OpcodeMask(
8210 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8211 // If this pattern is supported by the target then we consider the order.
8212 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8213 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8214 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8215 }
8216 // TODO: Check the reverse order too.
8217 }
8218
8219 bool IgnoreReorder =
8220 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8221 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8222 VectorizableTree.front()->getOpcode() == Instruction::Store);
8223 if (std::optional<OrdersType> CurrentOrder =
8224 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8225 // Do not include ordering for nodes used in the alt opcode vectorization,
8226 // better to reorder them during bottom-to-top stage. If follow the order
8227 // here, it causes reordering of the whole graph though actually it is
8228 // profitable just to reorder the subgraph that starts from the alternate
8229 // opcode vectorization node. Such nodes already end-up with the shuffle
8230 // instruction and it is just enough to change this shuffle rather than
8231 // rotate the scalars for the whole graph.
8232 unsigned Cnt = 0;
8233 const TreeEntry *UserTE = TE.get();
8234 while (UserTE && Cnt < RecursionMaxDepth) {
8235 if (!UserTE->UserTreeIndex)
8236 break;
8237 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8238 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8239 UserTE->UserTreeIndex.UserTE->Idx != 0)
8240 return;
8241 UserTE = UserTE->UserTreeIndex.UserTE;
8242 ++Cnt;
8243 }
8244 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8245 if (!(TE->State == TreeEntry::Vectorize ||
8246 TE->State == TreeEntry::StridedVectorize ||
8247 TE->State == TreeEntry::SplitVectorize ||
8248 TE->State == TreeEntry::CompressVectorize) ||
8249 !TE->ReuseShuffleIndices.empty())
8250 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8251 if (TE->State == TreeEntry::Vectorize &&
8252 TE->getOpcode() == Instruction::PHI)
8253 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8254 }
8255 });
8256
8257 // Reorder the graph nodes according to their vectorization factor.
8258 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8259 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8260 auto It = VFToOrderedEntries.find(VF);
8261 if (It == VFToOrderedEntries.end())
8262 continue;
8263 // Try to find the most profitable order. We just are looking for the most
8264 // used order and reorder scalar elements in the nodes according to this
8265 // mostly used order.
8266 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8267 // Delete VF entry upon exit.
8268 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
8269
8270 // All operands are reordered and used only in this node - propagate the
8271 // most used order to the user node.
8274 OrdersUses;
8275 for (const TreeEntry *OpTE : OrderedEntries) {
8276 // No need to reorder this nodes, still need to extend and to use shuffle,
8277 // just need to merge reordering shuffle and the reuse shuffle.
8278 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8279 OpTE->State != TreeEntry::SplitVectorize)
8280 continue;
8281 // Count number of orders uses.
8282 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8283 &PhisToOrders]() -> const OrdersType & {
8284 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8285 auto It = GathersToOrders.find(OpTE);
8286 if (It != GathersToOrders.end())
8287 return It->second;
8288 }
8289 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8290 auto It = AltShufflesToOrders.find(OpTE);
8291 if (It != AltShufflesToOrders.end())
8292 return It->second;
8293 }
8294 if (OpTE->State == TreeEntry::Vectorize &&
8295 OpTE->getOpcode() == Instruction::PHI) {
8296 auto It = PhisToOrders.find(OpTE);
8297 if (It != PhisToOrders.end())
8298 return It->second;
8299 }
8300 return OpTE->ReorderIndices;
8301 }();
8302 // First consider the order of the external scalar users.
8303 auto It = ExternalUserReorderMap.find(OpTE);
8304 if (It != ExternalUserReorderMap.end()) {
8305 const auto &ExternalUserReorderIndices = It->second;
8306 // If the OpTE vector factor != number of scalars - use natural order,
8307 // it is an attempt to reorder node with reused scalars but with
8308 // external uses.
8309 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8310 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8311 ExternalUserReorderIndices.size();
8312 } else {
8313 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8314 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8315 }
8316 // No other useful reorder data in this entry.
8317 if (Order.empty())
8318 continue;
8319 }
8320 // Stores actually store the mask, not the order, need to invert.
8321 if (OpTE->State == TreeEntry::Vectorize &&
8322 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8323 assert(!OpTE->isAltShuffle() &&
8324 "Alternate instructions are only supported by BinaryOperator "
8325 "and CastInst.");
8326 SmallVector<int> Mask;
8327 inversePermutation(Order, Mask);
8328 unsigned E = Order.size();
8329 OrdersType CurrentOrder(E, E);
8330 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8331 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8332 });
8333 fixupOrderingIndices(CurrentOrder);
8334 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8335 } else {
8336 ++OrdersUses.try_emplace(Order, 0).first->second;
8337 }
8338 }
8339 if (OrdersUses.empty())
8340 continue;
8341 // Choose the most used order.
8342 unsigned IdentityCnt = 0;
8343 unsigned FilledIdentityCnt = 0;
8344 OrdersType IdentityOrder(VF, VF);
8345 for (auto &Pair : OrdersUses) {
8346 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8347 if (!Pair.first.empty())
8348 FilledIdentityCnt += Pair.second;
8349 IdentityCnt += Pair.second;
8350 combineOrders(IdentityOrder, Pair.first);
8351 }
8352 }
8353 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8354 unsigned Cnt = IdentityCnt;
8355 for (auto &Pair : OrdersUses) {
8356 // Prefer identity order. But, if filled identity found (non-empty order)
8357 // with same number of uses, as the new candidate order, we can choose
8358 // this candidate order.
8359 if (Cnt < Pair.second ||
8360 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8361 Cnt == Pair.second && !BestOrder.empty() &&
8362 isIdentityOrder(BestOrder))) {
8363 combineOrders(Pair.first, BestOrder);
8364 BestOrder = Pair.first;
8365 Cnt = Pair.second;
8366 } else {
8367 combineOrders(BestOrder, Pair.first);
8368 }
8369 }
8370 // Set order of the user node.
8371 if (isIdentityOrder(BestOrder))
8372 continue;
8373 fixupOrderingIndices(BestOrder);
8374 SmallVector<int> Mask;
8375 inversePermutation(BestOrder, Mask);
8376 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8377 unsigned E = BestOrder.size();
8378 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8379 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8380 });
8381 // Do an actual reordering, if profitable.
8382 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8383 // Just do the reordering for the nodes with the given VF.
8384 if (TE->Scalars.size() != VF) {
8385 if (TE->ReuseShuffleIndices.size() == VF) {
8386 assert(TE->State != TreeEntry::SplitVectorize &&
8387 "Split vectorized not expected.");
8388 // Need to reorder the reuses masks of the operands with smaller VF to
8389 // be able to find the match between the graph nodes and scalar
8390 // operands of the given node during vectorization/cost estimation.
8391 assert(
8392 (!TE->UserTreeIndex ||
8393 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8394 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8395 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8396 "All users must be of VF size.");
8397 if (SLPReVec) {
8398 assert(SLPReVec && "Only supported by REVEC.");
8399 // ShuffleVectorInst does not do reorderOperands (and it should not
8400 // because ShuffleVectorInst supports only a limited set of
8401 // patterns). Only do reorderNodeWithReuses if the user is not
8402 // ShuffleVectorInst.
8403 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8404 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8405 continue;
8406 }
8407 // Update ordering of the operands with the smaller VF than the given
8408 // one.
8409 reorderNodeWithReuses(*TE, Mask);
8410 // Update orders in user split vectorize nodes.
8411 if (TE->UserTreeIndex &&
8412 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8413 TE->UserTreeIndex.UserTE->reorderSplitNode(
8414 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8415 }
8416 continue;
8417 }
8418 if ((TE->State == TreeEntry::SplitVectorize &&
8419 TE->ReuseShuffleIndices.empty()) ||
8420 ((TE->State == TreeEntry::Vectorize ||
8421 TE->State == TreeEntry::StridedVectorize ||
8422 TE->State == TreeEntry::CompressVectorize) &&
8424 InsertElementInst>(TE->getMainOp()) ||
8425 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8426 assert(
8427 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8428 TE->ReuseShuffleIndices.empty())) &&
8429 "Alternate instructions are only supported by BinaryOperator "
8430 "and CastInst.");
8431 // Build correct orders for extract{element,value}, loads,
8432 // stores and alternate (split) nodes.
8433 reorderOrder(TE->ReorderIndices, Mask);
8434 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8435 TE->reorderOperands(Mask);
8436 } else {
8437 // Reorder the node and its operands.
8438 TE->reorderOperands(Mask);
8439 assert(TE->ReorderIndices.empty() &&
8440 "Expected empty reorder sequence.");
8441 reorderScalars(TE->Scalars, Mask);
8442 }
8443 if (!TE->ReuseShuffleIndices.empty()) {
8444 // Apply reversed order to keep the original ordering of the reused
8445 // elements to avoid extra reorder indices shuffling.
8446 OrdersType CurrentOrder;
8447 reorderOrder(CurrentOrder, MaskOrder);
8448 SmallVector<int> NewReuses;
8449 inversePermutation(CurrentOrder, NewReuses);
8450 addMask(NewReuses, TE->ReuseShuffleIndices);
8451 TE->ReuseShuffleIndices.swap(NewReuses);
8452 } else if (TE->UserTreeIndex &&
8453 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8454 // Update orders in user split vectorize nodes.
8455 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8456 Mask, MaskOrder);
8457 }
8458 }
8459}
8460
8461void BoUpSLP::buildReorderableOperands(
8462 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8463 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8464 SmallVectorImpl<TreeEntry *> &GatherOps) {
8465 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8466 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8467 return OpData.first == I &&
8468 (OpData.second->State == TreeEntry::Vectorize ||
8469 OpData.second->State == TreeEntry::StridedVectorize ||
8470 OpData.second->State == TreeEntry::CompressVectorize ||
8471 OpData.second->State == TreeEntry::SplitVectorize);
8472 }))
8473 continue;
8474 // Do not request operands, if they do not exist.
8475 if (UserTE->hasState()) {
8476 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8477 UserTE->getOpcode() == Instruction::ExtractValue)
8478 continue;
8479 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8480 continue;
8481 if (UserTE->getOpcode() == Instruction::Store &&
8482 UserTE->State == TreeEntry::Vectorize && I == 1)
8483 continue;
8484 if (UserTE->getOpcode() == Instruction::Load &&
8485 (UserTE->State == TreeEntry::Vectorize ||
8486 UserTE->State == TreeEntry::StridedVectorize ||
8487 UserTE->State == TreeEntry::CompressVectorize))
8488 continue;
8489 }
8490 TreeEntry *TE = getOperandEntry(UserTE, I);
8491 assert(TE && "Expected operand entry.");
8492 if (!TE->isGather()) {
8493 // Add the node to the list of the ordered nodes with the identity
8494 // order.
8495 Edges.emplace_back(I, TE);
8496 // Add ScatterVectorize nodes to the list of operands, where just
8497 // reordering of the scalars is required. Similar to the gathers, so
8498 // simply add to the list of gathered ops.
8499 // If there are reused scalars, process this node as a regular vectorize
8500 // node, just reorder reuses mask.
8501 if (TE->State == TreeEntry::ScatterVectorize &&
8502 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8503 GatherOps.push_back(TE);
8504 continue;
8505 }
8506 if (ReorderableGathers.contains(TE))
8507 GatherOps.push_back(TE);
8508 }
8509}
8510
8511void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8512 struct TreeEntryCompare {
8513 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8514 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8515 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8516 return LHS->Idx < RHS->Idx;
8517 }
8518 };
8520 DenseSet<const TreeEntry *> GathersToOrders;
8521 // Find all reorderable leaf nodes with the given VF.
8522 // Currently the are vectorized loads,extracts without alternate operands +
8523 // some gathering of extracts.
8525 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8526 if (TE->State != TreeEntry::Vectorize &&
8527 TE->State != TreeEntry::StridedVectorize &&
8528 TE->State != TreeEntry::CompressVectorize &&
8529 TE->State != TreeEntry::SplitVectorize)
8530 NonVectorized.insert(TE.get());
8531 if (std::optional<OrdersType> CurrentOrder =
8532 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8533 Queue.push(TE.get());
8534 if (!(TE->State == TreeEntry::Vectorize ||
8535 TE->State == TreeEntry::StridedVectorize ||
8536 TE->State == TreeEntry::CompressVectorize ||
8537 TE->State == TreeEntry::SplitVectorize) ||
8538 !TE->ReuseShuffleIndices.empty())
8539 GathersToOrders.insert(TE.get());
8540 }
8541 }
8542
8543 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8544 // I.e., if the node has operands, that are reordered, try to make at least
8545 // one operand order in the natural order and reorder others + reorder the
8546 // user node itself.
8547 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8548 while (!Queue.empty()) {
8549 // 1. Filter out only reordered nodes.
8550 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8551 TreeEntry *TE = Queue.top();
8552 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8553 Queue.pop();
8554 SmallVector<TreeEntry *> OrderedOps(1, TE);
8555 while (!Queue.empty()) {
8556 TE = Queue.top();
8557 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8558 break;
8559 Queue.pop();
8560 OrderedOps.push_back(TE);
8561 }
8562 for (TreeEntry *TE : OrderedOps) {
8563 if (!(TE->State == TreeEntry::Vectorize ||
8564 TE->State == TreeEntry::StridedVectorize ||
8565 TE->State == TreeEntry::CompressVectorize ||
8566 TE->State == TreeEntry::SplitVectorize ||
8567 (TE->isGather() && GathersToOrders.contains(TE))) ||
8568 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8569 !Visited.insert(TE).second)
8570 continue;
8571 // Build a map between user nodes and their operands order to speedup
8572 // search. The graph currently does not provide this dependency directly.
8573 Users.first = TE->UserTreeIndex.UserTE;
8574 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8575 }
8576 if (Users.first) {
8577 auto &Data = Users;
8578 if (Data.first->State == TreeEntry::SplitVectorize) {
8579 assert(
8580 Data.second.size() <= 2 &&
8581 "Expected not greater than 2 operands for split vectorize node.");
8582 if (any_of(Data.second,
8583 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8584 continue;
8585 // Update orders in user split vectorize nodes.
8586 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8587 "Expected exactly 2 entries.");
8588 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8589 TreeEntry &OpTE = *VectorizableTree[P.first];
8590 OrdersType Order = OpTE.ReorderIndices;
8591 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8592 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8593 continue;
8594 const auto BestOrder =
8595 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8596 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8597 continue;
8598 Order = *BestOrder;
8599 }
8600 fixupOrderingIndices(Order);
8601 SmallVector<int> Mask;
8602 inversePermutation(Order, Mask);
8603 const unsigned E = Order.size();
8604 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8605 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8606 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8607 });
8608 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8609 // Clear ordering of the operand.
8610 if (!OpTE.ReorderIndices.empty()) {
8611 OpTE.ReorderIndices.clear();
8612 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8613 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8614 } else {
8615 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8616 reorderScalars(OpTE.Scalars, Mask);
8617 }
8618 }
8619 if (Data.first->ReuseShuffleIndices.empty() &&
8620 !Data.first->ReorderIndices.empty()) {
8621 // Insert user node to the list to try to sink reordering deeper in
8622 // the graph.
8623 Queue.push(Data.first);
8624 }
8625 continue;
8626 }
8627 // Check that operands are used only in the User node.
8628 SmallVector<TreeEntry *> GatherOps;
8629 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8630 GatherOps);
8631 // All operands are reordered and used only in this node - propagate the
8632 // most used order to the user node.
8635 OrdersUses;
8636 // Do the analysis for each tree entry only once, otherwise the order of
8637 // the same node my be considered several times, though might be not
8638 // profitable.
8641 for (const auto &Op : Data.second) {
8642 TreeEntry *OpTE = Op.second;
8643 if (!VisitedOps.insert(OpTE).second)
8644 continue;
8645 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8646 continue;
8647 const auto Order = [&]() -> const OrdersType {
8648 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8649 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8650 IgnoreReorder)
8651 .value_or(OrdersType(1));
8652 return OpTE->ReorderIndices;
8653 }();
8654 // The order is partially ordered, skip it in favor of fully non-ordered
8655 // orders.
8656 if (Order.size() == 1)
8657 continue;
8658
8659 // Check that the reordering does not increase number of shuffles, i.e.
8660 // same-values-nodes has same parents or their parents has same parents.
8661 if (!Order.empty() && !isIdentityOrder(Order)) {
8662 Value *Root = OpTE->hasState()
8663 ? OpTE->getMainOp()
8664 : *find_if_not(OpTE->Scalars, isConstant);
8665 auto GetSameNodesUsers = [&](Value *Root) {
8667 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8668 if (TE != OpTE && TE->UserTreeIndex &&
8669 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8670 TE->Scalars.size() == OpTE->Scalars.size() &&
8671 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8672 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8673 Res.insert(TE->UserTreeIndex.UserTE);
8674 }
8675 for (const TreeEntry *TE : getTreeEntries(Root)) {
8676 if (TE != OpTE && TE->UserTreeIndex &&
8677 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8678 TE->Scalars.size() == OpTE->Scalars.size() &&
8679 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8680 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8681 Res.insert(TE->UserTreeIndex.UserTE);
8682 }
8683 return Res.takeVector();
8684 };
8685 auto GetNumOperands = [](const TreeEntry *TE) {
8686 if (TE->State == TreeEntry::SplitVectorize)
8687 return TE->getNumOperands();
8688 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8689 return CI->arg_size();
8690 return TE->getNumOperands();
8691 };
8692 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8693 const TreeEntry *TE) {
8695 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8697 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8700 continue;
8701 const TreeEntry *Op = getOperandEntry(TE, Idx);
8702 if (Op->isGather() && Op->hasState()) {
8703 const TreeEntry *VecOp =
8704 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8705 if (VecOp)
8706 Op = VecOp;
8707 }
8708 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8709 return false;
8710 }
8711 return true;
8712 };
8713 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8714 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8715 if (!RevisitedOps.insert(UTE).second)
8716 return false;
8717 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8718 !UTE->ReuseShuffleIndices.empty() ||
8719 (UTE->UserTreeIndex &&
8720 UTE->UserTreeIndex.UserTE == Data.first) ||
8721 (Data.first->UserTreeIndex &&
8722 Data.first->UserTreeIndex.UserTE == UTE) ||
8723 (IgnoreReorder && UTE->UserTreeIndex &&
8724 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8725 NodeShouldBeReorderedWithOperands(UTE);
8726 }))
8727 continue;
8728 for (TreeEntry *UTE : Users) {
8730 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8732 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8735 continue;
8736 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8737 Visited.erase(Op);
8738 Queue.push(const_cast<TreeEntry *>(Op));
8739 }
8740 }
8741 }
8742 unsigned NumOps = count_if(
8743 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
8744 return P.second == OpTE;
8745 });
8746 // Stores actually store the mask, not the order, need to invert.
8747 if (OpTE->State == TreeEntry::Vectorize &&
8748 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8749 assert(!OpTE->isAltShuffle() &&
8750 "Alternate instructions are only supported by BinaryOperator "
8751 "and CastInst.");
8752 SmallVector<int> Mask;
8753 inversePermutation(Order, Mask);
8754 unsigned E = Order.size();
8755 OrdersType CurrentOrder(E, E);
8756 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8757 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8758 });
8759 fixupOrderingIndices(CurrentOrder);
8760 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8761 } else {
8762 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8763 }
8764 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
8765 const auto AllowsReordering = [&](const TreeEntry *TE) {
8766 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8767 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8768 (IgnoreReorder && TE->Idx == 0))
8769 return true;
8770 if (TE->isGather()) {
8771 if (GathersToOrders.contains(TE))
8772 return !getReorderingData(*TE, /*TopToBottom=*/false,
8773 IgnoreReorder)
8774 .value_or(OrdersType(1))
8775 .empty();
8776 return true;
8777 }
8778 return false;
8779 };
8780 if (OpTE->UserTreeIndex) {
8781 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8782 if (!VisitedUsers.insert(UserTE).second)
8783 continue;
8784 // May reorder user node if it requires reordering, has reused
8785 // scalars, is an alternate op vectorize node or its op nodes require
8786 // reordering.
8787 if (AllowsReordering(UserTE))
8788 continue;
8789 // Check if users allow reordering.
8790 // Currently look up just 1 level of operands to avoid increase of
8791 // the compile time.
8792 // Profitable to reorder if definitely more operands allow
8793 // reordering rather than those with natural order.
8795 if (static_cast<unsigned>(count_if(
8796 Ops, [UserTE, &AllowsReordering](
8797 const std::pair<unsigned, TreeEntry *> &Op) {
8798 return AllowsReordering(Op.second) &&
8799 Op.second->UserTreeIndex.UserTE == UserTE;
8800 })) <= Ops.size() / 2)
8801 ++Res.first->second;
8802 }
8803 }
8804 if (OrdersUses.empty()) {
8805 Visited.insert_range(llvm::make_second_range(Data.second));
8806 continue;
8807 }
8808 // Choose the most used order.
8809 unsigned IdentityCnt = 0;
8810 unsigned VF = Data.second.front().second->getVectorFactor();
8811 OrdersType IdentityOrder(VF, VF);
8812 for (auto &Pair : OrdersUses) {
8813 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8814 IdentityCnt += Pair.second;
8815 combineOrders(IdentityOrder, Pair.first);
8816 }
8817 }
8818 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8819 unsigned Cnt = IdentityCnt;
8820 for (auto &Pair : OrdersUses) {
8821 // Prefer identity order. But, if filled identity found (non-empty
8822 // order) with same number of uses, as the new candidate order, we can
8823 // choose this candidate order.
8824 if (Cnt < Pair.second) {
8825 combineOrders(Pair.first, BestOrder);
8826 BestOrder = Pair.first;
8827 Cnt = Pair.second;
8828 } else {
8829 combineOrders(BestOrder, Pair.first);
8830 }
8831 }
8832 // Set order of the user node.
8833 if (isIdentityOrder(BestOrder)) {
8834 Visited.insert_range(llvm::make_second_range(Data.second));
8835 continue;
8836 }
8837 fixupOrderingIndices(BestOrder);
8838 // Erase operands from OrderedEntries list and adjust their orders.
8839 VisitedOps.clear();
8840 SmallVector<int> Mask;
8841 inversePermutation(BestOrder, Mask);
8842 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8843 unsigned E = BestOrder.size();
8844 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8845 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8846 });
8847 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
8848 TreeEntry *TE = Op.second;
8849 if (!VisitedOps.insert(TE).second)
8850 continue;
8851 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
8852 reorderNodeWithReuses(*TE, Mask);
8853 continue;
8854 }
8855 // Gathers are processed separately.
8856 if (TE->State != TreeEntry::Vectorize &&
8857 TE->State != TreeEntry::StridedVectorize &&
8858 TE->State != TreeEntry::CompressVectorize &&
8859 TE->State != TreeEntry::SplitVectorize &&
8860 (TE->State != TreeEntry::ScatterVectorize ||
8861 TE->ReorderIndices.empty()))
8862 continue;
8863 assert((BestOrder.size() == TE->ReorderIndices.size() ||
8864 TE->ReorderIndices.empty()) &&
8865 "Non-matching sizes of user/operand entries.");
8866 reorderOrder(TE->ReorderIndices, Mask);
8867 if (IgnoreReorder && TE == VectorizableTree.front().get())
8868 IgnoreReorder = false;
8869 }
8870 // For gathers just need to reorder its scalars.
8871 for (TreeEntry *Gather : GatherOps) {
8872 assert(Gather->ReorderIndices.empty() &&
8873 "Unexpected reordering of gathers.");
8874 if (!Gather->ReuseShuffleIndices.empty()) {
8875 // Just reorder reuses indices.
8876 reorderReuses(Gather->ReuseShuffleIndices, Mask);
8877 continue;
8878 }
8879 reorderScalars(Gather->Scalars, Mask);
8880 Visited.insert(Gather);
8881 }
8882 // Reorder operands of the user node and set the ordering for the user
8883 // node itself.
8884 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
8885 return TE.isAltShuffle() &&
8886 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8887 TE.ReorderIndices.empty());
8888 };
8889 if (Data.first->State != TreeEntry::Vectorize ||
8891 Data.first->getMainOp()) ||
8892 IsNotProfitableAltCodeNode(*Data.first))
8893 Data.first->reorderOperands(Mask);
8894 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
8895 IsNotProfitableAltCodeNode(*Data.first) ||
8896 Data.first->State == TreeEntry::StridedVectorize ||
8897 Data.first->State == TreeEntry::CompressVectorize) {
8898 reorderScalars(Data.first->Scalars, Mask);
8899 reorderOrder(Data.first->ReorderIndices, MaskOrder,
8900 /*BottomOrder=*/true);
8901 if (Data.first->ReuseShuffleIndices.empty() &&
8902 !Data.first->ReorderIndices.empty() &&
8903 !IsNotProfitableAltCodeNode(*Data.first)) {
8904 // Insert user node to the list to try to sink reordering deeper in
8905 // the graph.
8906 Queue.push(Data.first);
8907 }
8908 } else {
8909 reorderOrder(Data.first->ReorderIndices, Mask);
8910 }
8911 }
8912 }
8913 // If the reordering is unnecessary, just remove the reorder.
8914 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8915 VectorizableTree.front()->ReuseShuffleIndices.empty())
8916 VectorizableTree.front()->ReorderIndices.clear();
8917}
8918
8919Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
8920 if (Entry.hasState() &&
8921 (Entry.getOpcode() == Instruction::Store ||
8922 Entry.getOpcode() == Instruction::Load) &&
8923 Entry.State == TreeEntry::StridedVectorize &&
8924 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
8925 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8926 return dyn_cast<Instruction>(Entry.Scalars.front());
8927}
8928
8930 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
8931 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8932 DenseMap<Value *, unsigned> ScalarToExtUses;
8933 // Collect the values that we need to extract from the tree.
8934 for (auto &TEPtr : VectorizableTree) {
8935 TreeEntry *Entry = TEPtr.get();
8936
8937 // No need to handle users of gathered values.
8938 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8939 continue;
8940
8941 // For each lane:
8942 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8943 Value *Scalar = Entry->Scalars[Lane];
8944 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8945 continue;
8946
8947 // All uses must be replaced already? No need to do it again.
8948 auto It = ScalarToExtUses.find(Scalar);
8949 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
8950 continue;
8951
8952 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8953 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8954 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
8955 << " from " << *Scalar << "for many users.\n");
8956 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8957 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8958 ExternalUsesWithNonUsers.insert(Scalar);
8959 continue;
8960 }
8961
8962 // Check if the scalar is externally used as an extra arg.
8963 const auto ExtI = ExternallyUsedValues.find(Scalar);
8964 if (ExtI != ExternallyUsedValues.end()) {
8965 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8966 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
8967 << FoundLane << " from " << *Scalar << ".\n");
8968 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
8969 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8970 continue;
8971 }
8972 for (User *U : Scalar->users()) {
8973 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
8974
8975 Instruction *UserInst = dyn_cast<Instruction>(U);
8976 if (!UserInst || isDeleted(UserInst))
8977 continue;
8978
8979 // Ignore users in the user ignore list.
8980 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8981 continue;
8982
8983 // Skip in-tree scalars that become vectors
8984 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
8985 !UseEntries.empty()) {
8986 // Some in-tree scalars will remain as scalar in vectorized
8987 // instructions. If that is the case, the one in FoundLane will
8988 // be used.
8989 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8990 isa<LoadInst, StoreInst>(UserInst)) ||
8991 isa<CallInst>(UserInst)) ||
8992 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8993 return UseEntry->State == TreeEntry::ScatterVectorize ||
8995 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8996 TTI);
8997 })) {
8998 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
8999 << ".\n");
9000 assert(none_of(UseEntries,
9001 [](TreeEntry *UseEntry) {
9002 return UseEntry->isGather();
9003 }) &&
9004 "Bad state");
9005 continue;
9006 }
9007 U = nullptr;
9008 if (It != ScalarToExtUses.end()) {
9009 ExternalUses[It->second].User = nullptr;
9010 break;
9011 }
9012 }
9013
9014 if (U && Scalar->hasNUsesOrMore(UsesLimit))
9015 U = nullptr;
9016 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9017 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9018 << " from lane " << FoundLane << " from " << *Scalar
9019 << ".\n");
9020 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9021 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9022 ExternalUsesWithNonUsers.insert(Scalar);
9023 if (!U)
9024 break;
9025 }
9026 }
9027 }
9028}
9029
9031BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9034 PtrToStoresMap;
9035 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
9036 Value *V = TE->Scalars[Lane];
9037 // Don't iterate over the users of constant data.
9038 if (!isa<Instruction>(V))
9039 continue;
9040 // To save compilation time we don't visit if we have too many users.
9041 if (V->hasNUsesOrMore(UsesLimit))
9042 break;
9043
9044 // Collect stores per pointer object.
9045 for (User *U : V->users()) {
9046 auto *SI = dyn_cast<StoreInst>(U);
9047 // Test whether we can handle the store. V might be a global, which could
9048 // be used in a different function.
9049 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9050 !isValidElementType(SI->getValueOperand()->getType()))
9051 continue;
9052 // Skip entry if already
9053 if (isVectorized(U))
9054 continue;
9055
9056 Value *Ptr =
9057 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
9058 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9059 SI->getValueOperand()->getType(), Ptr}];
9060 // For now just keep one store per pointer object per lane.
9061 // TODO: Extend this to support multiple stores per pointer per lane
9062 if (StoresVec.size() > Lane)
9063 continue;
9064 if (!StoresVec.empty()) {
9065 std::optional<int64_t> Diff = getPointersDiff(
9066 SI->getValueOperand()->getType(), SI->getPointerOperand(),
9067 SI->getValueOperand()->getType(),
9068 StoresVec.front()->getPointerOperand(), *DL, *SE,
9069 /*StrictCheck=*/true);
9070 // We failed to compare the pointers so just abandon this store.
9071 if (!Diff)
9072 continue;
9073 }
9074 StoresVec.push_back(SI);
9075 }
9076 }
9077 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9078 unsigned I = 0;
9079 for (auto &P : PtrToStoresMap) {
9080 Res[I].swap(P.second);
9081 ++I;
9082 }
9083 return Res;
9084}
9085
9086bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9087 OrdersType &ReorderIndices) const {
9088 // We check whether the stores in StoreVec can form a vector by sorting them
9089 // and checking whether they are consecutive.
9090
9091 // To avoid calling getPointersDiff() while sorting we create a vector of
9092 // pairs {store, offset from first} and sort this instead.
9094 StoreInst *S0 = StoresVec[0];
9095 StoreOffsetVec.emplace_back(0, 0);
9096 Type *S0Ty = S0->getValueOperand()->getType();
9097 Value *S0Ptr = S0->getPointerOperand();
9098 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
9099 StoreInst *SI = StoresVec[Idx];
9100 std::optional<int64_t> Diff =
9101 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
9102 SI->getPointerOperand(), *DL, *SE,
9103 /*StrictCheck=*/true);
9104 StoreOffsetVec.emplace_back(*Diff, Idx);
9105 }
9106
9107 // Check if the stores are consecutive by checking if their difference is 1.
9108 if (StoreOffsetVec.size() != StoresVec.size())
9109 return false;
9110 sort(StoreOffsetVec, llvm::less_first());
9111 unsigned Idx = 0;
9112 int64_t PrevDist = 0;
9113 for (const auto &P : StoreOffsetVec) {
9114 if (Idx > 0 && P.first != PrevDist + 1)
9115 return false;
9116 PrevDist = P.first;
9117 ++Idx;
9118 }
9119
9120 // Calculate the shuffle indices according to their offset against the sorted
9121 // StoreOffsetVec.
9122 ReorderIndices.assign(StoresVec.size(), 0);
9123 bool IsIdentity = true;
9124 for (auto [I, P] : enumerate(StoreOffsetVec)) {
9125 ReorderIndices[P.second] = I;
9126 IsIdentity &= P.second == I;
9127 }
9128 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9129 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9130 // same convention here.
9131 if (IsIdentity)
9132 ReorderIndices.clear();
9133
9134 return true;
9135}
9136
9137#ifndef NDEBUG
9139 for (unsigned Idx : Order)
9140 dbgs() << Idx << ", ";
9141 dbgs() << "\n";
9142}
9143#endif
9144
9146BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9147 unsigned NumLanes = TE->Scalars.size();
9148
9149 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9150
9151 // Holds the reorder indices for each candidate store vector that is a user of
9152 // the current TreeEntry.
9153 SmallVector<OrdersType, 1> ExternalReorderIndices;
9154
9155 // Now inspect the stores collected per pointer and look for vectorization
9156 // candidates. For each candidate calculate the reorder index vector and push
9157 // it into `ExternalReorderIndices`
9158 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9159 // If we have fewer than NumLanes stores, then we can't form a vector.
9160 if (StoresVec.size() != NumLanes)
9161 continue;
9162
9163 // If the stores are not consecutive then abandon this StoresVec.
9164 OrdersType ReorderIndices;
9165 if (!canFormVector(StoresVec, ReorderIndices))
9166 continue;
9167
9168 // We now know that the scalars in StoresVec can form a vector instruction,
9169 // so set the reorder indices.
9170 ExternalReorderIndices.push_back(ReorderIndices);
9171 }
9172 return ExternalReorderIndices;
9173}
9174
9176 const SmallDenseSet<Value *> &UserIgnoreLst) {
9177 deleteTree();
9178 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9179 "TreeEntryToStridedPtrInfoMap is not cleared");
9180 UserIgnoreList = &UserIgnoreLst;
9181 if (!allSameType(Roots))
9182 return;
9183 buildTreeRec(Roots, 0, EdgeInfo());
9184}
9185
9187 deleteTree();
9188 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9189 "TreeEntryToStridedPtrInfoMap is not cleared");
9190 if (!allSameType(Roots))
9191 return;
9192 buildTreeRec(Roots, 0, EdgeInfo());
9193}
9194
9195/// Tries to find subvector of loads and builds new vector of only loads if can
9196/// be profitable.
9198 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9200 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9201 bool AddNew = true) {
9202 if (VL.empty())
9203 return;
9204 Type *ScalarTy = getValueType(VL.front());
9205 if (!isValidElementType(ScalarTy))
9206 return;
9208 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9209 for (Value *V : VL) {
9210 auto *LI = dyn_cast<LoadInst>(V);
9211 if (!LI)
9212 continue;
9213 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9214 continue;
9215 bool IsFound = false;
9216 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9217 assert(LI->getParent() == Data.front().first->getParent() &&
9218 LI->getType() == Data.front().first->getType() &&
9219 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9220 getUnderlyingObject(Data.front().first->getPointerOperand(),
9222 "Expected loads with the same type, same parent and same "
9223 "underlying pointer.");
9224 std::optional<int64_t> Dist = getPointersDiff(
9225 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9226 Data.front().first->getPointerOperand(), DL, SE,
9227 /*StrictCheck=*/true);
9228 if (!Dist)
9229 continue;
9230 auto It = Map.find(*Dist);
9231 if (It != Map.end() && It->second != LI)
9232 continue;
9233 if (It == Map.end()) {
9234 Data.emplace_back(LI, *Dist);
9235 Map.try_emplace(*Dist, LI);
9236 }
9237 IsFound = true;
9238 break;
9239 }
9240 if (!IsFound) {
9241 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9242 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9243 }
9244 }
9245 auto FindMatchingLoads =
9248 &GatheredLoads,
9249 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9250 int64_t &Offset, unsigned &Start) {
9251 if (Loads.empty())
9252 return GatheredLoads.end();
9253 LoadInst *LI = Loads.front().first;
9254 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9255 if (Idx < Start)
9256 continue;
9257 ToAdd.clear();
9258 if (LI->getParent() != Data.front().first->getParent() ||
9259 LI->getType() != Data.front().first->getType())
9260 continue;
9261 std::optional<int64_t> Dist =
9263 Data.front().first->getType(),
9264 Data.front().first->getPointerOperand(), DL, SE,
9265 /*StrictCheck=*/true);
9266 if (!Dist)
9267 continue;
9268 SmallSet<int64_t, 4> DataDists;
9270 for (std::pair<LoadInst *, int64_t> P : Data) {
9271 DataDists.insert(P.second);
9272 DataLoads.insert(P.first);
9273 }
9274 // Found matching gathered loads - check if all loads are unique or
9275 // can be effectively vectorized.
9276 unsigned NumUniques = 0;
9277 for (auto [Cnt, Pair] : enumerate(Loads)) {
9278 bool Used = DataLoads.contains(Pair.first);
9279 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9280 ++NumUniques;
9281 ToAdd.insert(Cnt);
9282 } else if (Used) {
9283 Repeated.insert(Cnt);
9284 }
9285 }
9286 if (NumUniques > 0 &&
9287 (Loads.size() == NumUniques ||
9288 (Loads.size() - NumUniques >= 2 &&
9289 Loads.size() - NumUniques >= Loads.size() / 2 &&
9290 (has_single_bit(Data.size() + NumUniques) ||
9291 bit_ceil(Data.size()) <
9292 bit_ceil(Data.size() + NumUniques))))) {
9293 Offset = *Dist;
9294 Start = Idx + 1;
9295 return std::next(GatheredLoads.begin(), Idx);
9296 }
9297 }
9298 ToAdd.clear();
9299 return GatheredLoads.end();
9300 };
9301 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9302 unsigned Start = 0;
9303 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9304 int64_t Offset = 0;
9305 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9306 Offset, Start);
9307 while (It != GatheredLoads.end()) {
9308 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9309 for (unsigned Idx : LocalToAdd)
9310 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9311 ToAdd.insert_range(LocalToAdd);
9312 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9313 Start);
9314 }
9315 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9316 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9317 })) {
9318 auto AddNewLoads =
9320 for (unsigned Idx : seq<unsigned>(Data.size())) {
9321 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9322 continue;
9323 Loads.push_back(Data[Idx]);
9324 }
9325 };
9326 if (!AddNew) {
9327 LoadInst *LI = Data.front().first;
9328 It = find_if(
9329 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9330 return PD.front().first->getParent() == LI->getParent() &&
9331 PD.front().first->getType() == LI->getType();
9332 });
9333 while (It != GatheredLoads.end()) {
9334 AddNewLoads(*It);
9335 It = std::find_if(
9336 std::next(It), GatheredLoads.end(),
9337 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9338 return PD.front().first->getParent() == LI->getParent() &&
9339 PD.front().first->getType() == LI->getType();
9340 });
9341 }
9342 }
9343 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9344 AddNewLoads(GatheredLoads.emplace_back());
9345 }
9346 }
9347}
9348
9349void BoUpSLP::tryToVectorizeGatheredLoads(
9350 const SmallMapVector<
9351 std::tuple<BasicBlock *, Value *, Type *>,
9352 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9353 &GatheredLoads) {
9354 GatheredLoadsEntriesFirst = VectorizableTree.size();
9355
9356 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9357 LoadEntriesToVectorize.size());
9358 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9359 Set.insert_range(VectorizableTree[Idx]->Scalars);
9360
9361 // Sort loads by distance.
9362 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9363 const std::pair<LoadInst *, int64_t> &L2) {
9364 return L1.second > L2.second;
9365 };
9366
9367 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9368 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9369 Loads.size());
9370 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9371 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9372 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9373 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9374 };
9375
9376 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9377 BoUpSLP::ValueSet &VectorizedLoads,
9378 SmallVectorImpl<LoadInst *> &NonVectorized,
9379 bool Final, unsigned MaxVF) {
9381 unsigned StartIdx = 0;
9382 SmallVector<int> CandidateVFs;
9383 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9384 CandidateVFs.push_back(MaxVF);
9385 for (int NumElts = getFloorFullVectorNumberOfElements(
9386 *TTI, Loads.front()->getType(), MaxVF);
9387 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9388 *TTI, Loads.front()->getType(), NumElts - 1)) {
9389 CandidateVFs.push_back(NumElts);
9390 if (VectorizeNonPowerOf2 && NumElts > 2)
9391 CandidateVFs.push_back(NumElts - 1);
9392 }
9393
9394 if (Final && CandidateVFs.empty())
9395 return Results;
9396
9397 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9398 for (unsigned NumElts : CandidateVFs) {
9399 if (Final && NumElts > BestVF)
9400 continue;
9401 SmallVector<unsigned> MaskedGatherVectorized;
9402 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9403 ++Cnt) {
9404 ArrayRef<LoadInst *> Slice =
9405 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9406 if (VectorizedLoads.count(Slice.front()) ||
9407 VectorizedLoads.count(Slice.back()) ||
9409 continue;
9410 // Check if it is profitable to try vectorizing gathered loads. It is
9411 // profitable if we have more than 3 consecutive loads or if we have
9412 // less but all users are vectorized or deleted.
9413 bool AllowToVectorize = false;
9414 // Check if it is profitable to vectorize 2-elements loads.
9415 if (NumElts == 2) {
9416 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9417 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9418 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9419 for (LoadInst *LI : Slice) {
9420 // If single use/user - allow to vectorize.
9421 if (LI->hasOneUse())
9422 continue;
9423 // 1. Check if number of uses equals number of users.
9424 // 2. All users are deleted.
9425 // 3. The load broadcasts are not allowed or the load is not
9426 // broadcasted.
9427 if (static_cast<unsigned int>(std::distance(
9428 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9429 return false;
9430 if (!IsLegalBroadcastLoad)
9431 continue;
9432 if (LI->hasNUsesOrMore(UsesLimit))
9433 return false;
9434 for (User *U : LI->users()) {
9435 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9436 continue;
9437 for (const TreeEntry *UTE : getTreeEntries(U)) {
9438 for (int I : seq<int>(UTE->getNumOperands())) {
9439 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9440 return V == LI || isa<PoisonValue>(V);
9441 }))
9442 // Found legal broadcast - do not vectorize.
9443 return false;
9444 }
9445 }
9446 }
9447 }
9448 return true;
9449 };
9450 AllowToVectorize = CheckIfAllowed(Slice);
9451 } else {
9452 AllowToVectorize =
9453 (NumElts >= 3 ||
9454 any_of(ValueToGatherNodes.at(Slice.front()),
9455 [=](const TreeEntry *TE) {
9456 return TE->Scalars.size() == 2 &&
9457 ((TE->Scalars.front() == Slice.front() &&
9458 TE->Scalars.back() == Slice.back()) ||
9459 (TE->Scalars.front() == Slice.back() &&
9460 TE->Scalars.back() == Slice.front()));
9461 })) &&
9462 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9463 Slice.size());
9464 }
9465 if (AllowToVectorize) {
9466 SmallVector<Value *> PointerOps;
9467 OrdersType CurrentOrder;
9468 // Try to build vector load.
9469 ArrayRef<Value *> Values(
9470 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9471 StridedPtrInfo SPtrInfo;
9472 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9473 PointerOps, SPtrInfo, &BestVF);
9474 if (LS != LoadsState::Gather ||
9475 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9476 if (LS == LoadsState::ScatterVectorize) {
9477 if (MaskedGatherVectorized.empty() ||
9478 Cnt >= MaskedGatherVectorized.back() + NumElts)
9479 MaskedGatherVectorized.push_back(Cnt);
9480 continue;
9481 }
9482 if (LS != LoadsState::Gather) {
9483 Results.emplace_back(Values, LS);
9484 VectorizedLoads.insert_range(Slice);
9485 // If we vectorized initial block, no need to try to vectorize it
9486 // again.
9487 if (Cnt == StartIdx)
9488 StartIdx += NumElts;
9489 }
9490 // Check if the whole array was vectorized already - exit.
9491 if (StartIdx >= Loads.size())
9492 break;
9493 // Erase last masked gather candidate, if another candidate within
9494 // the range is found to be better.
9495 if (!MaskedGatherVectorized.empty() &&
9496 Cnt < MaskedGatherVectorized.back() + NumElts)
9497 MaskedGatherVectorized.pop_back();
9498 Cnt += NumElts - 1;
9499 continue;
9500 }
9501 }
9502 if (!AllowToVectorize || BestVF == 0)
9504 }
9505 // Mark masked gathers candidates as vectorized, if any.
9506 for (unsigned Cnt : MaskedGatherVectorized) {
9507 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9508 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9509 ArrayRef<Value *> Values(
9510 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9511 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9512 VectorizedLoads.insert_range(Slice);
9513 // If we vectorized initial block, no need to try to vectorize it again.
9514 if (Cnt == StartIdx)
9515 StartIdx += NumElts;
9516 }
9517 }
9518 for (LoadInst *LI : Loads) {
9519 if (!VectorizedLoads.contains(LI))
9520 NonVectorized.push_back(LI);
9521 }
9522 return Results;
9523 };
9524 auto ProcessGatheredLoads =
9525 [&, &TTI = *TTI](
9527 bool Final = false) {
9528 SmallVector<LoadInst *> NonVectorized;
9529 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9530 GatheredLoads) {
9531 if (LoadsDists.size() <= 1) {
9532 NonVectorized.push_back(LoadsDists.back().first);
9533 continue;
9534 }
9536 LoadsDists);
9537 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9538 stable_sort(LocalLoadsDists, LoadSorter);
9540 unsigned MaxConsecutiveDistance = 0;
9541 unsigned CurrentConsecutiveDist = 1;
9542 int64_t LastDist = LocalLoadsDists.front().second;
9543 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9544 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9545 if (isVectorized(L.first))
9546 continue;
9547 assert(LastDist >= L.second &&
9548 "Expected first distance always not less than second");
9549 if (static_cast<uint64_t>(LastDist - L.second) ==
9550 CurrentConsecutiveDist) {
9551 ++CurrentConsecutiveDist;
9552 MaxConsecutiveDistance =
9553 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9554 Loads.push_back(L.first);
9555 continue;
9556 }
9557 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9558 !Loads.empty())
9559 Loads.pop_back();
9560 CurrentConsecutiveDist = 1;
9561 LastDist = L.second;
9562 Loads.push_back(L.first);
9563 }
9564 if (Loads.size() <= 1)
9565 continue;
9566 if (AllowMaskedGather)
9567 MaxConsecutiveDistance = Loads.size();
9568 else if (MaxConsecutiveDistance < 2)
9569 continue;
9570 BoUpSLP::ValueSet VectorizedLoads;
9571 SmallVector<LoadInst *> SortedNonVectorized;
9573 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9574 Final, MaxConsecutiveDistance);
9575 if (!Results.empty() && !SortedNonVectorized.empty() &&
9576 OriginalLoads.size() == Loads.size() &&
9577 MaxConsecutiveDistance == Loads.size() &&
9579 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9580 return P.second == LoadsState::ScatterVectorize;
9581 })) {
9582 VectorizedLoads.clear();
9583 SmallVector<LoadInst *> UnsortedNonVectorized;
9585 UnsortedResults =
9586 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9587 UnsortedNonVectorized, Final,
9588 OriginalLoads.size());
9589 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9590 SortedNonVectorized.swap(UnsortedNonVectorized);
9591 Results.swap(UnsortedResults);
9592 }
9593 }
9594 for (auto [Slice, _] : Results) {
9595 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9596 << Slice.size() << ")\n");
9597 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9598 for (Value *L : Slice)
9599 if (!isVectorized(L))
9600 SortedNonVectorized.push_back(cast<LoadInst>(L));
9601 continue;
9602 }
9603
9604 // Select maximum VF as a maximum of user gathered nodes and
9605 // distance between scalar loads in these nodes.
9606 unsigned MaxVF = Slice.size();
9607 unsigned UserMaxVF = 0;
9608 unsigned InterleaveFactor = 0;
9609 if (MaxVF == 2) {
9610 UserMaxVF = MaxVF;
9611 } else {
9612 // Found distance between segments of the interleaved loads.
9613 std::optional<unsigned> InterleavedLoadsDistance = 0;
9614 unsigned Order = 0;
9615 std::optional<unsigned> CommonVF = 0;
9616 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9617 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9618 for (auto [Idx, V] : enumerate(Slice)) {
9619 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9620 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9621 unsigned Pos =
9622 EntryToPosition.try_emplace(E, Idx).first->second;
9623 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9624 if (CommonVF) {
9625 if (*CommonVF == 0) {
9626 CommonVF = E->Scalars.size();
9627 continue;
9628 }
9629 if (*CommonVF != E->Scalars.size())
9630 CommonVF.reset();
9631 }
9632 // Check if the load is the part of the interleaved load.
9633 if (Pos != Idx && InterleavedLoadsDistance) {
9634 if (!DeinterleavedNodes.contains(E) &&
9635 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9636 if (isa<Constant>(V))
9637 return false;
9638 if (isVectorized(V))
9639 return true;
9640 const auto &Nodes = ValueToGatherNodes.at(V);
9641 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9642 !is_contained(Slice, V);
9643 })) {
9644 InterleavedLoadsDistance.reset();
9645 continue;
9646 }
9647 DeinterleavedNodes.insert(E);
9648 if (*InterleavedLoadsDistance == 0) {
9649 InterleavedLoadsDistance = Idx - Pos;
9650 continue;
9651 }
9652 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9653 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9654 InterleavedLoadsDistance.reset();
9655 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9656 }
9657 }
9658 }
9659 DeinterleavedNodes.clear();
9660 // Check if the large load represents interleaved load operation.
9661 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9662 CommonVF.value_or(0) != 0) {
9663 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9664 unsigned VF = *CommonVF;
9665 OrdersType Order;
9666 SmallVector<Value *> PointerOps;
9667 StridedPtrInfo SPtrInfo;
9668 // Segmented load detected - vectorize at maximum vector factor.
9669 if (InterleaveFactor <= Slice.size() &&
9670 TTI.isLegalInterleavedAccessType(
9671 getWidenedType(Slice.front()->getType(), VF),
9672 InterleaveFactor,
9673 cast<LoadInst>(Slice.front())->getAlign(),
9674 cast<LoadInst>(Slice.front())
9676 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9677 SPtrInfo) == LoadsState::Vectorize) {
9678 UserMaxVF = InterleaveFactor * VF;
9679 } else {
9680 InterleaveFactor = 0;
9681 }
9682 }
9683 // Cannot represent the loads as consecutive vectorizable nodes -
9684 // just exit.
9685 unsigned ConsecutiveNodesSize = 0;
9686 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9687 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9688 [&, Slice = Slice](const auto &P) {
9689 const auto *It = find_if(Slice, [&](Value *V) {
9690 return std::get<1>(P).contains(V);
9691 });
9692 if (It == Slice.end())
9693 return false;
9694 const TreeEntry &TE =
9695 *VectorizableTree[std::get<0>(P)];
9696 ArrayRef<Value *> VL = TE.Scalars;
9697 OrdersType Order;
9698 SmallVector<Value *> PointerOps;
9699 StridedPtrInfo SPtrInfo;
9701 VL, VL.front(), Order, PointerOps, SPtrInfo);
9702 if (State == LoadsState::ScatterVectorize ||
9704 return false;
9705 ConsecutiveNodesSize += VL.size();
9706 size_t Start = std::distance(Slice.begin(), It);
9707 size_t Sz = Slice.size() - Start;
9708 return Sz < VL.size() ||
9709 Slice.slice(Start, VL.size()) != VL;
9710 }))
9711 continue;
9712 // Try to build long masked gather loads.
9713 UserMaxVF = bit_ceil(UserMaxVF);
9714 if (InterleaveFactor == 0 &&
9715 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9716 [&, Slice = Slice](unsigned Idx) {
9717 OrdersType Order;
9718 SmallVector<Value *> PointerOps;
9719 StridedPtrInfo SPtrInfo;
9720 return canVectorizeLoads(
9721 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9722 Slice[Idx * UserMaxVF], Order, PointerOps,
9723 SPtrInfo) == LoadsState::ScatterVectorize;
9724 }))
9725 UserMaxVF = MaxVF;
9726 if (Slice.size() != ConsecutiveNodesSize)
9727 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9728 }
9729 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9730 bool IsVectorized = true;
9731 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
9732 ArrayRef<Value *> SubSlice =
9733 Slice.slice(I, std::min(VF, E - I));
9734 if (isVectorized(SubSlice.front()))
9735 continue;
9736 // Check if the subslice is to be-vectorized entry, which is not
9737 // equal to entry.
9738 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9739 [&](const auto &P) {
9740 return !SubSlice.equals(
9741 VectorizableTree[std::get<0>(P)]
9742 ->Scalars) &&
9743 set_is_subset(SubSlice, std::get<1>(P));
9744 }))
9745 continue;
9746 unsigned Sz = VectorizableTree.size();
9747 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9748 if (Sz == VectorizableTree.size()) {
9749 IsVectorized = false;
9750 // Try non-interleaved vectorization with smaller vector
9751 // factor.
9752 if (InterleaveFactor > 0) {
9753 VF = 2 * (MaxVF / InterleaveFactor);
9754 InterleaveFactor = 0;
9755 }
9756 continue;
9757 }
9758 }
9759 if (IsVectorized)
9760 break;
9761 }
9762 }
9763 NonVectorized.append(SortedNonVectorized);
9764 }
9765 return NonVectorized;
9766 };
9767 for (const auto &GLs : GatheredLoads) {
9768 const auto &Ref = GLs.second;
9769 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
9770 if (!Ref.empty() && !NonVectorized.empty() &&
9771 std::accumulate(
9772 Ref.begin(), Ref.end(), 0u,
9773 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9774 -> unsigned { return S + LoadsDists.size(); }) !=
9775 NonVectorized.size() &&
9776 IsMaskedGatherSupported(NonVectorized)) {
9778 FinalGatheredLoads;
9779 for (LoadInst *LI : NonVectorized) {
9780 // Reinsert non-vectorized loads to other list of loads with the same
9781 // base pointers.
9782 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
9783 FinalGatheredLoads,
9784 /*AddNew=*/false);
9785 }
9786 // Final attempt to vectorize non-vectorized loads.
9787 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
9788 }
9789 }
9790 // Try to vectorize postponed load entries, previously marked as gathered.
9791 for (unsigned Idx : LoadEntriesToVectorize) {
9792 const TreeEntry &E = *VectorizableTree[Idx];
9793 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
9794 // Avoid reordering, if possible.
9795 if (!E.ReorderIndices.empty()) {
9796 // Build a mask out of the reorder indices and reorder scalars per this
9797 // mask.
9798 SmallVector<int> ReorderMask;
9799 inversePermutation(E.ReorderIndices, ReorderMask);
9800 reorderScalars(GatheredScalars, ReorderMask);
9801 }
9802 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9803 }
9804 // If no new entries created, consider it as no gathered loads entries must be
9805 // handled.
9806 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9807 VectorizableTree.size())
9808 GatheredLoadsEntriesFirst.reset();
9809}
9810
9811/// Generates key/subkey pair for the given value to provide effective sorting
9812/// of the values and better detection of the vectorizable values sequences. The
9813/// keys/subkeys can be used for better sorting of the values themselves (keys)
9814/// and in values subgroups (subkeys).
9815static std::pair<size_t, size_t> generateKeySubkey(
9816 Value *V, const TargetLibraryInfo *TLI,
9817 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
9818 bool AllowAlternate) {
9819 hash_code Key = hash_value(V->getValueID() + 2);
9820 hash_code SubKey = hash_value(0);
9821 // Sort the loads by the distance between the pointers.
9822 if (auto *LI = dyn_cast<LoadInst>(V)) {
9823 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
9824 if (LI->isSimple())
9825 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
9826 else
9827 Key = SubKey = hash_value(LI);
9828 } else if (isVectorLikeInstWithConstOps(V)) {
9829 // Sort extracts by the vector operands.
9831 Key = hash_value(Value::UndefValueVal + 1);
9832 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
9833 if (!isUndefVector(EI->getVectorOperand()).all() &&
9834 !isa<UndefValue>(EI->getIndexOperand()))
9835 SubKey = hash_value(EI->getVectorOperand());
9836 }
9837 } else if (auto *I = dyn_cast<Instruction>(V)) {
9838 // Sort other instructions just by the opcodes except for CMPInst.
9839 // For CMP also sort by the predicate kind.
9841 isValidForAlternation(I->getOpcode())) {
9842 if (AllowAlternate)
9843 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
9844 else
9845 Key = hash_combine(hash_value(I->getOpcode()), Key);
9846 SubKey = hash_combine(
9847 hash_value(I->getOpcode()), hash_value(I->getType()),
9849 ? I->getType()
9850 : cast<CastInst>(I)->getOperand(0)->getType()));
9851 // For casts, look through the only operand to improve compile time.
9852 if (isa<CastInst>(I)) {
9853 std::pair<size_t, size_t> OpVals =
9854 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
9855 /*AllowAlternate=*/true);
9856 Key = hash_combine(OpVals.first, Key);
9857 SubKey = hash_combine(OpVals.first, SubKey);
9858 }
9859 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
9860 CmpInst::Predicate Pred = CI->getPredicate();
9861 if (CI->isCommutative())
9862 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
9864 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
9865 hash_value(SwapPred),
9866 hash_value(CI->getOperand(0)->getType()));
9867 } else if (auto *Call = dyn_cast<CallInst>(I)) {
9870 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
9871 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
9872 SubKey = hash_combine(hash_value(I->getOpcode()),
9873 hash_value(Call->getCalledFunction()));
9874 } else {
9876 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
9877 }
9878 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
9879 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
9880 hash_value(Op.Tag), SubKey);
9881 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
9882 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9883 SubKey = hash_value(Gep->getPointerOperand());
9884 else
9885 SubKey = hash_value(Gep);
9886 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
9887 !isa<ConstantInt>(I->getOperand(1))) {
9888 // Do not try to vectorize instructions with potentially high cost.
9889 SubKey = hash_value(I);
9890 } else {
9891 SubKey = hash_value(I->getOpcode());
9892 }
9893 Key = hash_combine(hash_value(I->getParent()), Key);
9894 }
9895 return std::make_pair(Key, SubKey);
9896}
9897
9898/// Checks if the specified instruction \p I is an main operation for the given
9899/// \p MainOp and \p AltOp instructions.
9900static bool isMainInstruction(Instruction *I, Instruction *MainOp,
9901 Instruction *AltOp, const TargetLibraryInfo &TLI);
9902
9903bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
9904 ArrayRef<Value *> VL) const {
9905 Type *ScalarTy = S.getMainOp()->getType();
9906 unsigned Opcode0 = S.getOpcode();
9907 unsigned Opcode1 = S.getAltOpcode();
9908 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9909 // If this pattern is supported by the target then consider it profitable.
9910 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
9911 Opcode1, OpcodeMask))
9912 return true;
9913 SmallVector<ValueList> Operands;
9914 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9915 Operands.emplace_back();
9916 // Prepare the operand vector.
9917 for (Value *V : VL) {
9918 if (isa<PoisonValue>(V)) {
9919 Operands.back().push_back(
9920 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
9921 continue;
9922 }
9923 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
9924 }
9925 }
9926 if (Operands.size() == 2) {
9927 // Try find best operands candidates.
9928 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9930 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
9931 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
9932 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
9933 std::optional<int> Res = findBestRootPair(Candidates);
9934 switch (Res.value_or(0)) {
9935 case 0:
9936 break;
9937 case 1:
9938 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
9939 break;
9940 case 2:
9941 std::swap(Operands[0][I], Operands[1][I]);
9942 break;
9943 default:
9944 llvm_unreachable("Unexpected index.");
9945 }
9946 }
9947 }
9948 DenseSet<unsigned> UniqueOpcodes;
9949 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
9950 unsigned NonInstCnt = 0;
9951 // Estimate number of instructions, required for the vectorized node and for
9952 // the buildvector node.
9953 unsigned UndefCnt = 0;
9954 // Count the number of extra shuffles, required for vector nodes.
9955 unsigned ExtraShuffleInsts = 0;
9956 // Check that operands do not contain same values and create either perfect
9957 // diamond match or shuffled match.
9958 if (Operands.size() == 2) {
9959 // Do not count same operands twice.
9960 if (Operands.front() == Operands.back()) {
9961 Operands.erase(Operands.begin());
9962 } else if (!allConstant(Operands.front()) &&
9963 all_of(Operands.front(), [&](Value *V) {
9964 return is_contained(Operands.back(), V);
9965 })) {
9966 Operands.erase(Operands.begin());
9967 ++ExtraShuffleInsts;
9968 }
9969 }
9970 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
9971 // Vectorize node, if:
9972 // 1. at least single operand is constant or splat.
9973 // 2. Operands have many loop invariants (the instructions are not loop
9974 // invariants).
9975 // 3. At least single unique operands is supposed to vectorized.
9976 return none_of(Operands,
9977 [&](ArrayRef<Value *> Op) {
9978 if (allConstant(Op) ||
9979 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
9980 getSameOpcode(Op, *TLI)))
9981 return false;
9982 DenseMap<Value *, unsigned> Uniques;
9983 for (Value *V : Op) {
9985 isVectorized(V) || (L && L->isLoopInvariant(V))) {
9986 if (isa<UndefValue>(V))
9987 ++UndefCnt;
9988 continue;
9989 }
9990 auto Res = Uniques.try_emplace(V, 0);
9991 // Found first duplicate - need to add shuffle.
9992 if (!Res.second && Res.first->second == 1)
9993 ++ExtraShuffleInsts;
9994 ++Res.first->getSecond();
9995 if (auto *I = dyn_cast<Instruction>(V))
9996 UniqueOpcodes.insert(I->getOpcode());
9997 else if (Res.second)
9998 ++NonInstCnt;
9999 }
10000 return none_of(Uniques, [&](const auto &P) {
10001 return P.first->hasNUsesOrMore(P.second + 1) &&
10002 none_of(P.first->users(), [&](User *U) {
10003 return isVectorized(U) || Uniques.contains(U);
10004 });
10005 });
10006 }) ||
10007 // Do not vectorize node, if estimated number of vector instructions is
10008 // more than estimated number of buildvector instructions. Number of
10009 // vector operands is number of vector instructions + number of vector
10010 // instructions for operands (buildvectors). Number of buildvector
10011 // instructions is just number_of_operands * number_of_scalars.
10012 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10013 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
10014 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10015}
10016
10017/// Builds the arguments types vector for the given call instruction with the
10018/// given \p ID for the specified vector factor.
10021 const unsigned VF, unsigned MinBW,
10022 const TargetTransformInfo *TTI) {
10023 SmallVector<Type *> ArgTys;
10024 for (auto [Idx, Arg] : enumerate(CI->args())) {
10027 ArgTys.push_back(Arg->getType());
10028 continue;
10029 }
10030 if (MinBW > 0) {
10031 ArgTys.push_back(
10032 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
10033 continue;
10034 }
10035 }
10036 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
10037 }
10038 return ArgTys;
10039}
10040
10041/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10042/// function (if possible) calls. Returns invalid cost for the corresponding
10043/// calls, if they cannot be vectorized/will be scalarized.
10044static std::pair<InstructionCost, InstructionCost>
10047 ArrayRef<Type *> ArgTys) {
10048 auto Shape = VFShape::get(CI->getFunctionType(),
10050 false /*HasGlobalPred*/);
10051 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10052 auto LibCost = InstructionCost::getInvalid();
10053 if (!CI->isNoBuiltin() && VecFunc) {
10054 // Calculate the cost of the vector library call.
10055 // If the corresponding vector call is cheaper, return its cost.
10056 LibCost =
10057 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
10058 }
10060
10061 // Calculate the cost of the vector intrinsic call.
10062 FastMathFlags FMF;
10063 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
10064 FMF = FPCI->getFastMathFlags();
10065 const InstructionCost ScalarLimit = 10000;
10066 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10067 LibCost.isValid() ? LibCost : ScalarLimit);
10068 auto IntrinsicCost =
10069 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
10070 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10071 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10073
10074 return {IntrinsicCost, LibCost};
10075}
10076
10077BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10078 const InstructionsState &S, ArrayRef<Value *> VL,
10079 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10080 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10081 assert(S.getMainOp() &&
10082 "Expected instructions with same/alternate opcodes only.");
10083
10084 unsigned ShuffleOrOp =
10085 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10086 Instruction *VL0 = S.getMainOp();
10087 switch (ShuffleOrOp) {
10088 case Instruction::PHI: {
10089 // Too many operands - gather, most probably won't be vectorized.
10090 if (VL0->getNumOperands() > MaxPHINumOperands)
10091 return TreeEntry::NeedToGather;
10092 // Check for terminator values (e.g. invoke).
10093 for (Value *V : VL) {
10094 auto *PHI = dyn_cast<PHINode>(V);
10095 if (!PHI)
10096 continue;
10097 for (Value *Incoming : PHI->incoming_values()) {
10099 if (Term && Term->isTerminator()) {
10101 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10102 return TreeEntry::NeedToGather;
10103 }
10104 }
10105 }
10106
10107 return TreeEntry::Vectorize;
10108 }
10109 case Instruction::ExtractElement:
10110 if (any_of(VL, [&](Value *V) {
10111 auto *EI = dyn_cast<ExtractElementInst>(V);
10112 if (!EI)
10113 return true;
10114 return isVectorized(EI->getOperand(0));
10115 }))
10116 return TreeEntry::NeedToGather;
10117 [[fallthrough]];
10118 case Instruction::ExtractValue: {
10119 bool Reuse = canReuseExtract(VL, CurrentOrder);
10120 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10121 // non-full registers).
10122 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
10123 return TreeEntry::NeedToGather;
10124 if (Reuse || !CurrentOrder.empty())
10125 return TreeEntry::Vectorize;
10126 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10127 return TreeEntry::NeedToGather;
10128 }
10129 case Instruction::InsertElement: {
10130 // Check that we have a buildvector and not a shuffle of 2 or more
10131 // different vectors.
10132 ValueSet SourceVectors;
10133 for (Value *V : VL) {
10134 if (isa<PoisonValue>(V)) {
10135 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10136 return TreeEntry::NeedToGather;
10137 }
10138 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10139 assert(getElementIndex(V) != std::nullopt &&
10140 "Non-constant or undef index?");
10141 }
10142
10143 if (count_if(VL, [&SourceVectors](Value *V) {
10144 return !SourceVectors.contains(V);
10145 }) >= 2) {
10146 // Found 2nd source vector - cancel.
10147 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10148 "different source vectors.\n");
10149 return TreeEntry::NeedToGather;
10150 }
10151
10152 if (any_of(VL, [&SourceVectors](Value *V) {
10153 // The last InsertElement can have multiple uses.
10154 return SourceVectors.contains(V) && !V->hasOneUse();
10155 })) {
10156 assert(SLPReVec && "Only supported by REVEC.");
10157 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10158 "multiple uses.\n");
10159 return TreeEntry::NeedToGather;
10160 }
10161
10162 return TreeEntry::Vectorize;
10163 }
10164 case Instruction::Load: {
10165 // Check that a vectorized load would load the same memory as a scalar
10166 // load. For example, we don't want to vectorize loads that are smaller
10167 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10168 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10169 // from such a struct, we read/write packed bits disagreeing with the
10170 // unvectorized version.
10171 auto IsGatheredNode = [&]() {
10172 if (!GatheredLoadsEntriesFirst)
10173 return false;
10174 return all_of(VL, [&](Value *V) {
10175 if (isa<PoisonValue>(V))
10176 return true;
10177 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10178 return TE->Idx >= *GatheredLoadsEntriesFirst;
10179 });
10180 });
10181 };
10182 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10184 return TreeEntry::Vectorize;
10186 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10187 // Delay slow vectorized nodes for better vectorization attempts.
10188 LoadEntriesToVectorize.insert(VectorizableTree.size());
10189 return TreeEntry::NeedToGather;
10190 }
10191 return IsGatheredNode() ? TreeEntry::NeedToGather
10192 : TreeEntry::CompressVectorize;
10194 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10195 // Delay slow vectorized nodes for better vectorization attempts.
10196 LoadEntriesToVectorize.insert(VectorizableTree.size());
10197 return TreeEntry::NeedToGather;
10198 }
10199 return IsGatheredNode() ? TreeEntry::NeedToGather
10200 : TreeEntry::ScatterVectorize;
10202 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10203 // Delay slow vectorized nodes for better vectorization attempts.
10204 LoadEntriesToVectorize.insert(VectorizableTree.size());
10205 return TreeEntry::NeedToGather;
10206 }
10207 return IsGatheredNode() ? TreeEntry::NeedToGather
10208 : TreeEntry::StridedVectorize;
10209 case LoadsState::Gather:
10210#ifndef NDEBUG
10211 Type *ScalarTy = VL0->getType();
10212 if (DL->getTypeSizeInBits(ScalarTy) !=
10213 DL->getTypeAllocSizeInBits(ScalarTy))
10214 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10215 else if (any_of(VL, [](Value *V) {
10216 auto *LI = dyn_cast<LoadInst>(V);
10217 return !LI || !LI->isSimple();
10218 }))
10219 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10220 else
10221 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10222#endif // NDEBUG
10224 return TreeEntry::NeedToGather;
10225 }
10226 llvm_unreachable("Unexpected state of loads");
10227 }
10228 case Instruction::ZExt:
10229 case Instruction::SExt:
10230 case Instruction::FPToUI:
10231 case Instruction::FPToSI:
10232 case Instruction::FPExt:
10233 case Instruction::PtrToInt:
10234 case Instruction::IntToPtr:
10235 case Instruction::SIToFP:
10236 case Instruction::UIToFP:
10237 case Instruction::Trunc:
10238 case Instruction::FPTrunc:
10239 case Instruction::BitCast: {
10240 Type *SrcTy = VL0->getOperand(0)->getType();
10241 for (Value *V : VL) {
10242 if (isa<PoisonValue>(V))
10243 continue;
10244 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10245 if (Ty != SrcTy || !isValidElementType(Ty)) {
10246 LLVM_DEBUG(
10247 dbgs() << "SLP: Gathering casts with different src types.\n");
10248 return TreeEntry::NeedToGather;
10249 }
10250 }
10251 return TreeEntry::Vectorize;
10252 }
10253 case Instruction::ICmp:
10254 case Instruction::FCmp: {
10255 // Check that all of the compares have the same predicate.
10256 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10258 Type *ComparedTy = VL0->getOperand(0)->getType();
10259 for (Value *V : VL) {
10260 if (isa<PoisonValue>(V))
10261 continue;
10262 auto *Cmp = cast<CmpInst>(V);
10263 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10264 Cmp->getOperand(0)->getType() != ComparedTy) {
10265 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10266 return TreeEntry::NeedToGather;
10267 }
10268 }
10269 return TreeEntry::Vectorize;
10270 }
10271 case Instruction::Select:
10272 case Instruction::FNeg:
10273 case Instruction::Add:
10274 case Instruction::FAdd:
10275 case Instruction::Sub:
10276 case Instruction::FSub:
10277 case Instruction::Mul:
10278 case Instruction::FMul:
10279 case Instruction::UDiv:
10280 case Instruction::SDiv:
10281 case Instruction::FDiv:
10282 case Instruction::URem:
10283 case Instruction::SRem:
10284 case Instruction::FRem:
10285 case Instruction::Shl:
10286 case Instruction::LShr:
10287 case Instruction::AShr:
10288 case Instruction::And:
10289 case Instruction::Or:
10290 case Instruction::Xor:
10291 case Instruction::Freeze:
10292 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10293 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10294 auto *I = dyn_cast<Instruction>(V);
10295 return I && I->isBinaryOp() && !I->isFast();
10296 }))
10297 return TreeEntry::NeedToGather;
10298 return TreeEntry::Vectorize;
10299 case Instruction::GetElementPtr: {
10300 // We don't combine GEPs with complicated (nested) indexing.
10301 for (Value *V : VL) {
10302 auto *I = dyn_cast<GetElementPtrInst>(V);
10303 if (!I)
10304 continue;
10305 if (I->getNumOperands() != 2) {
10306 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10307 return TreeEntry::NeedToGather;
10308 }
10309 }
10310
10311 // We can't combine several GEPs into one vector if they operate on
10312 // different types.
10313 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10314 for (Value *V : VL) {
10315 auto *GEP = dyn_cast<GEPOperator>(V);
10316 if (!GEP)
10317 continue;
10318 Type *CurTy = GEP->getSourceElementType();
10319 if (Ty0 != CurTy) {
10320 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10321 return TreeEntry::NeedToGather;
10322 }
10323 }
10324
10325 // We don't combine GEPs with non-constant indexes.
10326 Type *Ty1 = VL0->getOperand(1)->getType();
10327 for (Value *V : VL) {
10328 auto *I = dyn_cast<GetElementPtrInst>(V);
10329 if (!I)
10330 continue;
10331 auto *Op = I->getOperand(1);
10332 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10333 (Op->getType() != Ty1 &&
10334 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10335 Op->getType()->getScalarSizeInBits() >
10336 DL->getIndexSizeInBits(
10337 V->getType()->getPointerAddressSpace())))) {
10338 LLVM_DEBUG(
10339 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10340 return TreeEntry::NeedToGather;
10341 }
10342 }
10343
10344 return TreeEntry::Vectorize;
10345 }
10346 case Instruction::Store: {
10347 // Check if the stores are consecutive or if we need to swizzle them.
10348 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10349 // Avoid types that are padded when being allocated as scalars, while
10350 // being packed together in a vector (such as i1).
10351 if (DL->getTypeSizeInBits(ScalarTy) !=
10352 DL->getTypeAllocSizeInBits(ScalarTy)) {
10353 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10354 return TreeEntry::NeedToGather;
10355 }
10356 // Make sure all stores in the bundle are simple - we can't vectorize
10357 // atomic or volatile stores.
10358 for (Value *V : VL) {
10359 auto *SI = cast<StoreInst>(V);
10360 if (!SI->isSimple()) {
10361 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10362 return TreeEntry::NeedToGather;
10363 }
10364 PointerOps.push_back(SI->getPointerOperand());
10365 }
10366
10367 // Check the order of pointer operands.
10368 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10369 Value *Ptr0;
10370 Value *PtrN;
10371 if (CurrentOrder.empty()) {
10372 Ptr0 = PointerOps.front();
10373 PtrN = PointerOps.back();
10374 } else {
10375 Ptr0 = PointerOps[CurrentOrder.front()];
10376 PtrN = PointerOps[CurrentOrder.back()];
10377 }
10378 std::optional<int64_t> Dist =
10379 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10380 // Check that the sorted pointer operands are consecutive.
10381 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10382 return TreeEntry::Vectorize;
10383 }
10384
10385 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10386 return TreeEntry::NeedToGather;
10387 }
10388 case Instruction::Call: {
10389 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10390 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10391 auto *I = dyn_cast<Instruction>(V);
10392 return I && !I->isFast();
10393 }))
10394 return TreeEntry::NeedToGather;
10395 // Check if the calls are all to the same vectorizable intrinsic or
10396 // library function.
10397 CallInst *CI = cast<CallInst>(VL0);
10399
10400 VFShape Shape = VFShape::get(
10401 CI->getFunctionType(),
10402 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10403 false /*HasGlobalPred*/);
10404 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10405
10406 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10407 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10408 return TreeEntry::NeedToGather;
10409 }
10410 Function *F = CI->getCalledFunction();
10411 unsigned NumArgs = CI->arg_size();
10412 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10413 for (unsigned J = 0; J != NumArgs; ++J)
10415 ScalarArgs[J] = CI->getArgOperand(J);
10416 for (Value *V : VL) {
10417 CallInst *CI2 = dyn_cast<CallInst>(V);
10418 if (!CI2 || CI2->getCalledFunction() != F ||
10419 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10420 (VecFunc &&
10421 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10423 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10424 << "\n");
10425 return TreeEntry::NeedToGather;
10426 }
10427 // Some intrinsics have scalar arguments and should be same in order for
10428 // them to be vectorized.
10429 for (unsigned J = 0; J != NumArgs; ++J) {
10431 Value *A1J = CI2->getArgOperand(J);
10432 if (ScalarArgs[J] != A1J) {
10434 << "SLP: mismatched arguments in call:" << *CI
10435 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10436 return TreeEntry::NeedToGather;
10437 }
10438 }
10439 }
10440 // Verify that the bundle operands are identical between the two calls.
10441 if (CI->hasOperandBundles() &&
10442 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10443 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10444 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10445 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10446 << "!=" << *V << '\n');
10447 return TreeEntry::NeedToGather;
10448 }
10449 }
10450 SmallVector<Type *> ArgTys =
10451 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10452 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10453 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10454 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10455 return TreeEntry::NeedToGather;
10456
10457 return TreeEntry::Vectorize;
10458 }
10459 case Instruction::ShuffleVector: {
10460 if (!S.isAltShuffle()) {
10461 // REVEC can support non alternate shuffle.
10463 return TreeEntry::Vectorize;
10464 // If this is not an alternate sequence of opcode like add-sub
10465 // then do not vectorize this instruction.
10466 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10467 return TreeEntry::NeedToGather;
10468 }
10469 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10470 LLVM_DEBUG(
10471 dbgs()
10472 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10473 "the whole alt sequence is not profitable.\n");
10474 return TreeEntry::NeedToGather;
10475 }
10476
10477 return TreeEntry::Vectorize;
10478 }
10479 default:
10480 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10481 return TreeEntry::NeedToGather;
10482 }
10483}
10484
10485namespace {
10486/// Allows to correctly handle operands of the phi nodes based on the \p Main
10487/// PHINode order of incoming basic blocks/values.
10488class PHIHandler {
10489 DominatorTree &DT;
10490 PHINode *Main = nullptr;
10493
10494public:
10495 PHIHandler() = delete;
10496 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10497 : DT(DT), Main(Main), Phis(Phis),
10498 Operands(Main->getNumIncomingValues(),
10499 SmallVector<Value *>(Phis.size(), nullptr)) {}
10500 void buildOperands() {
10501 constexpr unsigned FastLimit = 4;
10502 if (Main->getNumIncomingValues() <= FastLimit) {
10503 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10504 BasicBlock *InBB = Main->getIncomingBlock(I);
10505 if (!DT.isReachableFromEntry(InBB)) {
10506 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10507 continue;
10508 }
10509 // Prepare the operand vector.
10510 for (auto [Idx, V] : enumerate(Phis)) {
10511 auto *P = dyn_cast<PHINode>(V);
10512 if (!P) {
10514 "Expected isa instruction or poison value.");
10515 Operands[I][Idx] = V;
10516 continue;
10517 }
10518 if (P->getIncomingBlock(I) == InBB)
10519 Operands[I][Idx] = P->getIncomingValue(I);
10520 else
10521 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10522 }
10523 }
10524 return;
10525 }
10526 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10527 Blocks;
10528 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10529 BasicBlock *InBB = Main->getIncomingBlock(I);
10530 if (!DT.isReachableFromEntry(InBB)) {
10531 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10532 continue;
10533 }
10534 Blocks.try_emplace(InBB).first->second.push_back(I);
10535 }
10536 for (auto [Idx, V] : enumerate(Phis)) {
10537 if (isa<PoisonValue>(V)) {
10538 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10539 Operands[I][Idx] = V;
10540 continue;
10541 }
10542 auto *P = cast<PHINode>(V);
10543 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10544 BasicBlock *InBB = P->getIncomingBlock(I);
10545 if (InBB == Main->getIncomingBlock(I)) {
10546 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10547 continue;
10548 Operands[I][Idx] = P->getIncomingValue(I);
10549 continue;
10550 }
10551 auto *It = Blocks.find(InBB);
10552 if (It == Blocks.end())
10553 continue;
10554 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10555 }
10556 }
10557 for (const auto &P : Blocks) {
10558 ArrayRef<unsigned> IncomingValues = P.second;
10559 if (IncomingValues.size() <= 1)
10560 continue;
10561 unsigned BasicI = IncomingValues.consume_front();
10562 for (unsigned I : IncomingValues) {
10563 assert(all_of(enumerate(Operands[I]),
10564 [&](const auto &Data) {
10565 return !Data.value() ||
10566 Data.value() == Operands[BasicI][Data.index()];
10567 }) &&
10568 "Expected empty operands list.");
10569 Operands[I] = Operands[BasicI];
10570 }
10571 }
10572 }
10573 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10574};
10575} // namespace
10576
10577/// Returns main/alternate instructions for the given \p VL. Unlike
10578/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10579/// node support.
10580/// \returns first main/alt instructions, if only poisons and instruction with
10581/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10582static std::pair<Instruction *, Instruction *>
10584 Instruction *MainOp = nullptr;
10585 Instruction *AltOp = nullptr;
10586 for (Value *V : VL) {
10587 if (isa<PoisonValue>(V))
10588 continue;
10589 auto *I = dyn_cast<Instruction>(V);
10590 if (!I)
10591 return {};
10592 if (!MainOp) {
10593 MainOp = I;
10594 continue;
10595 }
10596 if (MainOp->getOpcode() == I->getOpcode()) {
10597 if (I->getParent() != MainOp->getParent())
10598 return {};
10599 continue;
10600 }
10601 if (!AltOp) {
10602 AltOp = I;
10603 continue;
10604 }
10605 if (AltOp->getOpcode() == I->getOpcode()) {
10606 if (I->getParent() != AltOp->getParent())
10607 return {};
10608 continue;
10609 }
10610 return {};
10611 }
10612 if (!AltOp)
10613 return {};
10614 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10615 "Expected different main and alt instructions.");
10616 return std::make_pair(MainOp, AltOp);
10617}
10618
10619/// Checks that every instruction appears once in the list and if not, packs
10620/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10621/// unique scalars is extended by poison values to the whole register size.
10622///
10623/// \returns false if \p VL could not be uniquified, in which case \p VL is
10624/// unchanged and \p ReuseShuffleIndices is empty.
10626 SmallVectorImpl<int> &ReuseShuffleIndices,
10627 const TargetTransformInfo &TTI,
10628 const TargetLibraryInfo &TLI,
10629 const InstructionsState &S,
10630 const BoUpSLP::EdgeInfo &UserTreeIdx,
10631 bool TryPad = false) {
10632 // Check that every instruction appears once in this bundle.
10633 SmallVector<Value *> UniqueValues;
10634 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10635 for (Value *V : VL) {
10636 if (isConstant(V)) {
10637 // Constants are always considered distinct, even if the same constant
10638 // appears multiple times in VL.
10639 ReuseShuffleIndices.emplace_back(
10640 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10641 UniqueValues.emplace_back(V);
10642 continue;
10643 }
10644 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10645 ReuseShuffleIndices.emplace_back(Res.first->second);
10646 if (Res.second)
10647 UniqueValues.emplace_back(V);
10648 }
10649
10650 // Easy case: VL has unique values and a "natural" size
10651 size_t NumUniqueScalarValues = UniqueValues.size();
10652 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10653 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10654 if (NumUniqueScalarValues == VL.size() &&
10655 (VectorizeNonPowerOf2 || IsFullVectors)) {
10656 ReuseShuffleIndices.clear();
10657 return true;
10658 }
10659
10660 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10661 if ((UserTreeIdx.UserTE &&
10662 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10664 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10665 "for nodes with padding.\n");
10666 ReuseShuffleIndices.clear();
10667 return false;
10668 }
10669
10670 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10671 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10672 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10673 return isa<UndefValue>(V) || !isConstant(V);
10674 }))) {
10675 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10676 S.getMainOp()->isSafeToRemove() &&
10677 (S.areInstructionsWithCopyableElements() ||
10678 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10679 // Find the number of elements, which forms full vectors.
10680 unsigned PWSz = getFullVectorNumberOfElements(
10681 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10682 PWSz = std::min<unsigned>(PWSz, VL.size());
10683 if (PWSz == VL.size()) {
10684 // We ended up with the same size after removing duplicates and
10685 // upgrading the resulting vector size to a "nice size". Just keep
10686 // the initial VL then.
10687 ReuseShuffleIndices.clear();
10688 } else {
10689 // Pad unique values with poison to grow the vector to a "nice" size
10690 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10691 UniqueValues.end());
10692 PaddedUniqueValues.append(
10693 PWSz - UniqueValues.size(),
10694 PoisonValue::get(UniqueValues.front()->getType()));
10695 // Check that extended with poisons/copyable operations are still valid
10696 // for vectorization (div/rem are not allowed).
10697 if ((!S.areInstructionsWithCopyableElements() &&
10698 !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
10699 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10700 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10701 isa<CallInst>(S.getMainOp())))) {
10702 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10703 ReuseShuffleIndices.clear();
10704 return false;
10705 }
10706 VL = std::move(PaddedUniqueValues);
10707 }
10708 return true;
10709 }
10710 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10711 ReuseShuffleIndices.clear();
10712 return false;
10713 }
10714 VL = std::move(UniqueValues);
10715 return true;
10716}
10717
10718bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10719 const InstructionsState &LocalState,
10720 SmallVectorImpl<Value *> &Op1,
10721 SmallVectorImpl<Value *> &Op2,
10722 OrdersType &ReorderIndices) const {
10723 constexpr unsigned SmallNodeSize = 4;
10724 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10726 return false;
10727
10728 // Check if this is a duplicate of another split entry.
10729 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10730 << ".\n");
10731 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10732 if (E->isSame(VL)) {
10733 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10734 << *LocalState.getMainOp() << ".\n");
10735 return false;
10736 }
10737 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10738 if (all_of(VL, [&](Value *V) {
10739 return isa<PoisonValue>(V) || Values.contains(V);
10740 })) {
10741 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10742 return false;
10743 }
10744 }
10745
10746 ReorderIndices.assign(VL.size(), VL.size());
10747 SmallBitVector Op1Indices(VL.size());
10748 for (auto [Idx, V] : enumerate(VL)) {
10749 auto *I = dyn_cast<Instruction>(V);
10750 if (!I) {
10751 Op1.push_back(V);
10752 Op1Indices.set(Idx);
10753 continue;
10754 }
10755 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10756 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
10757 *TLI)) ||
10758 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10759 !isAlternateInstruction(I, LocalState.getMainOp(),
10760 LocalState.getAltOp(), *TLI))) {
10761 Op1.push_back(V);
10762 Op1Indices.set(Idx);
10763 continue;
10764 }
10765 Op2.push_back(V);
10766 }
10767 Type *ScalarTy = getValueType(VL.front());
10768 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
10769 unsigned Opcode0 = LocalState.getOpcode();
10770 unsigned Opcode1 = LocalState.getAltOpcode();
10771 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10772 // Enable split node, only if all nodes do not form legal alternate
10773 // instruction (like X86 addsub).
10774 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
10775 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
10776 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10777 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10778 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
10779 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
10780 return false;
10781 // Enable split node, only if all nodes are power-of-2/full registers.
10782 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
10783 for (unsigned Idx : seq<unsigned>(VL.size())) {
10784 if (Op1Indices.test(Idx)) {
10785 ReorderIndices[Op1Cnt] = Idx;
10786 ++Op1Cnt;
10787 } else {
10788 ReorderIndices[Op2Cnt] = Idx;
10789 ++Op2Cnt;
10790 }
10791 }
10792 if (isIdentityOrder(ReorderIndices))
10793 ReorderIndices.clear();
10794 SmallVector<int> Mask;
10795 if (!ReorderIndices.empty())
10796 inversePermutation(ReorderIndices, Mask);
10797 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10798 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
10799 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
10800 // Check non-profitable single register ops, which better to be represented
10801 // as alternate ops.
10802 if (NumParts >= VL.size())
10803 return false;
10805 InstructionCost InsertCost = ::getShuffleCost(
10806 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
10807 FixedVectorType *SubVecTy =
10808 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
10809 InstructionCost NewShuffleCost =
10810 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
10811 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10812 (Mask.empty() || InsertCost >= NewShuffleCost))
10813 return false;
10814 if ((LocalState.getMainOp()->isBinaryOp() &&
10815 LocalState.getAltOp()->isBinaryOp() &&
10816 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10817 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10818 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10819 (LocalState.getMainOp()->isUnaryOp() &&
10820 LocalState.getAltOp()->isUnaryOp())) {
10821 InstructionCost OriginalVecOpsCost =
10822 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10823 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10824 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
10825 for (unsigned Idx : seq<unsigned>(VL.size())) {
10826 if (isa<PoisonValue>(VL[Idx]))
10827 continue;
10828 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
10829 }
10830 InstructionCost OriginalCost =
10831 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
10832 VecTy, OriginalMask, Kind);
10833 InstructionCost NewVecOpsCost =
10834 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10835 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10836 InstructionCost NewCost =
10837 NewVecOpsCost + InsertCost +
10838 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10839 VectorizableTree.front()->getOpcode() == Instruction::Store
10840 ? NewShuffleCost
10841 : 0);
10842 // If not profitable to split - exit.
10843 if (NewCost >= OriginalCost)
10844 return false;
10845 }
10846 return true;
10847}
10848
10849namespace {
10850/// Class accepts incoming list of values, checks if it is able to model
10851/// "copyable" values as compatible operations, and generates the list of values
10852/// for scheduling and list of operands doe the new nodes.
10853class InstructionsCompatibilityAnalysis {
10854 DominatorTree &DT;
10855 const DataLayout &DL;
10856 const TargetTransformInfo &TTI;
10857 const TargetLibraryInfo &TLI;
10858 unsigned MainOpcode = 0;
10859 Instruction *MainOp = nullptr;
10860
10861 /// Checks if the opcode is supported as the main opcode for copyable
10862 /// elements.
10863 static bool isSupportedOpcode(const unsigned Opcode) {
10864 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
10865 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
10866 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
10867 Opcode == Instruction::And || Opcode == Instruction::Or ||
10868 Opcode == Instruction::Xor;
10869 }
10870
10871 /// Identifies the best candidate value, which represents main opcode
10872 /// operation.
10873 /// Currently the best candidate is the Add instruction with the parent
10874 /// block with the highest DFS incoming number (block, that dominates other).
10875 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
10876 BasicBlock *Parent = nullptr;
10877 // Checks if the instruction has supported opcode.
10878 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
10879 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
10880 return false;
10881 return I && isSupportedOpcode(I->getOpcode()) &&
10882 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
10883 };
10884 // Exclude operands instructions immediately to improve compile time, it
10885 // will be unable to schedule anyway.
10886 SmallDenseSet<Value *, 8> Operands;
10887 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10888 bool AnyUndef = false;
10889 for (Value *V : VL) {
10890 auto *I = dyn_cast<Instruction>(V);
10891 if (!I) {
10892 AnyUndef |= isa<UndefValue>(V);
10893 continue;
10894 }
10895 if (!DT.isReachableFromEntry(I->getParent()))
10896 continue;
10897 if (Candidates.empty()) {
10898 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10899 Parent = I->getParent();
10900 Operands.insert(I->op_begin(), I->op_end());
10901 continue;
10902 }
10903 if (Parent == I->getParent()) {
10904 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10905 Operands.insert(I->op_begin(), I->op_end());
10906 continue;
10907 }
10908 auto *NodeA = DT.getNode(Parent);
10909 auto *NodeB = DT.getNode(I->getParent());
10910 assert(NodeA && "Should only process reachable instructions");
10911 assert(NodeB && "Should only process reachable instructions");
10912 assert((NodeA == NodeB) ==
10913 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10914 "Different nodes should have different DFS numbers");
10915 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10916 Candidates.clear();
10917 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10918 Parent = I->getParent();
10919 Operands.clear();
10920 Operands.insert(I->op_begin(), I->op_end());
10921 }
10922 }
10923 unsigned BestOpcodeNum = 0;
10924 MainOp = nullptr;
10925 for (const auto &P : Candidates) {
10926 if (P.second.size() < BestOpcodeNum)
10927 continue;
10928 // If have inner dependencies - skip.
10929 if (any_of(P.second,
10930 [&](Instruction *I) { return Operands.contains(I); }))
10931 continue;
10932 for (Instruction *I : P.second) {
10933 if (IsSupportedInstruction(I, AnyUndef)) {
10934 MainOp = I;
10935 BestOpcodeNum = P.second.size();
10936 break;
10937 }
10938 }
10939 }
10940 if (MainOp) {
10941 // Do not match, if any copyable is a terminator from the same block as
10942 // the main operation.
10943 if (any_of(VL, [&](Value *V) {
10944 auto *I = dyn_cast<Instruction>(V);
10945 return I && I->getParent() == MainOp->getParent() &&
10946 I->isTerminator();
10947 })) {
10948 MainOp = nullptr;
10949 return;
10950 }
10951 MainOpcode = MainOp->getOpcode();
10952 }
10953 }
10954
10955 /// Returns the idempotent value for the \p MainOp with the detected \p
10956 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
10957 /// the operand itself, since V or V == V.
10958 Value *selectBestIdempotentValue() const {
10959 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10960 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
10961 !MainOp->isCommutative());
10962 }
10963
10964 /// Returns the value and operands for the \p V, considering if it is original
10965 /// instruction and its actual operands should be returned, or it is a
10966 /// copyable element and its should be represented as idempotent instruction.
10967 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
10968 if (isa<PoisonValue>(V))
10969 return {V, V};
10970 if (!S.isCopyableElement(V))
10971 return convertTo(cast<Instruction>(V), S).second;
10972 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10973 return {V, selectBestIdempotentValue()};
10974 }
10975
10976 /// Builds operands for the original instructions.
10977 void
10978 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
10979 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
10980
10981 unsigned ShuffleOrOp =
10982 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10983 Instruction *VL0 = S.getMainOp();
10984
10985 switch (ShuffleOrOp) {
10986 case Instruction::PHI: {
10987 auto *PH = cast<PHINode>(VL0);
10988
10989 // Keeps the reordered operands to avoid code duplication.
10990 PHIHandler Handler(DT, PH, VL);
10991 Handler.buildOperands();
10992 Operands.assign(PH->getNumOperands(), {});
10993 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
10994 Operands[I].assign(Handler.getOperands(I).begin(),
10995 Handler.getOperands(I).end());
10996 return;
10997 }
10998 case Instruction::ExtractValue:
10999 case Instruction::ExtractElement:
11000 // This is a special case, as it does not gather, but at the same time
11001 // we are not extending buildTree_rec() towards the operands.
11002 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
11003 return;
11004 case Instruction::InsertElement:
11005 Operands.assign(2, {VL.size(), nullptr});
11006 for (auto [Idx, V] : enumerate(VL)) {
11007 auto *IE = cast<InsertElementInst>(V);
11008 for (auto [OpIdx, Ops] : enumerate(Operands))
11009 Ops[Idx] = IE->getOperand(OpIdx);
11010 }
11011 return;
11012 case Instruction::Load:
11013 Operands.assign(
11014 1, {VL.size(),
11015 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
11016 for (auto [V, Op] : zip(VL, Operands.back())) {
11017 auto *LI = dyn_cast<LoadInst>(V);
11018 if (!LI)
11019 continue;
11020 Op = LI->getPointerOperand();
11021 }
11022 return;
11023 case Instruction::ZExt:
11024 case Instruction::SExt:
11025 case Instruction::FPToUI:
11026 case Instruction::FPToSI:
11027 case Instruction::FPExt:
11028 case Instruction::PtrToInt:
11029 case Instruction::IntToPtr:
11030 case Instruction::SIToFP:
11031 case Instruction::UIToFP:
11032 case Instruction::Trunc:
11033 case Instruction::FPTrunc:
11034 case Instruction::BitCast:
11035 case Instruction::ICmp:
11036 case Instruction::FCmp:
11037 case Instruction::Select:
11038 case Instruction::FNeg:
11039 case Instruction::Add:
11040 case Instruction::FAdd:
11041 case Instruction::Sub:
11042 case Instruction::FSub:
11043 case Instruction::Mul:
11044 case Instruction::FMul:
11045 case Instruction::UDiv:
11046 case Instruction::SDiv:
11047 case Instruction::FDiv:
11048 case Instruction::URem:
11049 case Instruction::SRem:
11050 case Instruction::FRem:
11051 case Instruction::Shl:
11052 case Instruction::LShr:
11053 case Instruction::AShr:
11054 case Instruction::And:
11055 case Instruction::Or:
11056 case Instruction::Xor:
11057 case Instruction::Freeze:
11058 case Instruction::Store:
11059 case Instruction::ShuffleVector:
11060 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11061 for (auto [Idx, V] : enumerate(VL)) {
11062 auto *I = dyn_cast<Instruction>(V);
11063 if (!I) {
11064 for (auto [OpIdx, Ops] : enumerate(Operands))
11065 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11066 continue;
11067 }
11068 auto [Op, ConvertedOps] = convertTo(I, S);
11069 for (auto [OpIdx, Ops] : enumerate(Operands))
11070 Ops[Idx] = ConvertedOps[OpIdx];
11071 }
11072 return;
11073 case Instruction::GetElementPtr: {
11074 Operands.assign(2, {VL.size(), nullptr});
11075 // Need to cast all indices to the same type before vectorization to
11076 // avoid crash.
11077 // Required to be able to find correct matches between different gather
11078 // nodes and reuse the vectorized values rather than trying to gather them
11079 // again.
11080 const unsigned IndexIdx = 1;
11081 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
11082 Type *Ty =
11083 all_of(VL,
11084 [&](Value *V) {
11086 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
11087 })
11088 ? VL0Ty
11089 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
11090 ->getPointerOperandType()
11091 ->getScalarType());
11092 for (auto [Idx, V] : enumerate(VL)) {
11094 if (!GEP) {
11095 Operands[0][Idx] = V;
11096 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11097 continue;
11098 }
11099 Operands[0][Idx] = GEP->getPointerOperand();
11100 auto *Op = GEP->getOperand(IndexIdx);
11101 auto *CI = dyn_cast<ConstantInt>(Op);
11102 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
11103 CI, Ty, CI->getValue().isSignBitSet(), DL)
11104 : Op;
11105 }
11106 return;
11107 }
11108 case Instruction::Call: {
11109 auto *CI = cast<CallInst>(VL0);
11111 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
11113 continue;
11114 auto &Ops = Operands.emplace_back();
11115 for (Value *V : VL) {
11116 auto *I = dyn_cast<Instruction>(V);
11117 Ops.push_back(I ? I->getOperand(Idx)
11118 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
11119 }
11120 }
11121 return;
11122 }
11123 default:
11124 break;
11125 }
11126 llvm_unreachable("Unexpected vectorization of the instructions.");
11127 }
11128
11129public:
11130 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11131 const TargetTransformInfo &TTI,
11132 const TargetLibraryInfo &TLI)
11133 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11134
11135 InstructionsState
11136 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
11137 bool TryCopyableElementsVectorization,
11138 bool WithProfitabilityCheck = false,
11139 bool SkipSameCodeCheck = false) {
11140 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11141 ? InstructionsState::invalid()
11142 : getSameOpcode(VL, TLI);
11143 if (S)
11144 return S;
11145 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
11146 return S;
11147 findAndSetMainInstruction(VL, R);
11148 if (!MainOp)
11149 return InstructionsState::invalid();
11150 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11151 if (!WithProfitabilityCheck)
11152 return S;
11153 // Check if it is profitable to vectorize the instruction.
11154 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11155 auto BuildCandidates =
11156 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11157 Value *V2) {
11158 if (V1 != V2 && isa<PHINode>(V1))
11159 return;
11160 auto *I1 = dyn_cast<Instruction>(V1);
11161 auto *I2 = dyn_cast<Instruction>(V2);
11162 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11163 I1->getParent() != I2->getParent())
11164 return;
11165 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
11166 };
11167 if (VL.size() == 2) {
11168 // Check if the operands allow better vectorization.
11169 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11170 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11171 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11172 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11173 R.findBestRootPair(Candidates1) &&
11174 R.findBestRootPair(Candidates2);
11175 if (!Res && isCommutative(MainOp)) {
11176 Candidates1.clear();
11177 Candidates2.clear();
11178 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11179 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11180 Res = !Candidates1.empty() && !Candidates2.empty() &&
11181 R.findBestRootPair(Candidates1) &&
11182 R.findBestRootPair(Candidates2);
11183 }
11184 if (!Res)
11185 return InstructionsState::invalid();
11187 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
11188 InstructionCost VectorCost;
11189 FixedVectorType *VecTy =
11190 getWidenedType(S.getMainOp()->getType(), VL.size());
11191 switch (MainOpcode) {
11192 case Instruction::Add:
11193 case Instruction::Sub:
11194 case Instruction::LShr:
11195 case Instruction::Shl:
11196 case Instruction::SDiv:
11197 case Instruction::UDiv:
11198 case Instruction::And:
11199 case Instruction::Or:
11200 case Instruction::Xor:
11201 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
11202 break;
11203 default:
11204 llvm_unreachable("Unexpected instruction.");
11205 }
11206 if (VectorCost > ScalarCost)
11207 return InstructionsState::invalid();
11208 return S;
11209 }
11210 assert(Operands.size() == 2 && "Unexpected number of operands!");
11211 unsigned CopyableNum =
11212 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
11213 if (CopyableNum < VL.size() / 2)
11214 return S;
11215 // Too many phi copyables - exit.
11216 const unsigned Limit = VL.size() / 24;
11217 if ((CopyableNum >= VL.size() - Limit ||
11218 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11219 CopyableNum >= MaxPHINumOperands) &&
11220 all_of(VL, [&](Value *V) {
11221 return isa<PHINode>(V) || !S.isCopyableElement(V);
11222 }))
11223 return InstructionsState::invalid();
11224 // Check profitability if number of copyables > VL.size() / 2.
11225 // 1. Reorder operands for better matching.
11226 if (isCommutative(MainOp)) {
11227 for (auto &Ops : Operands) {
11228 // Make instructions the first operands.
11229 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
11230 std::swap(Ops.front(), Ops.back());
11231 continue;
11232 }
11233 // Make constants the second operands.
11234 if (isa<Constant>(Ops.front())) {
11235 std::swap(Ops.front(), Ops.back());
11236 continue;
11237 }
11238 }
11239 }
11240 // 2. Check, if operands can be vectorized.
11241 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
11242 return InstructionsState::invalid();
11243 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11244 if (allConstant(Ops) || isSplat(Ops))
11245 return true;
11246 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11247 // one is different.
11248 constexpr unsigned Limit = 4;
11249 if (Operands.front().size() >= Limit) {
11250 SmallDenseMap<const Value *, unsigned> Counters;
11251 for (Value *V : Ops) {
11252 if (isa<UndefValue>(V))
11253 continue;
11254 ++Counters[V];
11255 }
11256 if (Counters.size() == 2 &&
11257 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11258 return C.second == 1;
11259 }))
11260 return true;
11261 }
11262 // First operand not a constant or splat? Last attempt - check for
11263 // potential vectorization.
11264 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11265 InstructionsState OpS = Analysis.buildInstructionsState(
11266 Ops, R, /*TryCopyableElementsVectorization=*/true);
11267 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11268 return false;
11269 unsigned CopyableNum =
11270 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11271 return CopyableNum <= VL.size() / 2;
11272 };
11273 if (!CheckOperand(Operands.front()))
11274 return InstructionsState::invalid();
11275
11276 return S;
11277 }
11278
11279 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11280 ArrayRef<Value *> VL) {
11281 assert(S && "Invalid state!");
11283 if (S.areInstructionsWithCopyableElements()) {
11284 MainOp = S.getMainOp();
11285 MainOpcode = S.getOpcode();
11286 Operands.assign(MainOp->getNumOperands(),
11287 BoUpSLP::ValueList(VL.size(), nullptr));
11288 for (auto [Idx, V] : enumerate(VL)) {
11289 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11290 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11291 Operands[OperandIdx][Idx] = Operand;
11292 }
11293 } else {
11294 buildOriginalOperands(S, VL, Operands);
11295 }
11296 return Operands;
11297 }
11298};
11299} // namespace
11300
11301BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11302 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11303 bool TryCopyableElementsVectorization) const {
11304 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11305
11306 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11307 InstructionsState S = Analysis.buildInstructionsState(
11308 VL, *this, TryCopyableElementsVectorization,
11309 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11310
11311 // Don't go into catchswitch blocks, which can happen with PHIs.
11312 // Such blocks can only have PHIs and the catchswitch. There is no
11313 // place to insert a shuffle if we need to, so just avoid that issue.
11314 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
11315 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11316 // Do not try to pack to avoid extra instructions here.
11317 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11318 /*TryToFindDuplicates=*/false);
11319 }
11320
11321 // Check if this is a duplicate of another entry.
11322 if (S) {
11323 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11324 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11325 if (E->isSame(VL)) {
11326 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11327 << ".\n");
11328 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11329 }
11330 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11331 if (all_of(VL, [&](Value *V) {
11332 return isa<PoisonValue>(V) || Values.contains(V) ||
11333 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11334 LI->getLoopFor(S.getMainOp()->getParent()) &&
11335 isVectorized(V));
11336 })) {
11337 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11338 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11339 }
11340 }
11341 }
11342
11343 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11344 // a load), in which case peek through to include it in the tree, without
11345 // ballooning over-budget.
11346 if (Depth >= RecursionMaxDepth &&
11347 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
11348 (match(S.getMainOp(), m_Load(m_Value())) ||
11349 all_of(VL, [&S](const Value *I) {
11350 return match(I,
11352 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11353 })))) {
11354 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11355 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11356 }
11357
11358 // Don't handle scalable vectors
11359 if (S && S.getOpcode() == Instruction::ExtractElement &&
11361 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11362 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11363 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11364 }
11365
11366 // Don't handle vectors.
11367 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11368 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11369 // Do not try to pack to avoid extra instructions here.
11370 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11371 /*TryToFindDuplicates=*/false);
11372 }
11373
11374 // If all of the operands are identical or constant we have a simple solution.
11375 // If we deal with insert/extract instructions, they all must have constant
11376 // indices, otherwise we should gather them, not try to vectorize.
11377 // If alternate op node with 2 elements with gathered operands - do not
11378 // vectorize.
11379 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11380 if (!S || !S.isAltShuffle() || VL.size() > 2)
11381 return false;
11382 if (VectorizableTree.size() < MinTreeSize)
11383 return false;
11384 if (Depth >= RecursionMaxDepth - 1)
11385 return true;
11386 // Check if all operands are extracts, part of vector node or can build a
11387 // regular vectorize node.
11388 SmallVector<unsigned, 8> InstsCount;
11389 for (Value *V : VL) {
11390 auto *I = cast<Instruction>(V);
11391 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11392 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11393 }));
11394 }
11395 bool IsCommutative =
11396 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11397 if ((IsCommutative &&
11398 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11399 (!IsCommutative &&
11400 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11401 return true;
11402 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11404 auto *I1 = cast<Instruction>(VL.front());
11405 auto *I2 = cast<Instruction>(VL.back());
11406 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11407 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11408 I2->getOperand(Op));
11409 if (static_cast<unsigned>(count_if(
11410 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11412 })) >= S.getMainOp()->getNumOperands() / 2)
11413 return false;
11414 if (S.getMainOp()->getNumOperands() > 2)
11415 return true;
11416 if (IsCommutative) {
11417 // Check permuted operands.
11418 Candidates.clear();
11419 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11420 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11421 I2->getOperand((Op + 1) % E));
11422 if (any_of(
11423 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11425 }))
11426 return false;
11427 }
11428 return true;
11429 };
11430 SmallVector<unsigned> SortedIndices;
11431 BasicBlock *BB = nullptr;
11432 bool IsScatterVectorizeUserTE =
11433 UserTreeIdx.UserTE &&
11434 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11435 bool AreAllSameBlock = S.valid();
11436 bool AreScatterAllGEPSameBlock =
11437 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11438 VL.size() > 2 &&
11439 all_of(VL,
11440 [&BB](Value *V) {
11441 auto *I = dyn_cast<GetElementPtrInst>(V);
11442 if (!I)
11443 return doesNotNeedToBeScheduled(V);
11444 if (!BB)
11445 BB = I->getParent();
11446 return BB == I->getParent() && I->getNumOperands() == 2;
11447 }) &&
11448 BB &&
11449 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11450 SortedIndices));
11451 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11452 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
11453 (S &&
11455 S.getMainOp()) &&
11457 NotProfitableForVectorization(VL)) {
11458 if (!S) {
11459 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11460 "C,S,B,O, small shuffle. \n";
11461 dbgs() << "[";
11462 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11463 dbgs() << "]\n");
11464 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11465 /*TryToFindDuplicates=*/true,
11466 /*TrySplitVectorize=*/true);
11467 }
11468 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11469 dbgs() << "[";
11470 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11471 dbgs() << "]\n");
11472 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11473 }
11474
11475 // Don't vectorize ephemeral values.
11476 if (S && !EphValues.empty()) {
11477 for (Value *V : VL) {
11478 if (EphValues.count(V)) {
11479 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11480 << ") is ephemeral.\n");
11481 // Do not try to pack to avoid extra instructions here.
11482 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11483 /*TryToFindDuplicates=*/false);
11484 }
11485 }
11486 }
11487
11488 // We now know that this is a vector of instructions of the same type from
11489 // the same block.
11490
11491 // Check that none of the instructions in the bundle are already in the tree
11492 // and the node may be not profitable for the vectorization as the small
11493 // alternate node.
11494 if (S && S.isAltShuffle()) {
11495 auto GetNumVectorizedExtracted = [&]() {
11496 APInt Extracted = APInt::getZero(VL.size());
11497 APInt Vectorized = APInt::getAllOnes(VL.size());
11498 for (auto [Idx, V] : enumerate(VL)) {
11499 auto *I = dyn_cast<Instruction>(V);
11500 if (!I || doesNotNeedToBeScheduled(I) ||
11501 all_of(I->operands(), [&](const Use &U) {
11502 return isa<ExtractElementInst>(U.get());
11503 }))
11504 continue;
11505 if (isVectorized(I))
11506 Vectorized.clearBit(Idx);
11507 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11508 Extracted.setBit(Idx);
11509 }
11510 return std::make_pair(Vectorized, Extracted);
11511 };
11512 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11514 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11515 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11516 // Rough cost estimation, if the vector code (+ potential extracts) is
11517 // more profitable than the scalar + buildvector.
11518 Type *ScalarTy = VL.front()->getType();
11519 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11520 InstructionCost VectorizeCostEstimate =
11521 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11522 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11523 /*Insert=*/false, /*Extract=*/true, Kind);
11524 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11525 *TTI, ScalarTy, VecTy, Vectorized,
11526 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11527 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11528 }
11529 if (PreferScalarize) {
11530 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11531 "node is not profitable.\n");
11532 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11533 }
11534 }
11535
11536 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11537 if (UserIgnoreList && !UserIgnoreList->empty()) {
11538 for (Value *V : VL) {
11539 if (UserIgnoreList->contains(V)) {
11540 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11541 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11542 }
11543 }
11544 }
11545
11546 // Special processing for sorted pointers for ScatterVectorize node with
11547 // constant indeces only.
11548 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11549 assert(VL.front()->getType()->isPointerTy() &&
11551 "Expected pointers only.");
11552 // Reset S to make it GetElementPtr kind of node.
11553 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11554 assert(It != VL.end() && "Expected at least one GEP.");
11555 S = getSameOpcode(*It, *TLI);
11556 }
11557
11558 // Check that all of the users of the scalars that we want to vectorize are
11559 // schedulable.
11560 Instruction *VL0 = S.getMainOp();
11561 BB = VL0->getParent();
11562
11563 if (S &&
11565 !DT->isReachableFromEntry(BB))) {
11566 // Don't go into unreachable blocks. They may contain instructions with
11567 // dependency cycles which confuse the final scheduling.
11568 // Do not vectorize EH and non-returning blocks, not profitable in most
11569 // cases.
11570 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11571 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11572 }
11573 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11574}
11575
11576void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11577 const EdgeInfo &UserTreeIdx,
11578 unsigned InterleaveFactor) {
11579 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11580
11581 SmallVector<int> ReuseShuffleIndices;
11582 SmallVector<Value *> VL(VLRef);
11583
11584 // Tries to build split node.
11585 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11586 SmallVector<Value *> Op1, Op2;
11587 OrdersType ReorderIndices;
11588 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11589 return false;
11590
11591 auto Invalid = ScheduleBundle::invalid();
11592 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11593 UserTreeIdx, {}, ReorderIndices);
11594 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11595 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11596 InstructionsState S = getSameOpcode(Op, *TLI);
11597 if (S && (isa<LoadInst>(S.getMainOp()) ||
11598 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11599 // Build gather node for loads, they will be gathered later.
11600 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11601 Idx == 0 ? 0 : Op1.size());
11602 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11603 } else {
11604 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11605 Idx == 0 ? 0 : Op1.size());
11606 buildTreeRec(Op, Depth, {TE, Idx});
11607 }
11608 };
11609 AddNode(Op1, 0);
11610 AddNode(Op2, 1);
11611 return true;
11612 };
11613
11614 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11615 bool AreConsts = false;
11616 for (Value *V : VL) {
11617 if (isa<PoisonValue>(V))
11618 continue;
11619 if (isa<Constant>(V)) {
11620 AreConsts = true;
11621 continue;
11622 }
11623 if (!isa<PHINode>(V))
11624 return false;
11625 }
11626 return AreConsts;
11627 };
11628 if (AreOnlyConstsWithPHIs(VL)) {
11629 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11630 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11631 return;
11632 }
11633
11634 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11635 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11636 InstructionsState S = Legality.getInstructionsState();
11637 if (!Legality.isLegal()) {
11638 if (Legality.trySplitVectorize()) {
11639 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11640 // Last chance to try to vectorize alternate node.
11641 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11642 return;
11643 }
11644 if (!S)
11645 Legality = getScalarsVectorizationLegality(
11646 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11647 if (!Legality.isLegal()) {
11648 if (Legality.tryToFindDuplicates())
11649 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11650 UserTreeIdx);
11651
11652 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11653 return;
11654 }
11655 S = Legality.getInstructionsState();
11656 }
11657
11658 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11659 if (S.isAltShuffle() && TrySplitNode(S))
11660 return;
11661
11662 // Check that every instruction appears once in this bundle.
11663 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11664 /*TryPad=*/true)) {
11665 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11666 return;
11667 }
11668
11669 // Perform specific checks for each particular instruction kind.
11670 bool IsScatterVectorizeUserTE =
11671 UserTreeIdx.UserTE &&
11672 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11673 OrdersType CurrentOrder;
11674 SmallVector<Value *> PointerOps;
11675 StridedPtrInfo SPtrInfo;
11676 TreeEntry::EntryState State = getScalarsVectorizationState(
11677 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11678 if (State == TreeEntry::NeedToGather) {
11679 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11680 return;
11681 }
11682
11683 Instruction *VL0 = S.getMainOp();
11684 BasicBlock *BB = VL0->getParent();
11685 auto &BSRef = BlocksSchedules[BB];
11686 if (!BSRef)
11687 BSRef = std::make_unique<BlockScheduling>(BB);
11688
11689 BlockScheduling &BS = *BSRef;
11690
11691 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11692 std::optional<ScheduleBundle *> BundlePtr =
11693 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11694#ifdef EXPENSIVE_CHECKS
11695 // Make sure we didn't break any internal invariants
11696 BS.verify();
11697#endif
11698 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11699 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11700 // Last chance to try to vectorize alternate node.
11701 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11702 return;
11703 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11704 NonScheduledFirst.insert(VL.front());
11705 if (S.getOpcode() == Instruction::Load &&
11706 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11708 return;
11709 }
11710 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11711 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11712 ScheduleBundle Empty;
11713 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11714 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11715
11716 unsigned ShuffleOrOp =
11717 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11718 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11719 // Postpone PHI nodes creation
11720 SmallVector<unsigned> PHIOps;
11721 for (unsigned I : seq<unsigned>(Operands.size())) {
11722 ArrayRef<Value *> Op = Operands[I];
11723 if (Op.empty())
11724 continue;
11725 InstructionsState S = getSameOpcode(Op, *TLI);
11726 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11727 buildTreeRec(Op, Depth + 1, {TE, I});
11728 else
11729 PHIOps.push_back(I);
11730 }
11731 for (unsigned I : PHIOps)
11732 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11733 };
11734 switch (ShuffleOrOp) {
11735 case Instruction::PHI: {
11736 TreeEntry *TE =
11737 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11738 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11739 TE->dump());
11740
11741 TE->setOperands(Operands);
11742 CreateOperandNodes(TE, Operands);
11743 return;
11744 }
11745 case Instruction::ExtractValue:
11746 case Instruction::ExtractElement: {
11747 if (CurrentOrder.empty()) {
11748 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11749 } else {
11750 LLVM_DEBUG({
11751 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11752 "with order";
11753 for (unsigned Idx : CurrentOrder)
11754 dbgs() << " " << Idx;
11755 dbgs() << "\n";
11756 });
11757 fixupOrderingIndices(CurrentOrder);
11758 }
11759 // Insert new order with initial value 0, if it does not exist,
11760 // otherwise return the iterator to the existing one.
11761 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11762 ReuseShuffleIndices, CurrentOrder);
11763 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11764 "(ExtractValueInst/ExtractElementInst).\n";
11765 TE->dump());
11766 // This is a special case, as it does not gather, but at the same time
11767 // we are not extending buildTreeRec() towards the operands.
11768 TE->setOperands(Operands);
11769 return;
11770 }
11771 case Instruction::InsertElement: {
11772 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11773
11774 auto OrdCompare = [](const std::pair<int, int> &P1,
11775 const std::pair<int, int> &P2) {
11776 return P1.first > P2.first;
11777 };
11778 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
11779 decltype(OrdCompare)>
11780 Indices(OrdCompare);
11781 for (int I = 0, E = VL.size(); I < E; ++I) {
11782 unsigned Idx = *getElementIndex(VL[I]);
11783 Indices.emplace(Idx, I);
11784 }
11785 OrdersType CurrentOrder(VL.size(), VL.size());
11786 bool IsIdentity = true;
11787 for (int I = 0, E = VL.size(); I < E; ++I) {
11788 CurrentOrder[Indices.top().second] = I;
11789 IsIdentity &= Indices.top().second == I;
11790 Indices.pop();
11791 }
11792 if (IsIdentity)
11793 CurrentOrder.clear();
11794 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11795 {}, CurrentOrder);
11796 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
11797 TE->dump());
11798
11799 TE->setOperands(Operands);
11800 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
11801 return;
11802 }
11803 case Instruction::Load: {
11804 // Check that a vectorized load would load the same memory as a scalar
11805 // load. For example, we don't want to vectorize loads that are smaller
11806 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
11807 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
11808 // from such a struct, we read/write packed bits disagreeing with the
11809 // unvectorized version.
11810 TreeEntry *TE = nullptr;
11811 fixupOrderingIndices(CurrentOrder);
11812 switch (State) {
11813 case TreeEntry::Vectorize:
11814 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11815 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11816 if (CurrentOrder.empty())
11817 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
11818 TE->dump());
11819 else
11821 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
11822 TE->dump());
11823 break;
11824 case TreeEntry::CompressVectorize:
11825 // Vectorizing non-consecutive loads with (masked)load + compress.
11826 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11827 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11828 LLVM_DEBUG(
11829 dbgs()
11830 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11831 TE->dump());
11832 break;
11833 case TreeEntry::StridedVectorize:
11834 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11835 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11836 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11837 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
11838 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11839 TE->dump());
11840 break;
11841 case TreeEntry::ScatterVectorize:
11842 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11843 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11844 UserTreeIdx, ReuseShuffleIndices);
11845 LLVM_DEBUG(
11846 dbgs()
11847 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11848 TE->dump());
11849 break;
11850 case TreeEntry::CombinedVectorize:
11851 case TreeEntry::SplitVectorize:
11852 case TreeEntry::NeedToGather:
11853 llvm_unreachable("Unexpected loads state.");
11854 }
11855 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11856 assert(Operands.size() == 1 && "Expected a single operand only");
11857 SmallVector<int> Mask;
11858 inversePermutation(CurrentOrder, Mask);
11859 reorderScalars(Operands.front(), Mask);
11860 }
11861 TE->setOperands(Operands);
11862 if (State == TreeEntry::ScatterVectorize)
11863 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
11864 return;
11865 }
11866 case Instruction::ZExt:
11867 case Instruction::SExt:
11868 case Instruction::FPToUI:
11869 case Instruction::FPToSI:
11870 case Instruction::FPExt:
11871 case Instruction::PtrToInt:
11872 case Instruction::IntToPtr:
11873 case Instruction::SIToFP:
11874 case Instruction::UIToFP:
11875 case Instruction::Trunc:
11876 case Instruction::FPTrunc:
11877 case Instruction::BitCast: {
11878 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11879 std::make_pair(std::numeric_limits<unsigned>::min(),
11880 std::numeric_limits<unsigned>::max()));
11881 if (ShuffleOrOp == Instruction::ZExt ||
11882 ShuffleOrOp == Instruction::SExt) {
11883 CastMaxMinBWSizes = std::make_pair(
11884 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11885 PrevMaxBW),
11886 std::min<unsigned>(
11887 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11888 PrevMinBW));
11889 } else if (ShuffleOrOp == Instruction::Trunc) {
11890 CastMaxMinBWSizes = std::make_pair(
11891 std::max<unsigned>(
11892 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11893 PrevMaxBW),
11894 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11895 PrevMinBW));
11896 }
11897 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11898 ReuseShuffleIndices);
11899 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
11900 TE->dump());
11901
11902 TE->setOperands(Operands);
11903 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11904 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11905 if (ShuffleOrOp == Instruction::Trunc) {
11906 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11907 } else if (ShuffleOrOp == Instruction::SIToFP ||
11908 ShuffleOrOp == Instruction::UIToFP) {
11909 unsigned NumSignBits =
11910 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11911 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
11912 APInt Mask = DB->getDemandedBits(OpI);
11913 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
11914 }
11915 if (NumSignBits * 2 >=
11916 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11917 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11918 }
11919 return;
11920 }
11921 case Instruction::ICmp:
11922 case Instruction::FCmp: {
11923 // Check that all of the compares have the same predicate.
11924 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11925 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11926 ReuseShuffleIndices);
11927 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
11928 TE->dump());
11929
11930 VLOperands Ops(VL, Operands, S, *this);
11931 if (cast<CmpInst>(VL0)->isCommutative()) {
11932 // Commutative predicate - collect + sort operands of the instructions
11933 // so that each side is more likely to have the same opcode.
11935 "Commutative Predicate mismatch");
11936 Ops.reorder();
11937 Operands.front() = Ops.getVL(0);
11938 Operands.back() = Ops.getVL(1);
11939 } else {
11940 // Collect operands - commute if it uses the swapped predicate.
11941 for (auto [Idx, V] : enumerate(VL)) {
11942 if (isa<PoisonValue>(V))
11943 continue;
11944 auto *Cmp = cast<CmpInst>(V);
11945 if (Cmp->getPredicate() != P0)
11946 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11947 }
11948 }
11949 TE->setOperands(Operands);
11950 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11951 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11952 if (ShuffleOrOp == Instruction::ICmp) {
11953 unsigned NumSignBits0 =
11954 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11955 if (NumSignBits0 * 2 >=
11956 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11957 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11958 unsigned NumSignBits1 =
11959 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
11960 if (NumSignBits1 * 2 >=
11961 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
11962 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11963 }
11964 return;
11965 }
11966 case Instruction::Select:
11967 case Instruction::FNeg:
11968 case Instruction::Add:
11969 case Instruction::FAdd:
11970 case Instruction::Sub:
11971 case Instruction::FSub:
11972 case Instruction::Mul:
11973 case Instruction::FMul:
11974 case Instruction::UDiv:
11975 case Instruction::SDiv:
11976 case Instruction::FDiv:
11977 case Instruction::URem:
11978 case Instruction::SRem:
11979 case Instruction::FRem:
11980 case Instruction::Shl:
11981 case Instruction::LShr:
11982 case Instruction::AShr:
11983 case Instruction::And:
11984 case Instruction::Or:
11985 case Instruction::Xor:
11986 case Instruction::Freeze: {
11987 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11988 ReuseShuffleIndices);
11989 LLVM_DEBUG(
11990 dbgs() << "SLP: added a new TreeEntry "
11991 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11992 TE->dump());
11993
11994 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
11995 VLOperands Ops(VL, Operands, S, *this);
11996 Ops.reorder();
11997 Operands[0] = Ops.getVL(0);
11998 Operands[1] = Ops.getVL(1);
11999 }
12000 TE->setOperands(Operands);
12001 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12002 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12003 return;
12004 }
12005 case Instruction::GetElementPtr: {
12006 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12007 ReuseShuffleIndices);
12008 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
12009 TE->dump());
12010 TE->setOperands(Operands);
12011
12012 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
12013 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12014 return;
12015 }
12016 case Instruction::Store: {
12017 bool Consecutive = CurrentOrder.empty();
12018 if (!Consecutive)
12019 fixupOrderingIndices(CurrentOrder);
12020 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12021 ReuseShuffleIndices, CurrentOrder);
12022 if (Consecutive)
12023 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
12024 TE->dump());
12025 else
12026 LLVM_DEBUG(
12027 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
12028 TE->dump());
12029 TE->setOperands(Operands);
12030 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
12031 return;
12032 }
12033 case Instruction::Call: {
12034 // Check if the calls are all to the same vectorizable intrinsic or
12035 // library function.
12036 CallInst *CI = cast<CallInst>(VL0);
12038
12039 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12040 ReuseShuffleIndices);
12041 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
12042 TE->dump());
12043 if (isCommutative(VL0)) {
12044 VLOperands Ops(VL, Operands, S, *this);
12045 Ops.reorder();
12046 Operands[0] = Ops.getVL(0);
12047 Operands[1] = Ops.getVL(1);
12048 }
12049 TE->setOperands(Operands);
12050 for (unsigned I : seq<unsigned>(CI->arg_size())) {
12051 // For scalar operands no need to create an entry since no need to
12052 // vectorize it.
12054 continue;
12055 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12056 }
12057 return;
12058 }
12059 case Instruction::ShuffleVector: {
12060 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12061 ReuseShuffleIndices);
12062 if (S.isAltShuffle()) {
12063 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
12064 TE->dump());
12065 } else {
12066 assert(SLPReVec && "Only supported by REVEC.");
12067 LLVM_DEBUG(
12068 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12069 TE->dump());
12070 }
12071
12072 // Reorder operands if reordering would enable vectorization.
12073 auto *CI = dyn_cast<CmpInst>(VL0);
12074 if (CI && any_of(VL, [](Value *V) {
12075 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
12076 })) {
12077 auto *MainCI = cast<CmpInst>(S.getMainOp());
12078 auto *AltCI = cast<CmpInst>(S.getAltOp());
12079 CmpInst::Predicate MainP = MainCI->getPredicate();
12080 CmpInst::Predicate AltP = AltCI->getPredicate();
12081 assert(MainP != AltP &&
12082 "Expected different main/alternate predicates.");
12083 // Collect operands - commute if it uses the swapped predicate or
12084 // alternate operation.
12085 for (auto [Idx, V] : enumerate(VL)) {
12086 if (isa<PoisonValue>(V))
12087 continue;
12088 auto *Cmp = cast<CmpInst>(V);
12089
12090 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
12091 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12092 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12093 } else {
12094 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12095 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12096 }
12097 }
12098 TE->setOperands(Operands);
12099 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
12100 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
12101 return;
12102 }
12103
12104 if (isa<BinaryOperator>(VL0) || CI) {
12105 VLOperands Ops(VL, Operands, S, *this);
12106 Ops.reorder();
12107 Operands[0] = Ops.getVL(0);
12108 Operands[1] = Ops.getVL(1);
12109 }
12110 TE->setOperands(Operands);
12111 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12112 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12113 return;
12114 }
12115 default:
12116 break;
12117 }
12118 llvm_unreachable("Unexpected vectorization of the instructions.");
12119}
12120
12121unsigned BoUpSLP::canMapToVector(Type *T) const {
12122 unsigned N = 1;
12123 Type *EltTy = T;
12124
12126 if (EltTy->isEmptyTy())
12127 return 0;
12128 if (auto *ST = dyn_cast<StructType>(EltTy)) {
12129 // Check that struct is homogeneous.
12130 for (const auto *Ty : ST->elements())
12131 if (Ty != *ST->element_begin())
12132 return 0;
12133 N *= ST->getNumElements();
12134 EltTy = *ST->element_begin();
12135 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
12136 N *= AT->getNumElements();
12137 EltTy = AT->getElementType();
12138 } else {
12139 auto *VT = cast<FixedVectorType>(EltTy);
12140 N *= VT->getNumElements();
12141 EltTy = VT->getElementType();
12142 }
12143 }
12144
12145 if (!isValidElementType(EltTy))
12146 return 0;
12147 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
12148 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12149 VTSize != DL->getTypeStoreSizeInBits(T))
12150 return 0;
12151 return N;
12152}
12153
12154bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12155 SmallVectorImpl<unsigned> &CurrentOrder,
12156 bool ResizeAllowed) const {
12158 assert(It != VL.end() && "Expected at least one extract instruction.");
12159 auto *E0 = cast<Instruction>(*It);
12160 assert(
12162 "Invalid opcode");
12163 // Check if all of the extracts come from the same vector and from the
12164 // correct offset.
12165 Value *Vec = E0->getOperand(0);
12166
12167 CurrentOrder.clear();
12168
12169 // We have to extract from a vector/aggregate with the same number of elements.
12170 unsigned NElts;
12171 if (E0->getOpcode() == Instruction::ExtractValue) {
12172 NElts = canMapToVector(Vec->getType());
12173 if (!NElts)
12174 return false;
12175 // Check if load can be rewritten as load of vector.
12176 LoadInst *LI = dyn_cast<LoadInst>(Vec);
12177 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
12178 return false;
12179 } else {
12180 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
12181 }
12182
12183 unsigned E = VL.size();
12184 if (!ResizeAllowed && NElts != E)
12185 return false;
12186 SmallVector<int> Indices(E, PoisonMaskElem);
12187 unsigned MinIdx = NElts, MaxIdx = 0;
12188 for (auto [I, V] : enumerate(VL)) {
12189 auto *Inst = dyn_cast<Instruction>(V);
12190 if (!Inst)
12191 continue;
12192 if (Inst->getOperand(0) != Vec)
12193 return false;
12194 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
12195 if (isa<UndefValue>(EE->getIndexOperand()))
12196 continue;
12197 std::optional<unsigned> Idx = getExtractIndex(Inst);
12198 if (!Idx)
12199 return false;
12200 const unsigned ExtIdx = *Idx;
12201 if (ExtIdx >= NElts)
12202 continue;
12203 Indices[I] = ExtIdx;
12204 if (MinIdx > ExtIdx)
12205 MinIdx = ExtIdx;
12206 if (MaxIdx < ExtIdx)
12207 MaxIdx = ExtIdx;
12208 }
12209 if (MaxIdx - MinIdx + 1 > E)
12210 return false;
12211 if (MaxIdx + 1 <= E)
12212 MinIdx = 0;
12213
12214 // Check that all of the indices extract from the correct offset.
12215 bool ShouldKeepOrder = true;
12216 // Assign to all items the initial value E + 1 so we can check if the extract
12217 // instruction index was used already.
12218 // Also, later we can check that all the indices are used and we have a
12219 // consecutive access in the extract instructions, by checking that no
12220 // element of CurrentOrder still has value E + 1.
12221 CurrentOrder.assign(E, E);
12222 for (unsigned I = 0; I < E; ++I) {
12223 if (Indices[I] == PoisonMaskElem)
12224 continue;
12225 const unsigned ExtIdx = Indices[I] - MinIdx;
12226 if (CurrentOrder[ExtIdx] != E) {
12227 CurrentOrder.clear();
12228 return false;
12229 }
12230 ShouldKeepOrder &= ExtIdx == I;
12231 CurrentOrder[ExtIdx] = I;
12232 }
12233 if (ShouldKeepOrder)
12234 CurrentOrder.clear();
12235
12236 return ShouldKeepOrder;
12237}
12238
12239bool BoUpSLP::areAllUsersVectorized(
12240 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12241 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
12242 all_of(I->users(), [this](User *U) {
12243 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12244 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12245 });
12246}
12247
12248void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12249 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12250 SmallVectorImpl<Value *> *OpScalars,
12251 SmallVectorImpl<Value *> *AltScalars) const {
12252 unsigned Sz = Scalars.size();
12253 Mask.assign(Sz, PoisonMaskElem);
12254 SmallVector<int> OrderMask;
12255 if (!ReorderIndices.empty())
12256 inversePermutation(ReorderIndices, OrderMask);
12257 for (unsigned I = 0; I < Sz; ++I) {
12258 unsigned Idx = I;
12259 if (!ReorderIndices.empty())
12260 Idx = OrderMask[I];
12261 if (isa<PoisonValue>(Scalars[Idx]))
12262 continue;
12263 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12264 if (IsAltOp(OpInst)) {
12265 Mask[I] = Sz + Idx;
12266 if (AltScalars)
12267 AltScalars->push_back(OpInst);
12268 } else {
12269 Mask[I] = Idx;
12270 if (OpScalars)
12271 OpScalars->push_back(OpInst);
12272 }
12273 }
12274 if (!ReuseShuffleIndices.empty()) {
12275 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12276 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12277 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12278 });
12279 Mask.swap(NewMask);
12280 }
12281}
12282
12284 Instruction *AltOp,
12285 const TargetLibraryInfo &TLI) {
12286 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12287}
12288
12290 Instruction *AltOp,
12291 const TargetLibraryInfo &TLI) {
12292 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12293 auto *AltCI = cast<CmpInst>(AltOp);
12294 CmpInst::Predicate MainP = MainCI->getPredicate();
12295 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12296 assert(MainP != AltP && "Expected different main/alternate predicates.");
12297 auto *CI = cast<CmpInst>(I);
12298 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12299 return false;
12300 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12301 return true;
12302 CmpInst::Predicate P = CI->getPredicate();
12304
12305 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12306 "CmpInst expected to match either main or alternate predicate or "
12307 "their swap.");
12308 return MainP != P && MainP != SwappedP;
12309 }
12310 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12311}
12312
12313TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12314 assert(!Ops.empty());
12315 const auto *Op0 = Ops.front();
12316
12317 const bool IsConstant = all_of(Ops, [](Value *V) {
12318 // TODO: We should allow undef elements here
12319 return isConstant(V) && !isa<UndefValue>(V);
12320 });
12321 const bool IsUniform = all_of(Ops, [=](Value *V) {
12322 // TODO: We should allow undef elements here
12323 return V == Op0;
12324 });
12325 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12326 // TODO: We should allow undef elements here
12327 if (auto *CI = dyn_cast<ConstantInt>(V))
12328 return CI->getValue().isPowerOf2();
12329 return false;
12330 });
12331 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12332 // TODO: We should allow undef elements here
12333 if (auto *CI = dyn_cast<ConstantInt>(V))
12334 return CI->getValue().isNegatedPowerOf2();
12335 return false;
12336 });
12337
12339 if (IsConstant && IsUniform)
12341 else if (IsConstant)
12343 else if (IsUniform)
12345
12347 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12348 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12349
12350 return {VK, VP};
12351}
12352
12353namespace {
12354/// The base class for shuffle instruction emission and shuffle cost estimation.
12355class BaseShuffleAnalysis {
12356protected:
12357 Type *ScalarTy = nullptr;
12358
12359 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12360
12361 /// V is expected to be a vectorized value.
12362 /// When REVEC is disabled, there is no difference between VF and
12363 /// VNumElements.
12364 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12365 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12366 /// of 8.
12367 unsigned getVF(Value *V) const {
12368 assert(V && "V cannot be nullptr");
12369 assert(isa<FixedVectorType>(V->getType()) &&
12370 "V does not have FixedVectorType");
12371 assert(ScalarTy && "ScalarTy cannot be nullptr");
12372 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12373 unsigned VNumElements =
12374 cast<FixedVectorType>(V->getType())->getNumElements();
12375 assert(VNumElements > ScalarTyNumElements &&
12376 "the number of elements of V is not large enough");
12377 assert(VNumElements % ScalarTyNumElements == 0 &&
12378 "the number of elements of V is not a vectorized value");
12379 return VNumElements / ScalarTyNumElements;
12380 }
12381
12382 /// Checks if the mask is an identity mask.
12383 /// \param IsStrict if is true the function returns false if mask size does
12384 /// not match vector size.
12385 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12386 bool IsStrict) {
12387 int Limit = Mask.size();
12388 int VF = VecTy->getNumElements();
12389 int Index = -1;
12390 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12391 return true;
12392 if (!IsStrict) {
12393 // Consider extract subvector starting from index 0.
12394 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12395 Index == 0)
12396 return true;
12397 // All VF-size submasks are identity (e.g.
12398 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12399 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12400 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12401 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
12403 }))
12404 return true;
12405 }
12406 return false;
12407 }
12408
12409 /// Tries to combine 2 different masks into single one.
12410 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12411 /// change the size of the vector, \p LocalVF is the original size of the
12412 /// shuffled vector.
12413 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12414 ArrayRef<int> ExtMask) {
12415 unsigned VF = Mask.size();
12416 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12417 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12418 if (ExtMask[I] == PoisonMaskElem)
12419 continue;
12420 int MaskedIdx = Mask[ExtMask[I] % VF];
12421 NewMask[I] =
12422 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12423 }
12424 Mask.swap(NewMask);
12425 }
12426
12427 /// Looks through shuffles trying to reduce final number of shuffles in the
12428 /// code. The function looks through the previously emitted shuffle
12429 /// instructions and properly mark indices in mask as undef.
12430 /// For example, given the code
12431 /// \code
12432 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12433 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12434 /// \endcode
12435 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12436 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12437 /// <0, 1, 2, 3> for the shuffle.
12438 /// If 2 operands are of different size, the smallest one will be resized and
12439 /// the mask recalculated properly.
12440 /// For example, given the code
12441 /// \code
12442 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12443 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12444 /// \endcode
12445 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12446 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12447 /// <0, 1, 2, 3> for the shuffle.
12448 /// So, it tries to transform permutations to simple vector merge, if
12449 /// possible.
12450 /// \param V The input vector which must be shuffled using the given \p Mask.
12451 /// If the better candidate is found, \p V is set to this best candidate
12452 /// vector.
12453 /// \param Mask The input mask for the shuffle. If the best candidate is found
12454 /// during looking-through-shuffles attempt, it is updated accordingly.
12455 /// \param SinglePermute true if the shuffle operation is originally a
12456 /// single-value-permutation. In this case the look-through-shuffles procedure
12457 /// may look for resizing shuffles as the best candidates.
12458 /// \return true if the shuffle results in the non-resizing identity shuffle
12459 /// (and thus can be ignored), false - otherwise.
12460 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12461 bool SinglePermute) {
12462 Value *Op = V;
12463 ShuffleVectorInst *IdentityOp = nullptr;
12464 SmallVector<int> IdentityMask;
12465 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12466 // Exit if not a fixed vector type or changing size shuffle.
12467 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12468 if (!SVTy)
12469 break;
12470 // Remember the identity or broadcast mask, if it is not a resizing
12471 // shuffle. If no better candidates are found, this Op and Mask will be
12472 // used in the final shuffle.
12473 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12474 if (!IdentityOp || !SinglePermute ||
12475 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12477 IdentityMask.size()))) {
12478 IdentityOp = SV;
12479 // Store current mask in the IdentityMask so later we did not lost
12480 // this info if IdentityOp is selected as the best candidate for the
12481 // permutation.
12482 IdentityMask.assign(Mask);
12483 }
12484 }
12485 // Remember the broadcast mask. If no better candidates are found, this Op
12486 // and Mask will be used in the final shuffle.
12487 // Zero splat can be used as identity too, since it might be used with
12488 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12489 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12490 // expensive, the analysis founds out, that the source vector is just a
12491 // broadcast, this original mask can be transformed to identity mask <0,
12492 // 1, 2, 3>.
12493 // \code
12494 // %0 = shuffle %v, poison, zeroinitalizer
12495 // %res = shuffle %0, poison, <3, 1, 2, 0>
12496 // \endcode
12497 // may be transformed to
12498 // \code
12499 // %0 = shuffle %v, poison, zeroinitalizer
12500 // %res = shuffle %0, poison, <0, 1, 2, 3>
12501 // \endcode
12502 if (SV->isZeroEltSplat()) {
12503 IdentityOp = SV;
12504 IdentityMask.assign(Mask);
12505 }
12506 int LocalVF = Mask.size();
12507 if (auto *SVOpTy =
12508 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12509 LocalVF = SVOpTy->getNumElements();
12510 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12511 for (auto [Idx, I] : enumerate(Mask)) {
12512 if (I == PoisonMaskElem ||
12513 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12514 continue;
12515 ExtMask[Idx] = SV->getMaskValue(I);
12516 }
12517 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12518 SV->getOperand(0),
12519 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12520 .all();
12521 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12522 SV->getOperand(1),
12523 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12524 .all();
12525 if (!IsOp1Undef && !IsOp2Undef) {
12526 // Update mask and mark undef elems.
12527 for (int &I : Mask) {
12528 if (I == PoisonMaskElem)
12529 continue;
12530 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12532 I = PoisonMaskElem;
12533 }
12534 break;
12535 }
12536 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12537 combineMasks(LocalVF, ShuffleMask, Mask);
12538 Mask.swap(ShuffleMask);
12539 if (IsOp2Undef)
12540 Op = SV->getOperand(0);
12541 else
12542 Op = SV->getOperand(1);
12543 }
12544 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12545 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12547 if (IdentityOp) {
12548 V = IdentityOp;
12549 assert(Mask.size() == IdentityMask.size() &&
12550 "Expected masks of same sizes.");
12551 // Clear known poison elements.
12552 for (auto [I, Idx] : enumerate(Mask))
12553 if (Idx == PoisonMaskElem)
12554 IdentityMask[I] = PoisonMaskElem;
12555 Mask.swap(IdentityMask);
12556 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12557 return SinglePermute &&
12558 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12559 /*IsStrict=*/true) ||
12560 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12561 Shuffle->isZeroEltSplat() &&
12563 all_of(enumerate(Mask), [&](const auto &P) {
12564 return P.value() == PoisonMaskElem ||
12565 Shuffle->getShuffleMask()[P.index()] == 0;
12566 })));
12567 }
12568 V = Op;
12569 return false;
12570 }
12571 V = Op;
12572 return true;
12573 }
12574
12575 /// Smart shuffle instruction emission, walks through shuffles trees and
12576 /// tries to find the best matching vector for the actual shuffle
12577 /// instruction.
12578 template <typename T, typename ShuffleBuilderTy>
12579 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12580 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12581 assert(V1 && "Expected at least one vector value.");
12582 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12583 SmallVector<int> NewMask(Mask);
12584 if (ScalarTyNumElements != 1) {
12585 assert(SLPReVec && "FixedVectorType is not expected.");
12586 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12587 Mask = NewMask;
12588 }
12589 if (V2)
12590 Builder.resizeToMatch(V1, V2);
12591 int VF = Mask.size();
12592 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12593 VF = FTy->getNumElements();
12595 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12596 .all()) {
12597 // Peek through shuffles.
12598 Value *Op1 = V1;
12599 Value *Op2 = V2;
12600 int VF =
12601 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12602 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12603 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12604 for (int I = 0, E = Mask.size(); I < E; ++I) {
12605 if (Mask[I] < VF)
12606 CombinedMask1[I] = Mask[I];
12607 else
12608 CombinedMask2[I] = Mask[I] - VF;
12609 }
12610 Value *PrevOp1;
12611 Value *PrevOp2;
12612 do {
12613 PrevOp1 = Op1;
12614 PrevOp2 = Op2;
12615 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12616 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12617 // Check if we have 2 resizing shuffles - need to peek through operands
12618 // again.
12619 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12620 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12621 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12622 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12623 if (I == PoisonMaskElem)
12624 continue;
12625 ExtMask1[Idx] = SV1->getMaskValue(I);
12626 }
12627 SmallBitVector UseMask1 = buildUseMask(
12628 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12629 ->getNumElements(),
12630 ExtMask1, UseMask::SecondArg);
12631 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12632 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12633 if (I == PoisonMaskElem)
12634 continue;
12635 ExtMask2[Idx] = SV2->getMaskValue(I);
12636 }
12637 SmallBitVector UseMask2 = buildUseMask(
12638 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12639 ->getNumElements(),
12640 ExtMask2, UseMask::SecondArg);
12641 if (SV1->getOperand(0)->getType() ==
12642 SV2->getOperand(0)->getType() &&
12643 SV1->getOperand(0)->getType() != SV1->getType() &&
12644 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12645 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12646 Op1 = SV1->getOperand(0);
12647 Op2 = SV2->getOperand(0);
12648 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12649 int LocalVF = ShuffleMask1.size();
12650 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12651 LocalVF = FTy->getNumElements();
12652 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12653 CombinedMask1.swap(ShuffleMask1);
12654 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12655 LocalVF = ShuffleMask2.size();
12656 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12657 LocalVF = FTy->getNumElements();
12658 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12659 CombinedMask2.swap(ShuffleMask2);
12660 }
12661 }
12662 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12663 Builder.resizeToMatch(Op1, Op2);
12664 VF = std::max(cast<VectorType>(Op1->getType())
12665 ->getElementCount()
12666 .getKnownMinValue(),
12668 ->getElementCount()
12669 .getKnownMinValue());
12670 for (int I = 0, E = Mask.size(); I < E; ++I) {
12671 if (CombinedMask2[I] != PoisonMaskElem) {
12672 assert(CombinedMask1[I] == PoisonMaskElem &&
12673 "Expected undefined mask element");
12674 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12675 }
12676 }
12677 if (Op1 == Op2 &&
12678 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12679 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12681 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12682 ArrayRef(CombinedMask1))))
12683 return Builder.createIdentity(Op1);
12684 return Builder.createShuffleVector(
12685 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12686 CombinedMask1);
12687 }
12688 if (isa<PoisonValue>(V1))
12689 return Builder.createPoison(
12690 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12691 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12692 assert(V1 && "Expected non-null value after looking through shuffles.");
12693
12694 if (!IsIdentity)
12695 return Builder.createShuffleVector(V1, NewMask);
12696 return Builder.createIdentity(V1);
12697 }
12698
12699 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12700 /// shuffle emission.
12701 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12702 ArrayRef<int> Mask) {
12703 for (unsigned I : seq<unsigned>(CommonMask.size()))
12704 if (Mask[I] != PoisonMaskElem)
12705 CommonMask[I] = I;
12706 }
12707};
12708} // namespace
12709
12710/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12711static std::pair<InstructionCost, InstructionCost>
12713 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12714 Type *ScalarTy, VectorType *VecTy) {
12715 InstructionCost ScalarCost = 0;
12716 InstructionCost VecCost = 0;
12717 // Here we differentiate two cases: (1) when Ptrs represent a regular
12718 // vectorization tree node (as they are pointer arguments of scattered
12719 // loads) or (2) when Ptrs are the arguments of loads or stores being
12720 // vectorized as plane wide unit-stride load/store since all the
12721 // loads/stores are known to be from/to adjacent locations.
12722 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12723 // Case 2: estimate costs for pointer related costs when vectorizing to
12724 // a wide load/store.
12725 // Scalar cost is estimated as a set of pointers with known relationship
12726 // between them.
12727 // For vector code we will use BasePtr as argument for the wide load/store
12728 // but we also need to account all the instructions which are going to
12729 // stay in vectorized code due to uses outside of these scalar
12730 // loads/stores.
12731 ScalarCost = TTI.getPointersChainCost(
12732 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12733 CostKind);
12734
12735 SmallVector<const Value *> PtrsRetainedInVecCode;
12736 for (Value *V : Ptrs) {
12737 if (V == BasePtr) {
12738 PtrsRetainedInVecCode.push_back(V);
12739 continue;
12740 }
12741 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
12742 // For simplicity assume Ptr to stay in vectorized code if it's not a
12743 // GEP instruction. We don't care since it's cost considered free.
12744 // TODO: We should check for any uses outside of vectorizable tree
12745 // rather than just single use.
12746 if (!Ptr || !Ptr->hasOneUse())
12747 PtrsRetainedInVecCode.push_back(V);
12748 }
12749
12750 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12751 // If all pointers stay in vectorized code then we don't have
12752 // any savings on that.
12753 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
12754 }
12755 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12756 TTI::PointersChainInfo::getKnownStride(),
12757 VecTy, CostKind);
12758 } else {
12759 // Case 1: Ptrs are the arguments of loads that we are going to transform
12760 // into masked gather load intrinsic.
12761 // All the scalar GEPs will be removed as a result of vectorization.
12762 // For any external uses of some lanes extract element instructions will
12763 // be generated (which cost is estimated separately).
12764 TTI::PointersChainInfo PtrsInfo =
12765 all_of(Ptrs,
12766 [](const Value *V) {
12767 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
12768 return Ptr && !Ptr->hasAllConstantIndices();
12769 })
12770 ? TTI::PointersChainInfo::getUnknownStride()
12771 : TTI::PointersChainInfo::getKnownStride();
12772
12773 ScalarCost =
12774 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
12775 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12776 if (!BaseGEP) {
12777 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
12778 if (It != Ptrs.end())
12779 BaseGEP = cast<GEPOperator>(*It);
12780 }
12781 if (BaseGEP) {
12782 SmallVector<const Value *> Indices(BaseGEP->indices());
12783 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
12784 BaseGEP->getPointerOperand(), Indices, VecTy,
12785 CostKind);
12786 }
12787 }
12788
12789 return std::make_pair(ScalarCost, VecCost);
12790}
12791
12792void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12793 assert(TE.isGather() && TE.ReorderIndices.empty() &&
12794 "Expected gather node without reordering.");
12795 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
12796 SmallSet<size_t, 2> LoadKeyUsed;
12797
12798 // Do not reorder nodes if it small (just 2 elements), all-constant or all
12799 // instructions have same opcode already.
12800 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
12801 all_of(TE.Scalars, isConstant))
12802 return;
12803
12804 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
12805 return VectorizableTree[Idx]->isSame(TE.Scalars);
12806 }))
12807 return;
12808
12809 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
12810 Key = hash_combine(hash_value(LI->getParent()), Key);
12811 Value *Ptr =
12812 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
12813 if (LoadKeyUsed.contains(Key)) {
12814 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
12815 if (LIt != LoadsMap.end()) {
12816 for (LoadInst *RLI : LIt->second) {
12817 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
12818 LI->getType(), LI->getPointerOperand(), *DL, *SE,
12819 /*StrictCheck=*/true))
12820 return hash_value(RLI->getPointerOperand());
12821 }
12822 for (LoadInst *RLI : LIt->second) {
12824 LI->getPointerOperand(), *TLI)) {
12825 hash_code SubKey = hash_value(RLI->getPointerOperand());
12826 return SubKey;
12827 }
12828 }
12829 if (LIt->second.size() > 2) {
12830 hash_code SubKey =
12831 hash_value(LIt->second.back()->getPointerOperand());
12832 return SubKey;
12833 }
12834 }
12835 }
12836 LoadKeyUsed.insert(Key);
12837 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
12838 return hash_value(LI->getPointerOperand());
12839 };
12840 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12841 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12842 bool IsOrdered = true;
12843 unsigned NumInstructions = 0;
12844 // Try to "cluster" scalar instructions, to be able to build extra vectorized
12845 // nodes.
12846 for (auto [I, V] : enumerate(TE.Scalars)) {
12847 size_t Key = 1, Idx = 1;
12848 if (auto *Inst = dyn_cast<Instruction>(V);
12850 !isDeleted(Inst) && !isVectorized(V)) {
12851 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
12852 /*AllowAlternate=*/false);
12853 ++NumInstructions;
12854 }
12855 auto &Container = SortedValues[Key];
12856 if (IsOrdered && !KeyToIndex.contains(V) &&
12859 ((Container.contains(Idx) &&
12860 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
12861 (!Container.empty() && !Container.contains(Idx) &&
12862 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
12863 IsOrdered = false;
12864 auto &KTI = KeyToIndex[V];
12865 if (KTI.empty())
12866 Container[Idx].push_back(V);
12867 KTI.push_back(I);
12868 }
12870 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12871 if (!IsOrdered && NumInstructions > 1) {
12872 unsigned Cnt = 0;
12873 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
12874 for (const auto &D : SortedValues) {
12875 for (const auto &P : D.second) {
12876 unsigned Sz = 0;
12877 for (Value *V : P.second) {
12878 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
12879 for (auto [K, Idx] : enumerate(Indices)) {
12880 TE.ReorderIndices[Cnt + K] = Idx;
12881 TE.Scalars[Cnt + K] = V;
12882 }
12883 Sz += Indices.size();
12884 Cnt += Indices.size();
12885 }
12886 if (Sz > 1 && isa<Instruction>(P.second.front())) {
12887 const unsigned SubVF = getFloorFullVectorNumberOfElements(
12888 *TTI, TE.Scalars.front()->getType(), Sz);
12889 SubVectors.emplace_back(Cnt - Sz, SubVF);
12890 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12891 DemandedElts.clearBit(I);
12892 } else if (!P.second.empty() && isConstant(P.second.front())) {
12893 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12894 DemandedElts.clearBit(I);
12895 }
12896 }
12897 }
12898 }
12899 // Reuses always require shuffles, so consider it as profitable.
12900 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
12901 return;
12902 // Do simple cost estimation.
12905 auto *ScalarTy = TE.Scalars.front()->getType();
12906 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
12907 for (auto [Idx, Sz] : SubVectors) {
12909 Idx, getWidenedType(ScalarTy, Sz));
12910 }
12911 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12912 /*Insert=*/true,
12913 /*Extract=*/false, CostKind);
12914 int Sz = TE.Scalars.size();
12915 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
12916 TE.ReorderIndices.end());
12917 for (unsigned I : seq<unsigned>(Sz)) {
12918 Value *V = TE.getOrdered(I);
12919 if (isa<PoisonValue>(V)) {
12920 ReorderMask[I] = PoisonMaskElem;
12921 } else if (isConstant(V) || DemandedElts[I]) {
12922 ReorderMask[I] = I + TE.ReorderIndices.size();
12923 }
12924 }
12925 Cost += ::getShuffleCost(*TTI,
12926 any_of(ReorderMask, [&](int I) { return I >= Sz; })
12929 VecTy, ReorderMask);
12930 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12931 ReorderMask.assign(Sz, PoisonMaskElem);
12932 for (unsigned I : seq<unsigned>(Sz)) {
12933 Value *V = TE.getOrdered(I);
12934 if (isConstant(V)) {
12935 DemandedElts.clearBit(I);
12936 if (!isa<PoisonValue>(V))
12937 ReorderMask[I] = I;
12938 } else {
12939 ReorderMask[I] = I + Sz;
12940 }
12941 }
12942 InstructionCost BVCost =
12943 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12944 /*Insert=*/true, /*Extract=*/false, CostKind);
12945 if (!DemandedElts.isAllOnes())
12946 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
12947 if (Cost >= BVCost) {
12948 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
12949 reorderScalars(TE.Scalars, Mask);
12950 TE.ReorderIndices.clear();
12951 }
12952}
12953
12954/// Check if we can convert fadd/fsub sequence to FMAD.
12955/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
12957 const InstructionsState &S,
12958 DominatorTree &DT, const DataLayout &DL,
12960 const TargetLibraryInfo &TLI) {
12961 assert(all_of(VL,
12962 [](Value *V) {
12963 return V->getType()->getScalarType()->isFloatingPointTy();
12964 }) &&
12965 "Can only convert to FMA for floating point types");
12966 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12967
12968 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12969 FastMathFlags FMF;
12970 FMF.set();
12971 for (Value *V : VL) {
12972 auto *I = dyn_cast<Instruction>(V);
12973 if (!I)
12974 continue;
12975 if (S.isCopyableElement(I))
12976 continue;
12977 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12978 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12979 continue;
12980 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12981 FMF &= FPCI->getFastMathFlags();
12982 }
12983 return FMF.allowContract();
12984 };
12985 if (!CheckForContractable(VL))
12987 // fmul also should be contractable
12988 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12989 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12990
12991 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
12992 if (!OpS.valid())
12994
12995 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12997 if (!CheckForContractable(Operands.front()))
12999 // Compare the costs.
13000 InstructionCost FMulPlusFAddCost = 0;
13001 InstructionCost FMACost = 0;
13003 FastMathFlags FMF;
13004 FMF.set();
13005 for (Value *V : VL) {
13006 auto *I = dyn_cast<Instruction>(V);
13007 if (!I)
13008 continue;
13009 if (!S.isCopyableElement(I))
13010 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13011 FMF &= FPCI->getFastMathFlags();
13012 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13013 }
13014 unsigned NumOps = 0;
13015 for (auto [V, Op] : zip(VL, Operands.front())) {
13016 if (S.isCopyableElement(V))
13017 continue;
13018 auto *I = dyn_cast<Instruction>(Op);
13019 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
13020 if (auto *OpI = dyn_cast<Instruction>(V))
13021 FMACost += TTI.getInstructionCost(OpI, CostKind);
13022 if (I)
13023 FMACost += TTI.getInstructionCost(I, CostKind);
13024 continue;
13025 }
13026 ++NumOps;
13027 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13028 FMF &= FPCI->getFastMathFlags();
13029 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13030 }
13031 Type *Ty = VL.front()->getType();
13032 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
13033 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
13034 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
13035}
13036
13039 BaseGraphSize = VectorizableTree.size();
13040 // Turn graph transforming mode on and off, when done.
13041 class GraphTransformModeRAAI {
13042 bool &SavedIsGraphTransformMode;
13043
13044 public:
13045 GraphTransformModeRAAI(bool &IsGraphTransformMode)
13046 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13047 IsGraphTransformMode = true;
13048 }
13049 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
13050 } TransformContext(IsGraphTransformMode);
13051 // Operands are profitable if they are:
13052 // 1. At least one constant
13053 // or
13054 // 2. Splats
13055 // or
13056 // 3. Results in good vectorization opportunity, i.e. may generate vector
13057 // nodes and reduce cost of the graph.
13058 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
13059 const InstructionsState &S) {
13061 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
13062 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
13063 I2->getOperand(Op));
13064 return all_of(
13065 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
13066 return all_of(Cand,
13067 [](const std::pair<Value *, Value *> &P) {
13068 return isa<Constant>(P.first) ||
13069 isa<Constant>(P.second) || P.first == P.second;
13070 }) ||
13072 });
13073 };
13074
13075 // Try to reorder gather nodes for better vectorization opportunities.
13076 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13077 TreeEntry &E = *VectorizableTree[Idx];
13078 if (E.isGather())
13079 reorderGatherNode(E);
13080 }
13081
13082 // Better to use full gathered loads analysis, if there are only 2 loads
13083 // gathered nodes each having less than 16 elements.
13084 constexpr unsigned VFLimit = 16;
13085 bool ForceLoadGather =
13086 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
13087 return TE->isGather() && TE->hasState() &&
13088 TE->getOpcode() == Instruction::Load &&
13089 TE->getVectorFactor() < VFLimit;
13090 }) == 2;
13091
13092 // Checks if the scalars are used in other node.
13093 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
13094 function_ref<bool(Value *)> CheckContainer) {
13095 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
13096 if (isa<PoisonValue>(V))
13097 return true;
13098 auto *I = dyn_cast<Instruction>(V);
13099 if (!I)
13100 return false;
13101 return is_contained(TE->Scalars, I) || CheckContainer(I);
13102 });
13103 };
13104 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
13105 if (E.hasState()) {
13106 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
13107 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13108 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13109 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13110 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13111 return is_contained(TEs, TE);
13112 });
13113 });
13114 }))
13115 return true;
13116 ;
13117 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
13118 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13119 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13120 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13121 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13122 return is_contained(TEs, TE);
13123 });
13124 });
13125 }))
13126 return true;
13127 } else {
13128 // Check if the gather node full copy of split node.
13129 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
13130 if (It != E.Scalars.end()) {
13131 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
13132 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13133 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13134 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13135 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13136 return is_contained(TEs, TE);
13137 });
13138 });
13139 }))
13140 return true;
13141 }
13142 }
13143 return false;
13144 };
13145 // The tree may grow here, so iterate over nodes, built before.
13146 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13147 TreeEntry &E = *VectorizableTree[Idx];
13148 if (E.isGather()) {
13149 ArrayRef<Value *> VL = E.Scalars;
13150 const unsigned Sz = getVectorElementSize(VL.front());
13151 unsigned MinVF = getMinVF(2 * Sz);
13152 // Do not try partial vectorization for small nodes (<= 2), nodes with the
13153 // same opcode and same parent block or all constants.
13154 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13155 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
13156 // We use allSameOpcode instead of isAltShuffle because we don't
13157 // want to use interchangeable instruction here.
13158 !allSameOpcode(VL) || !allSameBlock(VL)) ||
13159 allConstant(VL) || isSplat(VL))
13160 continue;
13161 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13162 continue;
13163 // Check if the node is a copy of other vector nodes.
13164 if (CheckForSameVectorNodes(E))
13165 continue;
13166 // Try to find vectorizable sequences and transform them into a series of
13167 // insertvector instructions.
13168 unsigned StartIdx = 0;
13169 unsigned End = VL.size();
13170 for (unsigned VF = getFloorFullVectorNumberOfElements(
13171 *TTI, VL.front()->getType(), VL.size() - 1);
13172 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13173 *TTI, VL.front()->getType(), VF - 1)) {
13174 if (StartIdx + VF > End)
13175 continue;
13177 bool AllStrided = true;
13178 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13179 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
13180 // If any instruction is vectorized already - do not try again.
13181 // Reuse the existing node, if it fully matches the slice.
13182 if (isVectorized(Slice.front()) &&
13183 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
13184 continue;
13185 // Constant already handled effectively - skip.
13186 if (allConstant(Slice))
13187 continue;
13188 // Do not try to vectorize small splats (less than vector register and
13189 // only with the single non-undef element).
13190 bool IsSplat = isSplat(Slice);
13191 bool IsTwoRegisterSplat = true;
13192 if (IsSplat && VF == 2) {
13193 unsigned NumRegs2VF = ::getNumberOfParts(
13194 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
13195 IsTwoRegisterSplat = NumRegs2VF == 2;
13196 }
13197 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
13198 count(Slice, Slice.front()) ==
13199 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
13200 : 1)) {
13201 if (IsSplat)
13202 continue;
13203 InstructionsState S = getSameOpcode(Slice, *TLI);
13204 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
13205 (S.getOpcode() == Instruction::Load &&
13207 (S.getOpcode() != Instruction::Load &&
13208 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
13209 continue;
13210 if (VF == 2) {
13211 // Try to vectorize reduced values or if all users are vectorized.
13212 // For expensive instructions extra extracts might be profitable.
13213 if ((!UserIgnoreList || E.Idx != 0) &&
13214 TTI->getInstructionCost(S.getMainOp(), CostKind) <
13216 !all_of(Slice, [&](Value *V) {
13217 if (isa<PoisonValue>(V))
13218 return true;
13219 return areAllUsersVectorized(cast<Instruction>(V),
13220 UserIgnoreList);
13221 }))
13222 continue;
13223 if (S.getOpcode() == Instruction::Load) {
13224 OrdersType Order;
13225 SmallVector<Value *> PointerOps;
13226 StridedPtrInfo SPtrInfo;
13227 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
13228 PointerOps, SPtrInfo);
13229 AllStrided &= Res == LoadsState::StridedVectorize ||
13231 Res == LoadsState::Gather;
13232 // Do not vectorize gathers.
13233 if (Res == LoadsState::ScatterVectorize ||
13234 Res == LoadsState::Gather) {
13235 if (Res == LoadsState::Gather) {
13237 // If reductions and the scalars from the root node are
13238 // analyzed - mark as non-vectorizable reduction.
13239 if (UserIgnoreList && E.Idx == 0)
13240 analyzedReductionVals(Slice);
13241 }
13242 continue;
13243 }
13244 } else if (S.getOpcode() == Instruction::ExtractElement ||
13245 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
13247 !CheckOperandsProfitability(
13248 S.getMainOp(),
13251 S))) {
13252 // Do not vectorize extractelements (handled effectively
13253 // alread). Do not vectorize non-profitable instructions (with
13254 // low cost and non-vectorizable operands.)
13255 continue;
13256 }
13257 }
13258 }
13259 Slices.emplace_back(Cnt, Slice.size());
13260 }
13261 // Do not try to vectorize if all slides are strided or gathered with
13262 // vector factor 2 and there are more than 2 slices. Better to handle
13263 // them in gathered loads analysis, may result in better vectorization.
13264 if (VF == 2 && AllStrided && Slices.size() > 2)
13265 continue;
13266 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13267 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13268 if (StartIdx == Cnt)
13269 StartIdx = Cnt + Sz;
13270 if (End == Cnt + Sz)
13271 End = Cnt;
13272 };
13273 for (auto [Cnt, Sz] : Slices) {
13274 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13275 const TreeEntry *SameTE = nullptr;
13276 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13277 It != Slice.end()) {
13278 // If any instruction is vectorized already - do not try again.
13279 SameTE = getSameValuesTreeEntry(*It, Slice);
13280 }
13281 unsigned PrevSize = VectorizableTree.size();
13282 [[maybe_unused]] unsigned PrevEntriesSize =
13283 LoadEntriesToVectorize.size();
13284 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13285 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13286 VectorizableTree[PrevSize]->isGather() &&
13287 VectorizableTree[PrevSize]->hasState() &&
13288 VectorizableTree[PrevSize]->getOpcode() !=
13289 Instruction::ExtractElement &&
13290 !isSplat(Slice)) {
13291 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13292 analyzedReductionVals(Slice);
13293 VectorizableTree.pop_back();
13294 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13295 "LoadEntriesToVectorize expected to remain the same");
13296 continue;
13297 }
13298 AddCombinedNode(PrevSize, Cnt, Sz);
13299 }
13300 }
13301 // Restore ordering, if no extra vectorization happened.
13302 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13303 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13304 reorderScalars(E.Scalars, Mask);
13305 E.ReorderIndices.clear();
13306 }
13307 }
13308 if (!E.hasState())
13309 continue;
13310 switch (E.getOpcode()) {
13311 case Instruction::Load: {
13312 // No need to reorder masked gather loads, just reorder the scalar
13313 // operands.
13314 if (E.State != TreeEntry::Vectorize)
13315 break;
13316 Type *ScalarTy = E.getMainOp()->getType();
13317 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13318 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13319 // Check if profitable to represent consecutive load + reverse as strided
13320 // load with stride -1.
13321 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13322 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13323 SmallVector<int> Mask;
13324 inversePermutation(E.ReorderIndices, Mask);
13325 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13326 InstructionCost OriginalVecCost =
13327 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13328 BaseLI->getPointerAddressSpace(), CostKind,
13330 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13331 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13332 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13333 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
13334 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13335 // Strided load is more profitable than consecutive load + reverse -
13336 // transform the node to strided load.
13337 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13338 ->getPointerOperand()
13339 ->getType());
13340 StridedPtrInfo SPtrInfo;
13341 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13342 SPtrInfo.Ty = VecTy;
13343 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13344 E.State = TreeEntry::StridedVectorize;
13345 }
13346 }
13347 break;
13348 }
13349 case Instruction::Store: {
13350 Type *ScalarTy =
13351 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13352 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13353 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13354 // Check if profitable to represent consecutive load + reverse as strided
13355 // load with stride -1.
13356 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13357 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13358 SmallVector<int> Mask;
13359 inversePermutation(E.ReorderIndices, Mask);
13360 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13361 InstructionCost OriginalVecCost =
13362 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13363 BaseSI->getPointerAddressSpace(), CostKind,
13365 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13366 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13367 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13368 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
13369 if (StridedCost < OriginalVecCost)
13370 // Strided store is more profitable than reverse + consecutive store -
13371 // transform the node to strided store.
13372 E.State = TreeEntry::StridedVectorize;
13373 } else if (!E.ReorderIndices.empty()) {
13374 // Check for interleaved stores.
13375 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13376 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13377 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13378 if (Mask.size() < 4)
13379 return 0u;
13380 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13382 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13383 TTI.isLegalInterleavedAccessType(
13384 VecTy, Factor, BaseSI->getAlign(),
13385 BaseSI->getPointerAddressSpace()))
13386 return Factor;
13387 }
13388
13389 return 0u;
13390 };
13391 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13392 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13393 if (InterleaveFactor != 0)
13394 E.setInterleave(InterleaveFactor);
13395 }
13396 break;
13397 }
13398 case Instruction::Select: {
13399 if (E.State != TreeEntry::Vectorize)
13400 break;
13401 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13402 if (MinMaxID == Intrinsic::not_intrinsic)
13403 break;
13404 // This node is a minmax node.
13405 E.CombinedOp = TreeEntry::MinMax;
13406 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13407 if (SelectOnly && CondEntry->UserTreeIndex &&
13408 CondEntry->State == TreeEntry::Vectorize) {
13409 // The condition node is part of the combined minmax node.
13410 CondEntry->State = TreeEntry::CombinedVectorize;
13411 }
13412 break;
13413 }
13414 case Instruction::FSub:
13415 case Instruction::FAdd: {
13416 // Check if possible to convert (a*b)+c to fma.
13417 if (E.State != TreeEntry::Vectorize ||
13418 !E.getOperations().isAddSubLikeOp())
13419 break;
13420 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13421 .isValid())
13422 break;
13423 // This node is a fmuladd node.
13424 E.CombinedOp = TreeEntry::FMulAdd;
13425 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13426 if (FMulEntry->UserTreeIndex &&
13427 FMulEntry->State == TreeEntry::Vectorize) {
13428 // The FMul node is part of the combined fmuladd node.
13429 FMulEntry->State = TreeEntry::CombinedVectorize;
13430 }
13431 break;
13432 }
13433 default:
13434 break;
13435 }
13436 }
13437
13438 if (LoadEntriesToVectorize.empty()) {
13439 // Single load node - exit.
13440 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13441 VectorizableTree.front()->getOpcode() == Instruction::Load)
13442 return;
13443 // Small graph with small VF - exit.
13444 constexpr unsigned SmallTree = 3;
13445 constexpr unsigned SmallVF = 2;
13446 if ((VectorizableTree.size() <= SmallTree &&
13447 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13448 (VectorizableTree.size() <= 2 && UserIgnoreList))
13449 return;
13450
13451 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13452 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13453 getCanonicalGraphSize() <= SmallTree &&
13454 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13455 [](const std::unique_ptr<TreeEntry> &TE) {
13456 return TE->isGather() && TE->hasState() &&
13457 TE->getOpcode() == Instruction::Load &&
13458 !allSameBlock(TE->Scalars);
13459 }) == 1)
13460 return;
13461 }
13462
13463 // A list of loads to be gathered during the vectorization process. We can
13464 // try to vectorize them at the end, if profitable.
13465 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13467 GatheredLoads;
13468
13469 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13470 TreeEntry &E = *TE;
13471 if (E.isGather() &&
13472 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13473 (!E.hasState() && any_of(E.Scalars,
13474 [&](Value *V) {
13475 return isa<LoadInst>(V) &&
13476 !isVectorized(V) &&
13477 !isDeleted(cast<Instruction>(V));
13478 }))) &&
13479 !isSplat(E.Scalars)) {
13480 for (Value *V : E.Scalars) {
13481 auto *LI = dyn_cast<LoadInst>(V);
13482 if (!LI)
13483 continue;
13484 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13485 continue;
13487 *this, V, *DL, *SE, *TTI,
13488 GatheredLoads[std::make_tuple(
13489 LI->getParent(),
13490 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13491 LI->getType())]);
13492 }
13493 }
13494 }
13495 // Try to vectorize gathered loads if this is not just a gather of loads.
13496 if (!GatheredLoads.empty())
13497 tryToVectorizeGatheredLoads(GatheredLoads);
13498}
13499
13500/// Merges shuffle masks and emits final shuffle instruction, if required. It
13501/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13502/// when the actual shuffle instruction is generated only if this is actually
13503/// required. Otherwise, the shuffle instruction emission is delayed till the
13504/// end of the process, to reduce the number of emitted instructions and further
13505/// analysis/transformations.
13506class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13507 bool IsFinalized = false;
13508 SmallVector<int> CommonMask;
13510 const TargetTransformInfo &TTI;
13511 InstructionCost Cost = 0;
13512 SmallDenseSet<Value *> VectorizedVals;
13513 BoUpSLP &R;
13514 SmallPtrSetImpl<Value *> &CheckedExtracts;
13515 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13516 /// While set, still trying to estimate the cost for the same nodes and we
13517 /// can delay actual cost estimation (virtual shuffle instruction emission).
13518 /// May help better estimate the cost if same nodes must be permuted + allows
13519 /// to move most of the long shuffles cost estimation to TTI.
13520 bool SameNodesEstimated = true;
13521
13522 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13523 if (Ty->getScalarType()->isPointerTy()) {
13526 IntegerType::get(Ty->getContext(),
13527 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13528 Ty->getScalarType());
13529 if (auto *VTy = dyn_cast<VectorType>(Ty))
13530 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13531 return Res;
13532 }
13533 return Constant::getAllOnesValue(Ty);
13534 }
13535
13536 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13537 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13538 return TTI::TCC_Free;
13539 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13540 InstructionCost GatherCost = 0;
13541 SmallVector<Value *> Gathers(VL);
13542 if (!Root && isSplat(VL)) {
13543 // Found the broadcasting of the single scalar, calculate the cost as
13544 // the broadcast.
13545 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13546 assert(It != VL.end() && "Expected at least one non-undef value.");
13547 // Add broadcast for non-identity shuffle only.
13548 bool NeedShuffle =
13549 count(VL, *It) > 1 &&
13550 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13551 if (!NeedShuffle) {
13552 if (isa<FixedVectorType>(ScalarTy)) {
13553 assert(SLPReVec && "FixedVectorType is not expected.");
13554 return TTI.getShuffleCost(
13555 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13556 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13557 cast<FixedVectorType>(ScalarTy));
13558 }
13559 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13560 CostKind, std::distance(VL.begin(), It),
13561 PoisonValue::get(VecTy), *It);
13562 }
13563
13564 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13565 transform(VL, ShuffleMask.begin(), [](Value *V) {
13566 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13567 });
13568 InstructionCost InsertCost =
13569 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13570 PoisonValue::get(VecTy), *It);
13571 return InsertCost + ::getShuffleCost(TTI,
13573 VecTy, ShuffleMask, CostKind,
13574 /*Index=*/0, /*SubTp=*/nullptr,
13575 /*Args=*/*It);
13576 }
13577 return GatherCost +
13578 (all_of(Gathers, IsaPred<UndefValue>)
13580 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13581 ScalarTy));
13582 };
13583
13584 /// Compute the cost of creating a vector containing the extracted values from
13585 /// \p VL.
13587 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13588 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13589 unsigned NumParts) {
13590 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13591 unsigned NumElts =
13592 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13593 auto *EE = dyn_cast<ExtractElementInst>(V);
13594 if (!EE)
13595 return Sz;
13596 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13597 if (!VecTy)
13598 return Sz;
13599 return std::max(Sz, VecTy->getNumElements());
13600 });
13601 // FIXME: this must be moved to TTI for better estimation.
13602 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13603 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13605 SmallVectorImpl<unsigned> &SubVecSizes)
13606 -> std::optional<TTI::ShuffleKind> {
13607 if (NumElts <= EltsPerVector)
13608 return std::nullopt;
13609 int OffsetReg0 =
13610 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13611 [](int S, int I) {
13612 if (I == PoisonMaskElem)
13613 return S;
13614 return std::min(S, I);
13615 }),
13616 EltsPerVector);
13617 int OffsetReg1 = OffsetReg0;
13618 DenseSet<int> RegIndices;
13619 // Check that if trying to permute same single/2 input vectors.
13621 int FirstRegId = -1;
13622 Indices.assign(1, OffsetReg0);
13623 for (auto [Pos, I] : enumerate(Mask)) {
13624 if (I == PoisonMaskElem)
13625 continue;
13626 int Idx = I - OffsetReg0;
13627 int RegId =
13628 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13629 if (FirstRegId < 0)
13630 FirstRegId = RegId;
13631 RegIndices.insert(RegId);
13632 if (RegIndices.size() > 2)
13633 return std::nullopt;
13634 if (RegIndices.size() == 2) {
13635 ShuffleKind = TTI::SK_PermuteTwoSrc;
13636 if (Indices.size() == 1) {
13637 OffsetReg1 = alignDown(
13638 std::accumulate(
13639 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13640 [&](int S, int I) {
13641 if (I == PoisonMaskElem)
13642 return S;
13643 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13644 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13645 if (RegId == FirstRegId)
13646 return S;
13647 return std::min(S, I);
13648 }),
13649 EltsPerVector);
13650 unsigned Index = OffsetReg1 % NumElts;
13651 Indices.push_back(Index);
13652 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13653 }
13654 Idx = I - OffsetReg1;
13655 }
13656 I = (Idx % NumElts) % EltsPerVector +
13657 (RegId == FirstRegId ? 0 : EltsPerVector);
13658 }
13659 return ShuffleKind;
13660 };
13661 InstructionCost Cost = 0;
13662
13663 // Process extracts in blocks of EltsPerVector to check if the source vector
13664 // operand can be re-used directly. If not, add the cost of creating a
13665 // shuffle to extract the values into a vector register.
13666 for (unsigned Part : seq<unsigned>(NumParts)) {
13667 if (!ShuffleKinds[Part])
13668 continue;
13669 ArrayRef<int> MaskSlice = Mask.slice(
13670 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13671 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13672 copy(MaskSlice, SubMask.begin());
13674 SmallVector<unsigned, 2> SubVecSizes;
13675 std::optional<TTI::ShuffleKind> RegShuffleKind =
13676 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13677 if (!RegShuffleKind) {
13678 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13680 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13681 Cost +=
13682 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13683 getWidenedType(ScalarTy, NumElts), MaskSlice);
13684 continue;
13685 }
13686 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13687 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13688 Cost +=
13689 ::getShuffleCost(TTI, *RegShuffleKind,
13690 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13691 }
13692 const unsigned BaseVF = getFullVectorNumberOfElements(
13693 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13694 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13695 assert((Idx + SubVecSize) <= BaseVF &&
13696 "SK_ExtractSubvector index out of range");
13698 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13699 Idx, getWidenedType(ScalarTy, SubVecSize));
13700 }
13701 // Second attempt to check, if just a permute is better estimated than
13702 // subvector extract.
13703 SubMask.assign(NumElts, PoisonMaskElem);
13704 copy(MaskSlice, SubMask.begin());
13705 InstructionCost OriginalCost = ::getShuffleCost(
13706 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13707 if (OriginalCost < Cost)
13708 Cost = OriginalCost;
13709 }
13710 return Cost;
13711 }
13712 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13713 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13714 /// elements.
13715 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13716 ArrayRef<int> Mask, unsigned Part,
13717 unsigned SliceSize) {
13718 if (SameNodesEstimated) {
13719 // Delay the cost estimation if the same nodes are reshuffling.
13720 // If we already requested the cost of reshuffling of E1 and E2 before, no
13721 // need to estimate another cost with the sub-Mask, instead include this
13722 // sub-Mask into the CommonMask to estimate it later and avoid double cost
13723 // estimation.
13724 if ((InVectors.size() == 2 &&
13725 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
13726 cast<const TreeEntry *>(InVectors.back()) == E2) ||
13727 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
13728 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
13729 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
13730 [](int Idx) { return Idx == PoisonMaskElem; }) &&
13731 "Expected all poisoned elements.");
13732 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
13733 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13734 return;
13735 }
13736 // Found non-matching nodes - need to estimate the cost for the matched
13737 // and transform mask.
13738 Cost += createShuffle(InVectors.front(),
13739 InVectors.size() == 1 ? nullptr : InVectors.back(),
13740 CommonMask);
13741 transformMaskAfterShuffle(CommonMask, CommonMask);
13742 } else if (InVectors.size() == 2) {
13743 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13744 transformMaskAfterShuffle(CommonMask, CommonMask);
13745 }
13746 SameNodesEstimated = false;
13747 if (!E2 && InVectors.size() == 1) {
13748 unsigned VF = E1.getVectorFactor();
13749 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
13750 VF = std::max(VF, getVF(V1));
13751 } else {
13752 const auto *E = cast<const TreeEntry *>(InVectors.front());
13753 VF = std::max(VF, E->getVectorFactor());
13754 }
13755 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13756 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13757 CommonMask[Idx] = Mask[Idx] + VF;
13758 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13759 transformMaskAfterShuffle(CommonMask, CommonMask);
13760 } else {
13761 auto P = InVectors.front();
13762 Cost += createShuffle(&E1, E2, Mask);
13763 unsigned VF = Mask.size();
13764 if (Value *V1 = dyn_cast<Value *>(P)) {
13765 VF = std::max(VF,
13766 getNumElements(V1->getType()));
13767 } else {
13768 const auto *E = cast<const TreeEntry *>(P);
13769 VF = std::max(VF, E->getVectorFactor());
13770 }
13771 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13772 if (Mask[Idx] != PoisonMaskElem)
13773 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13774 Cost += createShuffle(P, InVectors.front(), CommonMask);
13775 transformMaskAfterShuffle(CommonMask, CommonMask);
13776 }
13777 }
13778
13779 class ShuffleCostBuilder {
13780 const TargetTransformInfo &TTI;
13781
13782 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
13783 int Index = -1;
13784 return Mask.empty() ||
13785 (VF == Mask.size() &&
13788 Index == 0);
13789 }
13790
13791 public:
13792 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
13793 ~ShuffleCostBuilder() = default;
13794 InstructionCost createShuffleVector(Value *V1, Value *,
13795 ArrayRef<int> Mask) const {
13796 // Empty mask or identity mask are free.
13797 unsigned VF =
13798 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13799 if (isEmptyOrIdentity(Mask, VF))
13800 return TTI::TCC_Free;
13801 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
13802 cast<VectorType>(V1->getType()), Mask);
13803 }
13804 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
13805 // Empty mask or identity mask are free.
13806 unsigned VF =
13807 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13808 if (isEmptyOrIdentity(Mask, VF))
13809 return TTI::TCC_Free;
13810 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
13811 cast<VectorType>(V1->getType()), Mask);
13812 }
13813 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
13814 InstructionCost createPoison(Type *Ty, unsigned VF) const {
13815 return TTI::TCC_Free;
13816 }
13817 void resizeToMatch(Value *&, Value *&) const {}
13818 };
13819
13820 /// Smart shuffle instruction emission, walks through shuffles trees and
13821 /// tries to find the best matching vector for the actual shuffle
13822 /// instruction.
13824 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
13826 ArrayRef<int> Mask) {
13827 ShuffleCostBuilder Builder(TTI);
13828 SmallVector<int> CommonMask(Mask);
13829 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
13830 unsigned CommonVF = Mask.size();
13831 InstructionCost ExtraCost = 0;
13832 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
13833 unsigned VF) -> InstructionCost {
13834 if (E.isGather() && allConstant(E.Scalars))
13835 return TTI::TCC_Free;
13836 Type *EScalarTy = E.Scalars.front()->getType();
13837 bool IsSigned = true;
13838 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13839 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
13840 IsSigned = It->second.second;
13841 }
13842 if (EScalarTy != ScalarTy) {
13843 unsigned CastOpcode = Instruction::Trunc;
13844 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13845 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13846 if (DstSz > SrcSz)
13847 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13848 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
13849 getWidenedType(EScalarTy, VF),
13850 TTI::CastContextHint::None, CostKind);
13851 }
13852 return TTI::TCC_Free;
13853 };
13854 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
13855 if (isa<Constant>(V))
13856 return TTI::TCC_Free;
13857 auto *VecTy = cast<VectorType>(V->getType());
13858 Type *EScalarTy = VecTy->getElementType();
13859 if (EScalarTy != ScalarTy) {
13860 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
13861 unsigned CastOpcode = Instruction::Trunc;
13862 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13863 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13864 if (DstSz > SrcSz)
13865 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13866 return TTI.getCastInstrCost(
13867 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
13868 VecTy, TTI::CastContextHint::None, CostKind);
13869 }
13870 return TTI::TCC_Free;
13871 };
13872 if (!V1 && !V2 && !P2.isNull()) {
13873 // Shuffle 2 entry nodes.
13874 const TreeEntry *E = cast<const TreeEntry *>(P1);
13875 unsigned VF = E->getVectorFactor();
13876 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13877 CommonVF = std::max(VF, E2->getVectorFactor());
13878 assert(all_of(Mask,
13879 [=](int Idx) {
13880 return Idx < 2 * static_cast<int>(CommonVF);
13881 }) &&
13882 "All elements in mask must be less than 2 * CommonVF.");
13883 if (E->Scalars.size() == E2->Scalars.size()) {
13884 SmallVector<int> EMask = E->getCommonMask();
13885 SmallVector<int> E2Mask = E2->getCommonMask();
13886 if (!EMask.empty() || !E2Mask.empty()) {
13887 for (int &Idx : CommonMask) {
13888 if (Idx == PoisonMaskElem)
13889 continue;
13890 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
13891 Idx = EMask[Idx];
13892 else if (Idx >= static_cast<int>(CommonVF))
13893 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13894 E->Scalars.size();
13895 }
13896 }
13897 CommonVF = E->Scalars.size();
13898 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13899 GetNodeMinBWAffectedCost(*E2, CommonVF);
13900 } else {
13901 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13902 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13903 }
13904 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13905 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13906 } else if (!V1 && P2.isNull()) {
13907 // Shuffle single entry node.
13908 const TreeEntry *E = cast<const TreeEntry *>(P1);
13909 unsigned VF = E->getVectorFactor();
13910 CommonVF = VF;
13911 assert(
13912 all_of(Mask,
13913 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13914 "All elements in mask must be less than CommonVF.");
13915 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13916 SmallVector<int> EMask = E->getCommonMask();
13917 assert(!EMask.empty() && "Expected non-empty common mask.");
13918 for (int &Idx : CommonMask) {
13919 if (Idx != PoisonMaskElem)
13920 Idx = EMask[Idx];
13921 }
13922 CommonVF = E->Scalars.size();
13923 } else if (unsigned Factor = E->getInterleaveFactor();
13924 Factor > 0 && E->Scalars.size() != Mask.size() &&
13926 Factor)) {
13927 // Deinterleaved nodes are free.
13928 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13929 }
13930 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13931 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13932 // Not identity/broadcast? Try to see if the original vector is better.
13933 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13934 CommonVF == CommonMask.size() &&
13935 any_of(enumerate(CommonMask),
13936 [](const auto &&P) {
13937 return P.value() != PoisonMaskElem &&
13938 static_cast<unsigned>(P.value()) != P.index();
13939 }) &&
13940 any_of(CommonMask,
13941 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
13942 SmallVector<int> ReorderMask;
13943 inversePermutation(E->ReorderIndices, ReorderMask);
13944 ::addMask(CommonMask, ReorderMask);
13945 }
13946 } else if (V1 && P2.isNull()) {
13947 // Shuffle single vector.
13948 ExtraCost += GetValueMinBWAffectedCost(V1);
13949 CommonVF = getVF(V1);
13950 assert(
13951 all_of(Mask,
13952 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13953 "All elements in mask must be less than CommonVF.");
13954 } else if (V1 && !V2) {
13955 // Shuffle vector and tree node.
13956 unsigned VF = getVF(V1);
13957 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13958 CommonVF = std::max(VF, E2->getVectorFactor());
13959 assert(all_of(Mask,
13960 [=](int Idx) {
13961 return Idx < 2 * static_cast<int>(CommonVF);
13962 }) &&
13963 "All elements in mask must be less than 2 * CommonVF.");
13964 if (E2->Scalars.size() == VF && VF != CommonVF) {
13965 SmallVector<int> E2Mask = E2->getCommonMask();
13966 assert(!E2Mask.empty() && "Expected non-empty common mask.");
13967 for (int &Idx : CommonMask) {
13968 if (Idx == PoisonMaskElem)
13969 continue;
13970 if (Idx >= static_cast<int>(CommonVF))
13971 Idx = E2Mask[Idx - CommonVF] + VF;
13972 }
13973 CommonVF = VF;
13974 }
13975 ExtraCost += GetValueMinBWAffectedCost(V1);
13976 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13977 ExtraCost += GetNodeMinBWAffectedCost(
13978 *E2, std::min(CommonVF, E2->getVectorFactor()));
13979 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13980 } else if (!V1 && V2) {
13981 // Shuffle vector and tree node.
13982 unsigned VF = getVF(V2);
13983 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13984 CommonVF = std::max(VF, E1->getVectorFactor());
13985 assert(all_of(Mask,
13986 [=](int Idx) {
13987 return Idx < 2 * static_cast<int>(CommonVF);
13988 }) &&
13989 "All elements in mask must be less than 2 * CommonVF.");
13990 if (E1->Scalars.size() == VF && VF != CommonVF) {
13991 SmallVector<int> E1Mask = E1->getCommonMask();
13992 assert(!E1Mask.empty() && "Expected non-empty common mask.");
13993 for (int &Idx : CommonMask) {
13994 if (Idx == PoisonMaskElem)
13995 continue;
13996 if (Idx >= static_cast<int>(CommonVF))
13997 Idx = E1Mask[Idx - CommonVF] + VF;
13998 else
13999 Idx = E1Mask[Idx];
14000 }
14001 CommonVF = VF;
14002 }
14003 ExtraCost += GetNodeMinBWAffectedCost(
14004 *E1, std::min(CommonVF, E1->getVectorFactor()));
14005 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14006 ExtraCost += GetValueMinBWAffectedCost(V2);
14007 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14008 } else {
14009 assert(V1 && V2 && "Expected both vectors.");
14010 unsigned VF = getVF(V1);
14011 CommonVF = std::max(VF, getVF(V2));
14012 assert(all_of(Mask,
14013 [=](int Idx) {
14014 return Idx < 2 * static_cast<int>(CommonVF);
14015 }) &&
14016 "All elements in mask must be less than 2 * CommonVF.");
14017 ExtraCost +=
14018 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14019 if (V1->getType() != V2->getType()) {
14020 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14021 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14022 } else {
14023 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
14024 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14025 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
14026 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14027 }
14028 }
14029 InVectors.front() =
14030 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14031 if (InVectors.size() == 2)
14032 InVectors.pop_back();
14033 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14034 V1, V2, CommonMask, Builder, ScalarTy);
14035 }
14036
14037public:
14039 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
14040 SmallPtrSetImpl<Value *> &CheckedExtracts)
14041 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14042 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14043 CheckedExtracts(CheckedExtracts) {}
14044 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14045 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14046 unsigned NumParts, bool &UseVecBaseAsInput) {
14047 UseVecBaseAsInput = false;
14048 if (Mask.empty())
14049 return nullptr;
14050 Value *VecBase = nullptr;
14051 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14052 if (!E->ReorderIndices.empty()) {
14053 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14054 E->ReorderIndices.end());
14055 reorderScalars(VL, ReorderMask);
14056 }
14057 // Check if it can be considered reused if same extractelements were
14058 // vectorized already.
14059 bool PrevNodeFound = any_of(
14060 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14061 [&](const std::unique_ptr<TreeEntry> &TE) {
14062 return ((TE->hasState() && !TE->isAltShuffle() &&
14063 TE->getOpcode() == Instruction::ExtractElement) ||
14064 TE->isGather()) &&
14065 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14066 return VL.size() > Data.index() &&
14067 (Mask[Data.index()] == PoisonMaskElem ||
14068 isa<UndefValue>(VL[Data.index()]) ||
14069 Data.value() == VL[Data.index()]);
14070 });
14071 });
14072 SmallPtrSet<Value *, 4> UniqueBases;
14073 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14074 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
14075 for (unsigned Part : seq<unsigned>(NumParts)) {
14076 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14077 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14078 for (auto [I, V] :
14079 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
14080 // Ignore non-extractelement scalars.
14081 if (isa<UndefValue>(V) ||
14082 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
14083 continue;
14084 // If all users of instruction are going to be vectorized and this
14085 // instruction itself is not going to be vectorized, consider this
14086 // instruction as dead and remove its cost from the final cost of the
14087 // vectorized tree.
14088 // Also, avoid adjusting the cost for extractelements with multiple uses
14089 // in different graph entries.
14090 auto *EE = cast<ExtractElementInst>(V);
14091 VecBase = EE->getVectorOperand();
14092 UniqueBases.insert(VecBase);
14093 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
14094 if (!CheckedExtracts.insert(V).second ||
14095 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
14096 any_of(EE->users(),
14097 [&](User *U) {
14098 return isa<GetElementPtrInst>(U) &&
14099 !R.areAllUsersVectorized(cast<Instruction>(U),
14100 &VectorizedVals);
14101 }) ||
14102 (!VEs.empty() && !is_contained(VEs, E)))
14103 continue;
14104 std::optional<unsigned> EEIdx = getExtractIndex(EE);
14105 if (!EEIdx)
14106 continue;
14107 unsigned Idx = *EEIdx;
14108 // Take credit for instruction that will become dead.
14109 if (EE->hasOneUse() || !PrevNodeFound) {
14110 Instruction *Ext = EE->user_back();
14111 if (isa<SExtInst, ZExtInst>(Ext) &&
14112 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
14113 // Use getExtractWithExtendCost() to calculate the cost of
14114 // extractelement/ext pair.
14115 Cost -= TTI.getExtractWithExtendCost(
14116 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
14117 Idx, CostKind);
14118 // Add back the cost of s|zext which is subtracted separately.
14119 Cost += TTI.getCastInstrCost(
14120 Ext->getOpcode(), Ext->getType(), EE->getType(),
14122 continue;
14123 }
14124 }
14125 APInt &DemandedElts =
14126 VectorOpsToExtracts
14127 .try_emplace(VecBase,
14128 APInt::getZero(getNumElements(VecBase->getType())))
14129 .first->getSecond();
14130 DemandedElts.setBit(Idx);
14131 }
14132 }
14133 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14135 DemandedElts, /*Insert=*/false,
14136 /*Extract=*/true, CostKind);
14137 // Check that gather of extractelements can be represented as just a
14138 // shuffle of a single/two vectors the scalars are extracted from.
14139 // Found the bunch of extractelement instructions that must be gathered
14140 // into a vector and can be represented as a permutation elements in a
14141 // single input vector or of 2 input vectors.
14142 // Done for reused if same extractelements were vectorized already.
14143 if (!PrevNodeFound)
14144 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14145 InVectors.assign(1, E);
14146 CommonMask.assign(Mask.begin(), Mask.end());
14147 transformMaskAfterShuffle(CommonMask, CommonMask);
14148 SameNodesEstimated = false;
14149 if (NumParts != 1 && UniqueBases.size() != 1) {
14150 UseVecBaseAsInput = true;
14151 VecBase =
14152 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14153 }
14154 return VecBase;
14155 }
14156 /// Checks if the specified entry \p E needs to be delayed because of its
14157 /// dependency nodes.
14158 std::optional<InstructionCost>
14159 needToDelay(const TreeEntry *,
14161 // No need to delay the cost estimation during analysis.
14162 return std::nullopt;
14163 }
14164 /// Reset the builder to handle perfect diamond match.
14166 IsFinalized = false;
14167 CommonMask.clear();
14168 InVectors.clear();
14169 Cost = 0;
14170 VectorizedVals.clear();
14171 SameNodesEstimated = true;
14172 }
14173 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14174 if (&E1 == &E2) {
14175 assert(all_of(Mask,
14176 [&](int Idx) {
14177 return Idx < static_cast<int>(E1.getVectorFactor());
14178 }) &&
14179 "Expected single vector shuffle mask.");
14180 add(E1, Mask);
14181 return;
14182 }
14183 if (InVectors.empty()) {
14184 CommonMask.assign(Mask.begin(), Mask.end());
14185 InVectors.assign({&E1, &E2});
14186 return;
14187 }
14188 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14189 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14190 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14191 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14192 const auto *It =
14193 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
14194 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14195 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14196 }
14197 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14198 if (InVectors.empty()) {
14199 CommonMask.assign(Mask.begin(), Mask.end());
14200 InVectors.assign(1, &E1);
14201 return;
14202 }
14203 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14204 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14205 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14206 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14207 const auto *It =
14208 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
14209 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14210 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
14211 if (!SameNodesEstimated && InVectors.size() == 1)
14212 InVectors.emplace_back(&E1);
14213 }
14214 /// Adds 2 input vectors and the mask for their shuffling.
14215 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14216 // May come only for shuffling of 2 vectors with extractelements, already
14217 // handled in adjustExtracts.
14218 assert(InVectors.size() == 1 &&
14219 all_of(enumerate(CommonMask),
14220 [&](auto P) {
14221 if (P.value() == PoisonMaskElem)
14222 return Mask[P.index()] == PoisonMaskElem;
14223 auto *EI = cast<ExtractElementInst>(
14224 cast<const TreeEntry *>(InVectors.front())
14225 ->getOrdered(P.index()));
14226 return EI->getVectorOperand() == V1 ||
14227 EI->getVectorOperand() == V2;
14228 }) &&
14229 "Expected extractelement vectors.");
14230 }
14231 /// Adds another one input vector and the mask for the shuffling.
14232 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14233 if (InVectors.empty()) {
14234 assert(CommonMask.empty() && !ForExtracts &&
14235 "Expected empty input mask/vectors.");
14236 CommonMask.assign(Mask.begin(), Mask.end());
14237 InVectors.assign(1, V1);
14238 return;
14239 }
14240 if (ForExtracts) {
14241 // No need to add vectors here, already handled them in adjustExtracts.
14242 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14243 !CommonMask.empty() &&
14244 all_of(enumerate(CommonMask),
14245 [&](auto P) {
14246 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14247 ->getOrdered(P.index());
14248 if (P.value() == PoisonMaskElem)
14249 return P.value() == Mask[P.index()] ||
14250 isa<UndefValue>(Scalar);
14251 if (isa<Constant>(V1))
14252 return true;
14253 auto *EI = cast<ExtractElementInst>(Scalar);
14254 return EI->getVectorOperand() == V1;
14255 }) &&
14256 "Expected only tree entry for extractelement vectors.");
14257 return;
14258 }
14259 assert(!InVectors.empty() && !CommonMask.empty() &&
14260 "Expected only tree entries from extracts/reused buildvectors.");
14261 unsigned VF = getVF(V1);
14262 if (InVectors.size() == 2) {
14263 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14264 transformMaskAfterShuffle(CommonMask, CommonMask);
14265 VF = std::max<unsigned>(VF, CommonMask.size());
14266 } else if (const auto *InTE =
14267 InVectors.front().dyn_cast<const TreeEntry *>()) {
14268 VF = std::max(VF, InTE->getVectorFactor());
14269 } else {
14270 VF = std::max(
14271 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14272 ->getNumElements());
14273 }
14274 InVectors.push_back(V1);
14275 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14276 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14277 CommonMask[Idx] = Mask[Idx] + VF;
14278 }
14279 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14280 Value *Root = nullptr) {
14281 Cost += getBuildVectorCost(VL, Root);
14282 if (!Root) {
14283 // FIXME: Need to find a way to avoid use of getNullValue here.
14285 unsigned VF = VL.size();
14286 if (MaskVF != 0)
14287 VF = std::min(VF, MaskVF);
14288 Type *VLScalarTy = VL.front()->getType();
14289 for (Value *V : VL.take_front(VF)) {
14290 Type *ScalarTy = VLScalarTy->getScalarType();
14291 if (isa<PoisonValue>(V)) {
14292 Vals.push_back(PoisonValue::get(ScalarTy));
14293 continue;
14294 }
14295 if (isa<UndefValue>(V)) {
14296 Vals.push_back(UndefValue::get(ScalarTy));
14297 continue;
14298 }
14299 Vals.push_back(Constant::getNullValue(ScalarTy));
14300 }
14301 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14302 assert(SLPReVec && "FixedVectorType is not expected.");
14303 // When REVEC is enabled, we need to expand vector types into scalar
14304 // types.
14305 Vals = replicateMask(Vals, VecTy->getNumElements());
14306 }
14307 return ConstantVector::get(Vals);
14308 }
14311 cast<FixedVectorType>(Root->getType())->getNumElements()),
14312 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14313 }
14315 /// Finalize emission of the shuffles.
14317 ArrayRef<int> ExtMask,
14318 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14319 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14322 Action = {}) {
14323 IsFinalized = true;
14324 if (Action) {
14325 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14326 if (InVectors.size() == 2)
14327 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14328 else
14329 Cost += createShuffle(Vec, nullptr, CommonMask);
14330 transformMaskAfterShuffle(CommonMask, CommonMask);
14331 assert(VF > 0 &&
14332 "Expected vector length for the final value before action.");
14333 Value *V = cast<Value *>(Vec);
14334 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14335 Cost += createShuffle(V1, V2, Mask);
14336 return V1;
14337 });
14338 InVectors.front() = V;
14339 }
14340 if (!SubVectors.empty()) {
14341 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14342 if (InVectors.size() == 2)
14343 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14344 else
14345 Cost += createShuffle(Vec, nullptr, CommonMask);
14346 transformMaskAfterShuffle(CommonMask, CommonMask);
14347 // Add subvectors permutation cost.
14348 if (!SubVectorsMask.empty()) {
14349 assert(SubVectorsMask.size() <= CommonMask.size() &&
14350 "Expected same size of masks for subvectors and common mask.");
14351 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14352 copy(SubVectorsMask, SVMask.begin());
14353 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14354 if (I2 != PoisonMaskElem) {
14355 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14356 I1 = I2 + CommonMask.size();
14357 }
14358 }
14360 getWidenedType(ScalarTy, CommonMask.size()),
14361 SVMask, CostKind);
14362 }
14363 for (auto [E, Idx] : SubVectors) {
14364 Type *EScalarTy = E->Scalars.front()->getType();
14365 bool IsSigned = true;
14366 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14367 EScalarTy =
14368 IntegerType::get(EScalarTy->getContext(), It->second.first);
14369 IsSigned = It->second.second;
14370 }
14371 if (ScalarTy != EScalarTy) {
14372 unsigned CastOpcode = Instruction::Trunc;
14373 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14374 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14375 if (DstSz > SrcSz)
14376 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14377 Cost += TTI.getCastInstrCost(
14378 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14379 getWidenedType(EScalarTy, E->getVectorFactor()),
14381 }
14384 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14385 getWidenedType(ScalarTy, E->getVectorFactor()));
14386 if (!CommonMask.empty()) {
14387 std::iota(std::next(CommonMask.begin(), Idx),
14388 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14389 Idx);
14390 }
14391 }
14392 }
14393
14394 if (!ExtMask.empty()) {
14395 if (CommonMask.empty()) {
14396 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14397 } else {
14398 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14399 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14400 if (ExtMask[I] == PoisonMaskElem)
14401 continue;
14402 NewMask[I] = CommonMask[ExtMask[I]];
14403 }
14404 CommonMask.swap(NewMask);
14405 }
14406 }
14407 if (CommonMask.empty()) {
14408 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14409 return Cost;
14410 }
14411 return Cost +
14412 createShuffle(InVectors.front(),
14413 InVectors.size() == 2 ? InVectors.back() : nullptr,
14414 CommonMask);
14415 }
14416
14418 assert((IsFinalized || CommonMask.empty()) &&
14419 "Shuffle construction must be finalized.");
14420 }
14421};
14422
14423const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14424 unsigned Idx) const {
14425 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14426 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14427 return Op;
14428}
14429
14430TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14431 if (TE.State == TreeEntry::ScatterVectorize ||
14432 TE.State == TreeEntry::StridedVectorize)
14434 if (TE.State == TreeEntry::CompressVectorize)
14436 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14437 !TE.isAltShuffle()) {
14438 if (TE.ReorderIndices.empty())
14440 SmallVector<int> Mask;
14441 inversePermutation(TE.ReorderIndices, Mask);
14442 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14444 }
14446}
14447
14449BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14450 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14451 ArrayRef<Value *> VL = E->Scalars;
14452
14453 Type *ScalarTy = getValueType(VL[0]);
14454 if (!isValidElementType(ScalarTy))
14455 return InstructionCost::getInvalid();
14457
14458 // If we have computed a smaller type for the expression, update VecTy so
14459 // that the costs will be accurate.
14460 auto It = MinBWs.find(E);
14461 Type *OrigScalarTy = ScalarTy;
14462 if (It != MinBWs.end()) {
14463 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14464 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14465 if (VecTy)
14466 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14467 }
14468 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14469 unsigned EntryVF = E->getVectorFactor();
14470 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14471
14472 if (E->isGather()) {
14473 if (allConstant(VL))
14474 return 0;
14475 if (isa<InsertElementInst>(VL[0]))
14476 return InstructionCost::getInvalid();
14477 if (isa<CmpInst>(VL.front()))
14478 ScalarTy = VL.front()->getType();
14479 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14480 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14481 }
14482 if (E->State == TreeEntry::SplitVectorize) {
14483 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14484 "Expected exactly 2 combined entries.");
14485 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14486 InstructionCost VectorCost = 0;
14487 if (E->ReorderIndices.empty()) {
14488 VectorCost = ::getShuffleCost(
14489 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14490 E->CombinedEntriesWithIndices.back().second,
14492 ScalarTy,
14493 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14494 ->getVectorFactor()));
14495 } else {
14496 unsigned CommonVF =
14497 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14498 ->getVectorFactor(),
14499 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14500 ->getVectorFactor());
14501 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14502 getWidenedType(ScalarTy, CommonVF),
14503 E->getSplitMask(), CostKind);
14504 }
14505 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14506 return VectorCost;
14507 }
14508 InstructionCost CommonCost = 0;
14509 SmallVector<int> Mask;
14510 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14511 (E->State != TreeEntry::StridedVectorize ||
14512 !isReverseOrder(E->ReorderIndices))) {
14513 SmallVector<int> NewMask;
14514 if (E->getOpcode() == Instruction::Store) {
14515 // For stores the order is actually a mask.
14516 NewMask.resize(E->ReorderIndices.size());
14517 copy(E->ReorderIndices, NewMask.begin());
14518 } else {
14519 inversePermutation(E->ReorderIndices, NewMask);
14520 }
14521 ::addMask(Mask, NewMask);
14522 }
14523 if (!E->ReuseShuffleIndices.empty())
14524 ::addMask(Mask, E->ReuseShuffleIndices);
14525 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14526 CommonCost =
14527 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14528 assert((E->State == TreeEntry::Vectorize ||
14529 E->State == TreeEntry::ScatterVectorize ||
14530 E->State == TreeEntry::StridedVectorize ||
14531 E->State == TreeEntry::CompressVectorize) &&
14532 "Unhandled state");
14533 assert(E->getOpcode() &&
14534 ((allSameType(VL) && allSameBlock(VL)) ||
14535 (E->getOpcode() == Instruction::GetElementPtr &&
14536 E->getMainOp()->getType()->isPointerTy()) ||
14537 E->hasCopyableElements()) &&
14538 "Invalid VL");
14539 Instruction *VL0 = E->getMainOp();
14540 unsigned ShuffleOrOp =
14541 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14542 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14543 ShuffleOrOp = E->CombinedOp;
14544 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
14545 const unsigned Sz = UniqueValues.size();
14546 SmallBitVector UsedScalars(Sz, false);
14547 for (unsigned I = 0; I < Sz; ++I) {
14548 if (isa<Instruction>(UniqueValues[I]) &&
14549 !E->isCopyableElement(UniqueValues[I]) &&
14550 getTreeEntries(UniqueValues[I]).front() == E)
14551 continue;
14552 UsedScalars.set(I);
14553 }
14554 auto GetCastContextHint = [&](Value *V) {
14555 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14556 return getCastContextHint(*OpTEs.front());
14557 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14558 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14559 !SrcState.isAltShuffle())
14562 };
14563 auto GetCostDiff =
14564 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14565 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14566 // Calculate the cost of this instruction.
14567 InstructionCost ScalarCost = 0;
14568 if (isa<CastInst, CallInst>(VL0)) {
14569 // For some of the instructions no need to calculate cost for each
14570 // particular instruction, we can use the cost of the single
14571 // instruction x total number of scalar instructions.
14572 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14573 } else {
14574 for (unsigned I = 0; I < Sz; ++I) {
14575 if (UsedScalars.test(I))
14576 continue;
14577 ScalarCost += ScalarEltCost(I);
14578 }
14579 }
14580
14581 InstructionCost VecCost = VectorCost(CommonCost);
14582 // Check if the current node must be resized, if the parent node is not
14583 // resized.
14584 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14585 E->Idx != 0 &&
14586 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14587 const EdgeInfo &EI = E->UserTreeIndex;
14588 if (!EI.UserTE->hasState() ||
14589 EI.UserTE->getOpcode() != Instruction::Select ||
14590 EI.EdgeIdx != 0) {
14591 auto UserBWIt = MinBWs.find(EI.UserTE);
14592 Type *UserScalarTy =
14593 (EI.UserTE->isGather() ||
14594 EI.UserTE->State == TreeEntry::SplitVectorize)
14595 ? EI.UserTE->Scalars.front()->getType()
14596 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14597 if (UserBWIt != MinBWs.end())
14598 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14599 UserBWIt->second.first);
14600 if (ScalarTy != UserScalarTy) {
14601 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14602 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14603 unsigned VecOpcode;
14604 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14605 if (BWSz > SrcBWSz)
14606 VecOpcode = Instruction::Trunc;
14607 else
14608 VecOpcode =
14609 It->second.second ? Instruction::SExt : Instruction::ZExt;
14610 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14611 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14612 CostKind);
14613 }
14614 }
14615 }
14616 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14617 ScalarCost, "Calculated costs for Tree"));
14618 return VecCost - ScalarCost;
14619 };
14620 // Calculate cost difference from vectorizing set of GEPs.
14621 // Negative value means vectorizing is profitable.
14622 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14623 assert((E->State == TreeEntry::Vectorize ||
14624 E->State == TreeEntry::StridedVectorize ||
14625 E->State == TreeEntry::CompressVectorize) &&
14626 "Entry state expected to be Vectorize, StridedVectorize or "
14627 "MaskedLoadCompressVectorize here.");
14628 InstructionCost ScalarCost = 0;
14629 InstructionCost VecCost = 0;
14630 std::tie(ScalarCost, VecCost) = getGEPCosts(
14631 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14632 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14633 "Calculated GEPs cost for Tree"));
14634
14635 return VecCost - ScalarCost;
14636 };
14637
14638 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14639 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14640 if (MinMaxID == Intrinsic::not_intrinsic)
14641 return InstructionCost::getInvalid();
14642 Type *CanonicalType = Ty;
14643 if (CanonicalType->isPtrOrPtrVectorTy())
14644 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14645 CanonicalType->getContext(),
14646 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14647
14648 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14649 {CanonicalType, CanonicalType});
14651 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14652 // If the selects are the only uses of the compares, they will be
14653 // dead and we can adjust the cost by removing their cost.
14654 if (VI && SelectOnly) {
14655 assert((!Ty->isVectorTy() || SLPReVec) &&
14656 "Expected only for scalar type.");
14657 auto *CI = cast<CmpInst>(VI->getOperand(0));
14658 IntrinsicCost -= TTI->getCmpSelInstrCost(
14659 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14660 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14661 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14662 }
14663 return IntrinsicCost;
14664 };
14665 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14666 Instruction *VI) {
14667 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14668 return Cost;
14669 };
14670 switch (ShuffleOrOp) {
14671 case Instruction::PHI: {
14672 // Count reused scalars.
14673 InstructionCost ScalarCost = 0;
14674 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14675 for (Value *V : UniqueValues) {
14676 auto *PHI = dyn_cast<PHINode>(V);
14677 if (!PHI)
14678 continue;
14679
14680 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14681 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14682 Value *Op = PHI->getIncomingValue(I);
14683 Operands[I] = Op;
14684 }
14685 if (const TreeEntry *OpTE =
14686 getSameValuesTreeEntry(Operands.front(), Operands))
14687 if (CountedOps.insert(OpTE).second &&
14688 !OpTE->ReuseShuffleIndices.empty())
14689 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14690 OpTE->Scalars.size());
14691 }
14692
14693 return CommonCost - ScalarCost;
14694 }
14695 case Instruction::ExtractValue:
14696 case Instruction::ExtractElement: {
14697 APInt DemandedElts;
14698 VectorType *SrcVecTy = nullptr;
14699 auto GetScalarCost = [&](unsigned Idx) {
14700 if (isa<PoisonValue>(UniqueValues[Idx]))
14702
14703 auto *I = cast<Instruction>(UniqueValues[Idx]);
14704 if (!SrcVecTy) {
14705 if (ShuffleOrOp == Instruction::ExtractElement) {
14706 auto *EE = cast<ExtractElementInst>(I);
14707 SrcVecTy = EE->getVectorOperandType();
14708 } else {
14709 auto *EV = cast<ExtractValueInst>(I);
14710 Type *AggregateTy = EV->getAggregateOperand()->getType();
14711 unsigned NumElts;
14712 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14713 NumElts = ATy->getNumElements();
14714 else
14715 NumElts = AggregateTy->getStructNumElements();
14716 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
14717 }
14718 }
14719 if (I->hasOneUse()) {
14720 Instruction *Ext = I->user_back();
14721 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14723 // Use getExtractWithExtendCost() to calculate the cost of
14724 // extractelement/ext pair.
14725 InstructionCost Cost = TTI->getExtractWithExtendCost(
14726 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
14727 CostKind);
14728 // Subtract the cost of s|zext which is subtracted separately.
14729 Cost -= TTI->getCastInstrCost(
14730 Ext->getOpcode(), Ext->getType(), I->getType(),
14732 return Cost;
14733 }
14734 }
14735 if (DemandedElts.isZero())
14736 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
14737 DemandedElts.setBit(*getExtractIndex(I));
14739 };
14740 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14741 return CommonCost - (DemandedElts.isZero()
14743 : TTI.getScalarizationOverhead(
14744 SrcVecTy, DemandedElts, /*Insert=*/false,
14745 /*Extract=*/true, CostKind));
14746 };
14747 return GetCostDiff(GetScalarCost, GetVectorCost);
14748 }
14749 case Instruction::InsertElement: {
14750 assert(E->ReuseShuffleIndices.empty() &&
14751 "Unique insertelements only are expected.");
14752 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
14753 unsigned const NumElts = SrcVecTy->getNumElements();
14754 unsigned const NumScalars = VL.size();
14755
14756 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
14757
14758 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14759 unsigned OffsetBeg = *getElementIndex(VL.front());
14760 unsigned OffsetEnd = OffsetBeg;
14761 InsertMask[OffsetBeg] = 0;
14762 for (auto [I, V] : enumerate(VL.drop_front())) {
14763 unsigned Idx = *getElementIndex(V);
14764 if (OffsetBeg > Idx)
14765 OffsetBeg = Idx;
14766 else if (OffsetEnd < Idx)
14767 OffsetEnd = Idx;
14768 InsertMask[Idx] = I + 1;
14769 }
14770 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
14771 if (NumOfParts > 0 && NumOfParts < NumElts)
14772 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14773 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14774 VecScalarsSz;
14775 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14776 unsigned InsertVecSz = std::min<unsigned>(
14777 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
14778 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14779 bool IsWholeSubvector =
14780 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14781 // Check if we can safely insert a subvector. If it is not possible, just
14782 // generate a whole-sized vector and shuffle the source vector and the new
14783 // subvector.
14784 if (OffsetBeg + InsertVecSz > VecSz) {
14785 // Align OffsetBeg to generate correct mask.
14786 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
14787 InsertVecSz = VecSz;
14788 }
14789
14790 APInt DemandedElts = APInt::getZero(NumElts);
14791 // TODO: Add support for Instruction::InsertValue.
14792 SmallVector<int> Mask;
14793 if (!E->ReorderIndices.empty()) {
14794 inversePermutation(E->ReorderIndices, Mask);
14795 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
14796 } else {
14797 Mask.assign(VecSz, PoisonMaskElem);
14798 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
14799 }
14800 bool IsIdentity = true;
14801 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
14802 Mask.swap(PrevMask);
14803 for (unsigned I = 0; I < NumScalars; ++I) {
14804 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
14805 DemandedElts.setBit(InsertIdx);
14806 IsIdentity &= InsertIdx - OffsetBeg == I;
14807 Mask[InsertIdx - OffsetBeg] = I;
14808 }
14809 assert(Offset < NumElts && "Failed to find vector index offset");
14810
14812 Cost -=
14813 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
14814 /*Insert*/ true, /*Extract*/ false, CostKind);
14815
14816 // First cost - resize to actual vector size if not identity shuffle or
14817 // need to shift the vector.
14818 // Do not calculate the cost if the actual size is the register size and
14819 // we can merge this shuffle with the following SK_Select.
14820 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
14821 if (!IsIdentity)
14823 InsertVecTy, Mask);
14824 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14825 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14826 }));
14827 // Second cost - permutation with subvector, if some elements are from the
14828 // initial vector or inserting a subvector.
14829 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
14830 // subvector of ActualVecTy.
14831 SmallBitVector InMask =
14832 isUndefVector(FirstInsert->getOperand(0),
14833 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14834 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
14835 if (InsertVecSz != VecSz) {
14836 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
14837 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
14838 CostKind, OffsetBeg - Offset, InsertVecTy);
14839 } else {
14840 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
14841 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
14842 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
14843 I <= End; ++I)
14844 if (Mask[I] != PoisonMaskElem)
14845 Mask[I] = I + VecSz;
14846 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
14847 Mask[I] =
14848 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
14849 Cost +=
14850 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
14851 }
14852 }
14853 return Cost;
14854 }
14855 case Instruction::ZExt:
14856 case Instruction::SExt:
14857 case Instruction::FPToUI:
14858 case Instruction::FPToSI:
14859 case Instruction::FPExt:
14860 case Instruction::PtrToInt:
14861 case Instruction::IntToPtr:
14862 case Instruction::SIToFP:
14863 case Instruction::UIToFP:
14864 case Instruction::Trunc:
14865 case Instruction::FPTrunc:
14866 case Instruction::BitCast: {
14867 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14868 Type *SrcScalarTy = VL0->getOperand(0)->getType();
14869 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
14870 unsigned Opcode = ShuffleOrOp;
14871 unsigned VecOpcode = Opcode;
14872 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14873 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14874 // Check if the values are candidates to demote.
14875 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
14876 if (SrcIt != MinBWs.end()) {
14877 SrcBWSz = SrcIt->second.first;
14878 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
14879 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
14880 SrcVecTy =
14881 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
14882 }
14883 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14884 if (BWSz == SrcBWSz) {
14885 VecOpcode = Instruction::BitCast;
14886 } else if (BWSz < SrcBWSz) {
14887 VecOpcode = Instruction::Trunc;
14888 } else if (It != MinBWs.end()) {
14889 assert(BWSz > SrcBWSz && "Invalid cast!");
14890 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14891 } else if (SrcIt != MinBWs.end()) {
14892 assert(BWSz > SrcBWSz && "Invalid cast!");
14893 VecOpcode =
14894 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14895 }
14896 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14897 !SrcIt->second.second) {
14898 VecOpcode = Instruction::UIToFP;
14899 }
14900 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
14901 assert(Idx == 0 && "Expected 0 index only");
14902 return TTI->getCastInstrCost(Opcode, VL0->getType(),
14903 VL0->getOperand(0)->getType(),
14905 };
14906 auto GetVectorCost = [=](InstructionCost CommonCost) {
14907 // Do not count cost here if minimum bitwidth is in effect and it is just
14908 // a bitcast (here it is just a noop).
14909 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14910 return CommonCost;
14911 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
14912 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
14913
14914 bool IsArithmeticExtendedReduction =
14915 E->Idx == 0 && UserIgnoreList &&
14916 all_of(*UserIgnoreList, [](Value *V) {
14917 auto *I = cast<Instruction>(V);
14918 return is_contained({Instruction::Add, Instruction::FAdd,
14919 Instruction::Mul, Instruction::FMul,
14920 Instruction::And, Instruction::Or,
14921 Instruction::Xor},
14922 I->getOpcode());
14923 });
14924 if (IsArithmeticExtendedReduction &&
14925 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14926 return CommonCost;
14927 return CommonCost +
14928 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
14929 VecOpcode == Opcode ? VI : nullptr);
14930 };
14931 return GetCostDiff(GetScalarCost, GetVectorCost);
14932 }
14933 case Instruction::FCmp:
14934 case Instruction::ICmp:
14935 case Instruction::Select: {
14936 CmpPredicate VecPred, SwappedVecPred;
14937 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
14938 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
14939 match(VL0, MatchCmp))
14940 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
14941 else
14942 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
14945 auto GetScalarCost = [&](unsigned Idx) {
14946 if (isa<PoisonValue>(UniqueValues[Idx]))
14948
14949 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14950 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
14953 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
14954 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
14955 !match(VI, MatchCmp)) ||
14956 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
14957 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
14958 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
14961
14962 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
14963 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14964 CostKind, getOperandInfo(VI->getOperand(0)),
14965 getOperandInfo(VI->getOperand(1)), VI);
14966 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
14967 if (IntrinsicCost.isValid())
14968 ScalarCost = IntrinsicCost;
14969
14970 return ScalarCost;
14971 };
14972 auto GetVectorCost = [&](InstructionCost CommonCost) {
14973 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
14974
14975 InstructionCost VecCost =
14976 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
14977 CostKind, getOperandInfo(E->getOperand(0)),
14978 getOperandInfo(E->getOperand(1)), VL0);
14979 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
14980 auto *CondType =
14981 getWidenedType(SI->getCondition()->getType(), VL.size());
14982 unsigned CondNumElements = CondType->getNumElements();
14983 unsigned VecTyNumElements = getNumElements(VecTy);
14984 assert(VecTyNumElements >= CondNumElements &&
14985 VecTyNumElements % CondNumElements == 0 &&
14986 "Cannot vectorize Instruction::Select");
14987 if (CondNumElements != VecTyNumElements) {
14988 // When the return type is i1 but the source is fixed vector type, we
14989 // need to duplicate the condition value.
14990 VecCost += ::getShuffleCost(
14991 *TTI, TTI::SK_PermuteSingleSrc, CondType,
14992 createReplicatedMask(VecTyNumElements / CondNumElements,
14993 CondNumElements));
14994 }
14995 }
14996 return VecCost + CommonCost;
14997 };
14998 return GetCostDiff(GetScalarCost, GetVectorCost);
14999 }
15000 case TreeEntry::MinMax: {
15001 auto GetScalarCost = [&](unsigned Idx) {
15002 return GetMinMaxCost(OrigScalarTy);
15003 };
15004 auto GetVectorCost = [&](InstructionCost CommonCost) {
15005 InstructionCost VecCost = GetMinMaxCost(VecTy);
15006 return VecCost + CommonCost;
15007 };
15008 return GetCostDiff(GetScalarCost, GetVectorCost);
15009 }
15010 case TreeEntry::FMulAdd: {
15011 auto GetScalarCost = [&](unsigned Idx) {
15012 if (isa<PoisonValue>(UniqueValues[Idx]))
15014 return GetFMulAddCost(E->getOperations(),
15015 cast<Instruction>(UniqueValues[Idx]));
15016 };
15017 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15018 FastMathFlags FMF;
15019 FMF.set();
15020 for (Value *V : E->Scalars) {
15021 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
15022 FMF &= FPCI->getFastMathFlags();
15023 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
15024 FMF &= FPCIOp->getFastMathFlags();
15025 }
15026 }
15027 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15028 {VecTy, VecTy, VecTy}, FMF);
15029 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
15030 return VecCost + CommonCost;
15031 };
15032 return GetCostDiff(GetScalarCost, GetVectorCost);
15033 }
15034 case Instruction::FNeg:
15035 case Instruction::Add:
15036 case Instruction::FAdd:
15037 case Instruction::Sub:
15038 case Instruction::FSub:
15039 case Instruction::Mul:
15040 case Instruction::FMul:
15041 case Instruction::UDiv:
15042 case Instruction::SDiv:
15043 case Instruction::FDiv:
15044 case Instruction::URem:
15045 case Instruction::SRem:
15046 case Instruction::FRem:
15047 case Instruction::Shl:
15048 case Instruction::LShr:
15049 case Instruction::AShr:
15050 case Instruction::And:
15051 case Instruction::Or:
15052 case Instruction::Xor: {
15053 auto GetScalarCost = [&](unsigned Idx) {
15054 if (isa<PoisonValue>(UniqueValues[Idx]))
15056
15057 // We cannot retrieve the operand from UniqueValues[Idx] because an
15058 // interchangeable instruction may be used. The order and the actual
15059 // operand might differ from what is retrieved from UniqueValues[Idx].
15060 Value *Op1 = E->getOperand(0)[Idx];
15061 Value *Op2;
15062 SmallVector<const Value *, 2> Operands(1, Op1);
15063 if (isa<UnaryOperator>(UniqueValues[Idx])) {
15064 Op2 = Op1;
15065 } else {
15066 Op2 = E->getOperand(1)[Idx];
15067 Operands.push_back(Op2);
15068 }
15071 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
15072 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
15073 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
15074 I && (ShuffleOrOp == Instruction::FAdd ||
15075 ShuffleOrOp == Instruction::FSub)) {
15076 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
15077 if (IntrinsicCost.isValid())
15078 ScalarCost = IntrinsicCost;
15079 }
15080 return ScalarCost;
15081 };
15082 auto GetVectorCost = [=](InstructionCost CommonCost) {
15083 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15084 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15085 ArrayRef<Value *> Ops = E->getOperand(I);
15086 if (all_of(Ops, [&](Value *Op) {
15087 auto *CI = dyn_cast<ConstantInt>(Op);
15088 return CI && CI->getValue().countr_one() >= It->second.first;
15089 }))
15090 return CommonCost;
15091 }
15092 }
15093 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
15094 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
15095 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
15096 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
15097 Op2Info, {}, nullptr, TLI) +
15098 CommonCost;
15099 };
15100 return GetCostDiff(GetScalarCost, GetVectorCost);
15101 }
15102 case Instruction::GetElementPtr: {
15103 return CommonCost + GetGEPCostDiff(VL, VL0);
15104 }
15105 case Instruction::Load: {
15106 auto GetScalarCost = [&](unsigned Idx) {
15107 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
15108 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15109 VI->getAlign(), VI->getPointerAddressSpace(),
15111 };
15112 auto *LI0 = cast<LoadInst>(VL0);
15113 auto GetVectorCost = [&](InstructionCost CommonCost) {
15114 InstructionCost VecLdCost;
15115 switch (E->State) {
15116 case TreeEntry::Vectorize:
15117 if (unsigned Factor = E->getInterleaveFactor()) {
15118 VecLdCost = TTI->getInterleavedMemoryOpCost(
15119 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15120 LI0->getPointerAddressSpace(), CostKind);
15121
15122 } else {
15123 VecLdCost = TTI->getMemoryOpCost(
15124 Instruction::Load, VecTy, LI0->getAlign(),
15125 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15126 }
15127 break;
15128 case TreeEntry::StridedVectorize: {
15129 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
15130 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15131 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
15132 Align CommonAlignment =
15133 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15134 VecLdCost = TTI->getStridedMemoryOpCost(
15135 Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
15136 /*VariableMask=*/false, CommonAlignment, CostKind);
15137 if (StridedLoadTy != VecTy)
15138 VecLdCost +=
15139 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15140 getCastContextHint(*E), CostKind);
15141
15142 break;
15143 }
15144 case TreeEntry::CompressVectorize: {
15145 bool IsMasked;
15146 unsigned InterleaveFactor;
15147 SmallVector<int> CompressMask;
15148 VectorType *LoadVecTy;
15149 SmallVector<Value *> Scalars(VL);
15150 if (!E->ReorderIndices.empty()) {
15151 SmallVector<int> Mask(E->ReorderIndices.begin(),
15152 E->ReorderIndices.end());
15153 reorderScalars(Scalars, Mask);
15154 }
15155 SmallVector<Value *> PointerOps(Scalars.size());
15156 for (auto [I, V] : enumerate(Scalars))
15157 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15158 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15159 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15160 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
15161 CompressMask, LoadVecTy);
15162 assert(IsVectorized && "Failed to vectorize load");
15163 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
15164 InterleaveFactor, IsMasked);
15165 Align CommonAlignment = LI0->getAlign();
15166 if (InterleaveFactor) {
15167 VecLdCost = TTI->getInterleavedMemoryOpCost(
15168 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15169 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
15170 } else if (IsMasked) {
15171 VecLdCost = TTI->getMaskedMemoryOpCost(
15172 {Intrinsic::masked_load, LoadVecTy, CommonAlignment,
15173 LI0->getPointerAddressSpace()},
15174 CostKind);
15175 // TODO: include this cost into CommonCost.
15176 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15177 LoadVecTy, CompressMask, CostKind);
15178 } else {
15179 VecLdCost = TTI->getMemoryOpCost(
15180 Instruction::Load, LoadVecTy, CommonAlignment,
15181 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15182 // TODO: include this cost into CommonCost.
15183 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15184 LoadVecTy, CompressMask, CostKind);
15185 }
15186 break;
15187 }
15188 case TreeEntry::ScatterVectorize: {
15189 Align CommonAlignment =
15190 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15191 VecLdCost = TTI->getGatherScatterOpCost(
15192 Instruction::Load, VecTy, LI0->getPointerOperand(),
15193 /*VariableMask=*/false, CommonAlignment, CostKind);
15194 break;
15195 }
15196 case TreeEntry::CombinedVectorize:
15197 case TreeEntry::SplitVectorize:
15198 case TreeEntry::NeedToGather:
15199 llvm_unreachable("Unexpected vectorization state.");
15200 }
15201 return VecLdCost + CommonCost;
15202 };
15203
15204 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
15205 // If this node generates masked gather load then it is not a terminal node.
15206 // Hence address operand cost is estimated separately.
15207 if (E->State == TreeEntry::ScatterVectorize)
15208 return Cost;
15209
15210 // Estimate cost of GEPs since this tree node is a terminator.
15211 SmallVector<Value *> PointerOps(VL.size());
15212 for (auto [I, V] : enumerate(VL))
15213 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15214 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15215 }
15216 case Instruction::Store: {
15217 bool IsReorder = !E->ReorderIndices.empty();
15218 auto GetScalarCost = [=](unsigned Idx) {
15219 auto *VI = cast<StoreInst>(VL[Idx]);
15220 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
15221 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15222 VI->getAlign(), VI->getPointerAddressSpace(),
15223 CostKind, OpInfo, VI);
15224 };
15225 auto *BaseSI =
15226 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
15227 auto GetVectorCost = [=](InstructionCost CommonCost) {
15228 // We know that we can merge the stores. Calculate the cost.
15229 InstructionCost VecStCost;
15230 if (E->State == TreeEntry::StridedVectorize) {
15231 Align CommonAlignment =
15232 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
15233 VecStCost = TTI->getStridedMemoryOpCost(
15234 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
15235 /*VariableMask=*/false, CommonAlignment, CostKind);
15236 } else {
15237 assert(E->State == TreeEntry::Vectorize &&
15238 "Expected either strided or consecutive stores.");
15239 if (unsigned Factor = E->getInterleaveFactor()) {
15240 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
15241 "No reused shuffles expected");
15242 CommonCost = 0;
15243 VecStCost = TTI->getInterleavedMemoryOpCost(
15244 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15245 BaseSI->getPointerAddressSpace(), CostKind);
15246 } else {
15247 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
15248 VecStCost = TTI->getMemoryOpCost(
15249 Instruction::Store, VecTy, BaseSI->getAlign(),
15250 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
15251 }
15252 }
15253 return VecStCost + CommonCost;
15254 };
15255 SmallVector<Value *> PointerOps(VL.size());
15256 for (auto [I, V] : enumerate(VL)) {
15257 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
15258 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
15259 }
15260
15261 return GetCostDiff(GetScalarCost, GetVectorCost) +
15262 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15263 }
15264 case Instruction::Call: {
15265 auto GetScalarCost = [&](unsigned Idx) {
15266 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15269 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15270 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15271 }
15272 return TTI->getCallInstrCost(CI->getCalledFunction(),
15274 CI->getFunctionType()->params(), CostKind);
15275 };
15276 auto GetVectorCost = [=](InstructionCost CommonCost) {
15277 auto *CI = cast<CallInst>(VL0);
15280 CI, ID, VecTy->getNumElements(),
15281 It != MinBWs.end() ? It->second.first : 0, TTI);
15282 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15283 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15284 };
15285 return GetCostDiff(GetScalarCost, GetVectorCost);
15286 }
15287 case Instruction::ShuffleVector: {
15288 if (!SLPReVec || E->isAltShuffle())
15289 assert(E->isAltShuffle() &&
15290 ((Instruction::isBinaryOp(E->getOpcode()) &&
15291 Instruction::isBinaryOp(E->getAltOpcode())) ||
15292 (Instruction::isCast(E->getOpcode()) &&
15293 Instruction::isCast(E->getAltOpcode())) ||
15294 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15295 "Invalid Shuffle Vector Operand");
15296 // Try to find the previous shuffle node with the same operands and same
15297 // main/alternate ops.
15298 auto TryFindNodeWithEqualOperands = [=]() {
15299 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15300 if (TE.get() == E)
15301 break;
15302 if (TE->hasState() && TE->isAltShuffle() &&
15303 ((TE->getOpcode() == E->getOpcode() &&
15304 TE->getAltOpcode() == E->getAltOpcode()) ||
15305 (TE->getOpcode() == E->getAltOpcode() &&
15306 TE->getAltOpcode() == E->getOpcode())) &&
15307 TE->hasEqualOperands(*E))
15308 return true;
15309 }
15310 return false;
15311 };
15312 auto GetScalarCost = [&](unsigned Idx) {
15313 if (isa<PoisonValue>(UniqueValues[Idx]))
15315
15316 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15317 assert(E->getMatchingMainOpOrAltOp(VI) &&
15318 "Unexpected main/alternate opcode");
15319 (void)E;
15320 return TTI->getInstructionCost(VI, CostKind);
15321 };
15322 // Need to clear CommonCost since the final shuffle cost is included into
15323 // vector cost.
15324 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15325 // VecCost is equal to sum of the cost of creating 2 vectors
15326 // and the cost of creating shuffle.
15327 InstructionCost VecCost = 0;
15328 if (TryFindNodeWithEqualOperands()) {
15329 LLVM_DEBUG({
15330 dbgs() << "SLP: diamond match for alternate node found.\n";
15331 E->dump();
15332 });
15333 // No need to add new vector costs here since we're going to reuse
15334 // same main/alternate vector ops, just do different shuffling.
15335 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15336 VecCost =
15337 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15338 VecCost +=
15339 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15340 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15341 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15342 VecCost = TTIRef.getCmpSelInstrCost(
15343 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15344 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15345 VL0);
15346 VecCost += TTIRef.getCmpSelInstrCost(
15347 E->getOpcode(), VecTy, MaskTy,
15348 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15349 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15350 E->getAltOp());
15351 } else {
15352 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15353 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15354 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15355 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15356 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15357 unsigned SrcBWSz =
15358 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15359 if (SrcIt != MinBWs.end()) {
15360 SrcBWSz = SrcIt->second.first;
15361 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15362 SrcTy = getWidenedType(SrcSclTy, VL.size());
15363 }
15364 if (BWSz <= SrcBWSz) {
15365 if (BWSz < SrcBWSz)
15366 VecCost =
15367 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15369 LLVM_DEBUG({
15370 dbgs()
15371 << "SLP: alternate extension, which should be truncated.\n";
15372 E->dump();
15373 });
15374 return VecCost;
15375 }
15376 }
15377 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15379 VecCost +=
15380 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15382 }
15383 SmallVector<int> Mask;
15384 E->buildAltOpShuffleMask(
15385 [&](Instruction *I) {
15386 assert(E->getMatchingMainOpOrAltOp(I) &&
15387 "Unexpected main/alternate opcode");
15388 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15389 *TLI);
15390 },
15391 Mask);
15393 FinalVecTy, Mask, CostKind);
15394 // Patterns like [fadd,fsub] can be combined into a single instruction
15395 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15396 // need to take into account their order when looking for the most used
15397 // order.
15398 unsigned Opcode0 = E->getOpcode();
15399 unsigned Opcode1 = E->getAltOpcode();
15400 SmallBitVector OpcodeMask(
15401 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15402 // If this pattern is supported by the target then we consider the
15403 // order.
15404 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15405 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15406 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15407 return AltVecCost < VecCost ? AltVecCost : VecCost;
15408 }
15409 // TODO: Check the reverse order too.
15410 return VecCost;
15411 };
15412 if (SLPReVec && !E->isAltShuffle())
15413 return GetCostDiff(
15414 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15415 // If a group uses mask in order, the shufflevector can be
15416 // eliminated by instcombine. Then the cost is 0.
15418 "Not supported shufflevector usage.");
15419 auto *SV = cast<ShuffleVectorInst>(VL.front());
15420 unsigned SVNumElements =
15421 cast<FixedVectorType>(SV->getOperand(0)->getType())
15422 ->getNumElements();
15423 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15424 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15425 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15426 int NextIndex = 0;
15427 if (!all_of(Group, [&](Value *V) {
15429 "Not supported shufflevector usage.");
15430 auto *SV = cast<ShuffleVectorInst>(V);
15431 int Index;
15432 [[maybe_unused]] bool IsExtractSubvectorMask =
15433 SV->isExtractSubvectorMask(Index);
15434 assert(IsExtractSubvectorMask &&
15435 "Not supported shufflevector usage.");
15436 if (NextIndex != Index)
15437 return false;
15438 NextIndex += SV->getShuffleMask().size();
15439 return true;
15440 }))
15441 return ::getShuffleCost(
15443 calculateShufflevectorMask(E->Scalars));
15444 }
15445 return TTI::TCC_Free;
15446 });
15447 return GetCostDiff(GetScalarCost, GetVectorCost);
15448 }
15449 case Instruction::Freeze:
15450 return CommonCost;
15451 default:
15452 llvm_unreachable("Unknown instruction");
15453 }
15454}
15455
15456bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15457 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15458 << VectorizableTree.size() << " is fully vectorizable .\n");
15459
15460 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15461 SmallVector<int> Mask;
15462 return TE->isGather() &&
15463 !any_of(TE->Scalars,
15464 [this](Value *V) { return EphValues.contains(V); }) &&
15465 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15466 TE->Scalars.size() < Limit ||
15467 (((TE->hasState() &&
15468 TE->getOpcode() == Instruction::ExtractElement) ||
15470 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15471 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15472 !TE->isAltShuffle()) ||
15473 any_of(TE->Scalars, IsaPred<LoadInst>));
15474 };
15475
15476 // We only handle trees of heights 1 and 2.
15477 if (VectorizableTree.size() == 1 &&
15478 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15479 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15480 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15481 (ForReduction &&
15482 AreVectorizableGathers(VectorizableTree[0].get(),
15483 VectorizableTree[0]->Scalars.size()) &&
15484 VectorizableTree[0]->getVectorFactor() > 2)))
15485 return true;
15486
15487 if (VectorizableTree.size() != 2)
15488 return false;
15489
15490 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15491 // with the second gather nodes if they have less scalar operands rather than
15492 // the initial tree element (may be profitable to shuffle the second gather)
15493 // or they are extractelements, which form shuffle.
15494 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15495 AreVectorizableGathers(VectorizableTree[1].get(),
15496 VectorizableTree[0]->Scalars.size()))
15497 return true;
15498
15499 // Gathering cost would be too much for tiny trees.
15500 if (VectorizableTree[0]->isGather() ||
15501 (VectorizableTree[1]->isGather() &&
15502 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15503 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15504 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15505 return false;
15506
15507 return true;
15508}
15509
15510static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15512 bool MustMatchOrInst) {
15513 // Look past the root to find a source value. Arbitrarily follow the
15514 // path through operand 0 of any 'or'. Also, peek through optional
15515 // shift-left-by-multiple-of-8-bits.
15516 Value *ZextLoad = Root;
15517 const APInt *ShAmtC;
15518 bool FoundOr = false;
15519 while (!isa<ConstantExpr>(ZextLoad) &&
15520 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15521 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15522 ShAmtC->urem(8) == 0))) {
15523 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15524 ZextLoad = BinOp->getOperand(0);
15525 if (BinOp->getOpcode() == Instruction::Or)
15526 FoundOr = true;
15527 }
15528 // Check if the input is an extended load of the required or/shift expression.
15529 Value *Load;
15530 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15531 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15532 return false;
15533
15534 // Require that the total load bit width is a legal integer type.
15535 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15536 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15537 Type *SrcTy = Load->getType();
15538 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15539 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15540 return false;
15541
15542 // Everything matched - assume that we can fold the whole sequence using
15543 // load combining.
15544 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15545 << *(cast<Instruction>(Root)) << "\n");
15546
15547 return true;
15548}
15549
15551 if (RdxKind != RecurKind::Or)
15552 return false;
15553
15554 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15555 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15556 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15557 /* MatchOr */ false);
15558}
15559
15561 // Peek through a final sequence of stores and check if all operations are
15562 // likely to be load-combined.
15563 unsigned NumElts = Stores.size();
15564 for (Value *Scalar : Stores) {
15565 Value *X;
15566 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15567 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15568 return false;
15569 }
15570 return true;
15571}
15572
15573bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15574 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15575 return true;
15576
15577 // Graph is empty - do nothing.
15578 if (VectorizableTree.empty()) {
15579 assert(ExternalUses.empty() && "We shouldn't have any external users");
15580
15581 return true;
15582 }
15583
15584 // No need to vectorize inserts of gathered values.
15585 if (VectorizableTree.size() == 2 &&
15586 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15587 VectorizableTree[1]->isGather() &&
15588 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15589 !(isSplat(VectorizableTree[1]->Scalars) ||
15590 allConstant(VectorizableTree[1]->Scalars))))
15591 return true;
15592
15593 // If the graph includes only PHI nodes and gathers, it is defnitely not
15594 // profitable for the vectorization, we can skip it, if the cost threshold is
15595 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15596 // gathers/buildvectors.
15597 constexpr int Limit = 4;
15598 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15599 !VectorizableTree.empty() &&
15600 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15601 return (TE->isGather() &&
15602 (!TE->hasState() ||
15603 TE->getOpcode() != Instruction::ExtractElement) &&
15604 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15605 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15606 }))
15607 return true;
15608
15609 // Do not vectorize small tree of phis only, if all vector phis are also
15610 // gathered.
15611 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15612 VectorizableTree.size() <= Limit &&
15613 all_of(VectorizableTree,
15614 [&](const std::unique_ptr<TreeEntry> &TE) {
15615 return (TE->isGather() &&
15616 (!TE->hasState() ||
15617 TE->getOpcode() != Instruction::ExtractElement) &&
15618 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15619 Limit) ||
15620 (TE->hasState() &&
15621 (TE->getOpcode() == Instruction::InsertElement ||
15622 (TE->getOpcode() == Instruction::PHI &&
15623 all_of(TE->Scalars, [&](Value *V) {
15624 return isa<PoisonValue>(V) || MustGather.contains(V);
15625 }))));
15626 }) &&
15627 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15628 return TE->State == TreeEntry::Vectorize &&
15629 TE->getOpcode() == Instruction::PHI;
15630 }))
15631 return true;
15632
15633 // If the tree contains only phis, buildvectors, split nodes and
15634 // small nodes with reuses, we can skip it.
15635 SmallVector<const TreeEntry *> StoreLoadNodes;
15636 unsigned NumGathers = 0;
15637 constexpr int LimitTreeSize = 36;
15638 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15639 all_of(VectorizableTree,
15640 [&](const std::unique_ptr<TreeEntry> &TE) {
15641 if (!TE->isGather() && TE->hasState() &&
15642 (TE->getOpcode() == Instruction::Load ||
15643 TE->getOpcode() == Instruction::Store)) {
15644 StoreLoadNodes.push_back(TE.get());
15645 return true;
15646 }
15647 if (TE->isGather())
15648 ++NumGathers;
15649 return TE->State == TreeEntry::SplitVectorize ||
15650 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15651 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15652 VectorizableTree.size() > LimitTreeSize) ||
15653 (TE->isGather() &&
15654 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15655 (TE->hasState() &&
15656 (TE->getOpcode() == Instruction::PHI ||
15657 (TE->hasCopyableElements() &&
15658 static_cast<unsigned>(count_if(
15659 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15660 TE->Scalars.size() / 2) ||
15661 ((!TE->ReuseShuffleIndices.empty() ||
15662 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15663 TE->Scalars.size() == 2)));
15664 }) &&
15665 (StoreLoadNodes.empty() ||
15666 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15667 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15668 return TE->getOpcode() == Instruction::Store ||
15669 all_of(TE->Scalars, [&](Value *V) {
15670 return !isa<LoadInst>(V) ||
15671 areAllUsersVectorized(cast<Instruction>(V));
15672 });
15673 })))))
15674 return true;
15675
15676 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15677 // tree node) and other buildvectors, we can skip it.
15678 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15679 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15680 VectorizableTree.size() >= Limit &&
15681 count_if(ArrayRef(VectorizableTree).drop_front(),
15682 [&](const std::unique_ptr<TreeEntry> &TE) {
15683 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15684 TE->UserTreeIndex.UserTE->Idx == 0;
15685 }) == 2)
15686 return true;
15687
15688 // If the tree contains only vectorization of the phi node from the
15689 // buildvector - skip it.
15690 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15691 VectorizableTree.size() > 2 &&
15692 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15693 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15694 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15695 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15696 all_of(
15697 ArrayRef(VectorizableTree).drop_front(2),
15698 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
15699 return true;
15700
15701 // We can vectorize the tree if its size is greater than or equal to the
15702 // minimum size specified by the MinTreeSize command line option.
15703 if (VectorizableTree.size() >= MinTreeSize)
15704 return false;
15705
15706 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
15707 // can vectorize it if we can prove it fully vectorizable.
15708 if (isFullyVectorizableTinyTree(ForReduction))
15709 return false;
15710
15711 // Check if any of the gather node forms an insertelement buildvector
15712 // somewhere.
15713 bool IsAllowedSingleBVNode =
15714 VectorizableTree.size() > 1 ||
15715 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15716 !VectorizableTree.front()->isAltShuffle() &&
15717 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15718 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15719 allSameBlock(VectorizableTree.front()->Scalars));
15720 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15721 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
15722 return isa<ExtractElementInst, Constant>(V) ||
15723 (IsAllowedSingleBVNode &&
15724 !V->hasNUsesOrMore(UsesLimit) &&
15725 any_of(V->users(), IsaPred<InsertElementInst>));
15726 });
15727 }))
15728 return false;
15729
15730 if (VectorizableTree.back()->isGather() &&
15731 VectorizableTree.back()->hasState() &&
15732 VectorizableTree.back()->isAltShuffle() &&
15733 VectorizableTree.back()->getVectorFactor() > 2 &&
15734 allSameBlock(VectorizableTree.back()->Scalars) &&
15735 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15736 TTI->getScalarizationOverhead(
15737 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15738 VectorizableTree.back()->getVectorFactor()),
15739 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
15740 /*Insert=*/true, /*Extract=*/false,
15742 return false;
15743
15744 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
15745 // vectorizable.
15746 return true;
15747}
15748
15751 constexpr unsigned SmallTree = 3;
15752 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15753 getCanonicalGraphSize() <= SmallTree &&
15754 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15755 [](const std::unique_ptr<TreeEntry> &TE) {
15756 return TE->isGather() && TE->hasState() &&
15757 TE->getOpcode() == Instruction::Load &&
15758 !allSameBlock(TE->Scalars);
15759 }) == 1)
15760 return true;
15761 return false;
15762 }
15763 bool Res = false;
15764 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
15765 TreeEntry &E = *VectorizableTree[Idx];
15766 if (E.State == TreeEntry::SplitVectorize)
15767 return false;
15768 if (!E.isGather())
15769 continue;
15770 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15771 (!E.hasState() &&
15773 (isa<ExtractElementInst>(E.Scalars.front()) &&
15774 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
15775 return false;
15776 if (isSplat(E.Scalars) || allConstant(E.Scalars))
15777 continue;
15778 Res = true;
15779 }
15780 return Res;
15781}
15782
15784 // Walk from the bottom of the tree to the top, tracking which values are
15785 // live. When we see a call instruction that is not part of our tree,
15786 // query TTI to see if there is a cost to keeping values live over it
15787 // (for example, if spills and fills are required).
15788
15789 const TreeEntry *Root = VectorizableTree.front().get();
15790 if (Root->isGather())
15791 return 0;
15792
15793 InstructionCost Cost = 0;
15795 EntriesToOperands;
15796 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
15797 SmallPtrSet<const Instruction *, 8> LastInstructions;
15798 for (const auto &TEPtr : VectorizableTree) {
15799 if (!TEPtr->isGather()) {
15800 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15801 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
15802 LastInstructions.insert(LastInst);
15803 }
15804 if (TEPtr->UserTreeIndex)
15805 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15806 }
15807
15808 auto NoCallIntrinsic = [this](const Instruction *I) {
15809 const auto *II = dyn_cast<IntrinsicInst>(I);
15810 if (!II)
15811 return false;
15812 if (II->isAssumeLikeIntrinsic())
15813 return true;
15814 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
15815 InstructionCost IntrCost =
15816 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
15817 InstructionCost CallCost = TTI->getCallInstrCost(
15818 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
15819 return IntrCost < CallCost;
15820 };
15821
15822 // Maps last instruction in the entry to the last instruction for the one of
15823 // operand entries and the flag. If the flag is true, there are no calls in
15824 // between these instructions.
15826 CheckedInstructions;
15827 unsigned Budget = 0;
15828 const unsigned BudgetLimit =
15829 ScheduleRegionSizeBudget / VectorizableTree.size();
15830 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
15831 const Instruction *Last) {
15832 assert(First->getParent() == Last->getParent() &&
15833 "Expected instructions in same block.");
15834 if (auto It = CheckedInstructions.find(Last);
15835 It != CheckedInstructions.end()) {
15836 const Instruction *Checked = It->second.getPointer();
15837 if (Checked == First || Checked->comesBefore(First))
15838 return It->second.getInt() != 0;
15839 Last = Checked;
15840 } else if (Last == First || Last->comesBefore(First)) {
15841 return true;
15842 }
15844 ++First->getIterator().getReverse(),
15845 PrevInstIt =
15846 Last->getIterator().getReverse();
15847 SmallVector<const Instruction *> LastInstsInRange;
15848 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15849 // Debug information does not impact spill cost.
15850 // Vectorized calls, represented as vector intrinsics, do not impact spill
15851 // cost.
15852 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15853 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
15854 for (const Instruction *LastInst : LastInstsInRange)
15855 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
15856 return false;
15857 }
15858 if (LastInstructions.contains(&*PrevInstIt))
15859 LastInstsInRange.push_back(&*PrevInstIt);
15860
15861 ++PrevInstIt;
15862 ++Budget;
15863 }
15864 for (const Instruction *LastInst : LastInstsInRange)
15865 CheckedInstructions.try_emplace(
15866 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
15867 Budget <= BudgetLimit ? 1 : 0);
15868 return Budget <= BudgetLimit;
15869 };
15870 auto AddCosts = [&](const TreeEntry *Op) {
15871 Type *ScalarTy = Op->Scalars.front()->getType();
15872 auto It = MinBWs.find(Op);
15873 if (It != MinBWs.end())
15874 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
15875 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
15876 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15877 if (ScalarTy->isVectorTy()) {
15878 // Handle revec dead vector instructions.
15879 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15880 }
15881 };
15882 // Memoize the relationship between blocks, i.e. if there is (at least one)
15883 // non-vectorized call between the blocks. This allows to skip the analysis of
15884 // the same block paths multiple times.
15886 ParentOpParentToPreds;
15887 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
15888 BasicBlock *OpParent) {
15889 auto Key = std::make_pair(Root, OpParent);
15890 if (auto It = ParentOpParentToPreds.find(Key);
15891 It != ParentOpParentToPreds.end())
15892 return It->second;
15894 if (Pred)
15895 Worklist.push_back(Pred);
15896 else
15897 Worklist.append(pred_begin(Root), pred_end(Root));
15900 ParentsPairsToAdd;
15901 bool Res = false;
15902 auto Cleanup = make_scope_exit([&]() {
15903 for (const auto &KeyPair : ParentsPairsToAdd) {
15904 assert(!ParentOpParentToPreds.contains(KeyPair) &&
15905 "Should not have been added before.");
15906 ParentOpParentToPreds.try_emplace(KeyPair, Res);
15907 }
15908 });
15909 while (!Worklist.empty()) {
15910 BasicBlock *BB = Worklist.pop_back_val();
15911 if (BB == OpParent || !Visited.insert(BB).second)
15912 continue;
15913 auto Pair = std::make_pair(BB, OpParent);
15914 if (auto It = ParentOpParentToPreds.find(Pair);
15915 It != ParentOpParentToPreds.end()) {
15916 Res = It->second;
15917 return Res;
15918 }
15919 ParentsPairsToAdd.insert(Pair);
15920 unsigned BlockSize = BB->size();
15921 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
15922 return Res;
15923 Budget += BlockSize;
15924 if (Budget > BudgetLimit)
15925 return Res;
15926 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
15927 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
15928 BB->getTerminator()))
15929 return Res;
15930 Worklist.append(pred_begin(BB), pred_end(BB));
15931 }
15932 Res = true;
15933 return Res;
15934 };
15935 SmallVector<const TreeEntry *> LiveEntries(1, Root);
15936 while (!LiveEntries.empty()) {
15937 const TreeEntry *Entry = LiveEntries.pop_back_val();
15938 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
15939 if (Operands.empty())
15940 continue;
15941 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
15942 BasicBlock *Parent = LastInst->getParent();
15943 for (const TreeEntry *Op : Operands) {
15944 if (!Op->isGather())
15945 LiveEntries.push_back(Op);
15946 if (Entry->State == TreeEntry::SplitVectorize ||
15947 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
15948 (Op->isGather() && allConstant(Op->Scalars)))
15949 continue;
15950 Budget = 0;
15951 BasicBlock *Pred = nullptr;
15952 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15953 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15954 BasicBlock *OpParent;
15955 Instruction *OpLastInst;
15956 if (Op->isGather()) {
15957 assert(Entry->getOpcode() == Instruction::PHI &&
15958 "Expected phi node only.");
15959 OpParent = cast<PHINode>(Entry->getMainOp())
15960 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15961 OpLastInst = OpParent->getTerminator();
15962 for (Value *V : Op->Scalars) {
15963 auto *Inst = dyn_cast<Instruction>(V);
15964 if (!Inst)
15965 continue;
15966 if (isVectorized(V)) {
15967 OpParent = Inst->getParent();
15968 OpLastInst = Inst;
15969 break;
15970 }
15971 }
15972 } else {
15973 OpLastInst = EntriesToLastInstruction.at(Op);
15974 OpParent = OpLastInst->getParent();
15975 }
15976 // Check the call instructions within the same basic blocks.
15977 if (OpParent == Parent) {
15978 if (Entry->getOpcode() == Instruction::PHI) {
15979 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15980 AddCosts(Op);
15981 continue;
15982 }
15983 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15984 AddCosts(Op);
15985 continue;
15986 }
15987 // Check for call instruction in between blocks.
15988 // 1. Check entry's block to the head.
15989 if (Entry->getOpcode() != Instruction::PHI &&
15990 !CheckForNonVecCallsInSameBlock(
15991 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
15992 LastInst)) {
15993 AddCosts(Op);
15994 continue;
15995 }
15996 // 2. Check op's block from the end.
15997 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15998 OpParent->getTerminator())) {
15999 AddCosts(Op);
16000 continue;
16001 }
16002 // 3. Check the predecessors of entry's block till op's block.
16003 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16004 AddCosts(Op);
16005 continue;
16006 }
16007 }
16008 }
16009
16010 return Cost;
16011}
16012
16013/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
16014/// buildvector sequence.
16016 const InsertElementInst *IE2) {
16017 if (IE1 == IE2)
16018 return false;
16019 const auto *I1 = IE1;
16020 const auto *I2 = IE2;
16021 const InsertElementInst *PrevI1;
16022 const InsertElementInst *PrevI2;
16023 unsigned Idx1 = *getElementIndex(IE1);
16024 unsigned Idx2 = *getElementIndex(IE2);
16025 do {
16026 if (I2 == IE1)
16027 return true;
16028 if (I1 == IE2)
16029 return false;
16030 PrevI1 = I1;
16031 PrevI2 = I2;
16032 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16033 getElementIndex(I1).value_or(Idx2) != Idx2)
16034 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
16035 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
16036 getElementIndex(I2).value_or(Idx1) != Idx1)
16037 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
16038 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16039 llvm_unreachable("Two different buildvectors not expected.");
16040}
16041
16042namespace {
16043/// Returns incoming Value *, if the requested type is Value * too, or a default
16044/// value, otherwise.
16045struct ValueSelect {
16046 template <typename U>
16047 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
16048 return V;
16049 }
16050 template <typename U>
16051 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
16052 return U();
16053 }
16054};
16055} // namespace
16056
16057/// Does the analysis of the provided shuffle masks and performs the requested
16058/// actions on the vectors with the given shuffle masks. It tries to do it in
16059/// several steps.
16060/// 1. If the Base vector is not undef vector, resizing the very first mask to
16061/// have common VF and perform action for 2 input vectors (including non-undef
16062/// Base). Other shuffle masks are combined with the resulting after the 1 stage
16063/// and processed as a shuffle of 2 elements.
16064/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
16065/// action only for 1 vector with the given mask, if it is not the identity
16066/// mask.
16067/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
16068/// vectors, combing the masks properly between the steps.
16069template <typename T>
16071 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
16072 function_ref<unsigned(T *)> GetVF,
16073 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
16075 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
16076 SmallVector<int> Mask(ShuffleMask.begin()->second);
16077 auto VMIt = std::next(ShuffleMask.begin());
16078 T *Prev = nullptr;
16079 SmallBitVector UseMask =
16080 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16081 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
16082 if (!IsBaseUndef.all()) {
16083 // Base is not undef, need to combine it with the next subvectors.
16084 std::pair<T *, bool> Res =
16085 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
16086 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
16087 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16088 if (Mask[Idx] == PoisonMaskElem)
16089 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
16090 else
16091 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16092 }
16093 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
16094 assert((!V || GetVF(V) == Mask.size()) &&
16095 "Expected base vector of VF number of elements.");
16096 Prev = Action(Mask, {nullptr, Res.first});
16097 } else if (ShuffleMask.size() == 1) {
16098 // Base is undef and only 1 vector is shuffled - perform the action only for
16099 // single vector, if the mask is not the identity mask.
16100 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16101 /*ForSingleMask=*/true);
16102 if (Res.second)
16103 // Identity mask is found.
16104 Prev = Res.first;
16105 else
16106 Prev = Action(Mask, {ShuffleMask.begin()->first});
16107 } else {
16108 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
16109 // shuffles step by step, combining shuffle between the steps.
16110 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16111 unsigned Vec2VF = GetVF(VMIt->first);
16112 if (Vec1VF == Vec2VF) {
16113 // No need to resize the input vectors since they are of the same size, we
16114 // can shuffle them directly.
16115 ArrayRef<int> SecMask = VMIt->second;
16116 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16117 if (SecMask[I] != PoisonMaskElem) {
16118 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16119 Mask[I] = SecMask[I] + Vec1VF;
16120 }
16121 }
16122 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16123 } else {
16124 // Vectors of different sizes - resize and reshuffle.
16125 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16126 /*ForSingleMask=*/false);
16127 std::pair<T *, bool> Res2 =
16128 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16129 ArrayRef<int> SecMask = VMIt->second;
16130 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16131 if (Mask[I] != PoisonMaskElem) {
16132 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16133 if (Res1.second)
16134 Mask[I] = I;
16135 } else if (SecMask[I] != PoisonMaskElem) {
16136 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16137 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
16138 }
16139 }
16140 Prev = Action(Mask, {Res1.first, Res2.first});
16141 }
16142 VMIt = std::next(VMIt);
16143 }
16144 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
16145 // Perform requested actions for the remaining masks/vectors.
16146 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16147 // Shuffle other input vectors, if any.
16148 std::pair<T *, bool> Res =
16149 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16150 ArrayRef<int> SecMask = VMIt->second;
16151 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16152 if (SecMask[I] != PoisonMaskElem) {
16153 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
16154 "Multiple uses of scalars.");
16155 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
16156 } else if (Mask[I] != PoisonMaskElem) {
16157 Mask[I] = I;
16158 }
16159 }
16160 Prev = Action(Mask, {Prev, Res.first});
16161 }
16162 return Prev;
16163}
16164
16165namespace {
16166/// Data type for handling buildvector sequences with the reused scalars from
16167/// other tree entries.
16168template <typename T> struct ShuffledInsertData {
16169 /// List of insertelements to be replaced by shuffles.
16170 SmallVector<InsertElementInst *> InsertElements;
16171 /// The parent vectors and shuffle mask for the given list of inserts.
16172 MapVector<T, SmallVector<int>> ValueMasks;
16173};
16174} // namespace
16175
16177 InstructionCost ReductionCost) {
16178 InstructionCost Cost = ReductionCost;
16179 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16180 << VectorizableTree.size() << ".\n");
16181
16182 SmallPtrSet<Value *, 4> CheckedExtracts;
16183 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
16184 TreeEntry &TE = *VectorizableTree[I];
16185 // No need to count the cost for combined entries, they are combined and
16186 // just skip their cost.
16187 if (TE.State == TreeEntry::CombinedVectorize) {
16188 LLVM_DEBUG(
16189 dbgs() << "SLP: Skipping cost for combined node that starts with "
16190 << *TE.Scalars[0] << ".\n";
16191 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16192 continue;
16193 }
16194 if (TE.hasState() &&
16195 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16196 if (const TreeEntry *E =
16197 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16198 E && E->getVectorFactor() == TE.getVectorFactor()) {
16199 // Some gather nodes might be absolutely the same as some vectorizable
16200 // nodes after reordering, need to handle it.
16201 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16202 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16203 << "SLP: Current total cost = " << Cost << "\n");
16204 continue;
16205 }
16206 }
16207
16208 // Exclude cost of gather loads nodes which are not used. These nodes were
16209 // built as part of the final attempt to vectorize gathered loads.
16210 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16211 "Expected gather nodes with users only.");
16212
16213 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
16214 Cost += C;
16215 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16216 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16217 << "SLP: Current total cost = " << Cost << "\n");
16218 }
16219
16220 if (Cost >= -SLPCostThreshold &&
16221 none_of(ExternalUses, [](const ExternalUser &EU) {
16222 return isa_and_nonnull<InsertElementInst>(EU.User);
16223 }))
16224 return Cost;
16225
16226 SmallPtrSet<Value *, 16> ExtractCostCalculated;
16227 InstructionCost ExtractCost = 0;
16229 SmallVector<APInt> DemandedElts;
16230 SmallDenseSet<Value *, 4> UsedInserts;
16232 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16234 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
16235 // Keep track {Scalar, Index, User} tuple.
16236 // On AArch64, this helps in fusing a mov instruction, associated with
16237 // extractelement, with fmul in the backend so that extractelement is free.
16239 for (ExternalUser &EU : ExternalUses) {
16240 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
16241 }
16242 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
16243 for (ExternalUser &EU : ExternalUses) {
16244 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
16245 << EU.E.Idx << " in lane " << EU.Lane << "\n");
16246 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
16247 else dbgs() << " User: nullptr\n");
16248 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
16249
16250 // Uses by ephemeral values are free (because the ephemeral value will be
16251 // removed prior to code generation, and so the extraction will be
16252 // removed as well).
16253 if (EphValues.count(EU.User))
16254 continue;
16255
16256 // Check if the scalar for the given user or all users is accounted already.
16257 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
16258 (EU.User &&
16259 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
16260 continue;
16261
16262 // Used in unreachable blocks or in EH pads (rarely executed) or is
16263 // terminated with unreachable instruction.
16264 if (BasicBlock *UserParent =
16265 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
16266 UserParent &&
16267 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16268 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
16269 continue;
16270
16271 // We only add extract cost once for the same scalar.
16272 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16273 !ExtractCostCalculated.insert(EU.Scalar).second)
16274 continue;
16275
16276 // No extract cost for vector "scalar" if REVEC is disabled
16277 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16278 continue;
16279
16280 // If found user is an insertelement, do not calculate extract cost but try
16281 // to detect it as a final shuffled/identity match.
16282 // TODO: what if a user is insertvalue when REVEC is enabled?
16283 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16284 VU && VU->getOperand(1) == EU.Scalar) {
16285 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16286 if (!UsedInserts.insert(VU).second)
16287 continue;
16288 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16289 if (InsertIdx) {
16290 const TreeEntry *ScalarTE = &EU.E;
16291 auto *It = find_if(
16292 ShuffledInserts,
16293 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16294 // Checks if 2 insertelements are from the same buildvector.
16295 InsertElementInst *VecInsert = Data.InsertElements.front();
16297 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16298 Value *Op0 = II->getOperand(0);
16299 if (isVectorized(II) && !isVectorized(Op0))
16300 return nullptr;
16301 return Op0;
16302 });
16303 });
16304 int VecId = -1;
16305 if (It == ShuffledInserts.end()) {
16306 auto &Data = ShuffledInserts.emplace_back();
16307 Data.InsertElements.emplace_back(VU);
16308 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16309 VecId = ShuffledInserts.size() - 1;
16310 auto It = MinBWs.find(ScalarTE);
16311 if (It != MinBWs.end() &&
16312 VectorCasts
16313 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16314 .second) {
16315 unsigned BWSz = It->second.first;
16316 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16317 unsigned VecOpcode;
16318 if (DstBWSz < BWSz)
16319 VecOpcode = Instruction::Trunc;
16320 else
16321 VecOpcode =
16322 It->second.second ? Instruction::SExt : Instruction::ZExt;
16324 InstructionCost C = TTI->getCastInstrCost(
16325 VecOpcode, FTy,
16326 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16327 FTy->getNumElements()),
16329 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16330 << " for extending externally used vector with "
16331 "non-equal minimum bitwidth.\n");
16332 Cost += C;
16333 }
16334 } else {
16335 if (isFirstInsertElement(VU, It->InsertElements.front()))
16336 It->InsertElements.front() = VU;
16337 VecId = std::distance(ShuffledInserts.begin(), It);
16338 }
16339 int InIdx = *InsertIdx;
16340 SmallVectorImpl<int> &Mask =
16341 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16342 if (Mask.empty())
16343 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16344 Mask[InIdx] = EU.Lane;
16345 DemandedElts[VecId].setBit(InIdx);
16346 continue;
16347 }
16348 }
16349 }
16350
16352 // If we plan to rewrite the tree in a smaller type, we will need to sign
16353 // extend the extracted value back to the original type. Here, we account
16354 // for the extract and the added cost of the sign extend if needed.
16355 InstructionCost ExtraCost = TTI::TCC_Free;
16356 auto *ScalarTy = EU.Scalar->getType();
16357 const unsigned BundleWidth = EU.E.getVectorFactor();
16358 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16359 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16360 const TreeEntry *Entry = &EU.E;
16361 auto It = MinBWs.find(Entry);
16362 if (It != MinBWs.end()) {
16363 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16364 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16365 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16366 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16367 ? Instruction::ZExt
16368 : Instruction::SExt;
16369 VecTy = getWidenedType(MinTy, BundleWidth);
16370 ExtraCost =
16371 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16372 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16373 << ExtraCost << "\n");
16374 } else {
16375 ExtraCost =
16376 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16377 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16378 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16379 << *VecTy << ": " << ExtraCost << "\n");
16380 }
16381 // Leave the scalar instructions as is if they are cheaper than extracts.
16382 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16383 Entry->getOpcode() == Instruction::Load) {
16384 // Checks if the user of the external scalar is phi in loop body.
16385 auto IsPhiInLoop = [&](const ExternalUser &U) {
16386 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16387 auto *I = cast<Instruction>(U.Scalar);
16388 const Loop *L = LI->getLoopFor(Phi->getParent());
16389 return L && (Phi->getParent() == I->getParent() ||
16390 L == LI->getLoopFor(I->getParent()));
16391 }
16392 return false;
16393 };
16394 if (!ValueToExtUses) {
16395 ValueToExtUses.emplace();
16396 for (const auto &P : enumerate(ExternalUses)) {
16397 // Ignore phis in loops.
16398 if (IsPhiInLoop(P.value()))
16399 continue;
16400
16401 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16402 }
16403 }
16404 // Can use original instruction, if no operands vectorized or they are
16405 // marked as externally used already.
16406 auto *Inst = cast<Instruction>(EU.Scalar);
16407 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16408 auto OperandIsScalar = [&](Value *V) {
16409 if (!isVectorized(V)) {
16410 // Some extractelements might be not vectorized, but
16411 // transformed into shuffle and removed from the function,
16412 // consider it here.
16413 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16414 return !EE->hasOneUse() || !MustGather.contains(EE);
16415 return true;
16416 }
16417 return ValueToExtUses->contains(V);
16418 };
16419 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16420 bool CanBeUsedAsScalarCast = false;
16421 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16422 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16423 Op && all_of(Op->operands(), OperandIsScalar)) {
16424 InstructionCost OpCost =
16425 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16426 ? TTI->getInstructionCost(Op, CostKind)
16427 : 0;
16428 if (ScalarCost + OpCost <= ExtraCost) {
16429 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16430 ScalarCost += OpCost;
16431 }
16432 }
16433 }
16434 if (CanBeUsedAsScalar) {
16435 bool KeepScalar = ScalarCost <= ExtraCost;
16436 // Try to keep original scalar if the user is the phi node from the same
16437 // block as the root phis, currently vectorized. It allows to keep
16438 // better ordering info of PHIs, being vectorized currently.
16439 bool IsProfitablePHIUser =
16440 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16441 VectorizableTree.front()->Scalars.size() > 2)) &&
16442 VectorizableTree.front()->hasState() &&
16443 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16444 !Inst->hasNUsesOrMore(UsesLimit) &&
16445 none_of(Inst->users(),
16446 [&](User *U) {
16447 auto *PHIUser = dyn_cast<PHINode>(U);
16448 return (!PHIUser ||
16449 PHIUser->getParent() !=
16450 cast<Instruction>(
16451 VectorizableTree.front()->getMainOp())
16452 ->getParent()) &&
16453 !isVectorized(U);
16454 }) &&
16455 count_if(Entry->Scalars, [&](Value *V) {
16456 return ValueToExtUses->contains(V);
16457 }) <= 2;
16458 if (IsProfitablePHIUser) {
16459 KeepScalar = true;
16460 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16461 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16462 (!GatheredLoadsEntriesFirst.has_value() ||
16463 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16464 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16465 return ValueToExtUses->contains(V);
16466 });
16467 auto It = ExtractsCount.find(Entry);
16468 if (It != ExtractsCount.end()) {
16469 assert(ScalarUsesCount >= It->getSecond().size() &&
16470 "Expected total number of external uses not less than "
16471 "number of scalar uses.");
16472 ScalarUsesCount -= It->getSecond().size();
16473 }
16474 // Keep original scalar if number of externally used instructions in
16475 // the same entry is not power of 2. It may help to do some extra
16476 // vectorization for now.
16477 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16478 }
16479 if (KeepScalar) {
16480 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16481 for (Value *V : Inst->operands()) {
16482 auto It = ValueToExtUses->find(V);
16483 if (It != ValueToExtUses->end()) {
16484 // Replace all uses to avoid compiler crash.
16485 ExternalUses[It->second].User = nullptr;
16486 }
16487 }
16488 ExtraCost = ScalarCost;
16489 if (!IsPhiInLoop(EU))
16490 ExtractsCount[Entry].insert(Inst);
16491 if (CanBeUsedAsScalarCast) {
16492 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16493 // Update the users of the operands of the cast operand to avoid
16494 // compiler crash.
16495 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16496 for (Value *V : IOp->operands()) {
16497 auto It = ValueToExtUses->find(V);
16498 if (It != ValueToExtUses->end()) {
16499 // Replace all uses to avoid compiler crash.
16500 ExternalUses[It->second].User = nullptr;
16501 }
16502 }
16503 }
16504 }
16505 }
16506 }
16507 }
16508
16509 ExtractCost += ExtraCost;
16510 }
16511 // Insert externals for extract of operands of casts to be emitted as scalars
16512 // instead of extractelement.
16513 for (Value *V : ScalarOpsFromCasts) {
16514 ExternalUsesAsOriginalScalar.insert(V);
16515 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
16516 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
16517 TEs.front()->findLaneForValue(V));
16518 }
16519 }
16520 // Add reduced value cost, if resized.
16521 if (!VectorizedVals.empty()) {
16522 const TreeEntry &Root = *VectorizableTree.front();
16523 auto BWIt = MinBWs.find(&Root);
16524 if (BWIt != MinBWs.end()) {
16525 Type *DstTy = Root.Scalars.front()->getType();
16526 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
16527 unsigned SrcSz =
16528 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16529 if (OriginalSz != SrcSz) {
16530 unsigned Opcode = Instruction::Trunc;
16531 if (OriginalSz > SrcSz)
16532 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16533 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
16534 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16535 assert(SLPReVec && "Only supported by REVEC.");
16536 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
16537 }
16538 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16541 }
16542 }
16543 }
16544
16545 Cost += ExtractCost;
16546 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
16547 bool ForSingleMask) {
16548 InstructionCost C = 0;
16549 unsigned VF = Mask.size();
16550 unsigned VecVF = TE->getVectorFactor();
16551 bool HasLargeIndex =
16552 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
16553 if ((VF != VecVF && HasLargeIndex) ||
16555
16556 if (HasLargeIndex) {
16557 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
16558 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16559 OrigMask.begin());
16561 getWidenedType(TE->getMainOp()->getType(), VecVF),
16562 OrigMask);
16563 LLVM_DEBUG(
16564 dbgs() << "SLP: Adding cost " << C
16565 << " for final shuffle of insertelement external users.\n";
16566 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16567 Cost += C;
16568 return std::make_pair(TE, true);
16569 }
16570
16571 if (!ForSingleMask) {
16572 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16573 for (unsigned I = 0; I < VF; ++I) {
16574 if (Mask[I] != PoisonMaskElem)
16575 ResizeMask[Mask[I]] = Mask[I];
16576 }
16577 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
16580 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
16581 LLVM_DEBUG(
16582 dbgs() << "SLP: Adding cost " << C
16583 << " for final shuffle of insertelement external users.\n";
16584 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16585
16586 Cost += C;
16587 }
16588 }
16589 return std::make_pair(TE, false);
16590 };
16591 // Calculate the cost of the reshuffled vectors, if any.
16592 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16593 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
16594 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16595 unsigned VF = 0;
16596 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
16598 assert((TEs.size() == 1 || TEs.size() == 2) &&
16599 "Expected exactly 1 or 2 tree entries.");
16600 if (TEs.size() == 1) {
16601 if (VF == 0)
16602 VF = TEs.front()->getVectorFactor();
16603 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16604 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
16605 !all_of(enumerate(Mask), [=](const auto &Data) {
16606 return Data.value() == PoisonMaskElem ||
16607 (Data.index() < VF &&
16608 static_cast<int>(Data.index()) == Data.value());
16609 })) {
16612 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16613 << " for final shuffle of insertelement "
16614 "external users.\n";
16615 TEs.front()->dump();
16616 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16617 Cost += C;
16618 }
16619 } else {
16620 if (VF == 0) {
16621 if (TEs.front() &&
16622 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16623 VF = TEs.front()->getVectorFactor();
16624 else
16625 VF = Mask.size();
16626 }
16627 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16629 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
16630 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16631 << " for final shuffle of vector node and external "
16632 "insertelement users.\n";
16633 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16634 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16635 Cost += C;
16636 }
16637 VF = Mask.size();
16638 return TEs.back();
16639 };
16641 MutableArrayRef(Vector.data(), Vector.size()), Base,
16642 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16643 EstimateShufflesCost);
16644 InstructionCost InsertCost = TTI->getScalarizationOverhead(
16646 ShuffledInserts[I].InsertElements.front()->getType()),
16647 DemandedElts[I],
16648 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
16649 Cost -= InsertCost;
16650 }
16651
16652 // Add the cost for reduced value resize (if required).
16653 if (ReductionBitWidth != 0) {
16654 assert(UserIgnoreList && "Expected reduction tree.");
16655 const TreeEntry &E = *VectorizableTree.front();
16656 auto It = MinBWs.find(&E);
16657 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16658 unsigned SrcSize = It->second.first;
16659 unsigned DstSize = ReductionBitWidth;
16660 unsigned Opcode = Instruction::Trunc;
16661 if (SrcSize < DstSize) {
16662 bool IsArithmeticExtendedReduction =
16663 all_of(*UserIgnoreList, [](Value *V) {
16664 auto *I = cast<Instruction>(V);
16665 return is_contained({Instruction::Add, Instruction::FAdd,
16666 Instruction::Mul, Instruction::FMul,
16667 Instruction::And, Instruction::Or,
16668 Instruction::Xor},
16669 I->getOpcode());
16670 });
16671 if (IsArithmeticExtendedReduction)
16672 Opcode =
16673 Instruction::BitCast; // Handle it by getExtendedReductionCost
16674 else
16675 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16676 }
16677 if (Opcode != Instruction::BitCast) {
16678 auto *SrcVecTy =
16679 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16680 auto *DstVecTy =
16681 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16682 TTI::CastContextHint CCH = getCastContextHint(E);
16683 InstructionCost CastCost;
16684 switch (E.getOpcode()) {
16685 case Instruction::SExt:
16686 case Instruction::ZExt:
16687 case Instruction::Trunc: {
16688 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16689 CCH = getCastContextHint(*OpTE);
16690 break;
16691 }
16692 default:
16693 break;
16694 }
16695 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16697 Cost += CastCost;
16698 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
16699 << " for final resize for reduction from " << SrcVecTy
16700 << " to " << DstVecTy << "\n";
16701 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16702 }
16703 }
16704 }
16705
16706 std::optional<InstructionCost> SpillCost;
16707 if (Cost < -SLPCostThreshold) {
16708 SpillCost = getSpillCost();
16709 Cost += *SpillCost;
16710 }
16711#ifndef NDEBUG
16712 SmallString<256> Str;
16713 {
16714 raw_svector_ostream OS(Str);
16715 OS << "SLP: Spill Cost = ";
16716 if (SpillCost)
16717 OS << *SpillCost;
16718 else
16719 OS << "<skipped>";
16720 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
16721 << "SLP: Total Cost = " << Cost << ".\n";
16722 }
16723 LLVM_DEBUG(dbgs() << Str);
16724 if (ViewSLPTree)
16725 ViewGraph(this, "SLP" + F->getName(), false, Str);
16726#endif
16727
16728 return Cost;
16729}
16730
16731/// Tries to find extractelement instructions with constant indices from fixed
16732/// vector type and gather such instructions into a bunch, which highly likely
16733/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16734/// successful, the matched scalars are replaced by poison values in \p VL for
16735/// future analysis.
16736std::optional<TTI::ShuffleKind>
16737BoUpSLP::tryToGatherSingleRegisterExtractElements(
16739 // Scan list of gathered scalars for extractelements that can be represented
16740 // as shuffles.
16742 SmallVector<int> UndefVectorExtracts;
16743 for (int I = 0, E = VL.size(); I < E; ++I) {
16744 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16745 if (!EI) {
16746 if (isa<UndefValue>(VL[I]))
16747 UndefVectorExtracts.push_back(I);
16748 continue;
16749 }
16750 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16751 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16752 continue;
16753 std::optional<unsigned> Idx = getExtractIndex(EI);
16754 // Undefined index.
16755 if (!Idx) {
16756 UndefVectorExtracts.push_back(I);
16757 continue;
16758 }
16759 if (Idx >= VecTy->getNumElements()) {
16760 UndefVectorExtracts.push_back(I);
16761 continue;
16762 }
16763 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
16764 ExtractMask.reset(*Idx);
16765 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
16766 UndefVectorExtracts.push_back(I);
16767 continue;
16768 }
16769 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
16770 }
16771 // Sort the vector operands by the maximum number of uses in extractelements.
16773 VectorOpToIdx.takeVector();
16774 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
16775 return P1.second.size() > P2.second.size();
16776 });
16777 // Find the best pair of the vectors or a single vector.
16778 const int UndefSz = UndefVectorExtracts.size();
16779 unsigned SingleMax = 0;
16780 unsigned PairMax = 0;
16781 if (!Vectors.empty()) {
16782 SingleMax = Vectors.front().second.size() + UndefSz;
16783 if (Vectors.size() > 1) {
16784 auto *ItNext = std::next(Vectors.begin());
16785 PairMax = SingleMax + ItNext->second.size();
16786 }
16787 }
16788 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16789 return std::nullopt;
16790 // Check if better to perform a shuffle of 2 vectors or just of a single
16791 // vector.
16792 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
16793 SmallVector<Value *> GatheredExtracts(
16794 VL.size(), PoisonValue::get(VL.front()->getType()));
16795 if (SingleMax >= PairMax && SingleMax) {
16796 for (int Idx : Vectors.front().second)
16797 std::swap(GatheredExtracts[Idx], VL[Idx]);
16798 } else if (!Vectors.empty()) {
16799 for (unsigned Idx : {0, 1})
16800 for (int Idx : Vectors[Idx].second)
16801 std::swap(GatheredExtracts[Idx], VL[Idx]);
16802 }
16803 // Add extracts from undefs too.
16804 for (int Idx : UndefVectorExtracts)
16805 std::swap(GatheredExtracts[Idx], VL[Idx]);
16806 // Check that gather of extractelements can be represented as just a
16807 // shuffle of a single/two vectors the scalars are extracted from.
16808 std::optional<TTI::ShuffleKind> Res =
16809 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
16810 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16811 // TODO: try to check other subsets if possible.
16812 // Restore the original VL if attempt was not successful.
16813 copy(SavedVL, VL.begin());
16814 return std::nullopt;
16815 }
16816 // Restore unused scalars from mask, if some of the extractelements were not
16817 // selected for shuffle.
16818 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
16819 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
16820 isa<UndefValue>(GatheredExtracts[I])) {
16821 std::swap(VL[I], GatheredExtracts[I]);
16822 continue;
16823 }
16824 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16825 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16826 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16827 is_contained(UndefVectorExtracts, I))
16828 continue;
16829 }
16830 return Res;
16831}
16832
16833/// Tries to find extractelement instructions with constant indices from fixed
16834/// vector type and gather such instructions into a bunch, which highly likely
16835/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16836/// successful, the matched scalars are replaced by poison values in \p VL for
16837/// future analysis.
16839BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16840 SmallVectorImpl<int> &Mask,
16841 unsigned NumParts) const {
16842 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
16843 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
16844 Mask.assign(VL.size(), PoisonMaskElem);
16845 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16846 for (unsigned Part : seq<unsigned>(NumParts)) {
16847 // Scan list of gathered scalars for extractelements that can be represented
16848 // as shuffles.
16849 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
16850 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
16851 SmallVector<int> SubMask;
16852 std::optional<TTI::ShuffleKind> Res =
16853 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16854 ShufflesRes[Part] = Res;
16855 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
16856 }
16857 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
16858 return Res.has_value();
16859 }))
16860 ShufflesRes.clear();
16861 return ShufflesRes;
16862}
16863
16864std::optional<TargetTransformInfo::ShuffleKind>
16865BoUpSLP::isGatherShuffledSingleRegisterEntry(
16866 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
16867 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
16868 Entries.clear();
16869 // TODO: currently checking only for Scalars in the tree entry, need to count
16870 // reused elements too for better cost estimation.
16871 auto GetUserEntry = [&](const TreeEntry *TE) {
16872 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16873 TE = TE->UserTreeIndex.UserTE;
16874 if (TE == VectorizableTree.front().get())
16875 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
16876 return TE->UserTreeIndex;
16877 };
16878 auto HasGatherUser = [&](const TreeEntry *TE) {
16879 while (TE->Idx != 0 && TE->UserTreeIndex) {
16880 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16881 return true;
16882 TE = TE->UserTreeIndex.UserTE;
16883 }
16884 return false;
16885 };
16886 const EdgeInfo TEUseEI = GetUserEntry(TE);
16887 if (!TEUseEI)
16888 return std::nullopt;
16889 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16890 const BasicBlock *TEInsertBlock = nullptr;
16891 // Main node of PHI entries keeps the correct order of operands/incoming
16892 // blocks.
16893 if (auto *PHI = dyn_cast_or_null<PHINode>(
16894 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
16895 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16896 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16897 TEInsertPt = TEInsertBlock->getTerminator();
16898 } else {
16899 TEInsertBlock = TEInsertPt->getParent();
16900 }
16901 if (!DT->isReachableFromEntry(TEInsertBlock))
16902 return std::nullopt;
16903 auto *NodeUI = DT->getNode(TEInsertBlock);
16904 assert(NodeUI && "Should only process reachable instructions");
16905 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
16906 auto CheckOrdering = [&](const Instruction *InsertPt) {
16907 // Argument InsertPt is an instruction where vector code for some other
16908 // tree entry (one that shares one or more scalars with TE) is going to be
16909 // generated. This lambda returns true if insertion point of vector code
16910 // for the TE dominates that point (otherwise dependency is the other way
16911 // around). The other node is not limited to be of a gather kind. Gather
16912 // nodes are not scheduled and their vector code is inserted before their
16913 // first user. If user is PHI, that is supposed to be at the end of a
16914 // predecessor block. Otherwise it is the last instruction among scalars of
16915 // the user node. So, instead of checking dependency between instructions
16916 // themselves, we check dependency between their insertion points for vector
16917 // code (since each scalar instruction ends up as a lane of a vector
16918 // instruction).
16919 const BasicBlock *InsertBlock = InsertPt->getParent();
16920 auto *NodeEUI = DT->getNode(InsertBlock);
16921 if (!NodeEUI)
16922 return false;
16923 assert((NodeUI == NodeEUI) ==
16924 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16925 "Different nodes should have different DFS numbers");
16926 // Check the order of the gather nodes users.
16927 if (TEInsertPt->getParent() != InsertBlock &&
16928 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16929 return false;
16930 if (TEInsertPt->getParent() == InsertBlock &&
16931 TEInsertPt->comesBefore(InsertPt))
16932 return false;
16933 return true;
16934 };
16935 // Find all tree entries used by the gathered values. If no common entries
16936 // found - not a shuffle.
16937 // Here we build a set of tree nodes for each gathered value and trying to
16938 // find the intersection between these sets. If we have at least one common
16939 // tree node for each gathered value - we have just a permutation of the
16940 // single vector. If we have 2 different sets, we're in situation where we
16941 // have a permutation of 2 input vectors.
16943 SmallDenseMap<Value *, int> UsedValuesEntry;
16944 SmallPtrSet<const Value *, 16> VisitedValue;
16945 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
16946 // The node is reused - exit.
16947 if ((TEPtr->getVectorFactor() != VL.size() &&
16948 TEPtr->Scalars.size() != VL.size()) ||
16949 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
16950 return false;
16951 UsedTEs.clear();
16952 UsedTEs.emplace_back().insert(TEPtr);
16953 for (Value *V : VL) {
16954 if (isConstant(V))
16955 continue;
16956 UsedValuesEntry.try_emplace(V, 0);
16957 }
16958 return true;
16959 };
16960 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
16961 unsigned EdgeIdx) {
16962 const TreeEntry *Ptr1 = User1;
16963 const TreeEntry *Ptr2 = User2;
16964 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16965 while (Ptr2) {
16966 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
16967 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16968 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16969 }
16970 while (Ptr1) {
16971 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16972 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16973 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
16974 return Idx < It->second;
16975 }
16976 return false;
16977 };
16978 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
16979 Instruction *InsertPt) {
16980 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
16981 !TEUseEI.UserTE->isCopyableElement(
16982 const_cast<Instruction *>(TEInsertPt)) &&
16983 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
16984 InsertPt->getNextNode() == TEInsertPt &&
16985 (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) ||
16986 !isUsedOutsideBlock(InsertPt));
16987 };
16988 for (Value *V : VL) {
16989 if (isConstant(V) || !VisitedValue.insert(V).second)
16990 continue;
16991 // Build a list of tree entries where V is used.
16992 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16993 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16994 if (TEPtr == TE || TEPtr->Idx == 0)
16995 continue;
16996 assert(any_of(TEPtr->Scalars,
16997 [&](Value *V) { return GatheredScalars.contains(V); }) &&
16998 "Must contain at least single gathered value.");
16999 assert(TEPtr->UserTreeIndex &&
17000 "Expected only single user of a gather node.");
17001 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17002
17003 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17004 UseEI.UserTE->hasState())
17005 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
17006 : nullptr;
17007 Instruction *InsertPt =
17008 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
17009 : &getLastInstructionInBundle(UseEI.UserTE);
17010 if (TEInsertPt == InsertPt) {
17011 // Check nodes, which might be emitted first.
17012 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17013 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17014 TEUseEI.UserTE->isAltShuffle()) &&
17015 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
17016 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17017 (UseEI.UserTE->hasState() &&
17018 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17019 !UseEI.UserTE->isAltShuffle()) ||
17020 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
17021 continue;
17022 }
17023
17024 // If the schedulable insertion point is used in multiple entries - just
17025 // exit, no known ordering at this point, available only after real
17026 // scheduling.
17027 if (!doesNotNeedToBeScheduled(InsertPt) &&
17028 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17029 continue;
17030 // If the users are the PHI nodes with the same incoming blocks - skip.
17031 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17032 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17033 UseEI.UserTE->State == TreeEntry::Vectorize &&
17034 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17035 TEUseEI.UserTE != UseEI.UserTE)
17036 continue;
17037 // If 2 gathers are operands of the same entry (regardless of whether
17038 // user is PHI or else), compare operands indices, use the earlier one
17039 // as the base.
17040 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17041 continue;
17042 // If the user instruction is used for some reason in different
17043 // vectorized nodes - make it depend on index.
17044 if (TEUseEI.UserTE != UseEI.UserTE &&
17045 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17046 HasGatherUser(TEUseEI.UserTE)))
17047 continue;
17048 // If the user node is the operand of the other user node - skip.
17049 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17050 continue;
17051 }
17052
17053 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17054 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17055 UseEI.UserTE->doesNotNeedToSchedule() &&
17056 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
17057 continue;
17058 // Check if the user node of the TE comes after user node of TEPtr,
17059 // otherwise TEPtr depends on TE.
17060 if ((TEInsertBlock != InsertPt->getParent() ||
17061 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17062 (!CheckOrdering(InsertPt) ||
17063 (UseEI.UserTE->hasCopyableElements() &&
17064 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17065 is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
17066 continue;
17067 // The node is reused - exit.
17068 if (CheckAndUseSameNode(TEPtr))
17069 break;
17070 // The parent node is copyable with last inst used outside? And the last
17071 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
17072 // preserve def-use chain.
17073 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17074 continue;
17075 VToTEs.insert(TEPtr);
17076 }
17077 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
17078 const auto *It = find_if(
17079 VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
17080 if (It != VTEs.end()) {
17081 const TreeEntry *VTE = *It;
17082 if (none_of(TE->CombinedEntriesWithIndices,
17083 [&](const auto &P) { return P.first == VTE->Idx; })) {
17084 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17085 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17086 continue;
17087 }
17088 // The node is reused - exit.
17089 if (CheckAndUseSameNode(VTE))
17090 break;
17091 VToTEs.insert(VTE);
17092 }
17093 }
17094 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
17095 const TreeEntry *VTE = VTEs.front();
17096 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17097 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17098 VTEs = VTEs.drop_front();
17099 // Iterate through all vectorized nodes.
17100 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
17101 return MTE->State == TreeEntry::Vectorize;
17102 });
17103 if (MIt == VTEs.end())
17104 continue;
17105 VTE = *MIt;
17106 }
17107 if (none_of(TE->CombinedEntriesWithIndices,
17108 [&](const auto &P) { return P.first == VTE->Idx; })) {
17109 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17110 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst) ||
17111 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
17112 continue;
17113 }
17114 // The node is reused - exit.
17115 if (CheckAndUseSameNode(VTE))
17116 break;
17117 VToTEs.insert(VTE);
17118 }
17119 if (VToTEs.empty())
17120 continue;
17121 if (UsedTEs.empty()) {
17122 // The first iteration, just insert the list of nodes to vector.
17123 UsedTEs.push_back(VToTEs);
17124 UsedValuesEntry.try_emplace(V, 0);
17125 } else {
17126 // Need to check if there are any previously used tree nodes which use V.
17127 // If there are no such nodes, consider that we have another one input
17128 // vector.
17129 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
17130 unsigned Idx = 0;
17131 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
17132 // Do we have a non-empty intersection of previously listed tree entries
17133 // and tree entries using current V?
17134 set_intersect(VToTEs, Set);
17135 if (!VToTEs.empty()) {
17136 // Yes, write the new subset and continue analysis for the next
17137 // scalar.
17138 Set.swap(VToTEs);
17139 break;
17140 }
17141 VToTEs = SavedVToTEs;
17142 ++Idx;
17143 }
17144 // No non-empty intersection found - need to add a second set of possible
17145 // source vectors.
17146 if (Idx == UsedTEs.size()) {
17147 // If the number of input vectors is greater than 2 - not a permutation,
17148 // fallback to the regular gather.
17149 // TODO: support multiple reshuffled nodes.
17150 if (UsedTEs.size() == 2)
17151 continue;
17152 UsedTEs.push_back(SavedVToTEs);
17153 Idx = UsedTEs.size() - 1;
17154 }
17155 UsedValuesEntry.try_emplace(V, Idx);
17156 }
17157 }
17158
17159 if (UsedTEs.empty()) {
17160 Entries.clear();
17161 return std::nullopt;
17162 }
17163
17164 unsigned VF = 0;
17165 if (UsedTEs.size() == 1) {
17166 // Keep the order to avoid non-determinism.
17167 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
17168 UsedTEs.front().end());
17169 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17170 return TE1->Idx < TE2->Idx;
17171 });
17172 // Try to find the perfect match in another gather node at first.
17173 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
17174 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
17175 });
17176 if (It != FirstEntries.end() &&
17177 ((*It)->getVectorFactor() == VL.size() ||
17178 ((*It)->getVectorFactor() == TE->Scalars.size() &&
17179 TE->ReuseShuffleIndices.size() == VL.size() &&
17180 (*It)->isSame(TE->Scalars)))) {
17181 Entries.push_back(*It);
17182 if ((*It)->getVectorFactor() == VL.size()) {
17183 std::iota(std::next(Mask.begin(), Part * VL.size()),
17184 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
17185 } else {
17186 SmallVector<int> CommonMask = TE->getCommonMask();
17187 copy(CommonMask, Mask.begin());
17188 }
17189 // Clear undef scalars.
17190 for (unsigned I : seq<unsigned>(VL.size()))
17191 if (isa<PoisonValue>(VL[I]))
17192 Mask[Part * VL.size() + I] = PoisonMaskElem;
17194 }
17195 // No perfect match, just shuffle, so choose the first tree node from the
17196 // tree.
17197 Entries.push_back(FirstEntries.front());
17198 // Update mapping between values and corresponding tree entries.
17199 for (auto &P : UsedValuesEntry)
17200 P.second = 0;
17201 VF = FirstEntries.front()->getVectorFactor();
17202 } else {
17203 // Try to find nodes with the same vector factor.
17204 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
17205 // Keep the order of tree nodes to avoid non-determinism.
17206 DenseMap<int, const TreeEntry *> VFToTE;
17207 for (const TreeEntry *TE : UsedTEs.front()) {
17208 unsigned VF = TE->getVectorFactor();
17209 auto It = VFToTE.find(VF);
17210 if (It != VFToTE.end()) {
17211 if (It->second->Idx > TE->Idx)
17212 It->getSecond() = TE;
17213 continue;
17214 }
17215 VFToTE.try_emplace(VF, TE);
17216 }
17217 // Same, keep the order to avoid non-determinism.
17218 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
17219 UsedTEs.back().end());
17220 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17221 return TE1->Idx < TE2->Idx;
17222 });
17223 for (const TreeEntry *TE : SecondEntries) {
17224 auto It = VFToTE.find(TE->getVectorFactor());
17225 if (It != VFToTE.end()) {
17226 VF = It->first;
17227 Entries.push_back(It->second);
17228 Entries.push_back(TE);
17229 break;
17230 }
17231 }
17232 // No 2 source vectors with the same vector factor - just choose 2 with max
17233 // index.
17234 if (Entries.empty()) {
17236 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
17237 return TE1->Idx < TE2->Idx;
17238 }));
17239 Entries.push_back(SecondEntries.front());
17240 VF = std::max(Entries.front()->getVectorFactor(),
17241 Entries.back()->getVectorFactor());
17242 } else {
17243 VF = Entries.front()->getVectorFactor();
17244 }
17245 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
17246 for (const TreeEntry *E : Entries)
17247 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
17248 E->Scalars.end());
17249 // Update mapping between values and corresponding tree entries.
17250 for (auto &P : UsedValuesEntry) {
17251 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
17252 if (ValuesToEntries[Idx].contains(P.first)) {
17253 P.second = Idx;
17254 break;
17255 }
17256 }
17257 }
17258
17259 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
17260 // Checks if the 2 PHIs are compatible in terms of high possibility to be
17261 // vectorized.
17262 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
17263 auto *PHI = cast<PHINode>(V);
17264 auto *PHI1 = cast<PHINode>(V1);
17265 // Check that all incoming values are compatible/from same parent (if they
17266 // are instructions).
17267 // The incoming values are compatible if they all are constants, or
17268 // instruction with the same/alternate opcodes from the same basic block.
17269 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
17270 Value *In = PHI->getIncomingValue(I);
17271 Value *In1 = PHI1->getIncomingValue(I);
17272 if (isConstant(In) && isConstant(In1))
17273 continue;
17274 if (!getSameOpcode({In, In1}, *TLI))
17275 return false;
17276 if (cast<Instruction>(In)->getParent() !=
17278 return false;
17279 }
17280 return true;
17281 };
17282 // Check if the value can be ignored during analysis for shuffled gathers.
17283 // We suppose it is better to ignore instruction, which do not form splats,
17284 // are not vectorized/not extractelements (these instructions will be handled
17285 // by extractelements processing) or may form vector node in future.
17286 auto MightBeIgnored = [=](Value *V) {
17287 auto *I = dyn_cast<Instruction>(V);
17288 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17290 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17291 };
17292 // Check that the neighbor instruction may form a full vector node with the
17293 // current instruction V. It is possible, if they have same/alternate opcode
17294 // and same parent basic block.
17295 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17296 Value *V1 = VL[Idx];
17297 bool UsedInSameVTE = false;
17298 auto It = UsedValuesEntry.find(V1);
17299 if (It != UsedValuesEntry.end())
17300 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17301 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17302 getSameOpcode({V, V1}, *TLI) &&
17303 cast<Instruction>(V)->getParent() ==
17304 cast<Instruction>(V1)->getParent() &&
17305 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17306 };
17307 // Build a shuffle mask for better cost estimation and vector emission.
17308 SmallBitVector UsedIdxs(Entries.size());
17310 for (int I = 0, E = VL.size(); I < E; ++I) {
17311 Value *V = VL[I];
17312 auto It = UsedValuesEntry.find(V);
17313 if (It == UsedValuesEntry.end())
17314 continue;
17315 // Do not try to shuffle scalars, if they are constants, or instructions
17316 // that can be vectorized as a result of the following vector build
17317 // vectorization.
17318 if (isConstant(V) || (MightBeIgnored(V) &&
17319 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17320 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17321 continue;
17322 unsigned Idx = It->second;
17323 EntryLanes.emplace_back(Idx, I);
17324 UsedIdxs.set(Idx);
17325 }
17326 // Iterate through all shuffled scalars and select entries, which can be used
17327 // for final shuffle.
17329 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17330 if (!UsedIdxs.test(I))
17331 continue;
17332 // Fix the entry number for the given scalar. If it is the first entry, set
17333 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17334 // These indices are used when calculating final shuffle mask as the vector
17335 // offset.
17336 for (std::pair<unsigned, int> &Pair : EntryLanes)
17337 if (Pair.first == I)
17338 Pair.first = TempEntries.size();
17339 TempEntries.push_back(Entries[I]);
17340 }
17341 Entries.swap(TempEntries);
17342 if (EntryLanes.size() == Entries.size() &&
17343 !VL.equals(ArrayRef(TE->Scalars)
17344 .slice(Part * VL.size(),
17345 std::min<int>(VL.size(), TE->Scalars.size())))) {
17346 // We may have here 1 or 2 entries only. If the number of scalars is equal
17347 // to the number of entries, no need to do the analysis, it is not very
17348 // profitable. Since VL is not the same as TE->Scalars, it means we already
17349 // have some shuffles before. Cut off not profitable case.
17350 Entries.clear();
17351 return std::nullopt;
17352 }
17353 // Build the final mask, check for the identity shuffle, if possible.
17354 bool IsIdentity = Entries.size() == 1;
17355 // Pair.first is the offset to the vector, while Pair.second is the index of
17356 // scalar in the list.
17357 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17358 unsigned Idx = Part * VL.size() + Pair.second;
17359 Mask[Idx] =
17360 Pair.first * VF +
17361 (ForOrder ? std::distance(
17362 Entries[Pair.first]->Scalars.begin(),
17363 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17364 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17365 IsIdentity &= Mask[Idx] == Pair.second;
17366 }
17367 if (ForOrder || IsIdentity || Entries.empty()) {
17368 switch (Entries.size()) {
17369 case 1:
17370 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17372 break;
17373 case 2:
17374 if (EntryLanes.size() > 2 || VL.size() <= 2)
17376 break;
17377 default:
17378 break;
17379 }
17380 } else if (!isa<VectorType>(VL.front()->getType()) &&
17381 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17382 // Do the cost estimation if shuffle beneficial than buildvector.
17383 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17384 std::next(Mask.begin(), (Part + 1) * VL.size()));
17385 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17386 for (int Idx : SubMask) {
17387 if (Idx == PoisonMaskElem)
17388 continue;
17389 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17390 MinElement = Idx;
17391 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17392 MaxElement = Idx;
17393 }
17394 assert(MaxElement >= 0 && MinElement >= 0 &&
17395 MaxElement % VF >= MinElement % VF &&
17396 "Expected at least single element.");
17397 unsigned NewVF = std::max<unsigned>(
17398 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17399 (MaxElement % VF) -
17400 (MinElement % VF) + 1));
17401 if (NewVF < VF) {
17402 for (int &Idx : SubMask) {
17403 if (Idx == PoisonMaskElem)
17404 continue;
17405 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17406 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17407 }
17408 } else {
17409 NewVF = VF;
17410 }
17411
17413 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17414 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17415 auto GetShuffleCost = [&,
17416 &TTI = *TTI](ArrayRef<int> Mask,
17418 VectorType *VecTy) -> InstructionCost {
17419 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17421 Mask, Entries.front()->getInterleaveFactor()))
17422 return TTI::TCC_Free;
17423 return ::getShuffleCost(TTI,
17424 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17426 VecTy, Mask, CostKind);
17427 };
17428 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17429 InstructionCost FirstShuffleCost = 0;
17430 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17431 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17432 FirstShuffleCost = ShuffleCost;
17433 } else {
17434 // Transform mask to include only first entry.
17435 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17436 bool IsIdentity = true;
17437 for (auto [I, Idx] : enumerate(FirstMask)) {
17438 if (Idx >= static_cast<int>(NewVF)) {
17439 Idx = PoisonMaskElem;
17440 } else {
17441 DemandedElts.clearBit(I);
17442 if (Idx != PoisonMaskElem)
17443 IsIdentity &= static_cast<int>(I) == Idx;
17444 }
17445 }
17446 if (!IsIdentity)
17447 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17448 FirstShuffleCost += getScalarizationOverhead(
17449 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17450 /*Extract=*/false, CostKind);
17451 }
17452 InstructionCost SecondShuffleCost = 0;
17453 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17454 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17455 SecondShuffleCost = ShuffleCost;
17456 } else {
17457 // Transform mask to include only first entry.
17458 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17459 bool IsIdentity = true;
17460 for (auto [I, Idx] : enumerate(SecondMask)) {
17461 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
17462 Idx = PoisonMaskElem;
17463 } else {
17464 DemandedElts.clearBit(I);
17465 if (Idx != PoisonMaskElem) {
17466 Idx -= NewVF;
17467 IsIdentity &= static_cast<int>(I) == Idx;
17468 }
17469 }
17470 }
17471 if (!IsIdentity)
17472 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17473 SecondShuffleCost += getScalarizationOverhead(
17474 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17475 /*Extract=*/false, CostKind);
17476 }
17477 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17478 for (auto [I, Idx] : enumerate(SubMask))
17479 if (Idx == PoisonMaskElem)
17480 DemandedElts.clearBit(I);
17481 InstructionCost BuildVectorCost = getScalarizationOverhead(
17482 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17483 /*Extract=*/false, CostKind);
17484 const TreeEntry *BestEntry = nullptr;
17485 if (FirstShuffleCost < ShuffleCost) {
17486 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17487 std::next(Mask.begin(), (Part + 1) * VL.size()),
17488 [&](int &Idx) {
17489 if (Idx >= static_cast<int>(VF))
17490 Idx = PoisonMaskElem;
17491 });
17492 BestEntry = Entries.front();
17493 ShuffleCost = FirstShuffleCost;
17494 }
17495 if (SecondShuffleCost < ShuffleCost) {
17496 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17497 std::next(Mask.begin(), (Part + 1) * VL.size()),
17498 [&](int &Idx) {
17499 if (Idx < static_cast<int>(VF))
17500 Idx = PoisonMaskElem;
17501 else
17502 Idx -= VF;
17503 });
17504 BestEntry = Entries[1];
17505 ShuffleCost = SecondShuffleCost;
17506 }
17507 if (BuildVectorCost >= ShuffleCost) {
17508 if (BestEntry) {
17509 Entries.clear();
17510 Entries.push_back(BestEntry);
17511 }
17512 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
17514 }
17515 }
17516 Entries.clear();
17517 // Clear the corresponding mask elements.
17518 std::fill(std::next(Mask.begin(), Part * VL.size()),
17519 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
17520 return std::nullopt;
17521}
17522
17524BoUpSLP::isGatherShuffledEntry(
17525 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
17526 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
17527 bool ForOrder) {
17528 assert(NumParts > 0 && NumParts < VL.size() &&
17529 "Expected positive number of registers.");
17530 Entries.clear();
17531 // No need to check for the topmost gather node.
17532 if (TE == VectorizableTree.front().get() &&
17533 (!GatheredLoadsEntriesFirst.has_value() ||
17534 none_of(ArrayRef(VectorizableTree).drop_front(),
17535 [](const std::unique_ptr<TreeEntry> &TE) {
17536 return !TE->isGather();
17537 })))
17538 return {};
17539 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
17540 // implemented yet.
17541 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17542 return {};
17543 Mask.assign(VL.size(), PoisonMaskElem);
17544 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17545 "Expected only single user of the gather node.");
17546 assert(VL.size() % NumParts == 0 &&
17547 "Number of scalars must be divisible by NumParts.");
17548 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
17549 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17550 (TE->Idx == 0 ||
17551 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
17552 isSplat(TE->Scalars) ||
17553 (TE->hasState() &&
17554 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
17555 return {};
17556 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17558 for (unsigned Part : seq<unsigned>(NumParts)) {
17559 ArrayRef<Value *> SubVL =
17560 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17561 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17562 std::optional<TTI::ShuffleKind> SubRes =
17563 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17564 ForOrder);
17565 if (!SubRes)
17566 SubEntries.clear();
17567 Res.push_back(SubRes);
17568 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
17569 SubEntries.front()->getVectorFactor() == VL.size() &&
17570 (SubEntries.front()->isSame(TE->Scalars) ||
17571 SubEntries.front()->isSame(VL))) {
17572 SmallVector<const TreeEntry *> LocalSubEntries;
17573 LocalSubEntries.swap(SubEntries);
17574 Entries.clear();
17575 Res.clear();
17576 std::iota(Mask.begin(), Mask.end(), 0);
17577 // Clear undef scalars.
17578 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
17579 if (isa<PoisonValue>(VL[I]))
17581 Entries.emplace_back(1, LocalSubEntries.front());
17583 return Res;
17584 }
17585 }
17586 if (all_of(Res,
17587 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
17588 Entries.clear();
17589 return {};
17590 }
17591 return Res;
17592}
17593
17594InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
17595 Type *ScalarTy) const {
17596 const unsigned VF = VL.size();
17597 auto *VecTy = getWidenedType(ScalarTy, VF);
17598 // Find the cost of inserting/extracting values from the vector.
17599 // Check if the same elements are inserted several times and count them as
17600 // shuffle candidates.
17601 APInt DemandedElements = APInt::getZero(VF);
17604 auto EstimateInsertCost = [&](unsigned I, Value *V) {
17605 DemandedElements.setBit(I);
17606 if (V->getType() != ScalarTy)
17607 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
17609 };
17610 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
17611 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17612 for (auto [I, V] : enumerate(VL)) {
17613 // No need to shuffle duplicates for constants.
17614 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
17615 continue;
17616
17617 if (isConstant(V)) {
17618 ConstantShuffleMask[I] = I + VF;
17619 continue;
17620 }
17621 EstimateInsertCost(I, V);
17622 }
17623 // FIXME: add a cost for constant vector materialization.
17624 bool IsAnyNonUndefConst =
17625 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
17626 // 1. Shuffle input source vector and constant vector.
17627 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17629 ConstantShuffleMask);
17630 }
17631
17632 // 2. Insert unique non-constants.
17633 if (!DemandedElements.isZero())
17634 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
17635 /*Insert=*/true,
17636 /*Extract=*/false, CostKind,
17637 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17638 return Cost;
17639}
17640
17641Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
17642 auto It = EntryToLastInstruction.find(E);
17643 if (It != EntryToLastInstruction.end())
17644 return *cast<Instruction>(It->second);
17645 Instruction *Res = nullptr;
17646 // Get the basic block this bundle is in. All instructions in the bundle
17647 // should be in this block (except for extractelement-like instructions with
17648 // constant indices or gathered loads or copyables).
17649 Instruction *Front;
17650 unsigned Opcode;
17651 if (E->hasState()) {
17652 Front = E->getMainOp();
17653 Opcode = E->getOpcode();
17654 } else {
17655 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
17656 Opcode = Front->getOpcode();
17657 }
17658 auto *BB = Front->getParent();
17659 assert(
17660 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17661 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17662 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17663 all_of(E->Scalars,
17664 [=](Value *V) -> bool {
17665 if (Opcode == Instruction::GetElementPtr &&
17666 !isa<GetElementPtrInst>(V))
17667 return true;
17668 auto *I = dyn_cast<Instruction>(V);
17669 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17670 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17671 })) &&
17672 "Expected gathered loads or GEPs or instructions from same basic "
17673 "block.");
17674
17675 auto FindLastInst = [&]() {
17676 Instruction *LastInst = Front;
17677 for (Value *V : E->Scalars) {
17678 auto *I = dyn_cast<Instruction>(V);
17679 if (!I)
17680 continue;
17681 if (E->isCopyableElement(I))
17682 continue;
17683 if (LastInst->getParent() == I->getParent()) {
17684 if (LastInst->comesBefore(I))
17685 LastInst = I;
17686 continue;
17687 }
17688 assert(((Opcode == Instruction::GetElementPtr &&
17690 E->State == TreeEntry::SplitVectorize ||
17691 (isVectorLikeInstWithConstOps(LastInst) &&
17693 (GatheredLoadsEntriesFirst.has_value() &&
17694 Opcode == Instruction::Load && E->isGather() &&
17695 E->Idx < *GatheredLoadsEntriesFirst)) &&
17696 "Expected vector-like or non-GEP in GEP node insts only.");
17697 if (!DT->isReachableFromEntry(LastInst->getParent())) {
17698 LastInst = I;
17699 continue;
17700 }
17701 if (!DT->isReachableFromEntry(I->getParent()))
17702 continue;
17703 auto *NodeA = DT->getNode(LastInst->getParent());
17704 auto *NodeB = DT->getNode(I->getParent());
17705 assert(NodeA && "Should only process reachable instructions");
17706 assert(NodeB && "Should only process reachable instructions");
17707 assert((NodeA == NodeB) ==
17708 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17709 "Different nodes should have different DFS numbers");
17710 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17711 LastInst = I;
17712 }
17713 BB = LastInst->getParent();
17714 return LastInst;
17715 };
17716
17717 auto FindFirstInst = [&]() {
17718 Instruction *FirstInst = Front;
17719 for (Value *V : E->Scalars) {
17720 auto *I = dyn_cast<Instruction>(V);
17721 if (!I)
17722 continue;
17723 if (E->isCopyableElement(I))
17724 continue;
17725 if (FirstInst->getParent() == I->getParent()) {
17726 if (I->comesBefore(FirstInst))
17727 FirstInst = I;
17728 continue;
17729 }
17730 assert(((Opcode == Instruction::GetElementPtr &&
17732 (isVectorLikeInstWithConstOps(FirstInst) &&
17734 "Expected vector-like or non-GEP in GEP node insts only.");
17735 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
17736 FirstInst = I;
17737 continue;
17738 }
17739 if (!DT->isReachableFromEntry(I->getParent()))
17740 continue;
17741 auto *NodeA = DT->getNode(FirstInst->getParent());
17742 auto *NodeB = DT->getNode(I->getParent());
17743 assert(NodeA && "Should only process reachable instructions");
17744 assert(NodeB && "Should only process reachable instructions");
17745 assert((NodeA == NodeB) ==
17746 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17747 "Different nodes should have different DFS numbers");
17748 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17749 FirstInst = I;
17750 }
17751 return FirstInst;
17752 };
17753
17754 if (E->State == TreeEntry::SplitVectorize) {
17755 Res = FindLastInst();
17756 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
17757 for (auto *E : Entries) {
17758 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17759 if (!I)
17760 I = &getLastInstructionInBundle(E);
17761 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
17762 Res = I;
17763 }
17764 }
17765 EntryToLastInstruction.try_emplace(E, Res);
17766 return *Res;
17767 }
17768
17769 // Set insertpoint for gathered loads to the very first load.
17770 if (GatheredLoadsEntriesFirst.has_value() &&
17771 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17772 Opcode == Instruction::Load) {
17773 Res = FindFirstInst();
17774 EntryToLastInstruction.try_emplace(E, Res);
17775 return *Res;
17776 }
17777
17778 // Set the insert point to the beginning of the basic block if the entry
17779 // should not be scheduled.
17780 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
17781 if (E->isGather())
17782 return nullptr;
17783 // Found previously that the instruction do not need to be scheduled.
17784 const auto *It = BlocksSchedules.find(BB);
17785 if (It == BlocksSchedules.end())
17786 return nullptr;
17787 for (Value *V : E->Scalars) {
17788 auto *I = dyn_cast<Instruction>(V);
17789 if (!I || isa<PHINode>(I) ||
17790 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
17791 continue;
17792 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
17793 if (Bundles.empty())
17794 continue;
17795 const auto *It = find_if(
17796 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
17797 if (It != Bundles.end())
17798 return *It;
17799 }
17800 return nullptr;
17801 };
17802 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17803 if (!E->isGather() && !Bundle) {
17804 if ((Opcode == Instruction::GetElementPtr &&
17805 any_of(E->Scalars,
17806 [](Value *V) {
17807 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17808 })) ||
17809 (all_of(E->Scalars,
17810 [&](Value *V) {
17811 return isa<PoisonValue>(V) ||
17812 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17813 E->isCopyableElement(V) ||
17814 (!isVectorLikeInstWithConstOps(V) &&
17815 isUsedOutsideBlock(V));
17816 }) &&
17817 (!E->doesNotNeedToSchedule() ||
17818 any_of(E->Scalars,
17819 [&](Value *V) {
17820 if (!isa<Instruction>(V) ||
17821 (E->hasCopyableElements() && E->isCopyableElement(V)))
17822 return false;
17823 return !areAllOperandsNonInsts(V);
17824 }) ||
17825 none_of(E->Scalars, [&](Value *V) {
17826 if (!isa<Instruction>(V) ||
17827 (E->hasCopyableElements() && E->isCopyableElement(V)))
17828 return false;
17829 return MustGather.contains(V);
17830 }))))
17831 Res = FindLastInst();
17832 else
17833 Res = FindFirstInst();
17834 EntryToLastInstruction.try_emplace(E, Res);
17835 return *Res;
17836 }
17837
17838 // Find the last instruction. The common case should be that BB has been
17839 // scheduled, and the last instruction is VL.back(). So we start with
17840 // VL.back() and iterate over schedule data until we reach the end of the
17841 // bundle. The end of the bundle is marked by null ScheduleData.
17842 if (Bundle) {
17843 assert(!E->isGather() && "Gathered instructions should not be scheduled");
17844 Res = Bundle->getBundle().back()->getInst();
17845 EntryToLastInstruction.try_emplace(E, Res);
17846 return *Res;
17847 }
17848
17849 // LastInst can still be null at this point if there's either not an entry
17850 // for BB in BlocksSchedules or there's no ScheduleData available for
17851 // VL.back(). This can be the case if buildTreeRec aborts for various
17852 // reasons (e.g., the maximum recursion depth is reached, the maximum region
17853 // size is reached, etc.). ScheduleData is initialized in the scheduling
17854 // "dry-run".
17855 //
17856 // If this happens, we can still find the last instruction by brute force. We
17857 // iterate forwards from Front (inclusive) until we either see all
17858 // instructions in the bundle or reach the end of the block. If Front is the
17859 // last instruction in program order, LastInst will be set to Front, and we
17860 // will visit all the remaining instructions in the block.
17861 //
17862 // One of the reasons we exit early from buildTreeRec is to place an upper
17863 // bound on compile-time. Thus, taking an additional compile-time hit here is
17864 // not ideal. However, this should be exceedingly rare since it requires that
17865 // we both exit early from buildTreeRec and that the bundle be out-of-order
17866 // (causing us to iterate all the way to the end of the block).
17867 if (!Res)
17868 Res = FindLastInst();
17869 assert(Res && "Failed to find last instruction in bundle");
17870 EntryToLastInstruction.try_emplace(E, Res);
17871 return *Res;
17872}
17873
17874void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
17875 auto *Front = E->getMainOp();
17876 Instruction *LastInst = &getLastInstructionInBundle(E);
17877 assert(LastInst && "Failed to find last instruction in bundle");
17878 BasicBlock::iterator LastInstIt = LastInst->getIterator();
17879 // If the instruction is PHI, set the insert point after all the PHIs.
17880 bool IsPHI = isa<PHINode>(LastInst);
17881 if (IsPHI) {
17882 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
17883 if (LastInstIt != LastInst->getParent()->end() &&
17884 LastInstIt->getParent()->isLandingPad())
17885 LastInstIt = std::next(LastInstIt);
17886 }
17887 if (IsPHI ||
17888 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17889 (E->doesNotNeedToSchedule() ||
17890 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
17891 isUsedOutsideBlock(LastInst)))) ||
17892 (GatheredLoadsEntriesFirst.has_value() &&
17893 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17894 E->getOpcode() == Instruction::Load)) {
17895 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
17896 } else {
17897 // Set the insertion point after the last instruction in the bundle. Set the
17898 // debug location to Front.
17899 Builder.SetInsertPoint(
17900 LastInst->getParent(),
17901 LastInst->getNextNode()->getIterator());
17902 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
17903 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
17904 } else {
17905 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
17906 PoisonValue::get(Builder.getPtrTy()),
17907 MaybeAlign());
17908 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
17909 eraseInstruction(Res);
17910 LastInstructionToPos.try_emplace(LastInst, Res);
17911 }
17912 }
17913 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
17914}
17915
17916Value *BoUpSLP::gather(
17917 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
17918 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
17919 // List of instructions/lanes from current block and/or the blocks which are
17920 // part of the current loop. These instructions will be inserted at the end to
17921 // make it possible to optimize loops and hoist invariant instructions out of
17922 // the loops body with better chances for success.
17924 SmallSet<int, 4> PostponedIndices;
17925 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
17926 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
17927 SmallPtrSet<BasicBlock *, 4> Visited;
17928 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
17929 InsertBB = InsertBB->getSinglePredecessor();
17930 return InsertBB && InsertBB == InstBB;
17931 };
17932 for (int I = 0, E = VL.size(); I < E; ++I) {
17933 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
17934 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17935 isVectorized(Inst) ||
17936 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
17937 PostponedIndices.insert(I).second)
17938 PostponedInsts.emplace_back(Inst, I);
17939 }
17940
17941 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
17942 Type *Ty) {
17943 Value *Scalar = V;
17944 if (Scalar->getType() != Ty) {
17945 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17946 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
17947 Value *V = Scalar;
17948 if (auto *CI = dyn_cast<CastInst>(Scalar);
17950 Value *Op = CI->getOperand(0);
17951 if (auto *IOp = dyn_cast<Instruction>(Op);
17952 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
17953 V = Op;
17954 }
17955 Scalar = Builder.CreateIntCast(
17956 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
17957 }
17958
17959 Instruction *InsElt;
17960 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17961 assert(SLPReVec && "FixedVectorType is not expected.");
17962 Vec =
17963 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
17964 auto *II = dyn_cast<Instruction>(Vec);
17965 if (!II)
17966 return Vec;
17967 InsElt = II;
17968 } else {
17969 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17970 InsElt = dyn_cast<InsertElementInst>(Vec);
17971 if (!InsElt)
17972 return Vec;
17973 }
17974 GatherShuffleExtractSeq.insert(InsElt);
17975 CSEBlocks.insert(InsElt->getParent());
17976 // Add to our 'need-to-extract' list.
17977 if (isa<Instruction>(V)) {
17978 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
17979 // Find which lane we need to extract.
17980 User *UserOp = nullptr;
17981 if (Scalar != V) {
17982 if (auto *SI = dyn_cast<Instruction>(Scalar))
17983 UserOp = SI;
17984 } else {
17985 if (V->getType()->isVectorTy()) {
17986 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
17987 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17988 // Find shufflevector, caused by resize.
17989 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
17990 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
17991 if (SV->getOperand(0) == V)
17992 return SV;
17993 if (SV->getOperand(1) == V)
17994 return SV;
17995 }
17996 return nullptr;
17997 };
17998 InsElt = nullptr;
17999 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18000 InsElt = User;
18001 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18002 InsElt = User;
18003 assert(InsElt &&
18004 "Failed to find shufflevector, caused by resize.");
18005 }
18006 }
18007 UserOp = InsElt;
18008 }
18009 if (UserOp) {
18010 unsigned FoundLane = Entries.front()->findLaneForValue(V);
18011 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
18012 }
18013 }
18014 }
18015 return Vec;
18016 };
18017 auto *VecTy = getWidenedType(ScalarTy, VL.size());
18018 Value *Vec = PoisonValue::get(VecTy);
18019 SmallVector<int> NonConsts;
18020 SmallVector<int> Mask(VL.size());
18021 std::iota(Mask.begin(), Mask.end(), 0);
18022 Value *OriginalRoot = Root;
18023 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
18024 SV && isa<PoisonValue>(SV->getOperand(1)) &&
18025 SV->getOperand(0)->getType() == VecTy) {
18026 Root = SV->getOperand(0);
18027 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18028 }
18029 // Insert constant values at first.
18030 for (int I = 0, E = VL.size(); I < E; ++I) {
18031 if (PostponedIndices.contains(I))
18032 continue;
18033 if (!isConstant(VL[I])) {
18034 NonConsts.push_back(I);
18035 continue;
18036 }
18037 if (isa<PoisonValue>(VL[I]))
18038 continue;
18039 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18040 Mask[I] = I + E;
18041 }
18042 if (Root) {
18043 if (isa<PoisonValue>(Vec)) {
18044 Vec = OriginalRoot;
18045 } else {
18046 Vec = CreateShuffle(Root, Vec, Mask);
18047 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
18048 OI && OI->use_empty() &&
18049 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18050 return TE->VectorizedValue == OI;
18051 }))
18052 eraseInstruction(OI);
18053 }
18054 }
18055 // Insert non-constant values.
18056 for (int I : NonConsts)
18057 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18058 // Append instructions, which are/may be part of the loop, in the end to make
18059 // it possible to hoist non-loop-based instructions.
18060 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18061 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18062
18063 return Vec;
18064}
18065
18066/// Merges shuffle masks and emits final shuffle instruction, if required. It
18067/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
18068/// when the actual shuffle instruction is generated only if this is actually
18069/// required. Otherwise, the shuffle instruction emission is delayed till the
18070/// end of the process, to reduce the number of emitted instructions and further
18071/// analysis/transformations.
18072/// The class also will look through the previously emitted shuffle instructions
18073/// and properly mark indices in mask as undef.
18074/// For example, given the code
18075/// \code
18076/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
18077/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
18078/// \endcode
18079/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
18080/// look through %s1 and %s2 and emit
18081/// \code
18082/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18083/// \endcode
18084/// instead.
18085/// If 2 operands are of different size, the smallest one will be resized and
18086/// the mask recalculated properly.
18087/// For example, given the code
18088/// \code
18089/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
18090/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
18091/// \endcode
18092/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
18093/// look through %s1 and %s2 and emit
18094/// \code
18095/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18096/// \endcode
18097/// instead.
18098class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
18099 bool IsFinalized = false;
18100 /// Combined mask for all applied operands and masks. It is built during
18101 /// analysis and actual emission of shuffle vector instructions.
18102 SmallVector<int> CommonMask;
18103 /// List of operands for the shuffle vector instruction. It hold at max 2
18104 /// operands, if the 3rd is going to be added, the first 2 are combined into
18105 /// shuffle with \p CommonMask mask, the first operand sets to be the
18106 /// resulting shuffle and the second operand sets to be the newly added
18107 /// operand. The \p CommonMask is transformed in the proper way after that.
18108 SmallVector<Value *, 2> InVectors;
18109 IRBuilderBase &Builder;
18110 BoUpSLP &R;
18111
18112 class ShuffleIRBuilder {
18113 IRBuilderBase &Builder;
18114 /// Holds all of the instructions that we gathered.
18115 SetVector<Instruction *> &GatherShuffleExtractSeq;
18116 /// A list of blocks that we are going to CSE.
18117 DenseSet<BasicBlock *> &CSEBlocks;
18118 /// Data layout.
18119 const DataLayout &DL;
18120
18121 public:
18122 ShuffleIRBuilder(IRBuilderBase &Builder,
18123 SetVector<Instruction *> &GatherShuffleExtractSeq,
18124 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
18125 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
18126 CSEBlocks(CSEBlocks), DL(DL) {}
18127 ~ShuffleIRBuilder() = default;
18128 /// Creates shufflevector for the 2 operands with the given mask.
18129 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
18130 if (V1->getType() != V2->getType()) {
18132 V1->getType()->isIntOrIntVectorTy() &&
18133 "Expected integer vector types only.");
18134 if (V1->getType() != V2->getType()) {
18135 if (cast<VectorType>(V2->getType())
18136 ->getElementType()
18137 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
18138 ->getElementType()
18139 ->getIntegerBitWidth())
18140 V2 = Builder.CreateIntCast(
18141 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
18142 else
18143 V1 = Builder.CreateIntCast(
18144 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
18145 }
18146 }
18147 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
18148 if (auto *I = dyn_cast<Instruction>(Vec)) {
18149 GatherShuffleExtractSeq.insert(I);
18150 CSEBlocks.insert(I->getParent());
18151 }
18152 return Vec;
18153 }
18154 /// Creates permutation of the single vector operand with the given mask, if
18155 /// it is not identity mask.
18156 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
18157 if (Mask.empty())
18158 return V1;
18159 unsigned VF = Mask.size();
18160 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
18161 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
18162 return V1;
18163 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18164 if (auto *I = dyn_cast<Instruction>(Vec)) {
18165 GatherShuffleExtractSeq.insert(I);
18166 CSEBlocks.insert(I->getParent());
18167 }
18168 return Vec;
18169 }
18170 Value *createIdentity(Value *V) { return V; }
18171 Value *createPoison(Type *Ty, unsigned VF) {
18172 return PoisonValue::get(getWidenedType(Ty, VF));
18173 }
18174 /// Resizes 2 input vector to match the sizes, if the they are not equal
18175 /// yet. The smallest vector is resized to the size of the larger vector.
18176 void resizeToMatch(Value *&V1, Value *&V2) {
18177 if (V1->getType() == V2->getType())
18178 return;
18179 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
18180 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
18181 int VF = std::max(V1VF, V2VF);
18182 int MinVF = std::min(V1VF, V2VF);
18183 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
18184 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
18185 0);
18186 Value *&Op = MinVF == V1VF ? V1 : V2;
18187 Op = Builder.CreateShuffleVector(Op, IdentityMask);
18188 if (auto *I = dyn_cast<Instruction>(Op)) {
18189 GatherShuffleExtractSeq.insert(I);
18190 CSEBlocks.insert(I->getParent());
18191 }
18192 if (MinVF == V1VF)
18193 V1 = Op;
18194 else
18195 V2 = Op;
18196 }
18197 };
18198
18199 /// Smart shuffle instruction emission, walks through shuffles trees and
18200 /// tries to find the best matching vector for the actual shuffle
18201 /// instruction.
18202 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
18203 assert(V1 && "Expected at least one vector value.");
18204 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18205 R.CSEBlocks, *R.DL);
18206 return BaseShuffleAnalysis::createShuffle<Value *>(
18207 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18208 }
18209
18210 /// Cast value \p V to the vector type with the same number of elements, but
18211 /// the base type \p ScalarTy.
18212 Value *castToScalarTyElem(Value *V,
18213 std::optional<bool> IsSigned = std::nullopt) {
18214 auto *VecTy = cast<VectorType>(V->getType());
18215 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
18216 if (VecTy->getElementType() == ScalarTy->getScalarType())
18217 return V;
18218 return Builder.CreateIntCast(
18219 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18220 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
18221 }
18222
18223 Value *getVectorizedValue(const TreeEntry &E) {
18224 Value *Vec = E.VectorizedValue;
18225 if (!Vec->getType()->isIntOrIntVectorTy())
18226 return Vec;
18227 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
18228 return !isa<PoisonValue>(V) &&
18229 !isKnownNonNegative(
18230 V, SimplifyQuery(*R.DL));
18231 }));
18232 }
18233
18234public:
18236 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18237
18238 /// Adjusts extractelements after reusing them.
18239 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
18240 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18241 unsigned NumParts, bool &UseVecBaseAsInput) {
18242 UseVecBaseAsInput = false;
18243 SmallPtrSet<Value *, 4> UniqueBases;
18244 Value *VecBase = nullptr;
18245 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
18246 if (!E->ReorderIndices.empty()) {
18247 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18248 E->ReorderIndices.end());
18249 reorderScalars(VL, ReorderMask);
18250 }
18251 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18252 int Idx = Mask[I];
18253 if (Idx == PoisonMaskElem)
18254 continue;
18255 auto *EI = cast<ExtractElementInst>(VL[I]);
18256 VecBase = EI->getVectorOperand();
18257 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
18258 VecBase = TEs.front()->VectorizedValue;
18259 assert(VecBase && "Expected vectorized value.");
18260 UniqueBases.insert(VecBase);
18261 // If the only one use is vectorized - can delete the extractelement
18262 // itself.
18263 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18264 (NumParts != 1 && count(VL, EI) > 1) ||
18265 any_of(EI->users(), [&](User *U) {
18266 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18267 return UTEs.empty() || UTEs.size() > 1 ||
18268 (isa<GetElementPtrInst>(U) &&
18269 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18270 (!UTEs.empty() &&
18271 count_if(R.VectorizableTree,
18272 [&](const std::unique_ptr<TreeEntry> &TE) {
18273 return TE->UserTreeIndex.UserTE ==
18274 UTEs.front() &&
18275 is_contained(VL, EI);
18276 }) != 1);
18277 }))
18278 continue;
18279 R.eraseInstruction(EI);
18280 }
18281 if (NumParts == 1 || UniqueBases.size() == 1) {
18282 assert(VecBase && "Expected vectorized value.");
18283 return castToScalarTyElem(VecBase);
18284 }
18285 UseVecBaseAsInput = true;
18286 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
18287 for (auto [I, Idx] : enumerate(Mask))
18288 if (Idx != PoisonMaskElem)
18289 Idx = I;
18290 };
18291 // Perform multi-register vector shuffle, joining them into a single virtual
18292 // long vector.
18293 // Need to shuffle each part independently and then insert all this parts
18294 // into a long virtual vector register, forming the original vector.
18295 Value *Vec = nullptr;
18296 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18297 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18298 for (unsigned Part : seq<unsigned>(NumParts)) {
18299 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
18300 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
18301 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
18302 constexpr int MaxBases = 2;
18303 SmallVector<Value *, MaxBases> Bases(MaxBases);
18304 auto VLMask = zip(SubVL, SubMask);
18305 const unsigned VF = std::accumulate(
18306 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
18307 if (std::get<1>(D) == PoisonMaskElem)
18308 return S;
18309 Value *VecOp =
18310 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18311 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18312 !TEs.empty())
18313 VecOp = TEs.front()->VectorizedValue;
18314 assert(VecOp && "Expected vectorized value.");
18315 const unsigned Size =
18316 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18317 return std::max(S, Size);
18318 });
18319 for (const auto [V, I] : VLMask) {
18320 if (I == PoisonMaskElem)
18321 continue;
18322 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18323 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18324 VecOp = TEs.front()->VectorizedValue;
18325 assert(VecOp && "Expected vectorized value.");
18326 VecOp = castToScalarTyElem(VecOp);
18327 Bases[I / VF] = VecOp;
18328 }
18329 if (!Bases.front())
18330 continue;
18331 Value *SubVec;
18332 if (Bases.back()) {
18333 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18334 TransformToIdentity(SubMask);
18335 } else {
18336 SubVec = Bases.front();
18337 }
18338 if (!Vec) {
18339 Vec = SubVec;
18340 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18341 [&](unsigned P) {
18342 ArrayRef<int> SubMask =
18343 Mask.slice(P * SliceSize,
18344 getNumElems(Mask.size(),
18345 SliceSize, P));
18346 return all_of(SubMask, [](int Idx) {
18347 return Idx == PoisonMaskElem;
18348 });
18349 })) &&
18350 "Expected first part or all previous parts masked.");
18351 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18352 } else {
18353 unsigned NewVF =
18354 cast<FixedVectorType>(Vec->getType())->getNumElements();
18355 if (Vec->getType() != SubVec->getType()) {
18356 unsigned SubVecVF =
18357 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18358 NewVF = std::max(NewVF, SubVecVF);
18359 }
18360 // Adjust SubMask.
18361 for (int &Idx : SubMask)
18362 if (Idx != PoisonMaskElem)
18363 Idx += NewVF;
18364 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18365 Vec = createShuffle(Vec, SubVec, VecMask);
18366 TransformToIdentity(VecMask);
18367 }
18368 }
18369 copy(VecMask, Mask.begin());
18370 return Vec;
18371 }
18372 /// Checks if the specified entry \p E needs to be delayed because of its
18373 /// dependency nodes.
18374 std::optional<Value *>
18375 needToDelay(const TreeEntry *E,
18377 // No need to delay emission if all deps are ready.
18378 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18379 return all_of(
18380 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18381 }))
18382 return std::nullopt;
18383 // Postpone gather emission, will be emitted after the end of the
18384 // process to keep correct order.
18385 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18386 return Builder.CreateAlignedLoad(
18387 ResVecTy,
18388 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18389 MaybeAlign());
18390 }
18391 /// Reset the builder to handle perfect diamond match.
18393 IsFinalized = false;
18394 CommonMask.clear();
18395 InVectors.clear();
18396 }
18397 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18398 /// shuffling.
18399 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18400 Value *V1 = getVectorizedValue(E1);
18401 Value *V2 = getVectorizedValue(E2);
18402 add(V1, V2, Mask);
18403 }
18404 /// Adds single input vector (in form of tree entry) and the mask for its
18405 /// shuffling.
18406 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18407 Value *V1 = getVectorizedValue(E1);
18408 add(V1, Mask);
18409 }
18410 /// Adds 2 input vectors and the mask for their shuffling.
18411 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18412 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18415 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18416 V1 = castToScalarTyElem(V1);
18417 V2 = castToScalarTyElem(V2);
18418 if (InVectors.empty()) {
18419 InVectors.push_back(V1);
18420 InVectors.push_back(V2);
18421 CommonMask.assign(Mask.begin(), Mask.end());
18422 return;
18423 }
18424 Value *Vec = InVectors.front();
18425 if (InVectors.size() == 2) {
18426 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18427 transformMaskAfterShuffle(CommonMask, CommonMask);
18428 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18429 Mask.size()) {
18430 Vec = createShuffle(Vec, nullptr, CommonMask);
18431 transformMaskAfterShuffle(CommonMask, CommonMask);
18432 }
18433 V1 = createShuffle(V1, V2, Mask);
18434 unsigned VF = std::max(getVF(V1), getVF(Vec));
18435 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18436 if (Mask[Idx] != PoisonMaskElem)
18437 CommonMask[Idx] = Idx + VF;
18438 InVectors.front() = Vec;
18439 if (InVectors.size() == 2)
18440 InVectors.back() = V1;
18441 else
18442 InVectors.push_back(V1);
18443 }
18444 /// Adds another one input vector and the mask for the shuffling.
18445 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
18447 "castToScalarTyElem expects V1 to be FixedVectorType");
18448 V1 = castToScalarTyElem(V1);
18449 if (InVectors.empty()) {
18450 InVectors.push_back(V1);
18451 CommonMask.assign(Mask.begin(), Mask.end());
18452 return;
18453 }
18454 const auto *It = find(InVectors, V1);
18455 if (It == InVectors.end()) {
18456 if (InVectors.size() == 2 ||
18457 InVectors.front()->getType() != V1->getType()) {
18458 Value *V = InVectors.front();
18459 if (InVectors.size() == 2) {
18460 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18461 transformMaskAfterShuffle(CommonMask, CommonMask);
18462 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18463 CommonMask.size()) {
18464 V = createShuffle(InVectors.front(), nullptr, CommonMask);
18465 transformMaskAfterShuffle(CommonMask, CommonMask);
18466 }
18467 unsigned VF = std::max(CommonMask.size(), Mask.size());
18468 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18469 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
18470 CommonMask[Idx] = V->getType() != V1->getType()
18471 ? Idx + VF
18472 : Mask[Idx] + getVF(V1);
18473 if (V->getType() != V1->getType())
18474 V1 = createShuffle(V1, nullptr, Mask);
18475 InVectors.front() = V;
18476 if (InVectors.size() == 2)
18477 InVectors.back() = V1;
18478 else
18479 InVectors.push_back(V1);
18480 return;
18481 }
18482 // Check if second vector is required if the used elements are already
18483 // used from the first one.
18484 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18485 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
18486 InVectors.push_back(V1);
18487 break;
18488 }
18489 }
18490 unsigned VF = 0;
18491 for (Value *V : InVectors)
18492 VF = std::max(VF, getVF(V));
18493 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18494 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
18495 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18496 }
18497 /// Adds another one input vector and the mask for the shuffling.
18499 SmallVector<int> NewMask;
18500 inversePermutation(Order, NewMask);
18501 add(V1, NewMask);
18502 }
18503 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
18504 Value *Root = nullptr) {
18505 return R.gather(VL, Root, ScalarTy,
18506 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
18507 return createShuffle(V1, V2, Mask);
18508 });
18509 }
18510 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
18511 /// Finalize emission of the shuffles.
18512 /// \param Action the action (if any) to be performed before final applying of
18513 /// the \p ExtMask mask.
18515 ArrayRef<int> ExtMask,
18516 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18517 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
18520 Action = {}) {
18521 IsFinalized = true;
18522 if (Action) {
18523 Value *Vec = InVectors.front();
18524 if (InVectors.size() == 2) {
18525 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18526 InVectors.pop_back();
18527 } else {
18528 Vec = createShuffle(Vec, nullptr, CommonMask);
18529 }
18530 transformMaskAfterShuffle(CommonMask, CommonMask);
18531 assert(VF > 0 &&
18532 "Expected vector length for the final value before action.");
18533 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
18534 if (VecVF < VF) {
18535 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18536 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18537 Vec = createShuffle(Vec, nullptr, ResizeMask);
18538 }
18539 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
18540 return createShuffle(V1, V2, Mask);
18541 });
18542 InVectors.front() = Vec;
18543 }
18544 if (!SubVectors.empty()) {
18545 Value *Vec = InVectors.front();
18546 if (InVectors.size() == 2) {
18547 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18548 InVectors.pop_back();
18549 } else {
18550 Vec = createShuffle(Vec, nullptr, CommonMask);
18551 }
18552 transformMaskAfterShuffle(CommonMask, CommonMask);
18553 auto CreateSubVectors = [&](Value *Vec,
18554 SmallVectorImpl<int> &CommonMask) {
18555 for (auto [E, Idx] : SubVectors) {
18556 Value *V = getVectorizedValue(*E);
18557 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
18558 // Use scalar version of the SCalarType to correctly handle shuffles
18559 // for revectorization. The revectorization mode operates by the
18560 // vectors, but here we need to operate on the scalars, because the
18561 // masks were already transformed for the vector elements and we don't
18562 // need doing this transformation again.
18563 Type *OrigScalarTy = ScalarTy;
18564 ScalarTy = ScalarTy->getScalarType();
18565 Vec = createInsertVector(
18566 Builder, Vec, V, InsertionIndex,
18567 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
18568 _3));
18569 ScalarTy = OrigScalarTy;
18570 if (!CommonMask.empty()) {
18571 std::iota(std::next(CommonMask.begin(), Idx),
18572 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
18573 Idx);
18574 }
18575 }
18576 return Vec;
18577 };
18578 if (SubVectorsMask.empty()) {
18579 Vec = CreateSubVectors(Vec, CommonMask);
18580 } else {
18581 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
18582 copy(SubVectorsMask, SVMask.begin());
18583 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
18584 if (I2 != PoisonMaskElem) {
18585 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
18586 I1 = I2 + CommonMask.size();
18587 }
18588 }
18589 Value *InsertVec =
18590 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
18591 Vec = createShuffle(InsertVec, Vec, SVMask);
18592 transformMaskAfterShuffle(CommonMask, SVMask);
18593 }
18594 InVectors.front() = Vec;
18595 }
18596
18597 if (!ExtMask.empty()) {
18598 if (CommonMask.empty()) {
18599 CommonMask.assign(ExtMask.begin(), ExtMask.end());
18600 } else {
18601 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
18602 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
18603 if (ExtMask[I] == PoisonMaskElem)
18604 continue;
18605 NewMask[I] = CommonMask[ExtMask[I]];
18606 }
18607 CommonMask.swap(NewMask);
18608 }
18609 }
18610 if (CommonMask.empty()) {
18611 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
18612 return InVectors.front();
18613 }
18614 if (InVectors.size() == 2)
18615 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18616 return createShuffle(InVectors.front(), nullptr, CommonMask);
18617 }
18618
18620 assert((IsFinalized || CommonMask.empty()) &&
18621 "Shuffle construction must be finalized.");
18622 }
18623};
18624
18625Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
18626 return vectorizeTree(getOperandEntry(E, NodeIdx));
18627}
18628
18629template <typename BVTy, typename ResTy, typename... Args>
18630ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
18631 Args &...Params) {
18632 assert(E->isGather() && "Expected gather node.");
18633 unsigned VF = E->getVectorFactor();
18634
18635 bool NeedFreeze = false;
18636 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
18637 // Clear values, to be replaced by insertvector instructions.
18638 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18639 for_each(MutableArrayRef(GatheredScalars)
18640 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18641 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
18643 E->CombinedEntriesWithIndices.size());
18644 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18645 [&](const auto &P) {
18646 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18647 });
18648 // Build a mask out of the reorder indices and reorder scalars per this
18649 // mask.
18650 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18651 E->ReorderIndices.end());
18652 if (!ReorderMask.empty())
18653 reorderScalars(GatheredScalars, ReorderMask);
18654 SmallVector<int> SubVectorsMask;
18655 inversePermutation(E->ReorderIndices, SubVectorsMask);
18656 // Transform non-clustered elements in the mask to poison (-1).
18657 // "Clustered" operations will be reordered using this mask later.
18658 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
18659 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
18660 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
18661 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
18662 } else {
18663 SubVectorsMask.clear();
18664 }
18665 SmallVector<Value *> StoredGS(GatheredScalars);
18666 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
18667 unsigned I, unsigned SliceSize,
18668 bool IsNotPoisonous) {
18669 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
18670 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18671 }))
18672 return false;
18673 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18674 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18675 if (UserTE->getNumOperands() != 2)
18676 return false;
18677 if (!IsNotPoisonous) {
18678 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18679 [=](const std::unique_ptr<TreeEntry> &TE) {
18680 return TE->UserTreeIndex.UserTE == UserTE &&
18681 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18682 });
18683 if (It == VectorizableTree.end())
18684 return false;
18685 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
18686 if (!(*It)->ReorderIndices.empty()) {
18687 inversePermutation((*It)->ReorderIndices, ReorderMask);
18688 reorderScalars(GS, ReorderMask);
18689 }
18690 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
18691 Value *V0 = std::get<0>(P);
18692 Value *V1 = std::get<1>(P);
18693 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18694 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18695 is_contained(E->Scalars, V1));
18696 }))
18697 return false;
18698 }
18699 int Idx;
18700 if ((Mask.size() < InputVF &&
18701 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
18702 Idx == 0) ||
18703 (Mask.size() == InputVF &&
18704 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
18705 std::iota(
18706 std::next(Mask.begin(), I * SliceSize),
18707 std::next(Mask.begin(),
18708 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18709 0);
18710 } else {
18711 unsigned IVal =
18712 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
18713 std::fill(
18714 std::next(Mask.begin(), I * SliceSize),
18715 std::next(Mask.begin(),
18716 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18717 IVal);
18718 }
18719 return true;
18720 };
18721 BVTy ShuffleBuilder(ScalarTy, Params...);
18722 ResTy Res = ResTy();
18723 SmallVector<int> Mask;
18724 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
18726 Value *ExtractVecBase = nullptr;
18727 bool UseVecBaseAsInput = false;
18730 Type *OrigScalarTy = GatheredScalars.front()->getType();
18731 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
18732 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
18733 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
18734 // Check for gathered extracts.
18735 bool Resized = false;
18736 ExtractShuffles =
18737 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18738 if (!ExtractShuffles.empty()) {
18739 SmallVector<const TreeEntry *> ExtractEntries;
18740 for (auto [Idx, I] : enumerate(ExtractMask)) {
18741 if (I == PoisonMaskElem)
18742 continue;
18743 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
18744 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
18745 !TEs.empty())
18746 ExtractEntries.append(TEs.begin(), TEs.end());
18747 }
18748 if (std::optional<ResTy> Delayed =
18749 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18750 // Delay emission of gathers which are not ready yet.
18751 PostponedGathers.insert(E);
18752 // Postpone gather emission, will be emitted after the end of the
18753 // process to keep correct order.
18754 return *Delayed;
18755 }
18756 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
18757 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18758 ExtractVecBase = VecBase;
18759 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18760 if (VF == VecBaseTy->getNumElements() &&
18761 GatheredScalars.size() != VF) {
18762 Resized = true;
18763 GatheredScalars.append(VF - GatheredScalars.size(),
18764 PoisonValue::get(OrigScalarTy));
18765 NumParts =
18766 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
18767 }
18768 }
18769 }
18770 // Gather extracts after we check for full matched gathers only.
18771 if (!ExtractShuffles.empty() || !E->hasState() ||
18772 E->getOpcode() != Instruction::Load ||
18773 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18774 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18775 any_of(E->Scalars,
18776 [this](Value *V) {
18777 return isa<LoadInst>(V) && isVectorized(V);
18778 })) ||
18779 (E->hasState() && E->isAltShuffle()) ||
18780 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
18781 isSplat(E->Scalars) ||
18782 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18783 GatherShuffles =
18784 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18785 }
18786 if (!GatherShuffles.empty()) {
18787 if (std::optional<ResTy> Delayed =
18788 ShuffleBuilder.needToDelay(E, Entries)) {
18789 // Delay emission of gathers which are not ready yet.
18790 PostponedGathers.insert(E);
18791 // Postpone gather emission, will be emitted after the end of the
18792 // process to keep correct order.
18793 return *Delayed;
18794 }
18795 if (GatherShuffles.size() == 1 &&
18796 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
18797 Entries.front().front()->isSame(E->Scalars)) {
18798 // Perfect match in the graph, will reuse the previously vectorized
18799 // node. Cost is 0.
18800 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
18801 << shortBundleName(E->Scalars, E->Idx) << ".\n");
18802 // Restore the mask for previous partially matched values.
18803 Mask.resize(E->Scalars.size());
18804 const TreeEntry *FrontTE = Entries.front().front();
18805 if (FrontTE->ReorderIndices.empty() &&
18806 ((FrontTE->ReuseShuffleIndices.empty() &&
18807 E->Scalars.size() == FrontTE->Scalars.size()) ||
18808 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18809 std::iota(Mask.begin(), Mask.end(), 0);
18810 } else {
18811 for (auto [I, V] : enumerate(E->Scalars)) {
18812 if (isa<PoisonValue>(V)) {
18813 Mask[I] = PoisonMaskElem;
18814 continue;
18815 }
18816 Mask[I] = FrontTE->findLaneForValue(V);
18817 }
18818 }
18819 // Reset the builder(s) to correctly handle perfect diamond matched
18820 // nodes.
18821 ShuffleBuilder.resetForSameNode();
18822 ShuffleBuilder.add(*FrontTE, Mask);
18823 // Full matched entry found, no need to insert subvectors.
18824 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18825 return Res;
18826 }
18827 if (!Resized) {
18828 if (GatheredScalars.size() != VF &&
18829 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
18830 return any_of(TEs, [&](const TreeEntry *TE) {
18831 return TE->getVectorFactor() == VF;
18832 });
18833 }))
18834 GatheredScalars.append(VF - GatheredScalars.size(),
18835 PoisonValue::get(OrigScalarTy));
18836 }
18837 // Remove shuffled elements from list of gathers.
18838 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18839 if (Mask[I] != PoisonMaskElem)
18840 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18841 }
18842 }
18843 }
18844 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18845 SmallVectorImpl<int> &ReuseMask,
18846 bool IsRootPoison) {
18847 // For splats with can emit broadcasts instead of gathers, so try to find
18848 // such sequences.
18849 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
18850 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
18851 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
18852 SmallVector<int> UndefPos;
18853 DenseMap<Value *, unsigned> UniquePositions;
18854 // Gather unique non-const values and all constant values.
18855 // For repeated values, just shuffle them.
18856 int NumNonConsts = 0;
18857 int SinglePos = 0;
18858 for (auto [I, V] : enumerate(Scalars)) {
18859 if (isa<UndefValue>(V)) {
18860 if (!isa<PoisonValue>(V)) {
18861 ReuseMask[I] = I;
18862 UndefPos.push_back(I);
18863 }
18864 continue;
18865 }
18866 if (isConstant(V)) {
18867 ReuseMask[I] = I;
18868 continue;
18869 }
18870 ++NumNonConsts;
18871 SinglePos = I;
18872 Value *OrigV = V;
18873 Scalars[I] = PoisonValue::get(OrigScalarTy);
18874 if (IsSplat) {
18875 Scalars.front() = OrigV;
18876 ReuseMask[I] = 0;
18877 } else {
18878 const auto Res = UniquePositions.try_emplace(OrigV, I);
18879 Scalars[Res.first->second] = OrigV;
18880 ReuseMask[I] = Res.first->second;
18881 }
18882 }
18883 if (NumNonConsts == 1) {
18884 // Restore single insert element.
18885 if (IsSplat) {
18886 ReuseMask.assign(VF, PoisonMaskElem);
18887 std::swap(Scalars.front(), Scalars[SinglePos]);
18888 if (!UndefPos.empty() && UndefPos.front() == 0)
18889 Scalars.front() = UndefValue::get(OrigScalarTy);
18890 }
18891 ReuseMask[SinglePos] = SinglePos;
18892 } else if (!UndefPos.empty() && IsSplat) {
18893 // For undef values, try to replace them with the simple broadcast.
18894 // We can do it if the broadcasted value is guaranteed to be
18895 // non-poisonous, or by freezing the incoming scalar value first.
18896 auto *It = find_if(Scalars, [this, E](Value *V) {
18897 return !isa<UndefValue>(V) &&
18899 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
18900 // Check if the value already used in the same operation in
18901 // one of the nodes already.
18902 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18903 is_contained(E->UserTreeIndex.UserTE->Scalars,
18904 U.getUser());
18905 })));
18906 });
18907 if (It != Scalars.end()) {
18908 // Replace undefs by the non-poisoned scalars and emit broadcast.
18909 int Pos = std::distance(Scalars.begin(), It);
18910 for (int I : UndefPos) {
18911 // Set the undef position to the non-poisoned scalar.
18912 ReuseMask[I] = Pos;
18913 // Replace the undef by the poison, in the mask it is replaced by
18914 // non-poisoned scalar already.
18915 if (I != Pos)
18916 Scalars[I] = PoisonValue::get(OrigScalarTy);
18917 }
18918 } else {
18919 // Replace undefs by the poisons, emit broadcast and then emit
18920 // freeze.
18921 for (int I : UndefPos) {
18922 ReuseMask[I] = PoisonMaskElem;
18923 if (isa<UndefValue>(Scalars[I]))
18924 Scalars[I] = PoisonValue::get(OrigScalarTy);
18925 }
18926 NeedFreeze = true;
18927 }
18928 }
18929 };
18930 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
18931 bool IsNonPoisoned = true;
18932 bool IsUsedInExpr = true;
18933 Value *Vec1 = nullptr;
18934 if (!ExtractShuffles.empty()) {
18935 // Gather of extractelements can be represented as just a shuffle of
18936 // a single/two vectors the scalars are extracted from.
18937 // Find input vectors.
18938 Value *Vec2 = nullptr;
18939 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18940 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
18941 ExtractMask[I] = PoisonMaskElem;
18942 }
18943 if (UseVecBaseAsInput) {
18944 Vec1 = ExtractVecBase;
18945 } else {
18946 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18947 if (ExtractMask[I] == PoisonMaskElem)
18948 continue;
18949 if (isa<UndefValue>(StoredGS[I]))
18950 continue;
18951 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
18952 Value *VecOp = EI->getVectorOperand();
18953 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
18954 !TEs.empty() && TEs.front()->VectorizedValue)
18955 VecOp = TEs.front()->VectorizedValue;
18956 if (!Vec1) {
18957 Vec1 = VecOp;
18958 } else if (Vec1 != VecOp) {
18959 assert((!Vec2 || Vec2 == VecOp) &&
18960 "Expected only 1 or 2 vectors shuffle.");
18961 Vec2 = VecOp;
18962 }
18963 }
18964 }
18965 if (Vec2) {
18966 IsUsedInExpr = false;
18967 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
18968 isGuaranteedNotToBePoison(Vec2, AC);
18969 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18970 } else if (Vec1) {
18971 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
18972 IsUsedInExpr &= FindReusedSplat(
18973 ExtractMask,
18974 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
18975 ExtractMask.size(), IsNotPoisonedVec);
18976 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
18977 IsNonPoisoned &= IsNotPoisonedVec;
18978 } else {
18979 IsUsedInExpr = false;
18980 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
18981 /*ForExtracts=*/true);
18982 }
18983 }
18984 if (!GatherShuffles.empty()) {
18985 unsigned SliceSize =
18986 getPartNumElems(E->Scalars.size(),
18987 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
18988 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18989 for (const auto [I, TEs] : enumerate(Entries)) {
18990 if (TEs.empty()) {
18991 assert(!GatherShuffles[I] &&
18992 "No shuffles with empty entries list expected.");
18993 continue;
18994 }
18995 assert((TEs.size() == 1 || TEs.size() == 2) &&
18996 "Expected shuffle of 1 or 2 entries.");
18997 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
18998 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
18999 VecMask.assign(VecMask.size(), PoisonMaskElem);
19000 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
19001 if (TEs.size() == 1) {
19002 bool IsNotPoisonedVec =
19003 TEs.front()->VectorizedValue
19004 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
19005 : true;
19006 IsUsedInExpr &=
19007 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
19008 SliceSize, IsNotPoisonedVec);
19009 ShuffleBuilder.add(*TEs.front(), VecMask);
19010 IsNonPoisoned &= IsNotPoisonedVec;
19011 } else {
19012 IsUsedInExpr = false;
19013 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19014 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19015 IsNonPoisoned &=
19016 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
19017 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
19018 }
19019 }
19020 }
19021 // Try to figure out best way to combine values: build a shuffle and insert
19022 // elements or just build several shuffles.
19023 // Insert non-constant scalars.
19024 SmallVector<Value *> NonConstants(GatheredScalars);
19025 int EMSz = ExtractMask.size();
19026 int MSz = Mask.size();
19027 // Try to build constant vector and shuffle with it only if currently we
19028 // have a single permutation and more than 1 scalar constants.
19029 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
19030 bool IsIdentityShuffle =
19031 ((UseVecBaseAsInput ||
19032 all_of(ExtractShuffles,
19033 [](const std::optional<TTI::ShuffleKind> &SK) {
19034 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19036 })) &&
19037 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
19038 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
19039 (!GatherShuffles.empty() &&
19040 all_of(GatherShuffles,
19041 [](const std::optional<TTI::ShuffleKind> &SK) {
19042 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19044 }) &&
19045 none_of(Mask, [&](int I) { return I >= MSz; }) &&
19047 bool EnoughConstsForShuffle =
19048 IsSingleShuffle &&
19049 (none_of(GatheredScalars,
19050 [](Value *V) {
19051 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19052 }) ||
19053 any_of(GatheredScalars,
19054 [](Value *V) {
19055 return isa<Constant>(V) && !isa<UndefValue>(V);
19056 })) &&
19057 (!IsIdentityShuffle ||
19058 (GatheredScalars.size() == 2 &&
19059 any_of(GatheredScalars,
19060 [](Value *V) { return !isa<UndefValue>(V); })) ||
19061 count_if(GatheredScalars, [](Value *V) {
19062 return isa<Constant>(V) && !isa<PoisonValue>(V);
19063 }) > 1);
19064 // NonConstants array contains just non-constant values, GatheredScalars
19065 // contains only constant to build final vector and then shuffle.
19066 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
19067 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
19068 NonConstants[I] = PoisonValue::get(OrigScalarTy);
19069 else
19070 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
19071 }
19072 // Generate constants for final shuffle and build a mask for them.
19073 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
19074 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
19075 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
19076 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
19077 ShuffleBuilder.add(BV, BVMask);
19078 }
19079 if (all_of(NonConstants, [=](Value *V) {
19080 return isa<PoisonValue>(V) ||
19081 (IsSingleShuffle && ((IsIdentityShuffle &&
19082 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
19083 }))
19084 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19085 SubVectorsMask);
19086 else
19087 Res = ShuffleBuilder.finalize(
19088 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
19089 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
19090 bool IsSplat = isSplat(NonConstants);
19091 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
19092 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
19093 auto CheckIfSplatIsProfitable = [&]() {
19094 // Estimate the cost of splatting + shuffle and compare with
19095 // insert + shuffle.
19096 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19097 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19098 if (isa<ExtractElementInst>(V) || isVectorized(V))
19099 return false;
19100 InstructionCost SplatCost = TTI->getVectorInstrCost(
19101 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
19102 PoisonValue::get(VecTy), V);
19103 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19104 for (auto [Idx, I] : enumerate(BVMask))
19105 if (I != PoisonMaskElem)
19106 NewMask[Idx] = Mask.size();
19107 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
19108 NewMask, CostKind);
19109 InstructionCost BVCost = TTI->getVectorInstrCost(
19110 Instruction::InsertElement, VecTy, CostKind,
19111 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
19112 Vec, V);
19113 // Shuffle required?
19114 if (count(BVMask, PoisonMaskElem) <
19115 static_cast<int>(BVMask.size() - 1)) {
19116 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19117 for (auto [Idx, I] : enumerate(BVMask))
19118 if (I != PoisonMaskElem)
19119 NewMask[Idx] = I;
19120 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
19121 VecTy, NewMask, CostKind);
19122 }
19123 return SplatCost <= BVCost;
19124 };
19125 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
19126 for (auto [Idx, I] : enumerate(BVMask))
19127 if (I != PoisonMaskElem)
19128 Mask[Idx] = I;
19129 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
19130 } else {
19131 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19132 SmallVector<Value *> Values(NonConstants.size(),
19133 PoisonValue::get(ScalarTy));
19134 Values[0] = V;
19135 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
19136 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
19137 transform(BVMask, SplatMask.begin(), [](int I) {
19138 return I == PoisonMaskElem ? PoisonMaskElem : 0;
19139 });
19140 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
19141 BV = CreateShuffle(BV, nullptr, SplatMask);
19142 for (auto [Idx, I] : enumerate(BVMask))
19143 if (I != PoisonMaskElem)
19144 Mask[Idx] = BVMask.size() + Idx;
19145 Vec = CreateShuffle(Vec, BV, Mask);
19146 for (auto [Idx, I] : enumerate(Mask))
19147 if (I != PoisonMaskElem)
19148 Mask[Idx] = Idx;
19149 }
19150 });
19151 } else if (!allConstant(GatheredScalars)) {
19152 // Gather unique scalars and all constants.
19153 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
19154 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
19155 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
19156 ShuffleBuilder.add(BV, ReuseMask);
19157 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19158 SubVectorsMask);
19159 } else {
19160 // Gather all constants.
19161 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
19162 for (auto [I, V] : enumerate(GatheredScalars)) {
19163 if (!isa<PoisonValue>(V))
19164 Mask[I] = I;
19165 }
19166 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19167 ShuffleBuilder.add(BV, Mask);
19168 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19169 SubVectorsMask);
19170 }
19171
19172 if (NeedFreeze)
19173 Res = ShuffleBuilder.createFreeze(Res);
19174 return Res;
19175}
19176
19177Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
19178 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
19179 (void)vectorizeTree(VectorizableTree[EIdx].get());
19180 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
19181 Builder, *this);
19182}
19183
19184/// \returns \p I after propagating metadata from \p VL only for instructions in
19185/// \p VL.
19188 for (Value *V : VL)
19189 if (isa<Instruction>(V))
19190 Insts.push_back(V);
19191 return llvm::propagateMetadata(Inst, Insts);
19192}
19193
19195 if (DebugLoc DL = PN.getDebugLoc())
19196 return DL;
19197 return DebugLoc::getUnknown();
19198}
19199
19200Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
19201 IRBuilderBase::InsertPointGuard Guard(Builder);
19202
19203 Value *V = E->Scalars.front();
19204 Type *ScalarTy = V->getType();
19205 if (!isa<CmpInst>(V))
19206 ScalarTy = getValueType(V);
19207 auto It = MinBWs.find(E);
19208 if (It != MinBWs.end()) {
19209 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
19210 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
19211 if (VecTy)
19212 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
19213 }
19214 if (E->VectorizedValue)
19215 return E->VectorizedValue;
19216 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
19217 if (E->isGather()) {
19218 // Set insert point for non-reduction initial nodes.
19219 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
19220 setInsertPointAfterBundle(E);
19221 Value *Vec = createBuildVector(E, ScalarTy);
19222 E->VectorizedValue = Vec;
19223 return Vec;
19224 }
19225 if (E->State == TreeEntry::SplitVectorize) {
19226 assert(E->CombinedEntriesWithIndices.size() == 2 &&
19227 "Expected exactly 2 combined entries.");
19228 setInsertPointAfterBundle(E);
19229 TreeEntry &OpTE1 =
19230 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
19231 assert(OpTE1.isSame(
19232 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19233 "Expected same first part of scalars.");
19234 Value *Op1 = vectorizeTree(&OpTE1);
19235 TreeEntry &OpTE2 =
19236 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
19237 assert(
19238 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19239 "Expected same second part of scalars.");
19240 Value *Op2 = vectorizeTree(&OpTE2);
19241 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
19242 bool IsSigned = false;
19243 auto It = MinBWs.find(OpE);
19244 if (It != MinBWs.end())
19245 IsSigned = It->second.second;
19246 else
19247 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19248 if (isa<PoisonValue>(V))
19249 return false;
19250 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19251 });
19252 return IsSigned;
19253 };
19254 if (cast<VectorType>(Op1->getType())->getElementType() !=
19255 ScalarTy->getScalarType()) {
19256 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19257 Op1 = Builder.CreateIntCast(
19258 Op1,
19260 ScalarTy,
19261 cast<FixedVectorType>(Op1->getType())->getNumElements()),
19262 GetOperandSignedness(&OpTE1));
19263 }
19264 if (cast<VectorType>(Op2->getType())->getElementType() !=
19265 ScalarTy->getScalarType()) {
19266 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19267 Op2 = Builder.CreateIntCast(
19268 Op2,
19270 ScalarTy,
19271 cast<FixedVectorType>(Op2->getType())->getNumElements()),
19272 GetOperandSignedness(&OpTE2));
19273 }
19274 if (E->ReorderIndices.empty()) {
19275 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
19276 std::iota(
19277 Mask.begin(),
19278 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
19279 0);
19280 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
19281 if (ScalarTyNumElements != 1) {
19282 assert(SLPReVec && "Only supported by REVEC.");
19283 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
19284 }
19285 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19286 Vec = createInsertVector(Builder, Vec, Op2,
19287 E->CombinedEntriesWithIndices.back().second *
19288 ScalarTyNumElements);
19289 E->VectorizedValue = Vec;
19290 return Vec;
19291 }
19292 unsigned CommonVF =
19293 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19294 if (getNumElements(Op1->getType()) != CommonVF) {
19295 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19296 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
19297 0);
19298 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19299 }
19300 if (getNumElements(Op2->getType()) != CommonVF) {
19301 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19302 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
19303 0);
19304 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19305 }
19306 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
19307 E->VectorizedValue = Vec;
19308 return Vec;
19309 }
19310
19311 bool IsReverseOrder =
19312 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
19313 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
19314 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
19315 if (E->getOpcode() == Instruction::Store &&
19316 E->State == TreeEntry::Vectorize) {
19317 ArrayRef<int> Mask =
19318 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19319 E->ReorderIndices.size());
19320 ShuffleBuilder.add(V, Mask);
19321 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19322 E->State == TreeEntry::CompressVectorize) {
19323 ShuffleBuilder.addOrdered(V, {});
19324 } else {
19325 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19326 }
19328 E->CombinedEntriesWithIndices.size());
19329 transform(
19330 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19331 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19332 });
19333 assert(
19334 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19335 "Expected either combined subnodes or reordering");
19336 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19337 };
19338
19339 assert(!E->isGather() && "Unhandled state");
19340 unsigned ShuffleOrOp =
19341 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19342 Instruction *VL0 = E->getMainOp();
19343 auto GetOperandSignedness = [&](unsigned Idx) {
19344 const TreeEntry *OpE = getOperandEntry(E, Idx);
19345 bool IsSigned = false;
19346 auto It = MinBWs.find(OpE);
19347 if (It != MinBWs.end())
19348 IsSigned = It->second.second;
19349 else
19350 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19351 if (isa<PoisonValue>(V))
19352 return false;
19353 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19354 });
19355 return IsSigned;
19356 };
19357 switch (ShuffleOrOp) {
19358 case Instruction::PHI: {
19359 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19360 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19361 "PHI reordering is free.");
19362 auto *PH = cast<PHINode>(VL0);
19363 Builder.SetInsertPoint(PH->getParent(),
19364 PH->getParent()->getFirstNonPHIIt());
19365 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19366 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19367 Value *V = NewPhi;
19368
19369 // Adjust insertion point once all PHI's have been generated.
19370 Builder.SetInsertPoint(PH->getParent(),
19371 PH->getParent()->getFirstInsertionPt());
19372 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19373
19374 V = FinalShuffle(V, E);
19375
19376 E->VectorizedValue = V;
19377 // If phi node is fully emitted - exit.
19378 if (NewPhi->getNumIncomingValues() != 0)
19379 return NewPhi;
19380
19381 // PHINodes may have multiple entries from the same block. We want to
19382 // visit every block once.
19383 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19384
19385 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19386 BasicBlock *IBB = PH->getIncomingBlock(I);
19387
19388 // Stop emission if all incoming values are generated.
19389 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19390 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19391 return NewPhi;
19392 }
19393
19394 if (!VisitedBBs.insert(IBB).second) {
19395 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19396 NewPhi->addIncoming(VecOp, IBB);
19397 TreeEntry *OpTE = getOperandEntry(E, I);
19398 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19399 OpTE->VectorizedValue = VecOp;
19400 continue;
19401 }
19402
19403 Builder.SetInsertPoint(IBB->getTerminator());
19404 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19405 Value *Vec = vectorizeOperand(E, I);
19406 if (VecTy != Vec->getType()) {
19407 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19408 MinBWs.contains(getOperandEntry(E, I))) &&
19409 "Expected item in MinBWs.");
19410 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19411 }
19412 NewPhi->addIncoming(Vec, IBB);
19413 }
19414
19415 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19416 "Invalid number of incoming values");
19417 assert(E->VectorizedValue && "Expected vectorized value.");
19418 return E->VectorizedValue;
19419 }
19420
19421 case Instruction::ExtractElement: {
19422 Value *V = E->getSingleOperand(0);
19423 setInsertPointAfterBundle(E);
19424 V = FinalShuffle(V, E);
19425 E->VectorizedValue = V;
19426 return V;
19427 }
19428 case Instruction::ExtractValue: {
19429 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19430 Builder.SetInsertPoint(LI);
19431 Value *Ptr = LI->getPointerOperand();
19432 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19433 Value *NewV = ::propagateMetadata(V, E->Scalars);
19434 NewV = FinalShuffle(NewV, E);
19435 E->VectorizedValue = NewV;
19436 return NewV;
19437 }
19438 case Instruction::InsertElement: {
19439 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
19440 if (const TreeEntry *OpE = getOperandEntry(E, 1);
19441 OpE && !OpE->isGather() && OpE->hasState() &&
19442 !OpE->hasCopyableElements())
19443 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
19444 else
19445 setInsertPointAfterBundle(E);
19446 Value *V = vectorizeOperand(E, 1);
19447 ArrayRef<Value *> Op = E->getOperand(1);
19448 Type *ScalarTy = Op.front()->getType();
19449 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
19450 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19451 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
19452 assert(Res.first > 0 && "Expected item in MinBWs.");
19453 V = Builder.CreateIntCast(
19454 V,
19456 ScalarTy,
19457 cast<FixedVectorType>(V->getType())->getNumElements()),
19458 Res.second);
19459 }
19460
19461 // Create InsertVector shuffle if necessary
19462 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
19463 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19464 }));
19465 const unsigned NumElts =
19466 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19467 const unsigned NumScalars = E->Scalars.size();
19468
19469 unsigned Offset = *getElementIndex(VL0);
19470 assert(Offset < NumElts && "Failed to find vector index offset");
19471
19472 // Create shuffle to resize vector
19473 SmallVector<int> Mask;
19474 if (!E->ReorderIndices.empty()) {
19475 inversePermutation(E->ReorderIndices, Mask);
19476 Mask.append(NumElts - NumScalars, PoisonMaskElem);
19477 } else {
19478 Mask.assign(NumElts, PoisonMaskElem);
19479 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
19480 }
19481 // Create InsertVector shuffle if necessary
19482 bool IsIdentity = true;
19483 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
19484 Mask.swap(PrevMask);
19485 for (unsigned I = 0; I < NumScalars; ++I) {
19486 Value *Scalar = E->Scalars[PrevMask[I]];
19487 unsigned InsertIdx = *getElementIndex(Scalar);
19488 IsIdentity &= InsertIdx - Offset == I;
19489 Mask[InsertIdx - Offset] = I;
19490 }
19491 if (!IsIdentity || NumElts != NumScalars) {
19492 Value *V2 = nullptr;
19493 bool IsVNonPoisonous =
19495 SmallVector<int> InsertMask(Mask);
19496 if (NumElts != NumScalars && Offset == 0) {
19497 // Follow all insert element instructions from the current buildvector
19498 // sequence.
19499 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
19500 do {
19501 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
19502 if (!InsertIdx)
19503 break;
19504 if (InsertMask[*InsertIdx] == PoisonMaskElem)
19505 InsertMask[*InsertIdx] = *InsertIdx;
19506 if (!Ins->hasOneUse())
19507 break;
19509 Ins->getUniqueUndroppableUser());
19510 } while (Ins);
19511 SmallBitVector UseMask =
19512 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19513 SmallBitVector IsFirstPoison =
19514 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19515 SmallBitVector IsFirstUndef =
19516 isUndefVector(FirstInsert->getOperand(0), UseMask);
19517 if (!IsFirstPoison.all()) {
19518 unsigned Idx = 0;
19519 for (unsigned I = 0; I < NumElts; I++) {
19520 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
19521 IsFirstUndef.test(I)) {
19522 if (IsVNonPoisonous) {
19523 InsertMask[I] = I < NumScalars ? I : 0;
19524 continue;
19525 }
19526 if (!V2)
19527 V2 = UndefValue::get(V->getType());
19528 if (Idx >= NumScalars)
19529 Idx = NumScalars - 1;
19530 InsertMask[I] = NumScalars + Idx;
19531 ++Idx;
19532 } else if (InsertMask[I] != PoisonMaskElem &&
19533 Mask[I] == PoisonMaskElem) {
19534 InsertMask[I] = PoisonMaskElem;
19535 }
19536 }
19537 } else {
19538 InsertMask = Mask;
19539 }
19540 }
19541 if (!V2)
19542 V2 = PoisonValue::get(V->getType());
19543 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19544 if (auto *I = dyn_cast<Instruction>(V)) {
19545 GatherShuffleExtractSeq.insert(I);
19546 CSEBlocks.insert(I->getParent());
19547 }
19548 }
19549
19550 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
19551 for (unsigned I = 0; I < NumElts; I++) {
19552 if (Mask[I] != PoisonMaskElem)
19553 InsertMask[Offset + I] = I;
19554 }
19555 SmallBitVector UseMask =
19556 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19557 SmallBitVector IsFirstUndef =
19558 isUndefVector(FirstInsert->getOperand(0), UseMask);
19559 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
19560 NumElts != NumScalars) {
19561 if (IsFirstUndef.all()) {
19562 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
19563 SmallBitVector IsFirstPoison =
19564 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19565 if (!IsFirstPoison.all()) {
19566 for (unsigned I = 0; I < NumElts; I++) {
19567 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
19568 InsertMask[I] = I + NumElts;
19569 }
19570 }
19571 V = Builder.CreateShuffleVector(
19572 V,
19573 IsFirstPoison.all() ? PoisonValue::get(V->getType())
19574 : FirstInsert->getOperand(0),
19575 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
19576 if (auto *I = dyn_cast<Instruction>(V)) {
19577 GatherShuffleExtractSeq.insert(I);
19578 CSEBlocks.insert(I->getParent());
19579 }
19580 }
19581 } else {
19582 SmallBitVector IsFirstPoison =
19583 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19584 for (unsigned I = 0; I < NumElts; I++) {
19585 if (InsertMask[I] == PoisonMaskElem)
19586 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
19587 else
19588 InsertMask[I] += NumElts;
19589 }
19590 V = Builder.CreateShuffleVector(
19591 FirstInsert->getOperand(0), V, InsertMask,
19592 cast<Instruction>(E->Scalars.back())->getName());
19593 if (auto *I = dyn_cast<Instruction>(V)) {
19594 GatherShuffleExtractSeq.insert(I);
19595 CSEBlocks.insert(I->getParent());
19596 }
19597 }
19598 }
19599
19600 ++NumVectorInstructions;
19601 E->VectorizedValue = V;
19602 return V;
19603 }
19604 case Instruction::ZExt:
19605 case Instruction::SExt:
19606 case Instruction::FPToUI:
19607 case Instruction::FPToSI:
19608 case Instruction::FPExt:
19609 case Instruction::PtrToInt:
19610 case Instruction::IntToPtr:
19611 case Instruction::SIToFP:
19612 case Instruction::UIToFP:
19613 case Instruction::Trunc:
19614 case Instruction::FPTrunc:
19615 case Instruction::BitCast: {
19616 setInsertPointAfterBundle(E);
19617
19618 Value *InVec = vectorizeOperand(E, 0);
19619
19620 auto *CI = cast<CastInst>(VL0);
19621 Instruction::CastOps VecOpcode = CI->getOpcode();
19622 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
19623 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
19624 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
19625 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19626 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
19627 // Check if the values are candidates to demote.
19628 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19629 if (SrcIt != MinBWs.end())
19630 SrcBWSz = SrcIt->second.first;
19631 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
19632 if (BWSz == SrcBWSz) {
19633 VecOpcode = Instruction::BitCast;
19634 } else if (BWSz < SrcBWSz) {
19635 VecOpcode = Instruction::Trunc;
19636 } else if (It != MinBWs.end()) {
19637 assert(BWSz > SrcBWSz && "Invalid cast!");
19638 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19639 } else if (SrcIt != MinBWs.end()) {
19640 assert(BWSz > SrcBWSz && "Invalid cast!");
19641 VecOpcode =
19642 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19643 }
19644 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19645 !SrcIt->second.second) {
19646 VecOpcode = Instruction::UIToFP;
19647 }
19648 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19649 ? InVec
19650 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19651 V = FinalShuffle(V, E);
19652
19653 E->VectorizedValue = V;
19654 ++NumVectorInstructions;
19655 return V;
19656 }
19657 case Instruction::FCmp:
19658 case Instruction::ICmp: {
19659 setInsertPointAfterBundle(E);
19660
19661 Value *L = vectorizeOperand(E, 0);
19662 Value *R = vectorizeOperand(E, 1);
19663 if (L->getType() != R->getType()) {
19664 assert((getOperandEntry(E, 0)->isGather() ||
19665 getOperandEntry(E, 1)->isGather() ||
19666 MinBWs.contains(getOperandEntry(E, 0)) ||
19667 MinBWs.contains(getOperandEntry(E, 1))) &&
19668 "Expected item in MinBWs.");
19669 if (cast<VectorType>(L->getType())
19670 ->getElementType()
19671 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
19672 ->getElementType()
19673 ->getIntegerBitWidth()) {
19674 Type *CastTy = R->getType();
19675 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19676 } else {
19677 Type *CastTy = L->getType();
19678 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19679 }
19680 }
19681
19682 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
19683 Value *V = Builder.CreateCmp(P0, L, R);
19684 propagateIRFlags(V, E->Scalars, VL0);
19685 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
19686 ICmp->setSameSign(/*B=*/false);
19687 // Do not cast for cmps.
19688 VecTy = cast<FixedVectorType>(V->getType());
19689 V = FinalShuffle(V, E);
19690
19691 E->VectorizedValue = V;
19692 ++NumVectorInstructions;
19693 return V;
19694 }
19695 case Instruction::Select: {
19696 setInsertPointAfterBundle(E);
19697
19698 Value *Cond = vectorizeOperand(E, 0);
19699 Value *True = vectorizeOperand(E, 1);
19700 Value *False = vectorizeOperand(E, 2);
19701 if (True->getType() != VecTy || False->getType() != VecTy) {
19702 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
19703 getOperandEntry(E, 2)->isGather() ||
19704 MinBWs.contains(getOperandEntry(E, 1)) ||
19705 MinBWs.contains(getOperandEntry(E, 2))) &&
19706 "Expected item in MinBWs.");
19707 if (True->getType() != VecTy)
19708 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19709 if (False->getType() != VecTy)
19710 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19711 }
19712
19713 unsigned CondNumElements = getNumElements(Cond->getType());
19714 unsigned TrueNumElements = getNumElements(True->getType());
19715 assert(TrueNumElements >= CondNumElements &&
19716 TrueNumElements % CondNumElements == 0 &&
19717 "Cannot vectorize Instruction::Select");
19718 assert(TrueNumElements == getNumElements(False->getType()) &&
19719 "Cannot vectorize Instruction::Select");
19720 if (CondNumElements != TrueNumElements) {
19721 // When the return type is i1 but the source is fixed vector type, we
19722 // need to duplicate the condition value.
19723 Cond = Builder.CreateShuffleVector(
19724 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
19725 CondNumElements));
19726 }
19727 assert(getNumElements(Cond->getType()) == TrueNumElements &&
19728 "Cannot vectorize Instruction::Select");
19729 Value *V =
19730 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
19731 V = FinalShuffle(V, E);
19732
19733 E->VectorizedValue = V;
19734 ++NumVectorInstructions;
19735 return V;
19736 }
19737 case Instruction::FNeg: {
19738 setInsertPointAfterBundle(E);
19739
19740 Value *Op = vectorizeOperand(E, 0);
19741
19742 Value *V = Builder.CreateUnOp(
19743 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
19744 propagateIRFlags(V, E->Scalars, VL0);
19745 if (auto *I = dyn_cast<Instruction>(V))
19746 V = ::propagateMetadata(I, E->Scalars);
19747
19748 V = FinalShuffle(V, E);
19749
19750 E->VectorizedValue = V;
19751 ++NumVectorInstructions;
19752
19753 return V;
19754 }
19755 case Instruction::Freeze: {
19756 setInsertPointAfterBundle(E);
19757
19758 Value *Op = vectorizeOperand(E, 0);
19759
19760 if (Op->getType() != VecTy) {
19761 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19762 MinBWs.contains(getOperandEntry(E, 0))) &&
19763 "Expected item in MinBWs.");
19764 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
19765 }
19766 Value *V = Builder.CreateFreeze(Op);
19767 V = FinalShuffle(V, E);
19768
19769 E->VectorizedValue = V;
19770 ++NumVectorInstructions;
19771
19772 return V;
19773 }
19774 case Instruction::Add:
19775 case Instruction::FAdd:
19776 case Instruction::Sub:
19777 case Instruction::FSub:
19778 case Instruction::Mul:
19779 case Instruction::FMul:
19780 case Instruction::UDiv:
19781 case Instruction::SDiv:
19782 case Instruction::FDiv:
19783 case Instruction::URem:
19784 case Instruction::SRem:
19785 case Instruction::FRem:
19786 case Instruction::Shl:
19787 case Instruction::LShr:
19788 case Instruction::AShr:
19789 case Instruction::And:
19790 case Instruction::Or:
19791 case Instruction::Xor: {
19792 setInsertPointAfterBundle(E);
19793
19794 Value *LHS = vectorizeOperand(E, 0);
19795 Value *RHS = vectorizeOperand(E, 1);
19796 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19797 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19798 ArrayRef<Value *> Ops = E->getOperand(I);
19799 if (all_of(Ops, [&](Value *Op) {
19800 auto *CI = dyn_cast<ConstantInt>(Op);
19801 return CI && CI->getValue().countr_one() >= It->second.first;
19802 })) {
19803 V = FinalShuffle(I == 0 ? RHS : LHS, E);
19804 E->VectorizedValue = V;
19805 ++NumVectorInstructions;
19806 return V;
19807 }
19808 }
19809 }
19810 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
19811 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19812 getOperandEntry(E, 1)->isGather() ||
19813 MinBWs.contains(getOperandEntry(E, 0)) ||
19814 MinBWs.contains(getOperandEntry(E, 1))) &&
19815 "Expected item in MinBWs.");
19816 if (LHS->getType() != VecTy)
19817 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
19818 if (RHS->getType() != VecTy)
19819 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
19820 }
19821
19822 Value *V = Builder.CreateBinOp(
19823 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
19824 RHS);
19825 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
19826 if (auto *I = dyn_cast<Instruction>(V)) {
19827 V = ::propagateMetadata(I, E->Scalars);
19828 // Drop nuw flags for abs(sub(commutative), true).
19829 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19830 any_of(E->Scalars, [E](Value *V) {
19831 return isa<PoisonValue>(V) ||
19832 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
19833 isCommutative(cast<Instruction>(V));
19834 }))
19835 I->setHasNoUnsignedWrap(/*b=*/false);
19836 }
19837
19838 V = FinalShuffle(V, E);
19839
19840 E->VectorizedValue = V;
19841 ++NumVectorInstructions;
19842
19843 return V;
19844 }
19845 case Instruction::Load: {
19846 // Loads are inserted at the head of the tree because we don't want to
19847 // sink them all the way down past store instructions.
19848 setInsertPointAfterBundle(E);
19849
19850 LoadInst *LI = cast<LoadInst>(VL0);
19851 Instruction *NewLI;
19852 FixedVectorType *StridedLoadTy = nullptr;
19853 Value *PO = LI->getPointerOperand();
19854 if (E->State == TreeEntry::Vectorize) {
19855 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19856 } else if (E->State == TreeEntry::CompressVectorize) {
19857 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19858 CompressEntryToData.at(E);
19859 Align CommonAlignment = LI->getAlign();
19860 if (IsMasked) {
19861 unsigned VF = getNumElements(LoadVecTy);
19862 SmallVector<Constant *> MaskValues(
19863 VF / getNumElements(LI->getType()),
19864 ConstantInt::getFalse(VecTy->getContext()));
19865 for (int I : CompressMask)
19866 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
19867 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19868 assert(SLPReVec && "Only supported by REVEC.");
19869 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
19870 }
19871 Constant *MaskValue = ConstantVector::get(MaskValues);
19872 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19873 MaskValue);
19874 } else {
19875 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19876 }
19877 NewLI = ::propagateMetadata(NewLI, E->Scalars);
19878 // TODO: include this cost into CommonCost.
19879 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19880 assert(SLPReVec && "FixedVectorType is not expected.");
19881 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
19882 CompressMask);
19883 }
19884 NewLI =
19885 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
19886 } else if (E->State == TreeEntry::StridedVectorize) {
19887 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19888 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19889 PO = IsReverseOrder ? PtrN : Ptr0;
19890 Type *StrideTy = DL->getIndexType(PO->getType());
19891 Value *StrideVal;
19892 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
19893 StridedLoadTy = SPtrInfo.Ty;
19894 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
19895 unsigned StridedLoadEC =
19896 StridedLoadTy->getElementCount().getKnownMinValue();
19897
19898 Value *Stride = SPtrInfo.StrideVal;
19899 if (!Stride) {
19900 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19901 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
19902 SCEVExpander Expander(*SE, *DL, "strided-load-vec");
19903 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
19904 &*Builder.GetInsertPoint());
19905 }
19906 Value *NewStride =
19907 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
19908 StrideVal = Builder.CreateMul(
19909 NewStride, ConstantInt::get(
19910 StrideTy, (IsReverseOrder ? -1 : 1) *
19911 static_cast<int>(
19912 DL->getTypeAllocSize(ScalarTy))));
19913 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19914 auto *Inst = Builder.CreateIntrinsic(
19915 Intrinsic::experimental_vp_strided_load,
19916 {StridedLoadTy, PO->getType(), StrideTy},
19917 {PO, StrideVal,
19918 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
19919 Builder.getInt32(StridedLoadEC)});
19920 Inst->addParamAttr(
19921 /*ArgNo=*/0,
19922 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19923 NewLI = Inst;
19924 } else {
19925 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
19926 Value *VecPtr = vectorizeOperand(E, 0);
19927 if (isa<FixedVectorType>(ScalarTy)) {
19928 assert(SLPReVec && "FixedVectorType is not expected.");
19929 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
19930 // to expand VecPtr if ScalarTy is a vector type.
19931 unsigned ScalarTyNumElements =
19932 cast<FixedVectorType>(ScalarTy)->getNumElements();
19933 unsigned VecTyNumElements =
19934 cast<FixedVectorType>(VecTy)->getNumElements();
19935 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19936 "Cannot expand getelementptr.");
19937 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19938 SmallVector<Constant *> Indices(VecTyNumElements);
19939 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
19940 return Builder.getInt64(I % ScalarTyNumElements);
19941 });
19942 VecPtr = Builder.CreateGEP(
19943 VecTy->getElementType(),
19944 Builder.CreateShuffleVector(
19945 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
19946 ConstantVector::get(Indices));
19947 }
19948 // Use the minimum alignment of the gathered loads.
19949 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19950 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19951 }
19952 Value *V = E->State == TreeEntry::CompressVectorize
19953 ? NewLI
19954 : ::propagateMetadata(NewLI, E->Scalars);
19955
19956 if (StridedLoadTy != VecTy)
19957 V = Builder.CreateBitOrPointerCast(V, VecTy);
19958 V = FinalShuffle(V, E);
19959 E->VectorizedValue = V;
19960 ++NumVectorInstructions;
19961 return V;
19962 }
19963 case Instruction::Store: {
19964 auto *SI = cast<StoreInst>(VL0);
19965
19966 setInsertPointAfterBundle(E);
19967
19968 Value *VecValue = vectorizeOperand(E, 0);
19969 if (VecValue->getType() != VecTy)
19970 VecValue =
19971 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19972 VecValue = FinalShuffle(VecValue, E);
19973
19974 Value *Ptr = SI->getPointerOperand();
19975 Instruction *ST;
19976 if (E->State == TreeEntry::Vectorize) {
19977 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
19978 } else {
19979 assert(E->State == TreeEntry::StridedVectorize &&
19980 "Expected either strided or consecutive stores.");
19981 if (!E->ReorderIndices.empty()) {
19982 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
19983 Ptr = SI->getPointerOperand();
19984 }
19985 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
19986 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
19987 auto *Inst = Builder.CreateIntrinsic(
19988 Intrinsic::experimental_vp_strided_store,
19989 {VecTy, Ptr->getType(), StrideTy},
19990 {VecValue, Ptr,
19991 ConstantInt::get(
19992 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19993 Builder.getAllOnesMask(VecTy->getElementCount()),
19994 Builder.getInt32(E->Scalars.size())});
19995 Inst->addParamAttr(
19996 /*ArgNo=*/1,
19997 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19998 ST = Inst;
19999 }
20000
20001 Value *V = ::propagateMetadata(ST, E->Scalars);
20002
20003 E->VectorizedValue = V;
20004 ++NumVectorInstructions;
20005 return V;
20006 }
20007 case Instruction::GetElementPtr: {
20008 auto *GEP0 = cast<GetElementPtrInst>(VL0);
20009 setInsertPointAfterBundle(E);
20010
20011 Value *Op0 = vectorizeOperand(E, 0);
20012
20013 SmallVector<Value *> OpVecs;
20014 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
20015 Value *OpVec = vectorizeOperand(E, J);
20016 OpVecs.push_back(OpVec);
20017 }
20018
20019 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20020 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
20022 for (Value *V : E->Scalars) {
20024 GEPs.push_back(V);
20025 }
20026 V = ::propagateMetadata(I, GEPs);
20027 }
20028
20029 V = FinalShuffle(V, E);
20030
20031 E->VectorizedValue = V;
20032 ++NumVectorInstructions;
20033
20034 return V;
20035 }
20036 case Instruction::Call: {
20037 CallInst *CI = cast<CallInst>(VL0);
20038 setInsertPointAfterBundle(E);
20039
20041
20043 CI, ID, VecTy->getNumElements(),
20044 It != MinBWs.end() ? It->second.first : 0, TTI);
20045 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
20046 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
20047 VecCallCosts.first <= VecCallCosts.second;
20048
20049 Value *ScalarArg = nullptr;
20050 SmallVector<Value *> OpVecs;
20051 SmallVector<Type *, 2> TysForDecl;
20052 // Add return type if intrinsic is overloaded on it.
20053 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
20054 TysForDecl.push_back(VecTy);
20055 auto *CEI = cast<CallInst>(VL0);
20056 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
20057 // Some intrinsics have scalar arguments. This argument should not be
20058 // vectorized.
20059 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
20060 ScalarArg = CEI->getArgOperand(I);
20061 // if decided to reduce bitwidth of abs intrinsic, it second argument
20062 // must be set false (do not return poison, if value issigned min).
20063 if (ID == Intrinsic::abs && It != MinBWs.end() &&
20064 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
20065 ScalarArg = Builder.getFalse();
20066 OpVecs.push_back(ScalarArg);
20068 TysForDecl.push_back(ScalarArg->getType());
20069 continue;
20070 }
20071
20072 Value *OpVec = vectorizeOperand(E, I);
20073 ScalarArg = CEI->getArgOperand(I);
20074 if (cast<VectorType>(OpVec->getType())->getElementType() !=
20075 ScalarArg->getType()->getScalarType() &&
20076 It == MinBWs.end()) {
20077 auto *CastTy =
20078 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
20079 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
20080 } else if (It != MinBWs.end()) {
20081 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
20082 }
20083 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
20084 OpVecs.push_back(OpVec);
20085 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
20086 TysForDecl.push_back(OpVec->getType());
20087 }
20088
20089 Function *CF;
20090 if (!UseIntrinsic) {
20091 VFShape Shape =
20093 ElementCount::getFixed(VecTy->getNumElements()),
20094 false /*HasGlobalPred*/);
20095 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
20096 } else {
20097 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
20098 }
20099
20101 CI->getOperandBundlesAsDefs(OpBundles);
20102 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
20103
20104 propagateIRFlags(V, E->Scalars, VL0);
20105 V = FinalShuffle(V, E);
20106
20107 E->VectorizedValue = V;
20108 ++NumVectorInstructions;
20109 return V;
20110 }
20111 case Instruction::ShuffleVector: {
20112 Value *V;
20113 if (SLPReVec && !E->isAltShuffle()) {
20114 setInsertPointAfterBundle(E);
20115 Value *Src = vectorizeOperand(E, 0);
20116 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
20117 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
20118 SmallVector<int> NewMask(ThisMask.size());
20119 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
20120 return SVSrc->getShuffleMask()[Mask];
20121 });
20122 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
20123 SVSrc->getOperand(1), NewMask);
20124 } else {
20125 V = Builder.CreateShuffleVector(Src, ThisMask);
20126 }
20127 propagateIRFlags(V, E->Scalars, VL0);
20128 if (auto *I = dyn_cast<Instruction>(V))
20129 V = ::propagateMetadata(I, E->Scalars);
20130 V = FinalShuffle(V, E);
20131 } else {
20132 assert(E->isAltShuffle() &&
20133 ((Instruction::isBinaryOp(E->getOpcode()) &&
20134 Instruction::isBinaryOp(E->getAltOpcode())) ||
20135 (Instruction::isCast(E->getOpcode()) &&
20136 Instruction::isCast(E->getAltOpcode())) ||
20137 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
20138 "Invalid Shuffle Vector Operand");
20139
20140 Value *LHS = nullptr, *RHS = nullptr;
20141 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
20142 setInsertPointAfterBundle(E);
20143 LHS = vectorizeOperand(E, 0);
20144 RHS = vectorizeOperand(E, 1);
20145 } else {
20146 setInsertPointAfterBundle(E);
20147 LHS = vectorizeOperand(E, 0);
20148 }
20149 if (LHS && RHS &&
20150 ((Instruction::isBinaryOp(E->getOpcode()) &&
20151 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
20152 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
20153 assert((It != MinBWs.end() ||
20154 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
20155 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
20156 MinBWs.contains(getOperandEntry(E, 0)) ||
20157 MinBWs.contains(getOperandEntry(E, 1))) &&
20158 "Expected item in MinBWs.");
20159 Type *CastTy = VecTy;
20160 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
20162 ->getElementType()
20163 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
20164 ->getElementType()
20165 ->getIntegerBitWidth())
20166 CastTy = RHS->getType();
20167 else
20168 CastTy = LHS->getType();
20169 }
20170 if (LHS->getType() != CastTy)
20171 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
20172 if (RHS->getType() != CastTy)
20173 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
20174 }
20175
20176 Value *V0, *V1;
20177 if (Instruction::isBinaryOp(E->getOpcode())) {
20178 V0 = Builder.CreateBinOp(
20179 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
20180 V1 = Builder.CreateBinOp(
20181 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
20182 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
20183 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
20184 auto *AltCI = cast<CmpInst>(E->getAltOp());
20185 CmpInst::Predicate AltPred = AltCI->getPredicate();
20186 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
20187 } else {
20188 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
20189 unsigned SrcBWSz = DL->getTypeSizeInBits(
20190 cast<VectorType>(LHS->getType())->getElementType());
20191 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20192 if (BWSz <= SrcBWSz) {
20193 if (BWSz < SrcBWSz)
20194 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
20195 assert(LHS->getType() == VecTy &&
20196 "Expected same type as operand.");
20197 if (auto *I = dyn_cast<Instruction>(LHS))
20198 LHS = ::propagateMetadata(I, E->Scalars);
20199 LHS = FinalShuffle(LHS, E);
20200 E->VectorizedValue = LHS;
20201 ++NumVectorInstructions;
20202 return LHS;
20203 }
20204 }
20205 V0 = Builder.CreateCast(
20206 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
20207 V1 = Builder.CreateCast(
20208 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
20209 }
20210 // Add V0 and V1 to later analysis to try to find and remove matching
20211 // instruction, if any.
20212 for (Value *V : {V0, V1}) {
20213 if (auto *I = dyn_cast<Instruction>(V)) {
20214 GatherShuffleExtractSeq.insert(I);
20215 CSEBlocks.insert(I->getParent());
20216 }
20217 }
20218
20219 // Create shuffle to take alternate operations from the vector.
20220 // Also, gather up main and alt scalar ops to propagate IR flags to
20221 // each vector operation.
20222 ValueList OpScalars, AltScalars;
20223 SmallVector<int> Mask;
20224 E->buildAltOpShuffleMask(
20225 [E, this](Instruction *I) {
20226 assert(E->getMatchingMainOpOrAltOp(I) &&
20227 "Unexpected main/alternate opcode");
20228 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
20229 *TLI);
20230 },
20231 Mask, &OpScalars, &AltScalars);
20232
20233 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
20234 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
20235 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
20236 // Drop nuw flags for abs(sub(commutative), true).
20237 if (auto *I = dyn_cast<Instruction>(Vec);
20238 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
20239 any_of(E->Scalars, [E](Value *V) {
20240 if (isa<PoisonValue>(V))
20241 return false;
20242 if (E->hasCopyableElements() && E->isCopyableElement(V))
20243 return false;
20244 auto *IV = cast<Instruction>(V);
20245 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20246 }))
20247 I->setHasNoUnsignedWrap(/*b=*/false);
20248 };
20249 DropNuwFlag(V0, E->getOpcode());
20250 DropNuwFlag(V1, E->getAltOpcode());
20251
20252 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20253 assert(SLPReVec && "FixedVectorType is not expected.");
20254 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
20255 }
20256 V = Builder.CreateShuffleVector(V0, V1, Mask);
20257 if (auto *I = dyn_cast<Instruction>(V)) {
20258 V = ::propagateMetadata(I, E->Scalars);
20259 GatherShuffleExtractSeq.insert(I);
20260 CSEBlocks.insert(I->getParent());
20261 }
20262 }
20263
20264 E->VectorizedValue = V;
20265 ++NumVectorInstructions;
20266
20267 return V;
20268 }
20269 default:
20270 llvm_unreachable("unknown inst");
20271 }
20272 return nullptr;
20273}
20274
20276 ExtraValueToDebugLocsMap ExternallyUsedValues;
20277 return vectorizeTree(ExternallyUsedValues);
20278}
20279
20281 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
20282 Instruction *ReductionRoot,
20283 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20284 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
20285 // need to rebuild it.
20286 EntryToLastInstruction.clear();
20287 // All blocks must be scheduled before any instructions are inserted.
20288 for (auto &BSIter : BlocksSchedules)
20289 scheduleBlock(*this, BSIter.second.get());
20290 // Cache last instructions for the nodes to avoid side effects, which may
20291 // appear during vectorization, like extra uses, etc.
20292 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20293 if (TE->isGather())
20294 continue;
20295 (void)getLastInstructionInBundle(TE.get());
20296 }
20297
20298 if (ReductionRoot)
20299 Builder.SetInsertPoint(ReductionRoot->getParent(),
20300 ReductionRoot->getIterator());
20301 else
20302 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20303
20304 // Vectorize gather operands of the nodes with the external uses only.
20306 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20307 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20308 TE->UserTreeIndex.UserTE->hasState() &&
20309 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20310 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20311 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20312 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20313 all_of(TE->UserTreeIndex.UserTE->Scalars,
20314 [](Value *V) { return isUsedOutsideBlock(V); })) {
20315 Instruction &LastInst =
20316 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20317 GatherEntries.emplace_back(TE.get(), &LastInst);
20318 }
20319 }
20320 for (auto &Entry : GatherEntries) {
20321 IRBuilderBase::InsertPointGuard Guard(Builder);
20322 Builder.SetInsertPoint(Entry.second);
20323 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20324 (void)vectorizeTree(Entry.first);
20325 }
20326 // Emit gathered loads first to emit better code for the users of those
20327 // gathered loads.
20328 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20329 if (GatheredLoadsEntriesFirst.has_value() &&
20330 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20331 (!TE->isGather() || TE->UserTreeIndex)) {
20332 assert((TE->UserTreeIndex ||
20333 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20334 "Expected gathered load node.");
20335 (void)vectorizeTree(TE.get());
20336 }
20337 }
20338 (void)vectorizeTree(VectorizableTree[0].get());
20339 // Run through the list of postponed gathers and emit them, replacing the temp
20340 // emitted allocas with actual vector instructions.
20341 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20343 for (const TreeEntry *E : PostponedNodes) {
20344 auto *TE = const_cast<TreeEntry *>(E);
20345 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20346 TE->VectorizedValue = nullptr;
20347 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20348 // If user is a PHI node, its vector code have to be inserted right before
20349 // block terminator. Since the node was delayed, there were some unresolved
20350 // dependencies at the moment when stab instruction was emitted. In a case
20351 // when any of these dependencies turn out an operand of another PHI, coming
20352 // from this same block, position of a stab instruction will become invalid.
20353 // The is because source vector that supposed to feed this gather node was
20354 // inserted at the end of the block [after stab instruction]. So we need
20355 // to adjust insertion point again to the end of block.
20356 if (isa<PHINode>(UserI) ||
20357 (TE->UserTreeIndex.UserTE->hasState() &&
20358 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20359 // Insert before all users.
20360 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20361 for (User *U : PrevVec->users()) {
20362 if (U == UserI)
20363 continue;
20364 auto *UI = dyn_cast<Instruction>(U);
20365 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20366 continue;
20367 if (UI->comesBefore(InsertPt))
20368 InsertPt = UI;
20369 }
20370 Builder.SetInsertPoint(InsertPt);
20371 } else {
20372 Builder.SetInsertPoint(PrevVec);
20373 }
20374 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20375 Value *Vec = vectorizeTree(TE);
20376 if (auto *VecI = dyn_cast<Instruction>(Vec);
20377 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20378 Builder.GetInsertPoint()->comesBefore(VecI))
20379 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20380 Builder.GetInsertPoint());
20381 if (Vec->getType() != PrevVec->getType()) {
20382 assert(Vec->getType()->isIntOrIntVectorTy() &&
20383 PrevVec->getType()->isIntOrIntVectorTy() &&
20384 "Expected integer vector types only.");
20385 std::optional<bool> IsSigned;
20386 for (Value *V : TE->Scalars) {
20387 if (isVectorized(V)) {
20388 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20389 auto It = MinBWs.find(MNTE);
20390 if (It != MinBWs.end()) {
20391 IsSigned = IsSigned.value_or(false) || It->second.second;
20392 if (*IsSigned)
20393 break;
20394 }
20395 }
20396 if (IsSigned.value_or(false))
20397 break;
20398 // Scan through gather nodes.
20399 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20400 auto It = MinBWs.find(BVE);
20401 if (It != MinBWs.end()) {
20402 IsSigned = IsSigned.value_or(false) || It->second.second;
20403 if (*IsSigned)
20404 break;
20405 }
20406 }
20407 if (IsSigned.value_or(false))
20408 break;
20409 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20410 IsSigned =
20411 IsSigned.value_or(false) ||
20412 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20413 continue;
20414 }
20415 if (IsSigned.value_or(false))
20416 break;
20417 }
20418 }
20419 if (IsSigned.value_or(false)) {
20420 // Final attempt - check user node.
20421 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20422 if (It != MinBWs.end())
20423 IsSigned = It->second.second;
20424 }
20425 assert(IsSigned &&
20426 "Expected user node or perfect diamond match in MinBWs.");
20427 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20428 }
20429 PrevVec->replaceAllUsesWith(Vec);
20430 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20431 // Replace the stub vector node, if it was used before for one of the
20432 // buildvector nodes already.
20433 auto It = PostponedValues.find(PrevVec);
20434 if (It != PostponedValues.end()) {
20435 for (TreeEntry *VTE : It->getSecond())
20436 VTE->VectorizedValue = Vec;
20437 }
20438 eraseInstruction(PrevVec);
20439 }
20440
20441 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
20442 << " values .\n");
20443
20445 // Maps vector instruction to original insertelement instruction
20446 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
20447 // Maps extract Scalar to the corresponding extractelement instruction in the
20448 // basic block. Only one extractelement per block should be emitted.
20450 ScalarToEEs;
20451 SmallDenseSet<Value *, 4> UsedInserts;
20453 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
20455 // Extract all of the elements with the external uses.
20456 for (const auto &ExternalUse : ExternalUses) {
20457 Value *Scalar = ExternalUse.Scalar;
20458 llvm::User *User = ExternalUse.User;
20459
20460 // Skip users that we already RAUW. This happens when one instruction
20461 // has multiple uses of the same value.
20462 if (User && !is_contained(Scalar->users(), User))
20463 continue;
20464 const TreeEntry *E = &ExternalUse.E;
20465 assert(E && "Invalid scalar");
20466 assert(!E->isGather() && "Extracting from a gather list");
20467 // Non-instruction pointers are not deleted, just skip them.
20468 if (E->getOpcode() == Instruction::GetElementPtr &&
20469 !isa<GetElementPtrInst>(Scalar))
20470 continue;
20471
20472 Value *Vec = E->VectorizedValue;
20473 assert(Vec && "Can't find vectorizable value");
20474
20475 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20476 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
20477 if (Scalar->getType() != Vec->getType()) {
20478 Value *Ex = nullptr;
20479 Value *ExV = nullptr;
20480 auto *Inst = dyn_cast<Instruction>(Scalar);
20481 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20482 auto It = ScalarToEEs.find(Scalar);
20483 if (It != ScalarToEEs.end()) {
20484 // No need to emit many extracts, just move the only one in the
20485 // current block.
20486 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20487 : Builder.GetInsertBlock());
20488 if (EEIt != It->second.end()) {
20489 Value *PrevV = EEIt->second.first;
20490 if (auto *I = dyn_cast<Instruction>(PrevV);
20491 I && !ReplaceInst &&
20492 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20493 Builder.GetInsertPoint()->comesBefore(I)) {
20494 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20495 Builder.GetInsertPoint());
20496 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20497 CI->moveAfter(I);
20498 }
20499 Ex = PrevV;
20500 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20501 }
20502 }
20503 if (!Ex) {
20504 // "Reuse" the existing extract to improve final codegen.
20505 if (ReplaceInst) {
20506 // Leave the instruction as is, if it cheaper extracts and all
20507 // operands are scalar.
20508 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20509 IgnoredExtracts.insert(EE);
20510 Ex = EE;
20511 } else {
20512 auto *CloneInst = Inst->clone();
20513 CloneInst->insertBefore(Inst->getIterator());
20514 if (Inst->hasName())
20515 CloneInst->takeName(Inst);
20516 Ex = CloneInst;
20517 }
20518 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20519 ES && isa<Instruction>(Vec)) {
20520 Value *V = ES->getVectorOperand();
20521 auto *IVec = cast<Instruction>(Vec);
20522 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
20523 V = ETEs.front()->VectorizedValue;
20524 if (auto *IV = dyn_cast<Instruction>(V);
20525 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
20526 IV->comesBefore(IVec))
20527 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20528 else
20529 Ex = Builder.CreateExtractElement(Vec, Lane);
20530 } else if (auto *VecTy =
20531 dyn_cast<FixedVectorType>(Scalar->getType())) {
20532 assert(SLPReVec && "FixedVectorType is not expected.");
20533 unsigned VecTyNumElements = VecTy->getNumElements();
20534 // When REVEC is enabled, we need to extract a vector.
20535 // Note: The element size of Scalar may be different from the
20536 // element size of Vec.
20537 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
20538 ExternalUse.Lane * VecTyNumElements);
20539 } else {
20540 Ex = Builder.CreateExtractElement(Vec, Lane);
20541 }
20542 // If necessary, sign-extend or zero-extend ScalarRoot
20543 // to the larger type.
20544 ExV = Ex;
20545 if (Scalar->getType() != Ex->getType())
20546 ExV = Builder.CreateIntCast(
20547 Ex, Scalar->getType(),
20548 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
20549 auto *I = dyn_cast<Instruction>(Ex);
20550 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
20551 : &F->getEntryBlock(),
20552 std::make_pair(Ex, ExV));
20553 }
20554 // The then branch of the previous if may produce constants, since 0
20555 // operand might be a constant.
20556 if (auto *ExI = dyn_cast<Instruction>(Ex);
20557 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
20558 GatherShuffleExtractSeq.insert(ExI);
20559 CSEBlocks.insert(ExI->getParent());
20560 }
20561 return ExV;
20562 }
20563 assert(isa<FixedVectorType>(Scalar->getType()) &&
20564 isa<InsertElementInst>(Scalar) &&
20565 "In-tree scalar of vector type is not insertelement?");
20566 auto *IE = cast<InsertElementInst>(Scalar);
20567 VectorToInsertElement.try_emplace(Vec, IE);
20568 return Vec;
20569 };
20570 // If User == nullptr, the Scalar remains as scalar in vectorized
20571 // instructions or is used as extra arg. Generate ExtractElement instruction
20572 // and update the record for this scalar in ExternallyUsedValues.
20573 if (!User) {
20574 if (!ScalarsWithNullptrUser.insert(Scalar).second)
20575 continue;
20576 assert(
20577 (ExternallyUsedValues.count(Scalar) ||
20578 ExternalUsesWithNonUsers.count(Scalar) ||
20579 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20580 any_of(
20581 Scalar->users(),
20582 [&, TTI = TTI](llvm::User *U) {
20583 if (ExternalUsesAsOriginalScalar.contains(U))
20584 return true;
20585 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20586 return !UseEntries.empty() &&
20587 (E->State == TreeEntry::Vectorize ||
20588 E->State == TreeEntry::StridedVectorize ||
20589 E->State == TreeEntry::CompressVectorize) &&
20590 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20591 return (UseEntry->State == TreeEntry::Vectorize ||
20592 UseEntry->State ==
20593 TreeEntry::StridedVectorize ||
20594 UseEntry->State ==
20595 TreeEntry::CompressVectorize) &&
20596 doesInTreeUserNeedToExtract(
20597 Scalar, getRootEntryInstruction(*UseEntry),
20598 TLI, TTI);
20599 });
20600 })) &&
20601 "Scalar with nullptr User must be registered in "
20602 "ExternallyUsedValues map or remain as scalar in vectorized "
20603 "instructions");
20604 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20605 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
20606 if (PHI->getParent()->isLandingPad())
20607 Builder.SetInsertPoint(
20608 PHI->getParent(),
20609 std::next(
20610 PHI->getParent()->getLandingPadInst()->getIterator()));
20611 else
20612 Builder.SetInsertPoint(PHI->getParent(),
20613 PHI->getParent()->getFirstNonPHIIt());
20614 } else {
20615 Builder.SetInsertPoint(VecI->getParent(),
20616 std::next(VecI->getIterator()));
20617 }
20618 } else {
20619 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20620 }
20621 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20622 // Required to update internally referenced instructions.
20623 if (Scalar != NewInst) {
20624 assert((!isa<ExtractElementInst>(Scalar) ||
20625 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
20626 "Extractelements should not be replaced.");
20627 Scalar->replaceAllUsesWith(NewInst);
20628 }
20629 continue;
20630 }
20631
20632 if (auto *VU = dyn_cast<InsertElementInst>(User);
20633 VU && VU->getOperand(1) == Scalar) {
20634 // Skip if the scalar is another vector op or Vec is not an instruction.
20635 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20636 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
20637 if (!UsedInserts.insert(VU).second)
20638 continue;
20639 // Need to use original vector, if the root is truncated.
20640 auto BWIt = MinBWs.find(E);
20641 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
20642 auto *ScalarTy = FTy->getElementType();
20643 auto Key = std::make_pair(Vec, ScalarTy);
20644 auto VecIt = VectorCasts.find(Key);
20645 if (VecIt == VectorCasts.end()) {
20646 IRBuilderBase::InsertPointGuard Guard(Builder);
20647 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
20648 if (IVec->getParent()->isLandingPad())
20649 Builder.SetInsertPoint(IVec->getParent(),
20650 std::next(IVec->getParent()
20651 ->getLandingPadInst()
20652 ->getIterator()));
20653 else
20654 Builder.SetInsertPoint(
20655 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20656 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
20657 Builder.SetInsertPoint(IVec->getNextNode());
20658 }
20659 Vec = Builder.CreateIntCast(
20660 Vec,
20662 ScalarTy,
20663 cast<FixedVectorType>(Vec->getType())->getNumElements()),
20664 BWIt->second.second);
20665 VectorCasts.try_emplace(Key, Vec);
20666 } else {
20667 Vec = VecIt->second;
20668 }
20669 }
20670
20671 std::optional<unsigned> InsertIdx = getElementIndex(VU);
20672 if (InsertIdx) {
20673 auto *It = find_if(
20674 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
20675 // Checks if 2 insertelements are from the same buildvector.
20676 InsertElementInst *VecInsert = Data.InsertElements.front();
20678 VU, VecInsert,
20679 [](InsertElementInst *II) { return II->getOperand(0); });
20680 });
20681 unsigned Idx = *InsertIdx;
20682 if (It == ShuffledInserts.end()) {
20683 (void)ShuffledInserts.emplace_back();
20684 It = std::next(ShuffledInserts.begin(),
20685 ShuffledInserts.size() - 1);
20686 }
20687 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
20688 if (Mask.empty())
20689 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
20690 Mask[Idx] = ExternalUse.Lane;
20691 It->InsertElements.push_back(cast<InsertElementInst>(User));
20692 continue;
20693 }
20694 }
20695 }
20696 }
20697
20698 // Generate extracts for out-of-tree users.
20699 // Find the insertion point for the extractelement lane.
20700 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20701 if (PHINode *PH = dyn_cast<PHINode>(User)) {
20702 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20703 if (PH->getIncomingValue(I) == Scalar) {
20704 Instruction *IncomingTerminator =
20705 PH->getIncomingBlock(I)->getTerminator();
20706 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20707 Builder.SetInsertPoint(VecI->getParent(),
20708 std::next(VecI->getIterator()));
20709 } else {
20710 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
20711 }
20712 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20713 PH->setOperand(I, NewInst);
20714 }
20715 }
20716 } else {
20717 Builder.SetInsertPoint(cast<Instruction>(User));
20718 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20719 User->replaceUsesOfWith(Scalar, NewInst);
20720 }
20721 } else {
20722 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20723 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20724 User->replaceUsesOfWith(Scalar, NewInst);
20725 }
20726
20727 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
20728 }
20729
20730 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20731 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
20732 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
20733 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
20734 for (int I = 0, E = Mask.size(); I < E; ++I) {
20735 if (Mask[I] < VF)
20736 CombinedMask1[I] = Mask[I];
20737 else
20738 CombinedMask2[I] = Mask[I] - VF;
20739 }
20740 ShuffleInstructionBuilder ShuffleBuilder(
20741 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
20742 ShuffleBuilder.add(V1, CombinedMask1);
20743 if (V2)
20744 ShuffleBuilder.add(V2, CombinedMask2);
20745 return ShuffleBuilder.finalize({}, {}, {});
20746 };
20747
20748 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
20749 bool ForSingleMask) {
20750 unsigned VF = Mask.size();
20751 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20752 if (VF != VecVF) {
20753 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
20754 Vec = CreateShuffle(Vec, nullptr, Mask);
20755 return std::make_pair(Vec, true);
20756 }
20757 if (!ForSingleMask) {
20758 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20759 for (unsigned I = 0; I < VF; ++I) {
20760 if (Mask[I] != PoisonMaskElem)
20761 ResizeMask[Mask[I]] = Mask[I];
20762 }
20763 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
20764 }
20765 }
20766
20767 return std::make_pair(Vec, false);
20768 };
20769 // Perform shuffling of the vectorize tree entries for better handling of
20770 // external extracts.
20771 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20772 // Find the first and the last instruction in the list of insertelements.
20773 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
20774 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
20775 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
20776 Builder.SetInsertPoint(LastInsert);
20777 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20779 MutableArrayRef(Vector.data(), Vector.size()),
20780 FirstInsert->getOperand(0),
20781 [](Value *Vec) {
20782 return cast<VectorType>(Vec->getType())
20783 ->getElementCount()
20784 .getKnownMinValue();
20785 },
20786 ResizeToVF,
20787 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20788 ArrayRef<Value *> Vals) {
20789 assert((Vals.size() == 1 || Vals.size() == 2) &&
20790 "Expected exactly 1 or 2 input values.");
20791 if (Vals.size() == 1) {
20792 // Do not create shuffle if the mask is a simple identity
20793 // non-resizing mask.
20794 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20795 ->getNumElements() ||
20796 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20797 return CreateShuffle(Vals.front(), nullptr, Mask);
20798 return Vals.front();
20799 }
20800 return CreateShuffle(Vals.front() ? Vals.front()
20801 : FirstInsert->getOperand(0),
20802 Vals.back(), Mask);
20803 });
20804 auto It = ShuffledInserts[I].InsertElements.rbegin();
20805 // Rebuild buildvector chain.
20806 InsertElementInst *II = nullptr;
20807 if (It != ShuffledInserts[I].InsertElements.rend())
20808 II = *It;
20810 while (It != ShuffledInserts[I].InsertElements.rend()) {
20811 assert(II && "Must be an insertelement instruction.");
20812 if (*It == II)
20813 ++It;
20814 else
20815 Inserts.push_back(cast<Instruction>(II));
20816 II = dyn_cast<InsertElementInst>(II->getOperand(0));
20817 }
20818 for (Instruction *II : reverse(Inserts)) {
20819 II->replaceUsesOfWith(II->getOperand(0), NewInst);
20820 if (auto *NewI = dyn_cast<Instruction>(NewInst))
20821 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
20822 II->moveAfter(NewI);
20823 NewInst = II;
20824 }
20825 LastInsert->replaceAllUsesWith(NewInst);
20826 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
20827 IE->replaceUsesOfWith(IE->getOperand(0),
20828 PoisonValue::get(IE->getOperand(0)->getType()));
20829 IE->replaceUsesOfWith(IE->getOperand(1),
20830 PoisonValue::get(IE->getOperand(1)->getType()));
20831 eraseInstruction(IE);
20832 }
20833 CSEBlocks.insert(LastInsert->getParent());
20834 }
20835
20836 SmallVector<Instruction *> RemovedInsts;
20837 // For each vectorized value:
20838 for (auto &TEPtr : VectorizableTree) {
20839 TreeEntry *Entry = TEPtr.get();
20840
20841 // No need to handle users of gathered values.
20842 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
20843 continue;
20844
20845 assert(Entry->VectorizedValue && "Can't find vectorizable value");
20846
20847 // For each lane:
20848 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
20849 Value *Scalar = Entry->Scalars[Lane];
20850
20851 if (Entry->getOpcode() == Instruction::GetElementPtr &&
20852 !isa<GetElementPtrInst>(Scalar))
20853 continue;
20854 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20855 EE && IgnoredExtracts.contains(EE))
20856 continue;
20857 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
20858 continue;
20859#ifndef NDEBUG
20860 Type *Ty = Scalar->getType();
20861 if (!Ty->isVoidTy()) {
20862 for (User *U : Scalar->users()) {
20863 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
20864
20865 // It is legal to delete users in the ignorelist.
20866 assert((isVectorized(U) ||
20867 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20870 "Deleting out-of-tree value");
20871 }
20872 }
20873#endif
20874 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
20875 auto *I = cast<Instruction>(Scalar);
20876 RemovedInsts.push_back(I);
20877 }
20878 }
20879
20880 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
20881 // new vector instruction.
20882 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20883 V->mergeDIAssignID(RemovedInsts);
20884
20885 // Clear up reduction references, if any.
20886 if (UserIgnoreList) {
20887 for (Instruction *I : RemovedInsts) {
20888 const TreeEntry *IE = getTreeEntries(I).front();
20889 if (IE->Idx != 0 &&
20890 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
20891 (ValueToGatherNodes.lookup(I).contains(
20892 VectorizableTree.front().get()) ||
20893 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20894 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20895 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20896 IE->UserTreeIndex &&
20897 is_contained(VectorizableTree.front()->Scalars, I)) &&
20898 !(GatheredLoadsEntriesFirst.has_value() &&
20899 IE->Idx >= *GatheredLoadsEntriesFirst &&
20900 VectorizableTree.front()->isGather() &&
20901 is_contained(VectorizableTree.front()->Scalars, I)) &&
20902 !(!VectorizableTree.front()->isGather() &&
20903 VectorizableTree.front()->isCopyableElement(I)))
20904 continue;
20905 SmallVector<SelectInst *> LogicalOpSelects;
20906 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
20907 // Do not replace condition of the logical op in form select <cond>.
20908 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20909 (match(U.getUser(), m_LogicalAnd()) ||
20910 match(U.getUser(), m_LogicalOr())) &&
20911 U.getOperandNo() == 0;
20912 if (IsPoisoningLogicalOp) {
20913 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20914 return false;
20915 }
20916 return UserIgnoreList->contains(U.getUser());
20917 });
20918 // Replace conditions of the poisoning logical ops with the non-poison
20919 // constant value.
20920 for (SelectInst *SI : LogicalOpSelects)
20921 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
20922 }
20923 }
20924 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
20925 // cache correctness.
20926 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
20927 // - instructions are not deleted until later.
20928 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
20929
20930 Builder.ClearInsertionPoint();
20931 InstrElementSize.clear();
20932
20933 const TreeEntry &RootTE = *VectorizableTree.front();
20934 Value *Vec = RootTE.VectorizedValue;
20935 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20936 It != MinBWs.end() &&
20937 ReductionBitWidth != It->second.first) {
20938 IRBuilder<>::InsertPointGuard Guard(Builder);
20939 Builder.SetInsertPoint(ReductionRoot->getParent(),
20940 ReductionRoot->getIterator());
20941 Vec = Builder.CreateIntCast(
20942 Vec,
20943 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20944 cast<VectorType>(Vec->getType())->getElementCount()),
20945 It->second.second);
20946 }
20947 return Vec;
20948}
20949
20951 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
20952 << " gather sequences instructions.\n");
20953 // LICM InsertElementInst sequences.
20954 for (Instruction *I : GatherShuffleExtractSeq) {
20955 if (isDeleted(I))
20956 continue;
20957
20958 // Check if this block is inside a loop.
20959 Loop *L = LI->getLoopFor(I->getParent());
20960 if (!L)
20961 continue;
20962
20963 // Check if it has a preheader.
20964 BasicBlock *PreHeader = L->getLoopPreheader();
20965 if (!PreHeader)
20966 continue;
20967
20968 // If the vector or the element that we insert into it are
20969 // instructions that are defined in this basic block then we can't
20970 // hoist this instruction.
20971 if (any_of(I->operands(), [L](Value *V) {
20972 auto *OpI = dyn_cast<Instruction>(V);
20973 return OpI && L->contains(OpI);
20974 }))
20975 continue;
20976
20977 // We can hoist this instruction. Move it to the pre-header.
20978 I->moveBefore(PreHeader->getTerminator()->getIterator());
20979 CSEBlocks.insert(PreHeader);
20980 }
20981
20982 // Make a list of all reachable blocks in our CSE queue.
20984 CSEWorkList.reserve(CSEBlocks.size());
20985 for (BasicBlock *BB : CSEBlocks)
20986 if (DomTreeNode *N = DT->getNode(BB)) {
20987 assert(DT->isReachableFromEntry(N));
20988 CSEWorkList.push_back(N);
20989 }
20990
20991 // Sort blocks by domination. This ensures we visit a block after all blocks
20992 // dominating it are visited.
20993 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
20994 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
20995 "Different nodes should have different DFS numbers");
20996 return A->getDFSNumIn() < B->getDFSNumIn();
20997 });
20998
20999 // Less defined shuffles can be replaced by the more defined copies.
21000 // Between two shuffles one is less defined if it has the same vector operands
21001 // and its mask indeces are the same as in the first one or undefs. E.g.
21002 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
21003 // poison, <0, 0, 0, 0>.
21004 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
21005 Instruction *I2,
21006 SmallVectorImpl<int> &NewMask) {
21007 if (I1->getType() != I2->getType())
21008 return false;
21009 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
21010 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
21011 if (!SI1 || !SI2)
21012 return I1->isIdenticalTo(I2);
21013 if (SI1->isIdenticalTo(SI2))
21014 return true;
21015 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
21016 if (SI1->getOperand(I) != SI2->getOperand(I))
21017 return false;
21018 // Check if the second instruction is more defined than the first one.
21019 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
21020 ArrayRef<int> SM1 = SI1->getShuffleMask();
21021 // Count trailing undefs in the mask to check the final number of used
21022 // registers.
21023 unsigned LastUndefsCnt = 0;
21024 for (int I = 0, E = NewMask.size(); I < E; ++I) {
21025 if (SM1[I] == PoisonMaskElem)
21026 ++LastUndefsCnt;
21027 else
21028 LastUndefsCnt = 0;
21029 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
21030 NewMask[I] != SM1[I])
21031 return false;
21032 if (NewMask[I] == PoisonMaskElem)
21033 NewMask[I] = SM1[I];
21034 }
21035 // Check if the last undefs actually change the final number of used vector
21036 // registers.
21037 return SM1.size() - LastUndefsCnt > 1 &&
21038 ::getNumberOfParts(*TTI, SI1->getType()) ==
21040 *TTI, getWidenedType(SI1->getType()->getElementType(),
21041 SM1.size() - LastUndefsCnt));
21042 };
21043 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
21044 // instructions. TODO: We can further optimize this scan if we split the
21045 // instructions into different buckets based on the insert lane.
21047 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
21048 assert(*I &&
21049 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
21050 "Worklist not sorted properly!");
21051 BasicBlock *BB = (*I)->getBlock();
21052 // For all instructions in blocks containing gather sequences:
21053 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
21054 if (isDeleted(&In))
21055 continue;
21057 !GatherShuffleExtractSeq.contains(&In))
21058 continue;
21059
21060 // Check if we can replace this instruction with any of the
21061 // visited instructions.
21062 bool Replaced = false;
21063 for (Instruction *&V : Visited) {
21064 SmallVector<int> NewMask;
21065 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
21066 DT->dominates(V->getParent(), In.getParent())) {
21067 In.replaceAllUsesWith(V);
21068 eraseInstruction(&In);
21069 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
21070 if (!NewMask.empty())
21071 SI->setShuffleMask(NewMask);
21072 Replaced = true;
21073 break;
21074 }
21076 GatherShuffleExtractSeq.contains(V) &&
21077 IsIdenticalOrLessDefined(V, &In, NewMask) &&
21078 DT->dominates(In.getParent(), V->getParent())) {
21079 In.moveAfter(V);
21080 V->replaceAllUsesWith(&In);
21082 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
21083 if (!NewMask.empty())
21084 SI->setShuffleMask(NewMask);
21085 V = &In;
21086 Replaced = true;
21087 break;
21088 }
21089 }
21090 if (!Replaced) {
21091 assert(!is_contained(Visited, &In));
21092 Visited.push_back(&In);
21093 }
21094 }
21095 }
21096 CSEBlocks.clear();
21097 GatherShuffleExtractSeq.clear();
21098}
21099
21100BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
21101 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
21102 auto &BundlePtr =
21103 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
21104 for (Value *V : VL) {
21105 if (S.isNonSchedulable(V))
21106 continue;
21107 auto *I = cast<Instruction>(V);
21108 if (S.isCopyableElement(V)) {
21109 // Add a copyable element model.
21110 ScheduleCopyableData &SD =
21111 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
21112 // Group the instructions to a bundle.
21113 BundlePtr->add(&SD);
21114 continue;
21115 }
21116 ScheduleData *BundleMember = getScheduleData(V);
21117 assert(BundleMember && "no ScheduleData for bundle member "
21118 "(maybe not in same basic block)");
21119 // Group the instructions to a bundle.
21120 BundlePtr->add(BundleMember);
21121 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
21122 BundlePtr.get());
21123 }
21124 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
21125 return *BundlePtr;
21126}
21127
21128// Groups the instructions to a bundle (which is then a single scheduling entity)
21129// and schedules instructions until the bundle gets ready.
21130std::optional<BoUpSLP::ScheduleBundle *>
21131BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
21132 const InstructionsState &S,
21133 const EdgeInfo &EI) {
21134 // No need to schedule PHIs, insertelement, extractelement and extractvalue
21135 // instructions.
21136 if (isa<PHINode>(S.getMainOp()) ||
21137 isVectorLikeInstWithConstOps(S.getMainOp()))
21138 return nullptr;
21139 // If the parent node is non-schedulable and the current node is copyable, and
21140 // any of parent instructions are used outside several basic blocks or in
21141 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
21142 // analysis, leading to a crash.
21143 // Non-scheduled nodes may not have related ScheduleData model, which may lead
21144 // to a skipped dep analysis.
21145 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21146 EI.UserTE->doesNotNeedToSchedule() &&
21147 EI.UserTE->getOpcode() != Instruction::PHI &&
21148 any_of(EI.UserTE->Scalars, [](Value *V) {
21149 auto *I = dyn_cast<Instruction>(V);
21150 if (!I || I->hasOneUser())
21151 return false;
21152 for (User *U : I->users()) {
21153 auto *UI = cast<Instruction>(U);
21154 if (isa<BinaryOperator>(UI))
21155 return true;
21156 }
21157 return false;
21158 }))
21159 return std::nullopt;
21160 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21161 EI.UserTE->hasCopyableElements() &&
21162 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21163 all_of(VL, [&](Value *V) {
21164 if (S.isCopyableElement(V))
21165 return true;
21166 return isUsedOutsideBlock(V);
21167 }))
21168 return std::nullopt;
21169 // If any instruction is used outside block only and its operand is placed
21170 // immediately before it, do not schedule, it may cause wrong def-use chain.
21171 if (S.areInstructionsWithCopyableElements() && any_of(VL, [&](Value *V) {
21172 if (isa<PoisonValue>(V) || S.isCopyableElement(V))
21173 return false;
21174 if (isUsedOutsideBlock(V)) {
21175 for (Value *Op : cast<Instruction>(V)->operands()) {
21176 auto *I = dyn_cast<Instruction>(Op);
21177 if (!I)
21178 continue;
21179 return SLP->isVectorized(I) && I->getNextNode() == V;
21180 }
21181 }
21182 return false;
21183 }))
21184 return std::nullopt;
21185 if (S.areInstructionsWithCopyableElements() && EI) {
21186 bool IsNonSchedulableWithParentPhiNode =
21187 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
21188 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
21189 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21190 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
21191 if (IsNonSchedulableWithParentPhiNode) {
21192 SmallSet<std::pair<Value *, Value *>, 4> Values;
21193 for (const auto [Idx, V] :
21194 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
21195 Value *Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
21196 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
21197 auto *I = dyn_cast<Instruction>(Op);
21198 if (!I || !isCommutative(I))
21199 continue;
21200 if (!Values.insert(std::make_pair(V, Op)).second)
21201 return std::nullopt;
21202 }
21203 }
21204 }
21205 bool HasCopyables = S.areInstructionsWithCopyableElements();
21206 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
21207 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
21208 // If all operands were replaced by copyables, the operands of this node
21209 // might be not, so need to recalculate dependencies for schedule data,
21210 // replaced by copyable schedule data.
21211 SmallVector<ScheduleData *> ControlDependentMembers;
21212 for (Value *V : VL) {
21213 auto *I = dyn_cast<Instruction>(V);
21214 if (!I || (HasCopyables && S.isCopyableElement(V)))
21215 continue;
21216 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21217 for (const Use &U : I->operands()) {
21218 unsigned &NumOps =
21219 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
21220 .first->getSecond();
21221 ++NumOps;
21222 if (auto *Op = dyn_cast<Instruction>(U.get());
21223 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
21224 if (ScheduleData *OpSD = getScheduleData(Op);
21225 OpSD && OpSD->hasValidDependencies()) {
21226 OpSD->clearDirectDependencies();
21227 if (RegionHasStackSave ||
21229 ControlDependentMembers.push_back(OpSD);
21230 }
21231 }
21232 }
21233 }
21234 if (!ControlDependentMembers.empty()) {
21235 ScheduleBundle Invalid = ScheduleBundle::invalid();
21236 calculateDependencies(Invalid, /*InsertInReadyList=*/true, SLP,
21237 ControlDependentMembers);
21238 }
21239 return nullptr;
21240 }
21241
21242 // Initialize the instruction bundle.
21243 Instruction *OldScheduleEnd = ScheduleEnd;
21244 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
21245
21246 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
21247 // Clear deps or recalculate the region, if the memory instruction is a
21248 // copyable. It may have memory deps, which must be recalculated.
21249 SmallVector<ScheduleData *> ControlDependentMembers;
21250 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21251 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21252 for (ScheduleEntity *SE : Bundle.getBundle()) {
21253 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
21254 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21255 BundleMember && BundleMember->hasValidDependencies()) {
21256 BundleMember->clearDirectDependencies();
21257 if (RegionHasStackSave ||
21259 BundleMember->getInst()))
21260 ControlDependentMembers.push_back(BundleMember);
21261 }
21262 continue;
21263 }
21264 auto *SD = cast<ScheduleData>(SE);
21265 if (SD->hasValidDependencies() &&
21266 (!S.areInstructionsWithCopyableElements() ||
21267 !S.isCopyableElement(SD->getInst())) &&
21268 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21269 EI.UserTE->hasState() &&
21270 (!EI.UserTE->hasCopyableElements() ||
21271 !EI.UserTE->isCopyableElement(SD->getInst())))
21272 SD->clearDirectDependencies();
21273 for (const Use &U : SD->getInst()->operands()) {
21274 unsigned &NumOps =
21275 UserOpToNumOps
21276 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
21277 .first->getSecond();
21278 ++NumOps;
21279 if (auto *Op = dyn_cast<Instruction>(U.get());
21280 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
21281 *SLP, NumOps)) {
21282 if (ScheduleData *OpSD = getScheduleData(Op);
21283 OpSD && OpSD->hasValidDependencies()) {
21284 OpSD->clearDirectDependencies();
21285 if (RegionHasStackSave ||
21287 ControlDependentMembers.push_back(OpSD);
21288 }
21289 }
21290 }
21291 }
21292 };
21293 // The scheduling region got new instructions at the lower end (or it is a
21294 // new region for the first bundle). This makes it necessary to
21295 // recalculate all dependencies.
21296 // It is seldom that this needs to be done a second time after adding the
21297 // initial bundle to the region.
21298 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21299 for_each(ScheduleDataMap, [&](auto &P) {
21300 if (BB != P.first->getParent())
21301 return;
21302 ScheduleData *SD = P.second;
21303 if (isInSchedulingRegion(*SD))
21304 SD->clearDependencies();
21305 });
21306 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21307 for_each(P.second, [&](ScheduleCopyableData *SD) {
21308 if (isInSchedulingRegion(*SD))
21309 SD->clearDependencies();
21310 });
21311 });
21312 ReSchedule = true;
21313 }
21314 // Check if the bundle data has deps for copyable elements already. In
21315 // this case need to reset deps and recalculate it.
21316 if (Bundle && !Bundle.getBundle().empty()) {
21317 if (S.areInstructionsWithCopyableElements() ||
21318 !ScheduleCopyableDataMap.empty())
21319 CheckIfNeedToClearDeps(Bundle);
21320 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
21321 << BB->getName() << "\n");
21322 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
21323 ControlDependentMembers);
21324 } else if (!ControlDependentMembers.empty()) {
21325 ScheduleBundle Invalid = ScheduleBundle::invalid();
21326 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
21327 ControlDependentMembers);
21328 }
21329
21330 if (ReSchedule) {
21331 resetSchedule();
21332 initialFillReadyList(ReadyInsts);
21333 }
21334
21335 // Now try to schedule the new bundle or (if no bundle) just calculate
21336 // dependencies. As soon as the bundle is "ready" it means that there are no
21337 // cyclic dependencies and we can schedule it. Note that's important that we
21338 // don't "schedule" the bundle yet.
21339 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21340 !ReadyInsts.empty()) {
21341 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21342 assert(Picked->isReady() && "must be ready to schedule");
21343 schedule(*SLP, S, EI, Picked, ReadyInsts);
21344 if (Picked == &Bundle)
21345 break;
21346 }
21347 };
21348
21349 // Make sure that the scheduling region contains all
21350 // instructions of the bundle.
21351 for (Value *V : VL) {
21352 if (S.isNonSchedulable(V))
21353 continue;
21354 if (!extendSchedulingRegion(V, S)) {
21355 // If the scheduling region got new instructions at the lower end (or it
21356 // is a new region for the first bundle). This makes it necessary to
21357 // recalculate all dependencies.
21358 // Otherwise the compiler may crash trying to incorrectly calculate
21359 // dependencies and emit instruction in the wrong order at the actual
21360 // scheduling.
21361 ScheduleBundle Invalid = ScheduleBundle::invalid();
21362 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
21363 return std::nullopt;
21364 }
21365 }
21366
21367 bool ReSchedule = false;
21368 for (Value *V : VL) {
21369 if (S.isNonSchedulable(V))
21370 continue;
21372 getScheduleCopyableData(cast<Instruction>(V));
21373 if (!CopyableData.empty()) {
21374 for (ScheduleCopyableData *SD : CopyableData)
21375 ReadyInsts.remove(SD);
21376 }
21377 ScheduleData *BundleMember = getScheduleData(V);
21378 assert((BundleMember || S.isCopyableElement(V)) &&
21379 "no ScheduleData for bundle member (maybe not in same basic block)");
21380 if (!BundleMember)
21381 continue;
21382
21383 // Make sure we don't leave the pieces of the bundle in the ready list when
21384 // whole bundle might not be ready.
21385 ReadyInsts.remove(BundleMember);
21386 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
21387 !Bundles.empty()) {
21388 for (ScheduleBundle *B : Bundles)
21389 ReadyInsts.remove(B);
21390 }
21391
21392 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21393 continue;
21394 // A bundle member was scheduled as single instruction before and now
21395 // needs to be scheduled as part of the bundle. We just get rid of the
21396 // existing schedule.
21397 // A bundle member has deps calculated before it was copyable element - need
21398 // to reschedule.
21399 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
21400 << " was already scheduled\n");
21401 ReSchedule = true;
21402 }
21403
21404 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21405 TryScheduleBundleImpl(ReSchedule, Bundle);
21406 if (!Bundle.isReady()) {
21407 for (ScheduleEntity *BD : Bundle.getBundle()) {
21408 // Copyable data scheduling is just removed.
21410 continue;
21411 if (BD->isReady()) {
21412 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
21413 if (Bundles.empty()) {
21414 ReadyInsts.insert(BD);
21415 continue;
21416 }
21417 for (ScheduleBundle *B : Bundles)
21418 if (B->isReady())
21419 ReadyInsts.insert(B);
21420 }
21421 }
21422 ScheduledBundlesList.pop_back();
21423 SmallVector<ScheduleData *> ControlDependentMembers;
21424 for (Value *V : VL) {
21425 if (S.isNonSchedulable(V))
21426 continue;
21427 auto *I = cast<Instruction>(V);
21428 if (S.isCopyableElement(I)) {
21429 // Remove the copyable data from the scheduling region and restore
21430 // previous mappings.
21431 auto KV = std::make_pair(EI, I);
21432 assert(ScheduleCopyableDataMap.contains(KV) &&
21433 "no ScheduleCopyableData for copyable element");
21434 ScheduleCopyableData *SD =
21435 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
21436 ScheduleCopyableDataMapByUsers[I].remove(SD);
21437 if (EI.UserTE) {
21438 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21439 const auto *It = find(Op, I);
21440 assert(It != Op.end() && "Lane not set");
21441 SmallPtrSet<Instruction *, 4> Visited;
21442 do {
21443 int Lane = std::distance(Op.begin(), It);
21444 assert(Lane >= 0 && "Lane not set");
21445 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21446 !EI.UserTE->ReorderIndices.empty())
21447 Lane = EI.UserTE->ReorderIndices[Lane];
21448 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21449 "Couldn't find extract lane");
21450 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21451 if (!Visited.insert(In).second) {
21452 It = find(make_range(std::next(It), Op.end()), I);
21453 break;
21454 }
21455 ScheduleCopyableDataMapByInstUser
21456 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
21457 .pop_back();
21458 It = find(make_range(std::next(It), Op.end()), I);
21459 } while (It != Op.end());
21460 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
21461 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
21462 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
21463 }
21464 if (ScheduleCopyableDataMapByUsers[I].empty())
21465 ScheduleCopyableDataMapByUsers.erase(I);
21466 ScheduleCopyableDataMap.erase(KV);
21467 // Need to recalculate dependencies for the actual schedule data.
21468 if (ScheduleData *OpSD = getScheduleData(I);
21469 OpSD && OpSD->hasValidDependencies()) {
21470 OpSD->clearDirectDependencies();
21471 if (RegionHasStackSave ||
21473 ControlDependentMembers.push_back(OpSD);
21474 }
21475 continue;
21476 }
21477 ScheduledBundles.find(I)->getSecond().pop_back();
21478 }
21479 if (!ControlDependentMembers.empty()) {
21480 ScheduleBundle Invalid = ScheduleBundle::invalid();
21481 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
21482 ControlDependentMembers);
21483 }
21484 return std::nullopt;
21485 }
21486 return &Bundle;
21487}
21488
21489BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21490 // Allocate a new ScheduleData for the instruction.
21491 if (ChunkPos >= ChunkSize) {
21492 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21493 ChunkPos = 0;
21494 }
21495 return &(ScheduleDataChunks.back()[ChunkPos++]);
21496}
21497
21498bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21499 Value *V, const InstructionsState &S) {
21501 assert(I && "bundle member must be an instruction");
21502 if (getScheduleData(I))
21503 return true;
21504 if (!ScheduleStart) {
21505 // It's the first instruction in the new region.
21506 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
21507 ScheduleStart = I;
21508 ScheduleEnd = I->getNextNode();
21509 assert(ScheduleEnd && "tried to vectorize a terminator?");
21510 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
21511 return true;
21512 }
21513 // Search up and down at the same time, because we don't know if the new
21514 // instruction is above or below the existing scheduling region.
21515 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
21516 // against the budget. Otherwise debug info could affect codegen.
21518 ++ScheduleStart->getIterator().getReverse();
21519 BasicBlock::reverse_iterator UpperEnd = BB->rend();
21520 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
21521 BasicBlock::iterator LowerEnd = BB->end();
21522 auto IsAssumeLikeIntr = [](const Instruction &I) {
21523 if (auto *II = dyn_cast<IntrinsicInst>(&I))
21524 return II->isAssumeLikeIntrinsic();
21525 return false;
21526 };
21527 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21528 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21529 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
21530 &*DownIter != I) {
21531 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21532 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
21533 return false;
21534 }
21535
21536 ++UpIter;
21537 ++DownIter;
21538
21539 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21540 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21541 }
21542 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
21543 assert(I->getParent() == ScheduleStart->getParent() &&
21544 "Instruction is in wrong basic block.");
21545 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
21546 ScheduleStart = I;
21547 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
21548 << "\n");
21549 return true;
21550 }
21551 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
21552 "Expected to reach top of the basic block or instruction down the "
21553 "lower end.");
21554 assert(I->getParent() == ScheduleEnd->getParent() &&
21555 "Instruction is in wrong basic block.");
21556 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
21557 nullptr);
21558 ScheduleEnd = I->getNextNode();
21559 assert(ScheduleEnd && "tried to vectorize a terminator?");
21560 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
21561 return true;
21562}
21563
21564void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21565 Instruction *ToI,
21566 ScheduleData *PrevLoadStore,
21567 ScheduleData *NextLoadStore) {
21568 ScheduleData *CurrentLoadStore = PrevLoadStore;
21569 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
21570 // No need to allocate data for non-schedulable instructions.
21571 if (isa<PHINode>(I))
21572 continue;
21573 ScheduleData *SD = ScheduleDataMap.lookup(I);
21574 if (!SD) {
21575 SD = allocateScheduleDataChunks();
21576 ScheduleDataMap[I] = SD;
21577 }
21578 assert(!isInSchedulingRegion(*SD) &&
21579 "new ScheduleData already in scheduling region");
21580 SD->init(SchedulingRegionID, I);
21581
21582 auto CanIgnoreLoad = [](const Instruction *I) {
21583 const auto *LI = dyn_cast<LoadInst>(I);
21584 // If there is a simple load marked as invariant, we can ignore it.
21585 // But, in the (unlikely) case of non-simple invariant load,
21586 // we should not ignore it.
21587 return LI && LI->isSimple() &&
21588 LI->getMetadata(LLVMContext::MD_invariant_load);
21589 };
21590
21591 if (I->mayReadOrWriteMemory() &&
21592 // Simple InvariantLoad does not depend on other memory accesses.
21593 !CanIgnoreLoad(I) &&
21594 (!isa<IntrinsicInst>(I) ||
21595 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
21597 Intrinsic::pseudoprobe))) {
21598 // Update the linked list of memory accessing instructions.
21599 if (CurrentLoadStore) {
21600 CurrentLoadStore->setNextLoadStore(SD);
21601 } else {
21602 FirstLoadStoreInRegion = SD;
21603 }
21604 CurrentLoadStore = SD;
21605 }
21606
21609 RegionHasStackSave = true;
21610 }
21611 if (NextLoadStore) {
21612 if (CurrentLoadStore)
21613 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21614 } else {
21615 LastLoadStoreInRegion = CurrentLoadStore;
21616 }
21617}
21618
21619void BoUpSLP::BlockScheduling::calculateDependencies(
21620 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
21621 ArrayRef<ScheduleData *> ControlDeps) {
21622 SmallVector<ScheduleEntity *> WorkList;
21623 auto ProcessNode = [&](ScheduleEntity *SE) {
21624 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21625 if (CD->hasValidDependencies())
21626 return;
21627 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
21628 CD->initDependencies();
21629 CD->resetUnscheduledDeps();
21630 const EdgeInfo &EI = CD->getEdgeInfo();
21631 if (EI.UserTE) {
21632 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21633 const auto *It = find(Op, CD->getInst());
21634 assert(It != Op.end() && "Lane not set");
21635 SmallPtrSet<Instruction *, 4> Visited;
21636 do {
21637 int Lane = std::distance(Op.begin(), It);
21638 assert(Lane >= 0 && "Lane not set");
21639 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21640 !EI.UserTE->ReorderIndices.empty())
21641 Lane = EI.UserTE->ReorderIndices[Lane];
21642 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21643 "Couldn't find extract lane");
21644 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21645 if (EI.UserTE->isCopyableElement(In)) {
21646 // We may have not have related copyable scheduling data, if the
21647 // instruction is non-schedulable.
21648 if (ScheduleCopyableData *UseSD =
21649 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21650 CD->incDependencies();
21651 if (!UseSD->isScheduled())
21652 CD->incrementUnscheduledDeps(1);
21653 if (!UseSD->hasValidDependencies() ||
21654 (InsertInReadyList && UseSD->isReady()))
21655 WorkList.push_back(UseSD);
21656 }
21657 } else if (Visited.insert(In).second) {
21658 if (ScheduleData *UseSD = getScheduleData(In)) {
21659 CD->incDependencies();
21660 if (!UseSD->isScheduled())
21661 CD->incrementUnscheduledDeps(1);
21662 if (!UseSD->hasValidDependencies() ||
21663 (InsertInReadyList && UseSD->isReady()))
21664 WorkList.push_back(UseSD);
21665 }
21666 }
21667 It = find(make_range(std::next(It), Op.end()), CD->getInst());
21668 } while (It != Op.end());
21669 if (CD->isReady() && CD->getDependencies() == 0 &&
21670 (EI.UserTE->hasState() &&
21671 (EI.UserTE->getMainOp()->getParent() !=
21672 CD->getInst()->getParent() ||
21673 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21674 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
21675 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21676 auto *IU = dyn_cast<Instruction>(U);
21677 if (!IU)
21678 return true;
21679 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21680 })))))) {
21681 // If no uses in the block - mark as having pseudo-use, which cannot
21682 // be scheduled.
21683 // Prevents incorrect def-use tracking between external user and
21684 // actual instruction.
21685 CD->incDependencies();
21686 CD->incrementUnscheduledDeps(1);
21687 }
21688 }
21689 return;
21690 }
21691 auto *BundleMember = cast<ScheduleData>(SE);
21692 if (BundleMember->hasValidDependencies())
21693 return;
21694 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
21695 BundleMember->initDependencies();
21696 BundleMember->resetUnscheduledDeps();
21697 // Handle def-use chain dependencies.
21698 SmallDenseMap<Value *, unsigned> UserToNumOps;
21699 for (User *U : BundleMember->getInst()->users()) {
21700 if (isa<PHINode>(U))
21701 continue;
21702 if (ScheduleData *UseSD = getScheduleData(U)) {
21703 // The operand is a copyable element - skip.
21704 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
21705 ++NumOps;
21706 if (areAllOperandsReplacedByCopyableData(
21707 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21708 continue;
21709 BundleMember->incDependencies();
21710 if (!UseSD->isScheduled())
21711 BundleMember->incrementUnscheduledDeps(1);
21712 if (!UseSD->hasValidDependencies() ||
21713 (InsertInReadyList && UseSD->isReady()))
21714 WorkList.push_back(UseSD);
21715 }
21716 }
21717 for (ScheduleCopyableData *UseSD :
21718 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21719 BundleMember->incDependencies();
21720 if (!UseSD->isScheduled())
21721 BundleMember->incrementUnscheduledDeps(1);
21722 if (!UseSD->hasValidDependencies() ||
21723 (InsertInReadyList && UseSD->isReady()))
21724 WorkList.push_back(UseSD);
21725 }
21726
21727 SmallPtrSet<const Instruction *, 4> Visited;
21728 auto MakeControlDependent = [&](Instruction *I) {
21729 // Do not mark control dependent twice.
21730 if (!Visited.insert(I).second)
21731 return;
21732 auto *DepDest = getScheduleData(I);
21733 assert(DepDest && "must be in schedule window");
21734 DepDest->addControlDependency(BundleMember);
21735 BundleMember->incDependencies();
21736 if (!DepDest->isScheduled())
21737 BundleMember->incrementUnscheduledDeps(1);
21738 if (!DepDest->hasValidDependencies() ||
21739 (InsertInReadyList && DepDest->isReady()))
21740 WorkList.push_back(DepDest);
21741 };
21742
21743 // Any instruction which isn't safe to speculate at the beginning of the
21744 // block is control depend on any early exit or non-willreturn call
21745 // which proceeds it.
21746 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
21747 for (Instruction *I = BundleMember->getInst()->getNextNode();
21748 I != ScheduleEnd; I = I->getNextNode()) {
21749 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
21750 continue;
21751
21752 // Add the dependency
21753 MakeControlDependent(I);
21754
21756 // Everything past here must be control dependent on I.
21757 break;
21758 }
21759 }
21760
21761 if (RegionHasStackSave) {
21762 // If we have an inalloc alloca instruction, it needs to be scheduled
21763 // after any preceeding stacksave. We also need to prevent any alloca
21764 // from reordering above a preceeding stackrestore.
21765 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21766 match(BundleMember->getInst(),
21768 for (Instruction *I = BundleMember->getInst()->getNextNode();
21769 I != ScheduleEnd; I = I->getNextNode()) {
21772 // Any allocas past here must be control dependent on I, and I
21773 // must be memory dependend on BundleMember->Inst.
21774 break;
21775
21776 if (!isa<AllocaInst>(I))
21777 continue;
21778
21779 // Add the dependency
21780 MakeControlDependent(I);
21781 }
21782 }
21783
21784 // In addition to the cases handle just above, we need to prevent
21785 // allocas and loads/stores from moving below a stacksave or a
21786 // stackrestore. Avoiding moving allocas below stackrestore is currently
21787 // thought to be conservatism. Moving loads/stores below a stackrestore
21788 // can lead to incorrect code.
21789 if (isa<AllocaInst>(BundleMember->getInst()) ||
21790 BundleMember->getInst()->mayReadOrWriteMemory()) {
21791 for (Instruction *I = BundleMember->getInst()->getNextNode();
21792 I != ScheduleEnd; I = I->getNextNode()) {
21795 continue;
21796
21797 // Add the dependency
21798 MakeControlDependent(I);
21799 break;
21800 }
21801 }
21802 }
21803
21804 // Handle the memory dependencies (if any).
21805 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21806 if (!NextLoadStore)
21807 return;
21808 Instruction *SrcInst = BundleMember->getInst();
21809 assert(SrcInst->mayReadOrWriteMemory() &&
21810 "NextLoadStore list for non memory effecting bundle?");
21811 MemoryLocation SrcLoc = getLocation(SrcInst);
21812 bool SrcMayWrite = SrcInst->mayWriteToMemory();
21813 unsigned NumAliased = 0;
21814 unsigned DistToSrc = 1;
21815 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
21816
21817 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21818 DepDest = DepDest->getNextLoadStore()) {
21819 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
21820
21821 // We have two limits to reduce the complexity:
21822 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
21823 // SLP->isAliased (which is the expensive part in this loop).
21824 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
21825 // the whole loop (even if the loop is fast, it's quadratic).
21826 // It's important for the loop break condition (see below) to
21827 // check this limit even between two read-only instructions.
21828 if (DistToSrc >= MaxMemDepDistance ||
21829 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21830 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
21831 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21832
21833 // We increment the counter only if the locations are aliased
21834 // (instead of counting all alias checks). This gives a better
21835 // balance between reduced runtime and accurate dependencies.
21836 NumAliased++;
21837
21838 DepDest->addMemoryDependency(BundleMember);
21839 BundleMember->incDependencies();
21840 if (!DepDest->isScheduled())
21841 BundleMember->incrementUnscheduledDeps(1);
21842 if (!DepDest->hasValidDependencies() ||
21843 (InsertInReadyList && DepDest->isReady()))
21844 WorkList.push_back(DepDest);
21845 }
21846
21847 // Example, explaining the loop break condition: Let's assume our
21848 // starting instruction is i0 and MaxMemDepDistance = 3.
21849 //
21850 // +--------v--v--v
21851 // i0,i1,i2,i3,i4,i5,i6,i7,i8
21852 // +--------^--^--^
21853 //
21854 // MaxMemDepDistance let us stop alias-checking at i3 and we add
21855 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
21856 // Previously we already added dependencies from i3 to i6,i7,i8
21857 // (because of MaxMemDepDistance). As we added a dependency from
21858 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
21859 // and we can abort this loop at i6.
21860 if (DistToSrc >= 2 * MaxMemDepDistance)
21861 break;
21862 DistToSrc++;
21863 }
21864 };
21865
21866 assert((Bundle || !ControlDeps.empty()) &&
21867 "expected at least one instruction to schedule");
21868 if (Bundle)
21869 WorkList.push_back(Bundle.getBundle().front());
21870 WorkList.append(ControlDeps.begin(), ControlDeps.end());
21871 SmallPtrSet<ScheduleBundle *, 16> Visited;
21872 while (!WorkList.empty()) {
21873 ScheduleEntity *SD = WorkList.pop_back_val();
21874 SmallVector<ScheduleBundle *, 1> CopyableBundle;
21876 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21877 CopyableBundle.push_back(&CD->getBundle());
21878 Bundles = CopyableBundle;
21879 } else {
21880 Bundles = getScheduleBundles(SD->getInst());
21881 }
21882 if (Bundles.empty()) {
21883 if (!SD->hasValidDependencies())
21884 ProcessNode(SD);
21885 if (InsertInReadyList && SD->isReady()) {
21886 ReadyInsts.insert(SD);
21887 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
21888 }
21889 continue;
21890 }
21891 for (ScheduleBundle *Bundle : Bundles) {
21892 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
21893 continue;
21894 assert(isInSchedulingRegion(*Bundle) &&
21895 "ScheduleData not in scheduling region");
21896 for_each(Bundle->getBundle(), ProcessNode);
21897 }
21898 if (InsertInReadyList && SD->isReady()) {
21899 for (ScheduleBundle *Bundle : Bundles) {
21900 assert(isInSchedulingRegion(*Bundle) &&
21901 "ScheduleData not in scheduling region");
21902 if (!Bundle->isReady())
21903 continue;
21904 ReadyInsts.insert(Bundle);
21905 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
21906 << "\n");
21907 }
21908 }
21909 }
21910}
21911
21912void BoUpSLP::BlockScheduling::resetSchedule() {
21913 assert(ScheduleStart &&
21914 "tried to reset schedule on block which has not been scheduled");
21915 for_each(ScheduleDataMap, [&](auto &P) {
21916 if (BB != P.first->getParent())
21917 return;
21918 ScheduleData *SD = P.second;
21919 if (isInSchedulingRegion(*SD)) {
21920 SD->setScheduled(/*Scheduled=*/false);
21921 SD->resetUnscheduledDeps();
21922 }
21923 });
21924 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21925 for_each(P.second, [&](ScheduleCopyableData *SD) {
21926 if (isInSchedulingRegion(*SD)) {
21927 SD->setScheduled(/*Scheduled=*/false);
21928 SD->resetUnscheduledDeps();
21929 }
21930 });
21931 });
21932 for_each(ScheduledBundles, [&](auto &P) {
21933 for_each(P.second, [&](ScheduleBundle *Bundle) {
21934 if (isInSchedulingRegion(*Bundle))
21935 Bundle->setScheduled(/*Scheduled=*/false);
21936 });
21937 });
21938 // Reset schedule data for copyable elements.
21939 for (auto &P : ScheduleCopyableDataMap) {
21940 if (isInSchedulingRegion(*P.second)) {
21941 P.second->setScheduled(/*Scheduled=*/false);
21942 P.second->resetUnscheduledDeps();
21943 }
21944 }
21945 ReadyInsts.clear();
21946}
21947
21948void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
21949 if (!BS->ScheduleStart)
21950 return;
21951
21952 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
21953
21954 // A key point - if we got here, pre-scheduling was able to find a valid
21955 // scheduling of the sub-graph of the scheduling window which consists
21956 // of all vector bundles and their transitive users. As such, we do not
21957 // need to reschedule anything *outside of* that subgraph.
21958
21959 BS->resetSchedule();
21960
21961 // For the real scheduling we use a more sophisticated ready-list: it is
21962 // sorted by the original instruction location. This lets the final schedule
21963 // be as close as possible to the original instruction order.
21964 // WARNING: If changing this order causes a correctness issue, that means
21965 // there is some missing dependence edge in the schedule data graph.
21966 struct ScheduleDataCompare {
21967 bool operator()(const ScheduleEntity *SD1,
21968 const ScheduleEntity *SD2) const {
21969 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21970 }
21971 };
21972 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21973
21974 // Ensure that all dependency data is updated (for nodes in the sub-graph)
21975 // and fill the ready-list with initial instructions.
21976 int Idx = 0;
21977 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21978 I = I->getNextNode()) {
21979 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21980 if (!Bundles.empty()) {
21981 for (ScheduleBundle *Bundle : Bundles) {
21982 Bundle->setSchedulingPriority(Idx++);
21983 if (!Bundle->hasValidDependencies())
21984 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
21985 }
21986 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
21987 for (ScheduleCopyableData *SD : reverse(SDs)) {
21988 ScheduleBundle &Bundle = SD->getBundle();
21989 Bundle.setSchedulingPriority(Idx++);
21990 if (!Bundle.hasValidDependencies())
21991 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21992 }
21993 continue;
21994 }
21996 BS->getScheduleCopyableDataUsers(I);
21997 if (ScheduleData *SD = BS->getScheduleData(I)) {
21998 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
21999 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
22000 SDTEs.front()->doesNotNeedToSchedule() ||
22002 "scheduler and vectorizer bundle mismatch");
22003 SD->setSchedulingPriority(Idx++);
22004 if (!SD->hasValidDependencies() &&
22005 (!CopyableData.empty() ||
22006 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
22007 assert(TE->isGather() && "expected gather node");
22008 return TE->hasState() && TE->hasCopyableElements() &&
22009 TE->isCopyableElement(I);
22010 }))) {
22011 // Need to calculate deps for these nodes to correctly handle copyable
22012 // dependencies, even if they were cancelled.
22013 // If copyables bundle was cancelled, the deps are cleared and need to
22014 // recalculate them.
22015 ScheduleBundle Bundle;
22016 Bundle.add(SD);
22017 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22018 }
22019 }
22020 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
22021 ScheduleBundle &Bundle = SD->getBundle();
22022 Bundle.setSchedulingPriority(Idx++);
22023 if (!Bundle.hasValidDependencies())
22024 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22025 }
22026 }
22027 BS->initialFillReadyList(ReadyInsts);
22028
22029 Instruction *LastScheduledInst = BS->ScheduleEnd;
22030
22031 // Do the "real" scheduling.
22032 SmallPtrSet<Instruction *, 16> Scheduled;
22033 while (!ReadyInsts.empty()) {
22034 auto *Picked = *ReadyInsts.begin();
22035 ReadyInsts.erase(ReadyInsts.begin());
22036
22037 // Move the scheduled instruction(s) to their dedicated places, if not
22038 // there yet.
22039 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
22040 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
22041 Instruction *PickedInst = BundleMember->getInst();
22042 // If copyable must be schedule as part of something else, skip it.
22043 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
22044 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
22045 (!IsCopyable && !Scheduled.insert(PickedInst).second))
22046 continue;
22047 if (PickedInst->getNextNode() != LastScheduledInst)
22048 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
22049 LastScheduledInst = PickedInst;
22050 }
22051 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
22052 LastScheduledInst);
22053 } else {
22054 auto *SD = cast<ScheduleData>(Picked);
22055 Instruction *PickedInst = SD->getInst();
22056 if (PickedInst->getNextNode() != LastScheduledInst)
22057 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
22058 LastScheduledInst = PickedInst;
22059 }
22060 auto Invalid = InstructionsState::invalid();
22061 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
22062 }
22063
22064 // Check that we didn't break any of our invariants.
22065#ifdef EXPENSIVE_CHECKS
22066 BS->verify();
22067#endif
22068
22069#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
22070 // Check that all schedulable entities got scheduled
22071 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22072 I = I->getNextNode()) {
22073 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
22074 assert(all_of(Bundles,
22075 [](const ScheduleBundle *Bundle) {
22076 return Bundle->isScheduled();
22077 }) &&
22078 "must be scheduled at this point");
22079 }
22080#endif
22081
22082 // Avoid duplicate scheduling of the block.
22083 BS->ScheduleStart = nullptr;
22084}
22085
22087 // If V is a store, just return the width of the stored value (or value
22088 // truncated just before storing) without traversing the expression tree.
22089 // This is the common case.
22090 if (auto *Store = dyn_cast<StoreInst>(V))
22091 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
22092
22093 if (auto *IEI = dyn_cast<InsertElementInst>(V))
22094 return getVectorElementSize(IEI->getOperand(1));
22095
22096 auto E = InstrElementSize.find(V);
22097 if (E != InstrElementSize.end())
22098 return E->second;
22099
22100 // If V is not a store, we can traverse the expression tree to find loads
22101 // that feed it. The type of the loaded value may indicate a more suitable
22102 // width than V's type. We want to base the vector element size on the width
22103 // of memory operations where possible.
22106 if (auto *I = dyn_cast<Instruction>(V)) {
22107 Worklist.emplace_back(I, I->getParent(), 0);
22108 Visited.insert(I);
22109 }
22110
22111 // Traverse the expression tree in bottom-up order looking for loads. If we
22112 // encounter an instruction we don't yet handle, we give up.
22113 auto Width = 0u;
22114 Value *FirstNonBool = nullptr;
22115 while (!Worklist.empty()) {
22116 auto [I, Parent, Level] = Worklist.pop_back_val();
22117
22118 // We should only be looking at scalar instructions here. If the current
22119 // instruction has a vector type, skip.
22120 auto *Ty = I->getType();
22121 if (isa<VectorType>(Ty))
22122 continue;
22123 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
22124 FirstNonBool = I;
22125 if (Level > RecursionMaxDepth)
22126 continue;
22127
22128 // If the current instruction is a load, update MaxWidth to reflect the
22129 // width of the loaded value.
22131 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
22132
22133 // Otherwise, we need to visit the operands of the instruction. We only
22134 // handle the interesting cases from buildTree here. If an operand is an
22135 // instruction we haven't yet visited and from the same basic block as the
22136 // user or the use is a PHI node, we add it to the worklist.
22139 for (Use &U : I->operands()) {
22140 if (auto *J = dyn_cast<Instruction>(U.get()))
22141 if (Visited.insert(J).second &&
22142 (isa<PHINode>(I) || J->getParent() == Parent)) {
22143 Worklist.emplace_back(J, J->getParent(), Level + 1);
22144 continue;
22145 }
22146 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
22147 FirstNonBool = U.get();
22148 }
22149 } else {
22150 break;
22151 }
22152 }
22153
22154 // If we didn't encounter a memory access in the expression tree, or if we
22155 // gave up for some reason, just return the width of V. Otherwise, return the
22156 // maximum width we found.
22157 if (!Width) {
22158 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
22159 V = FirstNonBool;
22160 Width = DL->getTypeSizeInBits(V->getType());
22161 }
22162
22163 for (Instruction *I : Visited)
22164 InstrElementSize[I] = Width;
22165
22166 return Width;
22167}
22168
22169bool BoUpSLP::collectValuesToDemote(
22170 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
22172 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
22173 bool &IsProfitableToDemote, bool IsTruncRoot) const {
22174 // We can always demote constants.
22175 if (all_of(E.Scalars, IsaPred<Constant>))
22176 return true;
22177
22178 unsigned OrigBitWidth =
22179 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
22180 if (OrigBitWidth == BitWidth) {
22181 MaxDepthLevel = 1;
22182 return true;
22183 }
22184
22185 // Check if the node was analyzed already and must keep its original bitwidth.
22186 if (NodesToKeepBWs.contains(E.Idx))
22187 return false;
22188
22189 // If the value is not a vectorized instruction in the expression and not used
22190 // by the insertelement instruction and not used in multiple vector nodes, it
22191 // cannot be demoted.
22192 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
22193 if (isa<PoisonValue>(R))
22194 return false;
22195 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22196 });
22197 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
22198 if (isa<PoisonValue>(V))
22199 return true;
22200 if (getTreeEntries(V).size() > 1)
22201 return false;
22202 // For lat shuffle of sext/zext with many uses need to check the extra bit
22203 // for unsigned values, otherwise may have incorrect casting for reused
22204 // scalars.
22205 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
22206 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
22207 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22208 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22209 return true;
22210 }
22211 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22212 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22213 if (IsSignedNode)
22214 ++BitWidth1;
22215 if (auto *I = dyn_cast<Instruction>(V)) {
22216 APInt Mask = DB->getDemandedBits(I);
22217 unsigned BitWidth2 =
22218 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
22219 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22220 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
22221 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22222 break;
22223 BitWidth2 *= 2;
22224 }
22225 BitWidth1 = std::min(BitWidth1, BitWidth2);
22226 }
22227 BitWidth = std::max(BitWidth, BitWidth1);
22228 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
22229 };
22230 auto FinalAnalysis = [&, TTI = TTI]() {
22231 if (!IsProfitableToDemote)
22232 return false;
22233 bool Res = all_of(
22234 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
22235 // Demote gathers.
22236 if (Res && E.isGather()) {
22237 if (E.hasState()) {
22238 if (const TreeEntry *SameTE =
22239 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
22240 SameTE)
22241 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
22242 ToDemote, Visited, NodesToKeepBWs,
22243 MaxDepthLevel, IsProfitableToDemote,
22244 IsTruncRoot)) {
22245 ToDemote.push_back(E.Idx);
22246 return true;
22247 }
22248 }
22249 // Check possible extractelement instructions bases and final vector
22250 // length.
22251 SmallPtrSet<Value *, 4> UniqueBases;
22252 for (Value *V : E.Scalars) {
22253 auto *EE = dyn_cast<ExtractElementInst>(V);
22254 if (!EE)
22255 continue;
22256 UniqueBases.insert(EE->getVectorOperand());
22257 }
22258 const unsigned VF = E.Scalars.size();
22259 Type *OrigScalarTy = E.Scalars.front()->getType();
22260 if (UniqueBases.size() <= 2 ||
22261 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
22263 *TTI,
22265 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
22266 VF))) {
22267 ToDemote.push_back(E.Idx);
22268 return true;
22269 }
22270 }
22271 return Res;
22272 };
22273 if (E.isGather() || !Visited.insert(&E).second ||
22274 any_of(E.Scalars, [&](Value *V) {
22275 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22276 return isa<InsertElementInst>(U) && !isVectorized(U);
22277 });
22278 }))
22279 return FinalAnalysis();
22280
22281 if (any_of(E.Scalars, [&](Value *V) {
22282 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22283 return isVectorized(U) ||
22284 (E.Idx == 0 && UserIgnoreList &&
22285 UserIgnoreList->contains(U)) ||
22286 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22287 !U->getType()->isScalableTy() &&
22288 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22289 }) && !IsPotentiallyTruncated(V, BitWidth);
22290 }))
22291 return false;
22292
22293 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
22294 bool &NeedToExit) {
22295 NeedToExit = false;
22296 unsigned InitLevel = MaxDepthLevel;
22297 for (const TreeEntry *Op : Operands) {
22298 unsigned Level = InitLevel;
22299 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
22300 ToDemote, Visited, NodesToKeepBWs, Level,
22301 IsProfitableToDemote, IsTruncRoot)) {
22302 if (!IsProfitableToDemote)
22303 return false;
22304 NeedToExit = true;
22305 if (!FinalAnalysis())
22306 return false;
22307 continue;
22308 }
22309 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22310 }
22311 return true;
22312 };
22313 auto AttemptCheckBitwidth =
22314 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
22315 // Try all bitwidth < OrigBitWidth.
22316 NeedToExit = false;
22317 unsigned BestFailBitwidth = 0;
22318 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
22319 if (Checker(BitWidth, OrigBitWidth))
22320 return true;
22321 if (BestFailBitwidth == 0 && FinalAnalysis())
22322 BestFailBitwidth = BitWidth;
22323 }
22324 if (BitWidth >= OrigBitWidth) {
22325 if (BestFailBitwidth == 0) {
22326 BitWidth = OrigBitWidth;
22327 return false;
22328 }
22329 MaxDepthLevel = 1;
22330 BitWidth = BestFailBitwidth;
22331 NeedToExit = true;
22332 return true;
22333 }
22334 return false;
22335 };
22336 auto TryProcessInstruction =
22337 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
22338 function_ref<bool(unsigned, unsigned)> Checker = {}) {
22339 if (Operands.empty()) {
22340 if (!IsTruncRoot)
22341 MaxDepthLevel = 1;
22342 for (Value *V : E.Scalars)
22343 (void)IsPotentiallyTruncated(V, BitWidth);
22344 } else {
22345 // Several vectorized uses? Check if we can truncate it, otherwise -
22346 // exit.
22347 if (any_of(E.Scalars, [&](Value *V) {
22348 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22349 }))
22350 return false;
22351 bool NeedToExit = false;
22352 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22353 return false;
22354 if (NeedToExit)
22355 return true;
22356 if (!ProcessOperands(Operands, NeedToExit))
22357 return false;
22358 if (NeedToExit)
22359 return true;
22360 }
22361
22362 ++MaxDepthLevel;
22363 // Record the entry that we can demote.
22364 ToDemote.push_back(E.Idx);
22365 return IsProfitableToDemote;
22366 };
22367
22368 if (E.State == TreeEntry::SplitVectorize)
22369 return TryProcessInstruction(
22370 BitWidth,
22371 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
22372 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
22373
22374 if (E.isAltShuffle()) {
22375 // Combining these opcodes may lead to incorrect analysis, skip for now.
22376 auto IsDangerousOpcode = [](unsigned Opcode) {
22377 switch (Opcode) {
22378 case Instruction::Shl:
22379 case Instruction::AShr:
22380 case Instruction::LShr:
22381 case Instruction::UDiv:
22382 case Instruction::SDiv:
22383 case Instruction::URem:
22384 case Instruction::SRem:
22385 return true;
22386 default:
22387 break;
22388 }
22389 return false;
22390 };
22391 if (IsDangerousOpcode(E.getAltOpcode()))
22392 return FinalAnalysis();
22393 }
22394
22395 switch (E.getOpcode()) {
22396
22397 // We can always demote truncations and extensions. Since truncations can
22398 // seed additional demotion, we save the truncated value.
22399 case Instruction::Trunc:
22400 if (IsProfitableToDemoteRoot)
22401 IsProfitableToDemote = true;
22402 return TryProcessInstruction(BitWidth);
22403 case Instruction::ZExt:
22404 case Instruction::SExt:
22405 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
22406 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22407 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22408 return false;
22409 IsProfitableToDemote = true;
22410 return TryProcessInstruction(BitWidth);
22411
22412 // We can demote certain binary operations if we can demote both of their
22413 // operands.
22414 case Instruction::Add:
22415 case Instruction::Sub:
22416 case Instruction::Mul:
22417 case Instruction::And:
22418 case Instruction::Or:
22419 case Instruction::Xor: {
22420 return TryProcessInstruction(
22421 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
22422 }
22423 case Instruction::Freeze:
22424 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
22425 case Instruction::Shl: {
22426 // If we are truncating the result of this SHL, and if it's a shift of an
22427 // inrange amount, we can always perform a SHL in a smaller type.
22428 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
22429 return all_of(E.Scalars, [&](Value *V) {
22430 if (isa<PoisonValue>(V))
22431 return true;
22432 if (E.isCopyableElement(V))
22433 return true;
22434 auto *I = cast<Instruction>(V);
22435 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22436 return AmtKnownBits.getMaxValue().ult(BitWidth);
22437 });
22438 };
22439 return TryProcessInstruction(
22440 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
22441 }
22442 case Instruction::LShr: {
22443 // If this is a truncate of a logical shr, we can truncate it to a smaller
22444 // lshr iff we know that the bits we would otherwise be shifting in are
22445 // already zeros.
22446 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22447 return all_of(E.Scalars, [&](Value *V) {
22448 if (isa<PoisonValue>(V))
22449 return true;
22450 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22451 if (E.isCopyableElement(V))
22452 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22453 auto *I = cast<Instruction>(V);
22454 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22455 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22456 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22457 SimplifyQuery(*DL));
22458 });
22459 };
22460 return TryProcessInstruction(
22461 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22462 LShrChecker);
22463 }
22464 case Instruction::AShr: {
22465 // If this is a truncate of an arithmetic shr, we can truncate it to a
22466 // smaller ashr iff we know that all the bits from the sign bit of the
22467 // original type and the sign bit of the truncate type are similar.
22468 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22469 return all_of(E.Scalars, [&](Value *V) {
22470 if (isa<PoisonValue>(V))
22471 return true;
22472 auto *I = cast<Instruction>(V);
22473 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22474 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22475 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22476 ShiftedBits <
22477 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22478 });
22479 };
22480 return TryProcessInstruction(
22481 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22482 AShrChecker);
22483 }
22484 case Instruction::UDiv:
22485 case Instruction::URem: {
22486 // UDiv and URem can be truncated if all the truncated bits are zero.
22487 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22488 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22489 return all_of(E.Scalars, [&](Value *V) {
22490 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22491 if (E.hasCopyableElements() && E.isCopyableElement(V))
22492 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22493 auto *I = cast<Instruction>(V);
22494 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22495 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22496 });
22497 };
22498 return TryProcessInstruction(
22499 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
22500 }
22501
22502 // We can demote selects if we can demote their true and false values.
22503 case Instruction::Select: {
22504 return TryProcessInstruction(
22505 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
22506 }
22507
22508 // We can demote phis if we can demote all their incoming operands.
22509 case Instruction::PHI: {
22510 const unsigned NumOps = E.getNumOperands();
22512 transform(seq<unsigned>(0, NumOps), Ops.begin(),
22513 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
22514
22515 return TryProcessInstruction(BitWidth, Ops);
22516 }
22517
22518 case Instruction::Call: {
22519 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
22520 if (!IC)
22521 break;
22523 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
22524 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
22525 break;
22526 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
22527 function_ref<bool(unsigned, unsigned)> CallChecker;
22528 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22529 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22530 return all_of(E.Scalars, [&](Value *V) {
22531 auto *I = cast<Instruction>(V);
22532 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22533 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22534 return MaskedValueIsZero(I->getOperand(0), Mask,
22535 SimplifyQuery(*DL)) &&
22536 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22537 }
22538 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
22539 "Expected min/max intrinsics only.");
22540 unsigned SignBits = OrigBitWidth - BitWidth;
22541 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22542 unsigned Op0SignBits =
22543 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22544 unsigned Op1SignBits =
22545 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
22546 return SignBits <= Op0SignBits &&
22547 ((SignBits != Op0SignBits &&
22548 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22549 MaskedValueIsZero(I->getOperand(0), Mask,
22550 SimplifyQuery(*DL))) &&
22551 SignBits <= Op1SignBits &&
22552 ((SignBits != Op1SignBits &&
22553 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
22554 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
22555 });
22556 };
22557 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22558 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22559 return all_of(E.Scalars, [&](Value *V) {
22560 auto *I = cast<Instruction>(V);
22561 unsigned SignBits = OrigBitWidth - BitWidth;
22562 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22563 unsigned Op0SignBits =
22564 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22565 return SignBits <= Op0SignBits &&
22566 ((SignBits != Op0SignBits &&
22567 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22568 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22569 });
22570 };
22571 if (ID != Intrinsic::abs) {
22572 Operands.push_back(getOperandEntry(&E, 1));
22573 CallChecker = CompChecker;
22574 } else {
22575 CallChecker = AbsChecker;
22576 }
22577 InstructionCost BestCost =
22578 std::numeric_limits<InstructionCost::CostType>::max();
22579 unsigned BestBitWidth = BitWidth;
22580 unsigned VF = E.Scalars.size();
22581 // Choose the best bitwidth based on cost estimations.
22582 auto Checker = [&](unsigned BitWidth, unsigned) {
22583 unsigned MinBW = PowerOf2Ceil(BitWidth);
22584 SmallVector<Type *> ArgTys =
22585 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
22586 auto VecCallCosts = getVectorCallCosts(
22587 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
22588 TTI, TLI, ArgTys);
22589 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
22590 if (Cost < BestCost) {
22591 BestCost = Cost;
22592 BestBitWidth = BitWidth;
22593 }
22594 return false;
22595 };
22596 [[maybe_unused]] bool NeedToExit;
22597 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22598 BitWidth = BestBitWidth;
22599 return TryProcessInstruction(BitWidth, Operands, CallChecker);
22600 }
22601
22602 // Otherwise, conservatively give up.
22603 default:
22604 break;
22605 }
22606 MaxDepthLevel = 1;
22607 return FinalAnalysis();
22608}
22609
22610static RecurKind getRdxKind(Value *V);
22611
22613 // We only attempt to truncate integer expressions.
22614 bool IsStoreOrInsertElt =
22615 VectorizableTree.front()->hasState() &&
22616 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22617 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22618 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22619 ExtraBitWidthNodes.size() <= 1 &&
22620 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22621 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22622 return;
22623
22624 unsigned NodeIdx = 0;
22625 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22626 NodeIdx = 1;
22627
22628 // Ensure the roots of the vectorizable tree don't form a cycle.
22629 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
22630 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22631 "Unexpected tree is graph.");
22632
22633 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
22634 // resize to the final type.
22635 bool IsTruncRoot = false;
22636 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22637 SmallVector<unsigned> RootDemotes;
22638 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
22639 if (NodeIdx != 0 &&
22640 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22641 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22642 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
22643 IsTruncRoot = true;
22644 RootDemotes.push_back(NodeIdx);
22645 IsProfitableToDemoteRoot = true;
22646 ++NodeIdx;
22647 }
22648
22649 // Analyzed the reduction already and not profitable - exit.
22650 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22651 return;
22652
22653 SmallVector<unsigned> ToDemote;
22654 auto ComputeMaxBitWidth =
22655 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
22656 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
22657 ToDemote.clear();
22658 // Check if the root is trunc and the next node is gather/buildvector, then
22659 // keep trunc in scalars, which is free in most cases.
22660 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22661 !NodesToKeepBWs.contains(E.Idx) &&
22662 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22663 all_of(E.Scalars, [&](Value *V) {
22664 return V->hasOneUse() || isa<Constant>(V) ||
22665 (!V->hasNUsesOrMore(UsesLimit) &&
22666 none_of(V->users(), [&](User *U) {
22667 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22668 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22669 if (TEs.empty() || is_contained(TEs, UserTE))
22670 return false;
22671 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22672 SelectInst>(U) ||
22673 isa<SIToFPInst, UIToFPInst>(U) ||
22674 (UserTE->hasState() &&
22675 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22676 SelectInst>(UserTE->getMainOp()) ||
22677 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22678 return true;
22679 unsigned UserTESz = DL->getTypeSizeInBits(
22680 UserTE->Scalars.front()->getType());
22681 if (all_of(TEs, [&](const TreeEntry *TE) {
22682 auto It = MinBWs.find(TE);
22683 return It != MinBWs.end() &&
22684 It->second.first > UserTESz;
22685 }))
22686 return true;
22687 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22688 }));
22689 })) {
22690 ToDemote.push_back(E.Idx);
22691 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22692 auto It = MinBWs.find(UserTE);
22693 if (It != MinBWs.end())
22694 return It->second.first;
22695 unsigned MaxBitWidth =
22696 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22697 MaxBitWidth = bit_ceil(MaxBitWidth);
22698 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22699 MaxBitWidth = 8;
22700 return MaxBitWidth;
22701 }
22702
22703 if (!E.hasState())
22704 return 0u;
22705
22706 unsigned VF = E.getVectorFactor();
22707 Type *ScalarTy = E.Scalars.front()->getType();
22708 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22709 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
22710 if (!TreeRootIT)
22711 return 0u;
22712
22713 if (any_of(E.Scalars,
22714 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
22715 return 0u;
22716
22717 unsigned NumParts = ::getNumberOfParts(
22718 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
22719
22720 // The maximum bit width required to represent all the values that can be
22721 // demoted without loss of precision. It would be safe to truncate the roots
22722 // of the expression to this width.
22723 unsigned MaxBitWidth = 1u;
22724
22725 // True if the roots can be zero-extended back to their original type,
22726 // rather than sign-extended. We know that if the leading bits are not
22727 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
22728 // True.
22729 // Determine if the sign bit of all the roots is known to be zero. If not,
22730 // IsKnownPositive is set to False.
22731 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
22732 if (isa<PoisonValue>(R))
22733 return true;
22734 KnownBits Known = computeKnownBits(R, *DL);
22735 return Known.isNonNegative();
22736 });
22737
22738 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22739 E.UserTreeIndex.UserTE->hasState() &&
22740 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22741 MaxBitWidth =
22742 std::min(DL->getTypeSizeInBits(
22743 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22744 DL->getTypeSizeInBits(ScalarTy));
22745
22746 // We first check if all the bits of the roots are demanded. If they're not,
22747 // we can truncate the roots to this narrower type.
22748 for (Value *Root : E.Scalars) {
22749 if (isa<PoisonValue>(Root))
22750 continue;
22751 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
22752 TypeSize NumTypeBits =
22753 DL->getTypeSizeInBits(Root->getType()->getScalarType());
22754 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22755 // If we can't prove that the sign bit is zero, we must add one to the
22756 // maximum bit width to account for the unknown sign bit. This preserves
22757 // the existing sign bit so we can safely sign-extend the root back to the
22758 // original type. Otherwise, if we know the sign bit is zero, we will
22759 // zero-extend the root instead.
22760 //
22761 // FIXME: This is somewhat suboptimal, as there will be cases where adding
22762 // one to the maximum bit width will yield a larger-than-necessary
22763 // type. In general, we need to add an extra bit only if we can't
22764 // prove that the upper bit of the original type is equal to the
22765 // upper bit of the proposed smaller type. If these two bits are
22766 // the same (either zero or one) we know that sign-extending from
22767 // the smaller type will result in the same value. Here, since we
22768 // can't yet prove this, we are just making the proposed smaller
22769 // type larger to ensure correctness.
22770 if (!IsKnownPositive)
22771 ++BitWidth1;
22772
22773 auto *I = dyn_cast<Instruction>(Root);
22774 if (!I) {
22775 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22776 continue;
22777 }
22778 APInt Mask = DB->getDemandedBits(I);
22779 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22780 MaxBitWidth =
22781 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22782 }
22783
22784 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22785 MaxBitWidth = 8;
22786
22787 // If the original type is large, but reduced type does not improve the reg
22788 // use - ignore it.
22789 if (NumParts > 1 &&
22790 NumParts ==
22792 *TTI, getWidenedType(IntegerType::get(F->getContext(),
22793 bit_ceil(MaxBitWidth)),
22794 VF)))
22795 return 0u;
22796
22797 unsigned Opcode = E.getOpcode();
22798 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22799 Opcode == Instruction::SExt ||
22800 Opcode == Instruction::ZExt || NumParts > 1;
22801 // Conservatively determine if we can actually truncate the roots of the
22802 // expression. Collect the values that can be demoted in ToDemote and
22803 // additional roots that require investigating in Roots.
22805 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22806 bool NeedToDemote = IsProfitableToDemote;
22807
22808 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22809 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22810 NeedToDemote, IsTruncRoot) ||
22811 (MaxDepthLevel <= Limit &&
22812 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22813 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22814 DL->getTypeSizeInBits(TreeRootIT) /
22815 DL->getTypeSizeInBits(
22816 E.getMainOp()->getOperand(0)->getType()) >
22817 2)))))
22818 return 0u;
22819 // Round MaxBitWidth up to the next power-of-two.
22820 MaxBitWidth = bit_ceil(MaxBitWidth);
22821
22822 return MaxBitWidth;
22823 };
22824
22825 // If we can truncate the root, we must collect additional values that might
22826 // be demoted as a result. That is, those seeded by truncations we will
22827 // modify.
22828 // Add reduction ops sizes, if any.
22829 if (UserIgnoreList &&
22830 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22831 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
22832 // x i1> to in)).
22833 if (all_of(*UserIgnoreList,
22834 [](Value *V) {
22835 return isa<PoisonValue>(V) ||
22836 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22837 }) &&
22838 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22839 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22840 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22841 Builder.getInt1Ty()) {
22842 ReductionBitWidth = 1;
22843 } else {
22844 for (Value *V : *UserIgnoreList) {
22845 if (isa<PoisonValue>(V))
22846 continue;
22847 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22848 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
22849 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22850 if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
22851 ++BitWidth1;
22852 unsigned BitWidth2 = BitWidth1;
22854 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
22855 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22856 }
22857 ReductionBitWidth =
22858 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22859 }
22860 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22861 ReductionBitWidth = 8;
22862
22863 ReductionBitWidth = bit_ceil(ReductionBitWidth);
22864 }
22865 }
22866 bool IsTopRoot = NodeIdx == 0;
22867 while (NodeIdx < VectorizableTree.size() &&
22868 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22869 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22870 RootDemotes.push_back(NodeIdx);
22871 ++NodeIdx;
22872 IsTruncRoot = true;
22873 }
22874 bool IsSignedCmp = false;
22875 if (UserIgnoreList &&
22876 all_of(*UserIgnoreList,
22878 m_SMax(m_Value(), m_Value())))))
22879 IsSignedCmp = true;
22880 while (NodeIdx < VectorizableTree.size()) {
22881 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
22882 unsigned Limit = 2;
22883 if (IsTopRoot &&
22884 ReductionBitWidth ==
22885 DL->getTypeSizeInBits(
22886 VectorizableTree.front()->Scalars.front()->getType()))
22887 Limit = 3;
22888 unsigned MaxBitWidth = ComputeMaxBitWidth(
22889 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22890 IsTruncRoot, IsSignedCmp);
22891 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22892 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22893 ReductionBitWidth = bit_ceil(MaxBitWidth);
22894 else if (MaxBitWidth == 0)
22895 ReductionBitWidth = 0;
22896 }
22897
22898 for (unsigned Idx : RootDemotes) {
22899 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
22900 uint32_t OrigBitWidth =
22901 DL->getTypeSizeInBits(V->getType()->getScalarType());
22902 if (OrigBitWidth > MaxBitWidth) {
22903 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
22904 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22905 }
22906 return false;
22907 }))
22908 ToDemote.push_back(Idx);
22909 }
22910 RootDemotes.clear();
22911 IsTopRoot = false;
22912 IsProfitableToDemoteRoot = true;
22913
22914 if (ExtraBitWidthNodes.empty()) {
22915 NodeIdx = VectorizableTree.size();
22916 } else {
22917 unsigned NewIdx = 0;
22918 do {
22919 NewIdx = *ExtraBitWidthNodes.begin();
22920 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22921 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22922 NodeIdx = NewIdx;
22923 IsTruncRoot =
22924 NodeIdx < VectorizableTree.size() &&
22925 VectorizableTree[NodeIdx]->UserTreeIndex &&
22926 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22927 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22928 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22929 Instruction::Trunc &&
22930 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22931 IsSignedCmp =
22932 NodeIdx < VectorizableTree.size() &&
22933 VectorizableTree[NodeIdx]->UserTreeIndex &&
22934 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22935 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22936 Instruction::ICmp &&
22937 any_of(
22938 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22939 [&](Value *V) {
22940 auto *IC = dyn_cast<ICmpInst>(V);
22941 return IC && (IC->isSigned() ||
22942 !isKnownNonNegative(IC->getOperand(0),
22943 SimplifyQuery(*DL)) ||
22944 !isKnownNonNegative(IC->getOperand(1),
22945 SimplifyQuery(*DL)));
22946 });
22947 }
22948
22949 // If the maximum bit width we compute is less than the width of the roots'
22950 // type, we can proceed with the narrowing. Otherwise, do nothing.
22951 if (MaxBitWidth == 0 ||
22952 MaxBitWidth >=
22953 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
22954 ->getBitWidth()) {
22955 if (UserIgnoreList)
22956 AnalyzedMinBWVals.insert_range(TreeRoot);
22957 NodesToKeepBWs.insert_range(ToDemote);
22958 continue;
22959 }
22960
22961 // Finally, map the values we can demote to the maximum bit with we
22962 // computed.
22963 for (unsigned Idx : ToDemote) {
22964 TreeEntry *TE = VectorizableTree[Idx].get();
22965 if (MinBWs.contains(TE))
22966 continue;
22967 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
22968 if (isa<PoisonValue>(R))
22969 return false;
22970 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22971 });
22972 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22973 }
22974 }
22975}
22976
22978 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
22979 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
22981 auto *AA = &AM.getResult<AAManager>(F);
22982 auto *LI = &AM.getResult<LoopAnalysis>(F);
22983 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
22984 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
22985 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
22987
22988 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
22989 if (!Changed)
22990 return PreservedAnalyses::all();
22991
22994 return PA;
22995}
22996
22998 TargetTransformInfo *TTI_,
22999 TargetLibraryInfo *TLI_, AAResults *AA_,
23000 LoopInfo *LI_, DominatorTree *DT_,
23001 AssumptionCache *AC_, DemandedBits *DB_,
23004 return false;
23005 SE = SE_;
23006 TTI = TTI_;
23007 TLI = TLI_;
23008 AA = AA_;
23009 LI = LI_;
23010 DT = DT_;
23011 AC = AC_;
23012 DB = DB_;
23013 DL = &F.getDataLayout();
23014
23015 Stores.clear();
23016 GEPs.clear();
23017 bool Changed = false;
23018
23019 // If the target claims to have no vector registers don't attempt
23020 // vectorization.
23021 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
23022 LLVM_DEBUG(
23023 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
23024 return false;
23025 }
23026
23027 // Don't vectorize when the attribute NoImplicitFloat is used.
23028 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
23029 return false;
23030
23031 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
23032
23033 // Use the bottom up slp vectorizer to construct chains that start with
23034 // store instructions.
23035 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
23036
23037 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
23038 // delete instructions.
23039
23040 // Update DFS numbers now so that we can use them for ordering.
23041 DT->updateDFSNumbers();
23042
23043 // Scan the blocks in the function in post order.
23044 for (auto *BB : post_order(&F.getEntryBlock())) {
23046 continue;
23047
23048 // Start new block - clear the list of reduction roots.
23049 R.clearReductionData();
23050 collectSeedInstructions(BB);
23051
23052 // Vectorize trees that end at stores.
23053 if (!Stores.empty()) {
23054 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
23055 << " underlying objects.\n");
23056 Changed |= vectorizeStoreChains(R);
23057 }
23058
23059 // Vectorize trees that end at reductions.
23060 Changed |= vectorizeChainsInBlock(BB, R);
23061
23062 // Vectorize the index computations of getelementptr instructions. This
23063 // is primarily intended to catch gather-like idioms ending at
23064 // non-consecutive loads.
23065 if (!GEPs.empty()) {
23066 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
23067 << " underlying objects.\n");
23068 Changed |= vectorizeGEPIndices(BB, R);
23069 }
23070 }
23071
23072 if (Changed) {
23073 R.optimizeGatherSequence();
23074 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
23075 }
23076 return Changed;
23077}
23078
23079std::optional<bool>
23080SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
23081 unsigned Idx, unsigned MinVF,
23082 unsigned &Size) {
23083 Size = 0;
23084 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
23085 << "\n");
23086 const unsigned Sz = R.getVectorElementSize(Chain[0]);
23087 unsigned VF = Chain.size();
23088
23089 if (!has_single_bit(Sz) ||
23091 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
23092 VF) ||
23093 VF < 2 || VF < MinVF) {
23094 // Check if vectorizing with a non-power-of-2 VF should be considered. At
23095 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
23096 // all vector lanes are used.
23097 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
23098 return false;
23099 }
23100
23101 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
23102 << "\n");
23103
23104 SetVector<Value *> ValOps;
23105 for (Value *V : Chain)
23106 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
23107 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
23108 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
23109 InstructionsState S = Analysis.buildInstructionsState(
23110 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
23111 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
23112 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
23113 bool IsAllowedSize =
23114 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
23115 ValOps.size()) ||
23116 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
23117 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
23118 (!S.getMainOp()->isSafeToRemove() ||
23119 any_of(ValOps.getArrayRef(),
23120 [&](Value *V) {
23121 return !isa<ExtractElementInst>(V) &&
23122 (V->getNumUses() > Chain.size() ||
23123 any_of(V->users(), [&](User *U) {
23124 return !Stores.contains(U);
23125 }));
23126 }))) ||
23127 (ValOps.size() > Chain.size() / 2 && !S)) {
23128 Size = (!IsAllowedSize && S) ? 1 : 2;
23129 return false;
23130 }
23131 }
23132 if (R.isLoadCombineCandidate(Chain))
23133 return true;
23134 R.buildTree(Chain);
23135 // Check if tree tiny and store itself or its value is not vectorized.
23136 if (R.isTreeTinyAndNotFullyVectorizable()) {
23137 if (R.isGathered(Chain.front()) ||
23138 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
23139 return std::nullopt;
23140 Size = R.getCanonicalGraphSize();
23141 return false;
23142 }
23143 if (R.isProfitableToReorder()) {
23144 R.reorderTopToBottom();
23145 R.reorderBottomToTop();
23146 }
23147 R.transformNodes();
23148 R.buildExternalUses();
23149
23150 R.computeMinimumValueSizes();
23151
23152 Size = R.getCanonicalGraphSize();
23153 if (S && S.getOpcode() == Instruction::Load)
23154 Size = 2; // cut off masked gather small trees
23155 InstructionCost Cost = R.getTreeCost();
23156
23157 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
23158 if (Cost < -SLPCostThreshold) {
23159 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
23160
23161 using namespace ore;
23162
23163 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
23164 cast<StoreInst>(Chain[0]))
23165 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
23166 << " and with tree size "
23167 << NV("TreeSize", R.getTreeSize()));
23168
23169 R.vectorizeTree();
23170 return true;
23171 }
23172
23173 return false;
23174}
23175
23176/// Checks if the quadratic mean deviation is less than 90% of the mean size.
23177static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
23178 bool First) {
23179 unsigned Num = 0;
23180 uint64_t Sum = std::accumulate(
23181 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
23182 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
23183 unsigned Size = First ? Val.first : Val.second;
23184 if (Size == 1)
23185 return V;
23186 ++Num;
23187 return V + Size;
23188 });
23189 if (Num == 0)
23190 return true;
23191 uint64_t Mean = Sum / Num;
23192 if (Mean == 0)
23193 return true;
23194 uint64_t Dev = std::accumulate(
23195 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
23196 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
23197 unsigned P = First ? Val.first : Val.second;
23198 if (P == 1)
23199 return V;
23200 return V + (P - Mean) * (P - Mean);
23201 }) /
23202 Num;
23203 return Dev * 96 / (Mean * Mean) == 0;
23204}
23205
23206namespace {
23207
23208/// A group of stores that we'll try to bundle together using vector ops.
23209/// They are ordered using the signed distance of their address operand to the
23210/// address of this group's BaseInstr.
23211class RelatedStoreInsts {
23212public:
23213 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
23214 : AllStores(AllStores) {
23215 reset(BaseInstrIdx);
23216 }
23217
23218 void reset(unsigned NewBaseInstr) {
23219 assert(NewBaseInstr < AllStores.size() &&
23220 "Instruction index out of bounds");
23221 BaseInstrIdx = NewBaseInstr;
23222 Instrs.clear();
23223 insertOrLookup(NewBaseInstr, 0);
23224 }
23225
23226 /// Tries to insert \p InstrIdx as the store with a pointer distance of
23227 /// \p PtrDist.
23228 /// Does nothing if there is already a store with that \p PtrDist.
23229 /// \returns The previously associated Instruction index, or std::nullopt
23230 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
23231 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23232 return Inserted ? std::nullopt : std::make_optional(It->second);
23233 }
23234
23235 using DistToInstMap = std::map<int64_t, unsigned>;
23236 const DistToInstMap &getStores() const { return Instrs; }
23237
23238 /// If \p SI is related to this group of stores, return the distance of its
23239 /// pointer operand to the one the group's BaseInstr.
23240 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
23241 ScalarEvolution &SE) const {
23242 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23243 return getPointersDiff(
23244 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
23245 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
23246 /*StrictCheck=*/true);
23247 }
23248
23249 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
23250 /// Stores whose index is less than \p MinSafeIdx will be dropped.
23251 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
23252 int64_t DistFromCurBase) {
23253 DistToInstMap PrevSet = std::move(Instrs);
23254 reset(NewBaseInstIdx);
23255
23256 // Re-insert stores that come after MinSafeIdx to try and vectorize them
23257 // again. Their distance will be "rebased" to use NewBaseInstIdx as
23258 // reference.
23259 for (auto [Dist, InstIdx] : PrevSet) {
23260 if (InstIdx >= MinSafeIdx)
23261 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23262 }
23263 }
23264
23265 /// Remove all stores that have been vectorized from this group.
23266 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
23267 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
23268 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
23269 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
23270 });
23271
23272 // Get a forward iterator pointing after the last vectorized store and erase
23273 // all stores before it so we don't try to vectorize them again.
23274 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23275 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23276 }
23277
23278private:
23279 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
23280 unsigned BaseInstrIdx;
23281
23282 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
23283 DistToInstMap Instrs;
23284
23285 /// Reference to all the stores in the BB being analyzed.
23286 ArrayRef<StoreInst *> AllStores;
23287};
23288
23289} // end anonymous namespace
23290
23291bool SLPVectorizerPass::vectorizeStores(
23292 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
23293 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23294 &Visited) {
23295 // We may run into multiple chains that merge into a single chain. We mark the
23296 // stores that we vectorized so that we don't visit the same store twice.
23297 BoUpSLP::ValueSet VectorizedStores;
23298 bool Changed = false;
23299
23300 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23301 int64_t PrevDist = -1;
23302 BoUpSLP::ValueList Operands;
23303 // Collect the chain into a list.
23304 for (auto [Idx, Data] : enumerate(StoreSeq)) {
23305 auto &[Dist, InstIdx] = Data;
23306 if (Operands.empty() || Dist - PrevDist == 1) {
23307 Operands.push_back(Stores[InstIdx]);
23308 PrevDist = Dist;
23309 if (Idx != StoreSeq.size() - 1)
23310 continue;
23311 }
23312 auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
23313 Operands.clear();
23314 Operands.push_back(Stores[InstIdx]);
23315 PrevDist = Dist;
23316 });
23317
23318 if (Operands.size() <= 1 ||
23319 !Visited
23320 .insert({Operands.front(),
23321 cast<StoreInst>(Operands.front())->getValueOperand(),
23322 Operands.back(),
23323 cast<StoreInst>(Operands.back())->getValueOperand(),
23324 Operands.size()})
23325 .second)
23326 continue;
23327
23328 unsigned MaxVecRegSize = R.getMaxVecRegSize();
23329 unsigned EltSize = R.getVectorElementSize(Operands[0]);
23330 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
23331
23332 unsigned MaxVF =
23333 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23334 auto *Store = cast<StoreInst>(Operands[0]);
23335 Type *StoreTy = Store->getValueOperand()->getType();
23336 Type *ValueTy = StoreTy;
23337 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
23338 ValueTy = Trunc->getSrcTy();
23339 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
23340 // getStoreMinimumVF only support scalar type as arguments. As a result,
23341 // we need to use the element type of StoreTy and ValueTy to retrieve the
23342 // VF and then transform it back.
23343 // Remember: VF is defined as the number we want to vectorize, not the
23344 // number of elements in the final vector.
23345 Type *StoreScalarTy = StoreTy->getScalarType();
23346 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
23347 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23348 ValueTy->getScalarType()));
23349 MinVF /= getNumElements(StoreTy);
23350 MinVF = std::max<unsigned>(2, MinVF);
23351
23352 if (MaxVF < MinVF) {
23353 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23354 << ") < "
23355 << "MinVF (" << MinVF << ")\n");
23356 continue;
23357 }
23358
23359 unsigned NonPowerOf2VF = 0;
23361 // First try vectorizing with a non-power-of-2 VF. At the moment, only
23362 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
23363 // lanes are used.
23364 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
23365 if (has_single_bit(CandVF + 1)) {
23366 NonPowerOf2VF = CandVF;
23367 assert(NonPowerOf2VF != MaxVF &&
23368 "Non-power-of-2 VF should not be equal to MaxVF");
23369 }
23370 }
23371
23372 // MaxRegVF represents the number of instructions (scalar, or vector in
23373 // case of revec) that can be vectorized to naturally fit in a vector
23374 // register.
23375 unsigned MaxRegVF = MaxVF;
23376
23377 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
23378 if (MaxVF < MinVF) {
23379 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23380 << ") < "
23381 << "MinVF (" << MinVF << ")\n");
23382 continue;
23383 }
23384
23385 SmallVector<unsigned> CandidateVFs;
23386 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23387 VF = divideCeil(VF, 2))
23388 CandidateVFs.push_back(VF);
23389
23390 unsigned End = Operands.size();
23391 unsigned Repeat = 0;
23392 constexpr unsigned MaxAttempts = 4;
23393 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
23394 for (std::pair<unsigned, unsigned> &P : RangeSizes)
23395 P.first = P.second = 1;
23396 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23397 auto IsNotVectorized = [](bool First,
23398 const std::pair<unsigned, unsigned> &P) {
23399 return First ? P.first > 0 : P.second > 0;
23400 };
23401 auto IsVectorized = [](bool First,
23402 const std::pair<unsigned, unsigned> &P) {
23403 return First ? P.first == 0 : P.second == 0;
23404 };
23405 auto VFIsProfitable = [](bool First, unsigned Size,
23406 const std::pair<unsigned, unsigned> &P) {
23407 return First ? Size >= P.first : Size >= P.second;
23408 };
23409 auto FirstSizeSame = [](unsigned Size,
23410 const std::pair<unsigned, unsigned> &P) {
23411 return Size == P.first;
23412 };
23413 while (true) {
23414 ++Repeat;
23415 bool RepeatChanged = false;
23416 bool AnyProfitableGraph = false;
23417 for (unsigned VF : CandidateVFs) {
23418 AnyProfitableGraph = false;
23419 unsigned FirstUnvecStore =
23420 std::distance(RangeSizes.begin(),
23421 find_if(RangeSizes, std::bind(IsNotVectorized,
23422 VF >= MaxRegVF, _1)));
23423
23424 // Form slices of size VF starting from FirstUnvecStore and try to
23425 // vectorize them.
23426 while (FirstUnvecStore < End) {
23427 unsigned FirstVecStore = std::distance(
23428 RangeSizes.begin(),
23429 find_if(RangeSizes.drop_front(FirstUnvecStore),
23430 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23431 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23432 for (unsigned SliceStartIdx = FirstUnvecStore;
23433 SliceStartIdx + VF <= MaxSliceEnd;) {
23434 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
23435 VF >= MaxRegVF)) {
23436 ++SliceStartIdx;
23437 continue;
23438 }
23439 ArrayRef<Value *> Slice =
23440 ArrayRef(Operands).slice(SliceStartIdx, VF);
23441 assert(all_of(Slice,
23442 [&](Value *V) {
23443 return cast<StoreInst>(V)
23444 ->getValueOperand()
23445 ->getType() ==
23446 cast<StoreInst>(Slice.front())
23447 ->getValueOperand()
23448 ->getType();
23449 }) &&
23450 "Expected all operands of same type.");
23451 if (!NonSchedulable.empty()) {
23452 auto [NonSchedSizeMax, NonSchedSizeMin] =
23453 NonSchedulable.lookup(Slice.front());
23454 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23455 // VF is too ambitious. Try to vectorize another slice before
23456 // trying a smaller VF.
23457 SliceStartIdx += NonSchedSizeMax;
23458 continue;
23459 }
23460 }
23461 unsigned TreeSize;
23462 std::optional<bool> Res =
23463 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23464 if (!Res) {
23465 // Update the range of non schedulable VFs for slices starting
23466 // at SliceStartIdx.
23467 NonSchedulable
23468 .try_emplace(Slice.front(), std::make_pair(VF, VF))
23469 .first->getSecond()
23470 .second = VF;
23471 } else if (*Res) {
23472 // Mark the vectorized stores so that we don't vectorize them
23473 // again.
23474 VectorizedStores.insert_range(Slice);
23475 // Mark the vectorized stores so that we don't vectorize them
23476 // again.
23477 AnyProfitableGraph = RepeatChanged = Changed = true;
23478 // If we vectorized initial block, no need to try to vectorize
23479 // it again.
23480 for (std::pair<unsigned, unsigned> &P :
23481 RangeSizes.slice(SliceStartIdx, VF))
23482 P.first = P.second = 0;
23483 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23484 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
23485 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23486 P.first = P.second = 0;
23487 FirstUnvecStore = SliceStartIdx + VF;
23488 }
23489 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23490 for (std::pair<unsigned, unsigned> &P :
23491 RangeSizes.slice(SliceStartIdx + VF,
23492 MaxSliceEnd - (SliceStartIdx + VF)))
23493 P.first = P.second = 0;
23494 if (MaxSliceEnd == End)
23495 End = SliceStartIdx;
23496 MaxSliceEnd = SliceStartIdx;
23497 }
23498 SliceStartIdx += VF;
23499 continue;
23500 }
23501 if (VF > 2 && Res &&
23502 !all_of(RangeSizes.slice(SliceStartIdx, VF),
23503 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23504 _1))) {
23505 SliceStartIdx += VF;
23506 continue;
23507 }
23508 // Check for the very big VFs that we're not rebuilding same
23509 // trees, just with larger number of elements.
23510 if (VF > MaxRegVF && TreeSize > 1 &&
23511 all_of(RangeSizes.slice(SliceStartIdx, VF),
23512 std::bind(FirstSizeSame, TreeSize, _1))) {
23513 SliceStartIdx += VF;
23514 while (SliceStartIdx != MaxSliceEnd &&
23515 RangeSizes[SliceStartIdx].first == TreeSize)
23516 ++SliceStartIdx;
23517 continue;
23518 }
23519 if (TreeSize > 1) {
23520 for (std::pair<unsigned, unsigned> &P :
23521 RangeSizes.slice(SliceStartIdx, VF)) {
23522 if (VF >= MaxRegVF)
23523 P.second = std::max(P.second, TreeSize);
23524 else
23525 P.first = std::max(P.first, TreeSize);
23526 }
23527 }
23528 ++SliceStartIdx;
23529 AnyProfitableGraph = true;
23530 }
23531 if (FirstUnvecStore >= End)
23532 break;
23533 if (MaxSliceEnd - FirstUnvecStore < VF &&
23534 MaxSliceEnd - FirstUnvecStore >= MinVF)
23535 AnyProfitableGraph = true;
23536 FirstUnvecStore = std::distance(
23537 RangeSizes.begin(),
23538 find_if(RangeSizes.drop_front(MaxSliceEnd),
23539 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23540 }
23541 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
23542 break;
23543 }
23544 // All values vectorized - exit.
23545 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
23546 return P.first == 0 && P.second == 0;
23547 }))
23548 break;
23549 // Check if tried all attempts or no need for the last attempts at all.
23550 if (Repeat >= MaxAttempts ||
23551 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23552 break;
23553 constexpr unsigned StoresLimit = 64;
23554 const unsigned MaxTotalNum = std::min<unsigned>(
23555 Operands.size(),
23556 static_cast<unsigned>(
23557 End -
23558 std::distance(
23559 RangeSizes.begin(),
23560 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
23561 1));
23562 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
23563 unsigned Limit =
23564 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
23565 CandidateVFs.clear();
23566 if (bit_floor(Limit) == VF)
23567 CandidateVFs.push_back(Limit);
23568 if (VF > MaxTotalNum || VF >= StoresLimit)
23569 break;
23570 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
23571 if (P.first != 0)
23572 P.first = std::max(P.second, P.first);
23573 }
23574 // Last attempt to vectorize max number of elements, if all previous
23575 // attempts were unsuccessful because of the cost issues.
23576 CandidateVFs.push_back(VF);
23577 }
23578 }
23579 };
23580
23581 /// Groups of stores to vectorize
23582 SmallVector<RelatedStoreInsts> SortedStores;
23583
23584 // Inserts the specified store SI with the given index Idx to the set of the
23585 // stores. If the store with the same distance is found already - stop
23586 // insertion, try to vectorize already found stores. If some stores from this
23587 // sequence were not vectorized - try to vectorize them with the new store
23588 // later. But this logic is applied only to the stores, that come before the
23589 // previous store with the same distance.
23590 // Example:
23591 // 1. store x, %p
23592 // 2. store y, %p+1
23593 // 3. store z, %p+2
23594 // 4. store a, %p
23595 // 5. store b, %p+3
23596 // - Scan this from the last to first store. The very first bunch of stores is
23597 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
23598 // vector).
23599 // - The next store in the list - #1 - has the same distance from store #5 as
23600 // the store #4.
23601 // - Try to vectorize sequence of stores 4,2,3,5.
23602 // - If all these stores are vectorized - just drop them.
23603 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
23604 // - Start new stores sequence.
23605 // The new bunch of stores is {1, {1, 0}}.
23606 // - Add the stores from previous sequence, that were not vectorized.
23607 // Here we consider the stores in the reversed order, rather they are used in
23608 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
23609 // Store #3 can be added -> comes after store #4 with the same distance as
23610 // store #1.
23611 // Store #5 cannot be added - comes before store #4.
23612 // This logic allows to improve the compile time, we assume that the stores
23613 // after previous store with the same distance most likely have memory
23614 // dependencies and no need to waste compile time to try to vectorize them.
23615 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
23616 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
23617 std::optional<int64_t> PtrDist;
23618 auto *RelatedStores = find_if(
23619 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
23620 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23621 return PtrDist.has_value();
23622 });
23623
23624 // We did not find a comparable store, start a new group.
23625 if (RelatedStores == SortedStores.end()) {
23626 SortedStores.emplace_back(Idx, Stores);
23627 return;
23628 }
23629
23630 // If there is already a store in the group with the same PtrDiff, try to
23631 // vectorize the existing instructions before adding the current store.
23632 // Otherwise, insert this store and keep collecting.
23633 if (std::optional<unsigned> PrevInst =
23634 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23635 TryToVectorize(RelatedStores->getStores());
23636 RelatedStores->clearVectorizedStores(VectorizedStores);
23637 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
23638 /*NewBaseInstIdx=*/Idx,
23639 /*DistFromCurBase=*/*PtrDist);
23640 }
23641 };
23642 Type *PrevValTy = nullptr;
23643 for (auto [I, SI] : enumerate(Stores)) {
23644 if (R.isDeleted(SI))
23645 continue;
23646 if (!PrevValTy)
23647 PrevValTy = SI->getValueOperand()->getType();
23648 // Check that we do not try to vectorize stores of different types.
23649 if (PrevValTy != SI->getValueOperand()->getType()) {
23650 for (RelatedStoreInsts &StoreSeq : SortedStores)
23651 TryToVectorize(StoreSeq.getStores());
23652 SortedStores.clear();
23653 PrevValTy = SI->getValueOperand()->getType();
23654 }
23655 FillStoresSet(I, SI);
23656 }
23657
23658 // Final vectorization attempt.
23659 for (RelatedStoreInsts &StoreSeq : SortedStores)
23660 TryToVectorize(StoreSeq.getStores());
23661
23662 return Changed;
23663}
23664
23665void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23666 // Initialize the collections. We will make a single pass over the block.
23667 Stores.clear();
23668 GEPs.clear();
23669
23670 // Visit the store and getelementptr instructions in BB and organize them in
23671 // Stores and GEPs according to the underlying objects of their pointer
23672 // operands.
23673 for (Instruction &I : *BB) {
23674 // Ignore store instructions that are volatile or have a pointer operand
23675 // that doesn't point to a scalar type.
23676 if (auto *SI = dyn_cast<StoreInst>(&I)) {
23677 if (!SI->isSimple())
23678 continue;
23679 if (!isValidElementType(SI->getValueOperand()->getType()))
23680 continue;
23681 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
23682 }
23683
23684 // Ignore getelementptr instructions that have more than one index, a
23685 // constant index, or a pointer operand that doesn't point to a scalar
23686 // type.
23687 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
23688 if (GEP->getNumIndices() != 1)
23689 continue;
23690 Value *Idx = GEP->idx_begin()->get();
23691 if (isa<Constant>(Idx))
23692 continue;
23693 if (!isValidElementType(Idx->getType()))
23694 continue;
23695 if (GEP->getType()->isVectorTy())
23696 continue;
23697 GEPs[GEP->getPointerOperand()].push_back(GEP);
23698 }
23699 }
23700}
23701
23702bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
23703 bool MaxVFOnly) {
23704 if (VL.size() < 2)
23705 return false;
23706
23707 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
23708 << VL.size() << ".\n");
23709
23710 // Check that all of the parts are instructions of the same type,
23711 // we permit an alternate opcode via InstructionsState.
23712 InstructionsState S = getSameOpcode(VL, *TLI);
23713 if (!S)
23714 return false;
23715
23716 Instruction *I0 = S.getMainOp();
23717 // Make sure invalid types (including vector type) are rejected before
23718 // determining vectorization factor for scalar instructions.
23719 for (Value *V : VL) {
23720 Type *Ty = V->getType();
23722 // NOTE: the following will give user internal llvm type name, which may
23723 // not be useful.
23724 R.getORE()->emit([&]() {
23725 std::string TypeStr;
23726 llvm::raw_string_ostream OS(TypeStr);
23727 Ty->print(OS);
23728 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
23729 << "Cannot SLP vectorize list: type "
23730 << TypeStr + " is unsupported by vectorizer";
23731 });
23732 return false;
23733 }
23734 }
23735
23736 Type *ScalarTy = getValueType(VL[0]);
23737 unsigned Sz = R.getVectorElementSize(I0);
23738 unsigned MinVF = R.getMinVF(Sz);
23739 unsigned MaxVF = std::max<unsigned>(
23740 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
23741 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23742 if (MaxVF < 2) {
23743 R.getORE()->emit([&]() {
23744 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
23745 << "Cannot SLP vectorize list: vectorization factor "
23746 << "less than 2 is not supported";
23747 });
23748 return false;
23749 }
23750
23751 bool Changed = false;
23752 bool CandidateFound = false;
23753 InstructionCost MinCost = SLPCostThreshold.getValue();
23754
23755 unsigned NextInst = 0, MaxInst = VL.size();
23756 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23757 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
23758 // No actual vectorization should happen, if number of parts is the same as
23759 // provided vectorization factor (i.e. the scalar type is used for vector
23760 // code during codegen).
23761 auto *VecTy = getWidenedType(ScalarTy, VF);
23762 if (TTI->getNumberOfParts(VecTy) == VF)
23763 continue;
23764 for (unsigned I = NextInst; I < MaxInst; ++I) {
23765 unsigned ActualVF = std::min(MaxInst - I, VF);
23766
23767 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
23768 continue;
23769
23770 if (MaxVFOnly && ActualVF < MaxVF)
23771 break;
23772 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23773 break;
23774
23775 SmallVector<Value *> Ops(ActualVF, nullptr);
23776 unsigned Idx = 0;
23777 for (Value *V : VL.drop_front(I)) {
23778 // Check that a previous iteration of this loop did not delete the
23779 // Value.
23780 if (auto *Inst = dyn_cast<Instruction>(V);
23781 !Inst || !R.isDeleted(Inst)) {
23782 Ops[Idx] = V;
23783 ++Idx;
23784 if (Idx == ActualVF)
23785 break;
23786 }
23787 }
23788 // Not enough vectorizable instructions - exit.
23789 if (Idx != ActualVF)
23790 break;
23791
23792 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
23793 << "\n");
23794
23795 R.buildTree(Ops);
23796 if (R.isTreeTinyAndNotFullyVectorizable())
23797 continue;
23798 if (R.isProfitableToReorder()) {
23799 R.reorderTopToBottom();
23800 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
23801 }
23802 R.transformNodes();
23803 R.buildExternalUses();
23804
23805 R.computeMinimumValueSizes();
23806 InstructionCost Cost = R.getTreeCost();
23807 CandidateFound = true;
23808 MinCost = std::min(MinCost, Cost);
23809
23810 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
23811 << " for VF=" << ActualVF << "\n");
23812 if (Cost < -SLPCostThreshold) {
23813 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
23814 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
23816 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
23817 << " and with tree size "
23818 << ore::NV("TreeSize", R.getTreeSize()));
23819
23820 R.vectorizeTree();
23821 // Move to the next bundle.
23822 I += VF - 1;
23823 NextInst = I + 1;
23824 Changed = true;
23825 }
23826 }
23827 }
23828
23829 if (!Changed && CandidateFound) {
23830 R.getORE()->emit([&]() {
23831 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
23832 << "List vectorization was possible but not beneficial with cost "
23833 << ore::NV("Cost", MinCost) << " >= "
23834 << ore::NV("Treshold", -SLPCostThreshold);
23835 });
23836 } else if (!Changed) {
23837 R.getORE()->emit([&]() {
23838 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
23839 << "Cannot SLP vectorize list: vectorization was impossible"
23840 << " with available vectorization factors";
23841 });
23842 }
23843 return Changed;
23844}
23845
23846namespace {
23847
23848/// Model horizontal reductions.
23849///
23850/// A horizontal reduction is a tree of reduction instructions that has values
23851/// that can be put into a vector as its leaves. For example:
23852///
23853/// mul mul mul mul
23854/// \ / \ /
23855/// + +
23856/// \ /
23857/// +
23858/// This tree has "mul" as its leaf values and "+" as its reduction
23859/// instructions. A reduction can feed into a store or a binary operation
23860/// feeding a phi.
23861/// ...
23862/// \ /
23863/// +
23864/// |
23865/// phi +=
23866///
23867/// Or:
23868/// ...
23869/// \ /
23870/// +
23871/// |
23872/// *p =
23873///
23874class HorizontalReduction {
23875 using ReductionOpsType = SmallVector<Value *, 16>;
23876 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23877 ReductionOpsListType ReductionOps;
23878 /// List of possibly reduced values.
23880 /// Maps reduced value to the corresponding reduction operation.
23881 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23882 WeakTrackingVH ReductionRoot;
23883 /// The type of reduction operation.
23884 RecurKind RdxKind;
23885 /// Checks if the optimization of original scalar identity operations on
23886 /// matched horizontal reductions is enabled and allowed.
23887 bool IsSupportedHorRdxIdentityOp = false;
23888 /// The minimum number of the reduced values.
23889 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
23890 /// Contains vector values for reduction including their scale factor and
23891 /// signedness.
23893
23894 static bool isCmpSelMinMax(Instruction *I) {
23895 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
23897 }
23898
23899 // And/or are potentially poison-safe logical patterns like:
23900 // select x, y, false
23901 // select x, true, y
23902 static bool isBoolLogicOp(Instruction *I) {
23903 return isa<SelectInst>(I) &&
23904 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
23905 }
23906
23907 /// Checks if instruction is associative and can be vectorized.
23908 static bool isVectorizable(RecurKind Kind, Instruction *I,
23909 bool TwoElementReduction = false) {
23910 if (Kind == RecurKind::None)
23911 return false;
23912
23913 // Integer ops that map to select instructions or intrinsics are fine.
23915 isBoolLogicOp(I))
23916 return true;
23917
23918 // No need to check for associativity, if 2 reduced values.
23919 if (TwoElementReduction)
23920 return true;
23921
23922 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23923 // FP min/max are associative except for NaN and -0.0. We do not
23924 // have to rule out -0.0 here because the intrinsic semantics do not
23925 // specify a fixed result for it.
23926 return I->getFastMathFlags().noNaNs();
23927 }
23928
23929 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23930 return true;
23931
23932 return I->isAssociative();
23933 }
23934
23935 static Value *getRdxOperand(Instruction *I, unsigned Index) {
23936 // Poison-safe 'or' takes the form: select X, true, Y
23937 // To make that work with the normal operand processing, we skip the
23938 // true value operand.
23939 // TODO: Change the code and data structures to handle this without a hack.
23940 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
23941 return I->getOperand(2);
23942 return I->getOperand(Index);
23943 }
23944
23945 /// Creates reduction operation with the current opcode.
23946 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
23947 Value *RHS, const Twine &Name, bool UseSelect) {
23948 Type *OpTy = LHS->getType();
23949 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
23950 switch (Kind) {
23951 case RecurKind::Or: {
23952 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23953 return Builder.CreateSelectWithUnknownProfile(
23954 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
23955 RHS, DEBUG_TYPE, Name);
23956 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23957 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23958 Name);
23959 }
23960 case RecurKind::And: {
23961 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23962 return Builder.CreateSelectWithUnknownProfile(
23963 LHS, RHS,
23964 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
23965 DEBUG_TYPE, Name);
23966 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23967 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23968 Name);
23969 }
23970 case RecurKind::Add:
23971 case RecurKind::Mul:
23972 case RecurKind::Xor:
23973 case RecurKind::FAdd:
23974 case RecurKind::FMul: {
23975 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23976 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23977 Name);
23978 }
23979 case RecurKind::SMax:
23980 case RecurKind::SMin:
23981 case RecurKind::UMax:
23982 case RecurKind::UMin:
23983 if (UseSelect) {
23985 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
23986 return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
23987 Name);
23988 }
23989 [[fallthrough]];
23990 case RecurKind::FMax:
23991 case RecurKind::FMin:
23992 case RecurKind::FMaximum:
23993 case RecurKind::FMinimum:
23994 case RecurKind::FMaximumNum:
23995 case RecurKind::FMinimumNum: {
23997 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
23998 }
23999 default:
24000 llvm_unreachable("Unknown reduction operation.");
24001 }
24002 }
24003
24004 /// Creates reduction operation with the current opcode with the IR flags
24005 /// from \p ReductionOps, dropping nuw/nsw flags.
24006 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
24007 Value *RHS, const Twine &Name,
24008 const ReductionOpsListType &ReductionOps) {
24009 bool UseSelect = ReductionOps.size() == 2 ||
24010 // Logical or/and.
24011 (ReductionOps.size() == 1 &&
24012 any_of(ReductionOps.front(), IsaPred<SelectInst>));
24013 assert((!UseSelect || ReductionOps.size() != 2 ||
24014 isa<SelectInst>(ReductionOps[1][0])) &&
24015 "Expected cmp + select pairs for reduction");
24016 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
24018 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
24019 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
24020 /*IncludeWrapFlags=*/false);
24021 propagateIRFlags(Op, ReductionOps[1], nullptr,
24022 /*IncludeWrapFlags=*/false);
24023 return Op;
24024 }
24025 }
24026 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
24027 return Op;
24028 }
24029
24030public:
24031 static RecurKind getRdxKind(Value *V) {
24032 auto *I = dyn_cast<Instruction>(V);
24033 if (!I)
24034 return RecurKind::None;
24035 if (match(I, m_Add(m_Value(), m_Value())))
24036 return RecurKind::Add;
24037 if (match(I, m_Mul(m_Value(), m_Value())))
24038 return RecurKind::Mul;
24039 if (match(I, m_And(m_Value(), m_Value())) ||
24041 return RecurKind::And;
24042 if (match(I, m_Or(m_Value(), m_Value())) ||
24044 return RecurKind::Or;
24045 if (match(I, m_Xor(m_Value(), m_Value())))
24046 return RecurKind::Xor;
24047 if (match(I, m_FAdd(m_Value(), m_Value())))
24048 return RecurKind::FAdd;
24049 if (match(I, m_FMul(m_Value(), m_Value())))
24050 return RecurKind::FMul;
24051
24053 return RecurKind::FMax;
24055 return RecurKind::FMin;
24056
24057 if (match(I, m_FMaximum(m_Value(), m_Value())))
24058 return RecurKind::FMaximum;
24059 if (match(I, m_FMinimum(m_Value(), m_Value())))
24060 return RecurKind::FMinimum;
24061 // This matches either cmp+select or intrinsics. SLP is expected to handle
24062 // either form.
24063 // TODO: If we are canonicalizing to intrinsics, we can remove several
24064 // special-case paths that deal with selects.
24065 if (match(I, m_SMax(m_Value(), m_Value())))
24066 return RecurKind::SMax;
24067 if (match(I, m_SMin(m_Value(), m_Value())))
24068 return RecurKind::SMin;
24069 if (match(I, m_UMax(m_Value(), m_Value())))
24070 return RecurKind::UMax;
24071 if (match(I, m_UMin(m_Value(), m_Value())))
24072 return RecurKind::UMin;
24073
24074 if (auto *Select = dyn_cast<SelectInst>(I)) {
24075 // Try harder: look for min/max pattern based on instructions producing
24076 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
24077 // During the intermediate stages of SLP, it's very common to have
24078 // pattern like this (since optimizeGatherSequence is run only once
24079 // at the end):
24080 // %1 = extractelement <2 x i32> %a, i32 0
24081 // %2 = extractelement <2 x i32> %a, i32 1
24082 // %cond = icmp sgt i32 %1, %2
24083 // %3 = extractelement <2 x i32> %a, i32 0
24084 // %4 = extractelement <2 x i32> %a, i32 1
24085 // %select = select i1 %cond, i32 %3, i32 %4
24086 CmpPredicate Pred;
24087 Instruction *L1;
24088 Instruction *L2;
24089
24090 Value *LHS = Select->getTrueValue();
24091 Value *RHS = Select->getFalseValue();
24092 Value *Cond = Select->getCondition();
24093
24094 // TODO: Support inverse predicates.
24095 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
24098 return RecurKind::None;
24099 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
24102 return RecurKind::None;
24103 } else {
24105 return RecurKind::None;
24106 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
24109 return RecurKind::None;
24110 }
24111
24112 switch (Pred) {
24113 default:
24114 return RecurKind::None;
24115 case CmpInst::ICMP_SGT:
24116 case CmpInst::ICMP_SGE:
24117 return RecurKind::SMax;
24118 case CmpInst::ICMP_SLT:
24119 case CmpInst::ICMP_SLE:
24120 return RecurKind::SMin;
24121 case CmpInst::ICMP_UGT:
24122 case CmpInst::ICMP_UGE:
24123 return RecurKind::UMax;
24124 case CmpInst::ICMP_ULT:
24125 case CmpInst::ICMP_ULE:
24126 return RecurKind::UMin;
24127 }
24128 }
24129 return RecurKind::None;
24130 }
24131
24132 /// Get the index of the first operand.
24133 static unsigned getFirstOperandIndex(Instruction *I) {
24134 return isCmpSelMinMax(I) ? 1 : 0;
24135 }
24136
24137private:
24138 /// Total number of operands in the reduction operation.
24139 static unsigned getNumberOfOperands(Instruction *I) {
24140 return isCmpSelMinMax(I) ? 3 : 2;
24141 }
24142
24143 /// Checks if the instruction is in basic block \p BB.
24144 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
24145 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
24146 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
24147 auto *Sel = cast<SelectInst>(I);
24148 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
24149 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
24150 }
24151 return I->getParent() == BB;
24152 }
24153
24154 /// Expected number of uses for reduction operations/reduced values.
24155 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
24156 if (IsCmpSelMinMax) {
24157 // SelectInst must be used twice while the condition op must have single
24158 // use only.
24159 if (auto *Sel = dyn_cast<SelectInst>(I))
24160 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
24161 return I->hasNUses(2);
24162 }
24163
24164 // Arithmetic reduction operation must be used once only.
24165 return I->hasOneUse();
24166 }
24167
24168 /// Initializes the list of reduction operations.
24169 void initReductionOps(Instruction *I) {
24170 if (isCmpSelMinMax(I))
24171 ReductionOps.assign(2, ReductionOpsType());
24172 else
24173 ReductionOps.assign(1, ReductionOpsType());
24174 }
24175
24176 /// Add all reduction operations for the reduction instruction \p I.
24177 void addReductionOps(Instruction *I) {
24178 if (isCmpSelMinMax(I)) {
24179 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
24180 ReductionOps[1].emplace_back(I);
24181 } else {
24182 ReductionOps[0].emplace_back(I);
24183 }
24184 }
24185
24186 static bool isGoodForReduction(ArrayRef<Value *> Data) {
24187 int Sz = Data.size();
24188 auto *I = dyn_cast<Instruction>(Data.front());
24189 return Sz > 1 || isConstant(Data.front()) ||
24190 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
24191 }
24192
24193public:
24194 HorizontalReduction() = default;
24196 : ReductionRoot(I), ReductionLimit(2) {
24197 RdxKind = HorizontalReduction::getRdxKind(I);
24198 ReductionOps.emplace_back().push_back(I);
24199 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
24200 for (Value *V : Ops)
24201 ReducedValsToOps[V].push_back(I);
24202 }
24203
24204 bool matchReductionForOperands() const {
24205 // Analyze "regular" integer/FP types for reductions - no target-specific
24206 // types or pointers.
24207 assert(ReductionRoot && "Reduction root is not set!");
24208 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
24209 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
24210 return Ops.size() == 2;
24211 })))
24212 return false;
24213
24214 return true;
24215 }
24216
24217 /// Try to find a reduction tree.
24218 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24219 ScalarEvolution &SE, const DataLayout &DL,
24220 const TargetLibraryInfo &TLI) {
24221 RdxKind = HorizontalReduction::getRdxKind(Root);
24222 if (!isVectorizable(RdxKind, Root))
24223 return false;
24224
24225 // Analyze "regular" integer/FP types for reductions - no target-specific
24226 // types or pointers.
24227 Type *Ty = Root->getType();
24228 if (!isValidElementType(Ty) || Ty->isPointerTy())
24229 return false;
24230
24231 // Though the ultimate reduction may have multiple uses, its condition must
24232 // have only single use.
24233 if (auto *Sel = dyn_cast<SelectInst>(Root))
24234 if (!Sel->getCondition()->hasOneUse())
24235 return false;
24236
24237 ReductionRoot = Root;
24238
24239 // Iterate through all the operands of the possible reduction tree and
24240 // gather all the reduced values, sorting them by their value id.
24241 BasicBlock *BB = Root->getParent();
24242 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24244 1, std::make_pair(Root, 0));
24245 // Checks if the operands of the \p TreeN instruction are also reduction
24246 // operations or should be treated as reduced values or an extra argument,
24247 // which is not part of the reduction.
24248 auto CheckOperands = [&](Instruction *TreeN,
24249 SmallVectorImpl<Value *> &PossibleReducedVals,
24250 SmallVectorImpl<Instruction *> &ReductionOps,
24251 unsigned Level) {
24252 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
24253 getNumberOfOperands(TreeN)))) {
24254 Value *EdgeVal = getRdxOperand(TreeN, I);
24255 ReducedValsToOps[EdgeVal].push_back(TreeN);
24256 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
24257 // If the edge is not an instruction, or it is different from the main
24258 // reduction opcode or has too many uses - possible reduced value.
24259 // Also, do not try to reduce const values, if the operation is not
24260 // foldable.
24261 if (!EdgeInst || Level > RecursionMaxDepth ||
24262 getRdxKind(EdgeInst) != RdxKind ||
24263 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24264 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24265 !isVectorizable(RdxKind, EdgeInst) ||
24266 (R.isAnalyzedReductionRoot(EdgeInst) &&
24267 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
24268 PossibleReducedVals.push_back(EdgeVal);
24269 continue;
24270 }
24271 ReductionOps.push_back(EdgeInst);
24272 }
24273 };
24274 // Try to regroup reduced values so that it gets more profitable to try to
24275 // reduce them. Values are grouped by their value ids, instructions - by
24276 // instruction op id and/or alternate op id, plus do extra analysis for
24277 // loads (grouping them by the distance between pointers) and cmp
24278 // instructions (grouping them by the predicate).
24279 SmallMapVector<
24280 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24281 8>
24282 PossibleReducedVals;
24283 initReductionOps(Root);
24284 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
24285 SmallSet<size_t, 2> LoadKeyUsed;
24286
24287 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
24289 Value *Ptr =
24291 if (!LoadKeyUsed.insert(Key).second) {
24292 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
24293 if (LIt != LoadsMap.end()) {
24294 for (LoadInst *RLI : LIt->second) {
24295 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
24296 LI->getType(), LI->getPointerOperand(), DL, SE,
24297 /*StrictCheck=*/true))
24298 return hash_value(RLI->getPointerOperand());
24299 }
24300 for (LoadInst *RLI : LIt->second) {
24302 LI->getPointerOperand(), TLI)) {
24303 hash_code SubKey = hash_value(RLI->getPointerOperand());
24304 return SubKey;
24305 }
24306 }
24307 if (LIt->second.size() > 2) {
24308 hash_code SubKey =
24309 hash_value(LIt->second.back()->getPointerOperand());
24310 return SubKey;
24311 }
24312 }
24313 }
24314 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
24315 .first->second.push_back(LI);
24316 return hash_value(LI->getPointerOperand());
24317 };
24318
24319 while (!Worklist.empty()) {
24320 auto [TreeN, Level] = Worklist.pop_back_val();
24321 SmallVector<Value *> PossibleRedVals;
24322 SmallVector<Instruction *> PossibleReductionOps;
24323 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24324 addReductionOps(TreeN);
24325 // Add reduction values. The values are sorted for better vectorization
24326 // results.
24327 for (Value *V : PossibleRedVals) {
24328 size_t Key, Idx;
24329 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
24330 /*AllowAlternate=*/false);
24331 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
24332 }
24333 for (Instruction *I : reverse(PossibleReductionOps))
24334 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
24335 }
24336 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
24337 // Sort values by the total number of values kinds to start the reduction
24338 // from the longest possible reduced values sequences.
24339 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
24340 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
24341 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
24342 for (auto &Slice : PossibleRedVals) {
24343 PossibleRedValsVect.emplace_back();
24344 auto RedValsVect = Slice.second.takeVector();
24345 stable_sort(RedValsVect, llvm::less_second());
24346 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
24347 PossibleRedValsVect.back().append(Data.second, Data.first);
24348 }
24349 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
24350 return P1.size() > P2.size();
24351 });
24352 bool First = true;
24353 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
24354 if (First) {
24355 First = false;
24356 ReducedVals.emplace_back();
24357 } else if (!isGoodForReduction(Data)) {
24358 auto *LI = dyn_cast<LoadInst>(Data.front());
24359 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
24360 if (!LI || !LastLI ||
24362 getUnderlyingObject(LastLI->getPointerOperand()))
24363 ReducedVals.emplace_back();
24364 }
24365 ReducedVals.back().append(Data.rbegin(), Data.rend());
24366 }
24367 }
24368 // Sort the reduced values by number of same/alternate opcode and/or pointer
24369 // operand.
24370 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
24371 return P1.size() > P2.size();
24372 });
24373 return true;
24374 }
24375
24376 /// Attempt to vectorize the tree found by matchAssociativeReduction.
24377 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
24378 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24379 DominatorTree &DT) {
24380 constexpr unsigned RegMaxNumber = 4;
24381 constexpr unsigned RedValsMaxNumber = 128;
24382 // If there are a sufficient number of reduction values, reduce
24383 // to a nearby power-of-2. We can safely generate oversized
24384 // vectors and rely on the backend to split them to legal sizes.
24385 if (unsigned NumReducedVals = std::accumulate(
24386 ReducedVals.begin(), ReducedVals.end(), 0,
24387 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
24388 if (!isGoodForReduction(Vals))
24389 return Num;
24390 return Num + Vals.size();
24391 });
24392 NumReducedVals < ReductionLimit &&
24393 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
24394 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
24395 })) {
24396 for (ReductionOpsType &RdxOps : ReductionOps)
24397 for (Value *RdxOp : RdxOps)
24398 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24399 return nullptr;
24400 }
24401
24402 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24403 TargetFolder(DL));
24404 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
24405
24406 // Track the reduced values in case if they are replaced by extractelement
24407 // because of the vectorization.
24408 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
24409 ReducedVals.front().size());
24410
24411 // The compare instruction of a min/max is the insertion point for new
24412 // instructions and may be replaced with a new compare instruction.
24413 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
24414 assert(isa<SelectInst>(RdxRootInst) &&
24415 "Expected min/max reduction to have select root instruction");
24416 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
24417 assert(isa<Instruction>(ScalarCond) &&
24418 "Expected min/max reduction to have compare condition");
24419 return cast<Instruction>(ScalarCond);
24420 };
24421
24422 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
24423 return isBoolLogicOp(cast<Instruction>(V));
24424 });
24425 // Return new VectorizedTree, based on previous value.
24426 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
24427 if (VectorizedTree) {
24428 // Update the final value in the reduction.
24430 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
24431 if (AnyBoolLogicOp) {
24432 auto It = ReducedValsToOps.find(VectorizedTree);
24433 auto It1 = ReducedValsToOps.find(Res);
24434 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
24435 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
24436 (It != ReducedValsToOps.end() &&
24437 any_of(It->getSecond(), [&](Instruction *I) {
24438 return isBoolLogicOp(I) &&
24439 getRdxOperand(I, 0) == VectorizedTree;
24440 }))) {
24441 ;
24442 } else if (isGuaranteedNotToBePoison(Res, AC) ||
24443 (It1 != ReducedValsToOps.end() &&
24444 any_of(It1->getSecond(), [&](Instruction *I) {
24445 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24446 }))) {
24447 std::swap(VectorizedTree, Res);
24448 } else {
24449 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
24450 }
24451 }
24452
24453 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
24454 ReductionOps);
24455 }
24456 // Initialize the final value in the reduction.
24457 return Res;
24458 };
24459 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24460 ReductionOps.front().size());
24461 for (ReductionOpsType &RdxOps : ReductionOps)
24462 for (Value *RdxOp : RdxOps) {
24463 if (!RdxOp)
24464 continue;
24465 IgnoreList.insert(RdxOp);
24466 }
24467 // Intersect the fast-math-flags from all reduction operations.
24468 FastMathFlags RdxFMF;
24469 RdxFMF.set();
24470 for (Value *U : IgnoreList)
24471 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
24472 RdxFMF &= FPMO->getFastMathFlags();
24473 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
24474
24475 // Need to track reduced vals, they may be changed during vectorization of
24476 // subvectors.
24477 for (ArrayRef<Value *> Candidates : ReducedVals)
24478 for (Value *V : Candidates)
24479 TrackedVals.try_emplace(V, V);
24480
24481 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24482 Value *V) -> unsigned & {
24483 auto *It = MV.find(V);
24484 assert(It != MV.end() && "Unable to find given key.");
24485 return It->second;
24486 };
24487
24488 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
24489 // List of the values that were reduced in other trees as part of gather
24490 // nodes and thus requiring extract if fully vectorized in other trees.
24491 SmallPtrSet<Value *, 4> RequiredExtract;
24492 WeakTrackingVH VectorizedTree = nullptr;
24493 bool CheckForReusedReductionOps = false;
24494 // Try to vectorize elements based on their type.
24496 for (ArrayRef<Value *> RV : ReducedVals)
24497 States.push_back(getSameOpcode(RV, TLI));
24498 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
24499 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
24500 InstructionsState S = States[I];
24501 SmallVector<Value *> Candidates;
24502 Candidates.reserve(2 * OrigReducedVals.size());
24503 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
24504 for (Value *ReducedVal : OrigReducedVals) {
24505 Value *RdxVal = TrackedVals.at(ReducedVal);
24506 // Check if the reduction value was not overriden by the extractelement
24507 // instruction because of the vectorization and exclude it, if it is not
24508 // compatible with other values.
24509 // Also check if the instruction was folded to constant/other value.
24510 auto *Inst = dyn_cast<Instruction>(RdxVal);
24511 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
24512 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24513 (S && !Inst))
24514 continue;
24515 Candidates.push_back(RdxVal);
24516 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24517 }
24518 bool ShuffledExtracts = false;
24519 // Try to handle shuffled extractelements.
24520 if (S && S.getOpcode() == Instruction::ExtractElement &&
24521 !S.isAltShuffle() && I + 1 < E) {
24522 SmallVector<Value *> CommonCandidates(Candidates);
24523 for (Value *RV : ReducedVals[I + 1]) {
24524 Value *RdxVal = TrackedVals.at(RV);
24525 // Check if the reduction value was not overriden by the
24526 // extractelement instruction because of the vectorization and
24527 // exclude it, if it is not compatible with other values.
24528 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
24529 if (!Inst)
24530 continue;
24531 CommonCandidates.push_back(RdxVal);
24532 TrackedToOrig.try_emplace(RdxVal, RV);
24533 }
24534 SmallVector<int> Mask;
24535 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
24536 ++I;
24537 Candidates.swap(CommonCandidates);
24538 ShuffledExtracts = true;
24539 }
24540 }
24541
24542 // Emit code for constant values.
24543 if (Candidates.size() > 1 && allConstant(Candidates)) {
24544 Value *Res = Candidates.front();
24545 Value *OrigV = TrackedToOrig.at(Candidates.front());
24546 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24547 for (Value *VC : ArrayRef(Candidates).drop_front()) {
24548 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
24549 Value *OrigV = TrackedToOrig.at(VC);
24550 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24551 if (auto *ResI = dyn_cast<Instruction>(Res))
24552 V.analyzedReductionRoot(ResI);
24553 }
24554 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24555 continue;
24556 }
24557
24558 unsigned NumReducedVals = Candidates.size();
24559 if (NumReducedVals < ReductionLimit &&
24560 (NumReducedVals < 2 || !isSplat(Candidates)))
24561 continue;
24562
24563 // Check if we support repeated scalar values processing (optimization of
24564 // original scalar identity operations on matched horizontal reductions).
24565 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24566 RdxKind != RecurKind::FMul &&
24567 RdxKind != RecurKind::FMulAdd;
24568 // Gather same values.
24569 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24570 if (IsSupportedHorRdxIdentityOp)
24571 for (Value *V : Candidates) {
24572 Value *OrigV = TrackedToOrig.at(V);
24573 ++SameValuesCounter.try_emplace(OrigV).first->second;
24574 }
24575 // Used to check if the reduced values used same number of times. In this
24576 // case the compiler may produce better code. E.g. if reduced values are
24577 // aabbccdd (8 x values), then the first node of the tree will have a node
24578 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
24579 // Plus, the final reduction will be performed on <8 x aabbccdd>.
24580 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
24581 // x abcd) * 2.
24582 // Currently it only handles add/fadd/xor. and/or/min/max do not require
24583 // this analysis, other operations may require an extra estimation of
24584 // the profitability.
24585 bool SameScaleFactor = false;
24586 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24587 SameValuesCounter.size() != Candidates.size();
24588 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
24589 if (OptReusedScalars) {
24590 SameScaleFactor =
24591 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24592 RdxKind == RecurKind::Xor) &&
24593 all_of(drop_begin(SameValuesCounter),
24594 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
24595 return P.second == SameValuesCounter.front().second;
24596 });
24597 Candidates.resize(SameValuesCounter.size());
24598 transform(SameValuesCounter, Candidates.begin(),
24599 [&](const auto &P) { return TrackedVals.at(P.first); });
24600 NumReducedVals = Candidates.size();
24601 // Have a reduction of the same element.
24602 if (NumReducedVals == 1) {
24603 Value *OrigV = TrackedToOrig.at(Candidates.front());
24604 unsigned Cnt = At(SameValuesCounter, OrigV);
24605 Value *RedVal =
24606 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24607 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24608 VectorizedVals.try_emplace(OrigV, Cnt);
24609 ExternallyUsedValues.insert(OrigV);
24610 continue;
24611 }
24612 }
24613
24614 unsigned MaxVecRegSize = V.getMaxVecRegSize();
24615 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
24616 const unsigned MaxElts = std::clamp<unsigned>(
24617 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
24618 RegMaxNumber * RedValsMaxNumber);
24619
24620 unsigned ReduxWidth = NumReducedVals;
24621 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
24622 unsigned NumParts, NumRegs;
24623 Type *ScalarTy = Candidates.front()->getType();
24624 ReduxWidth =
24625 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
24626 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24627 NumParts = ::getNumberOfParts(TTI, Tp);
24628 NumRegs =
24630 while (NumParts > NumRegs) {
24631 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
24632 ReduxWidth = bit_floor(ReduxWidth - 1);
24633 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24634 NumParts = ::getNumberOfParts(TTI, Tp);
24635 NumRegs =
24637 }
24638 if (NumParts > NumRegs / 2)
24639 ReduxWidth = bit_floor(ReduxWidth);
24640 return ReduxWidth;
24641 };
24642 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
24643 ReduxWidth = GetVectorFactor(ReduxWidth);
24644 ReduxWidth = std::min(ReduxWidth, MaxElts);
24645
24646 unsigned Start = 0;
24647 unsigned Pos = Start;
24648 // Restarts vectorization attempt with lower vector factor.
24649 unsigned PrevReduxWidth = ReduxWidth;
24650 bool CheckForReusedReductionOpsLocal = false;
24651 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
24652 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
24653 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24654 // Check if any of the reduction ops are gathered. If so, worth
24655 // trying again with less number of reduction ops.
24656 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24657 }
24658 ++Pos;
24659 if (Pos < NumReducedVals - ReduxWidth + 1)
24660 return IsAnyRedOpGathered;
24661 Pos = Start;
24662 --ReduxWidth;
24663 if (ReduxWidth > 1)
24664 ReduxWidth = GetVectorFactor(ReduxWidth);
24665 return IsAnyRedOpGathered;
24666 };
24667 bool AnyVectorized = false;
24668 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24669 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24670 ReduxWidth >= ReductionLimit) {
24671 // Dependency in tree of the reduction ops - drop this attempt, try
24672 // later.
24673 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24674 Start == 0) {
24675 CheckForReusedReductionOps = true;
24676 break;
24677 }
24678 PrevReduxWidth = ReduxWidth;
24679 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
24680 // Been analyzed already - skip.
24681 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
24682 (!has_single_bit(ReduxWidth) &&
24683 (IgnoredCandidates.contains(
24684 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
24685 IgnoredCandidates.contains(
24686 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
24687 bit_floor(ReduxWidth))))) ||
24688 V.areAnalyzedReductionVals(VL)) {
24689 (void)AdjustReducedVals(/*IgnoreVL=*/true);
24690 continue;
24691 }
24692 // Early exit if any of the reduction values were deleted during
24693 // previous vectorization attempts.
24694 if (any_of(VL, [&V](Value *RedVal) {
24695 auto *RedValI = dyn_cast<Instruction>(RedVal);
24696 return RedValI && V.isDeleted(RedValI);
24697 }))
24698 break;
24699 V.buildTree(VL, IgnoreList);
24700 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
24701 if (!AdjustReducedVals())
24702 V.analyzedReductionVals(VL);
24703 continue;
24704 }
24705 if (V.isLoadCombineReductionCandidate(RdxKind)) {
24706 if (!AdjustReducedVals())
24707 V.analyzedReductionVals(VL);
24708 continue;
24709 }
24710 V.reorderTopToBottom();
24711 // No need to reorder the root node at all for reassociative reduction.
24712 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
24713 VL.front()->getType()->isIntOrIntVectorTy() ||
24714 ReductionLimit > 2);
24715 // Keep extracted other reduction values, if they are used in the
24716 // vectorization trees.
24717 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
24718 ExternallyUsedValues);
24719 // The reduction root is used as the insertion point for new
24720 // instructions, so set it as externally used to prevent it from being
24721 // deleted.
24722 LocalExternallyUsedValues.insert(ReductionRoot);
24723 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
24724 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
24725 continue;
24726 for (Value *V : ReducedVals[Cnt])
24727 if (isa<Instruction>(V))
24728 LocalExternallyUsedValues.insert(TrackedVals[V]);
24729 }
24730 if (!IsSupportedHorRdxIdentityOp) {
24731 // Number of uses of the candidates in the vector of values.
24732 assert(SameValuesCounter.empty() &&
24733 "Reused values counter map is not empty");
24734 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24735 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24736 continue;
24737 Value *V = Candidates[Cnt];
24738 Value *OrigV = TrackedToOrig.at(V);
24739 ++SameValuesCounter.try_emplace(OrigV).first->second;
24740 }
24741 }
24742 V.transformNodes();
24743 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
24744 // Gather externally used values.
24745 SmallPtrSet<Value *, 4> Visited;
24746 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24747 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24748 continue;
24749 Value *RdxVal = Candidates[Cnt];
24750 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24751 RdxVal = It->second;
24752 if (!Visited.insert(RdxVal).second)
24753 continue;
24754 // Check if the scalar was vectorized as part of the vectorization
24755 // tree but not the top node.
24756 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
24757 LocalExternallyUsedValues.insert(RdxVal);
24758 continue;
24759 }
24760 Value *OrigV = TrackedToOrig.at(RdxVal);
24761 unsigned NumOps =
24762 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24763 if (NumOps != ReducedValsToOps.at(OrigV).size())
24764 LocalExternallyUsedValues.insert(RdxVal);
24765 }
24766 // Do not need the list of reused scalars in regular mode anymore.
24767 if (!IsSupportedHorRdxIdentityOp)
24768 SameValuesCounter.clear();
24769 for (Value *RdxVal : VL)
24770 if (RequiredExtract.contains(RdxVal))
24771 LocalExternallyUsedValues.insert(RdxVal);
24772 V.buildExternalUses(LocalExternallyUsedValues);
24773
24774 V.computeMinimumValueSizes();
24775
24776 // Estimate cost.
24777 InstructionCost ReductionCost =
24778 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
24779 InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
24780 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24781 << " for reduction\n");
24782 if (!Cost.isValid())
24783 break;
24784 if (Cost >= -SLPCostThreshold) {
24785 V.getORE()->emit([&]() {
24786 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
24787 ReducedValsToOps.at(VL[0]).front())
24788 << "Vectorizing horizontal reduction is possible "
24789 << "but not beneficial with cost " << ore::NV("Cost", Cost)
24790 << " and threshold "
24791 << ore::NV("Threshold", -SLPCostThreshold);
24792 });
24793 if (!AdjustReducedVals()) {
24794 V.analyzedReductionVals(VL);
24795 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24796 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
24797 // Add subvectors of VL to the list of the analyzed values.
24798 for (unsigned VF = getFloorFullVectorNumberOfElements(
24799 *TTI, VL.front()->getType(), ReduxWidth - 1);
24800 VF >= ReductionLimit;
24802 *TTI, VL.front()->getType(), VF - 1)) {
24803 if (has_single_bit(VF) &&
24804 V.getCanonicalGraphSize() != V.getTreeSize())
24805 continue;
24806 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24807 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
24808 }
24809 }
24810 }
24811 continue;
24812 }
24813
24814 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
24815 << Cost << ". (HorRdx)\n");
24816 V.getORE()->emit([&]() {
24817 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
24818 ReducedValsToOps.at(VL[0]).front())
24819 << "Vectorized horizontal reduction with cost "
24820 << ore::NV("Cost", Cost) << " and with tree size "
24821 << ore::NV("TreeSize", V.getTreeSize());
24822 });
24823
24824 Builder.setFastMathFlags(RdxFMF);
24825
24826 // Emit a reduction. If the root is a select (min/max idiom), the insert
24827 // point is the compare condition of that select.
24828 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24829 Instruction *InsertPt = RdxRootInst;
24830 if (IsCmpSelMinMax)
24831 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24832
24833 // Vectorize a tree.
24834 Value *VectorizedRoot = V.vectorizeTree(
24835 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24836 // Update TrackedToOrig mapping, since the tracked values might be
24837 // updated.
24838 for (Value *RdxVal : Candidates) {
24839 Value *OrigVal = TrackedToOrig.at(RdxVal);
24840 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24841 if (TransformedRdxVal != RdxVal)
24842 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24843 }
24844
24845 Builder.SetInsertPoint(InsertPt);
24846
24847 // To prevent poison from leaking across what used to be sequential,
24848 // safe, scalar boolean logic operations, the reduction operand must be
24849 // frozen.
24850 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
24851 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
24852
24853 // Emit code to correctly handle reused reduced values, if required.
24854 if (OptReusedScalars && !SameScaleFactor) {
24855 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24856 SameValuesCounter, TrackedToOrig);
24857 }
24858
24859 Type *ScalarTy = VL.front()->getType();
24860 Type *VecTy = VectorizedRoot->getType();
24861 Type *RedScalarTy = VecTy->getScalarType();
24862 VectorValuesAndScales.emplace_back(
24863 VectorizedRoot,
24864 OptReusedScalars && SameScaleFactor
24865 ? SameValuesCounter.front().second
24866 : 1,
24867 RedScalarTy != ScalarTy->getScalarType()
24868 ? V.isSignedMinBitwidthRootNode()
24869 : true);
24870
24871 // Count vectorized reduced values to exclude them from final reduction.
24872 for (Value *RdxVal : VL) {
24873 Value *OrigV = TrackedToOrig.at(RdxVal);
24874 if (IsSupportedHorRdxIdentityOp) {
24875 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24876 continue;
24877 }
24878 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24879 if (!V.isVectorized(RdxVal))
24880 RequiredExtract.insert(RdxVal);
24881 }
24882 Pos += ReduxWidth;
24883 Start = Pos;
24884 ReduxWidth = NumReducedVals - Pos;
24885 if (ReduxWidth > 1)
24886 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24887 AnyVectorized = true;
24888 }
24889 if (OptReusedScalars && !AnyVectorized) {
24890 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
24891 Value *RdxVal = TrackedVals.at(P.first);
24892 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
24893 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24894 VectorizedVals.try_emplace(P.first, P.second);
24895 }
24896 continue;
24897 }
24898 }
24899 if (!VectorValuesAndScales.empty())
24900 VectorizedTree = GetNewVectorizedTree(
24901 VectorizedTree,
24902 emitReduction(Builder, *TTI, ReductionRoot->getType()));
24903
24904 if (!VectorizedTree) {
24905 if (!CheckForReusedReductionOps) {
24906 for (ReductionOpsType &RdxOps : ReductionOps)
24907 for (Value *RdxOp : RdxOps)
24908 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24909 }
24910 return nullptr;
24911 }
24912
24913 // Reorder operands of bool logical op in the natural order to avoid
24914 // possible problem with poison propagation. If not possible to reorder
24915 // (both operands are originally RHS), emit an extra freeze instruction
24916 // for the LHS operand.
24917 // I.e., if we have original code like this:
24918 // RedOp1 = select i1 ?, i1 LHS, i1 false
24919 // RedOp2 = select i1 RHS, i1 ?, i1 false
24920
24921 // Then, we swap LHS/RHS to create a new op that matches the poison
24922 // semantics of the original code.
24923
24924 // If we have original code like this and both values could be poison:
24925 // RedOp1 = select i1 ?, i1 LHS, i1 false
24926 // RedOp2 = select i1 ?, i1 RHS, i1 false
24927
24928 // Then, we must freeze LHS in the new op.
24929 auto FixBoolLogicalOps =
24930 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
24931 Instruction *RedOp2, bool InitStep) {
24932 if (!AnyBoolLogicOp)
24933 return;
24934 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
24935 getRdxOperand(RedOp1, 0) == LHS ||
24937 return;
24938 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
24939 getRdxOperand(RedOp2, 0) == RHS ||
24941 std::swap(LHS, RHS);
24942 return;
24943 }
24944 if (LHS != VectorizedTree)
24945 LHS = Builder.CreateFreeze(LHS);
24946 };
24947 // Finish the reduction.
24948 // Need to add extra arguments and not vectorized possible reduction values.
24949 // Try to avoid dependencies between the scalar remainders after reductions.
24950 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
24951 bool InitStep) {
24952 unsigned Sz = InstVals.size();
24953 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
24954 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
24955 Instruction *RedOp = InstVals[I + 1].first;
24956 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
24957 Value *RdxVal1 = InstVals[I].second;
24958 Value *StableRdxVal1 = RdxVal1;
24959 auto It1 = TrackedVals.find(RdxVal1);
24960 if (It1 != TrackedVals.end())
24961 StableRdxVal1 = It1->second;
24962 Value *RdxVal2 = InstVals[I + 1].second;
24963 Value *StableRdxVal2 = RdxVal2;
24964 auto It2 = TrackedVals.find(RdxVal2);
24965 if (It2 != TrackedVals.end())
24966 StableRdxVal2 = It2->second;
24967 // To prevent poison from leaking across what used to be sequential,
24968 // safe, scalar boolean logic operations, the reduction operand must be
24969 // frozen.
24970 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
24971 RedOp, InitStep);
24972 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24973 StableRdxVal2, "op.rdx", ReductionOps);
24974 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
24975 }
24976 if (Sz % 2 == 1)
24977 ExtraReds[Sz / 2] = InstVals.back();
24978 return ExtraReds;
24979 };
24981 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
24982 VectorizedTree);
24983 SmallPtrSet<Value *, 8> Visited;
24984 for (ArrayRef<Value *> Candidates : ReducedVals) {
24985 for (Value *RdxVal : Candidates) {
24986 if (!Visited.insert(RdxVal).second)
24987 continue;
24988 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24989 for (Instruction *RedOp :
24990 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
24991 ExtraReductions.emplace_back(RedOp, RdxVal);
24992 }
24993 }
24994 // Iterate through all not-vectorized reduction values/extra arguments.
24995 bool InitStep = true;
24996 while (ExtraReductions.size() > 1) {
24998 FinalGen(ExtraReductions, InitStep);
24999 ExtraReductions.swap(NewReds);
25000 InitStep = false;
25001 }
25002 VectorizedTree = ExtraReductions.front().second;
25003
25004 ReductionRoot->replaceAllUsesWith(VectorizedTree);
25005
25006 // The original scalar reduction is expected to have no remaining
25007 // uses outside the reduction tree itself. Assert that we got this
25008 // correct, replace internal uses with undef, and mark for eventual
25009 // deletion.
25010#ifndef NDEBUG
25011 SmallPtrSet<Value *, 4> IgnoreSet;
25012 for (ArrayRef<Value *> RdxOps : ReductionOps)
25013 IgnoreSet.insert_range(RdxOps);
25014#endif
25015 for (ArrayRef<Value *> RdxOps : ReductionOps) {
25016 for (Value *Ignore : RdxOps) {
25017 if (!Ignore)
25018 continue;
25019#ifndef NDEBUG
25020 for (auto *U : Ignore->users()) {
25021 assert(IgnoreSet.count(U) &&
25022 "All users must be either in the reduction ops list.");
25023 }
25024#endif
25025 if (!Ignore->use_empty()) {
25026 Value *P = PoisonValue::get(Ignore->getType());
25027 Ignore->replaceAllUsesWith(P);
25028 }
25029 }
25030 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
25031 }
25032 return VectorizedTree;
25033 }
25034
25035private:
25036 /// Creates the reduction from the given \p Vec vector value with the given
25037 /// scale \p Scale and signedness \p IsSigned.
25038 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
25039 Value *Vec, unsigned Scale, bool IsSigned,
25040 Type *DestTy) {
25041 Value *Rdx;
25042 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
25043 unsigned DestTyNumElements = getNumElements(VecTy);
25044 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
25045 Rdx = PoisonValue::get(
25046 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
25047 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
25048 // Do reduction for each lane.
25049 // e.g., do reduce add for
25050 // VL[0] = <4 x Ty> <a, b, c, d>
25051 // VL[1] = <4 x Ty> <e, f, g, h>
25052 // Lane[0] = <2 x Ty> <a, e>
25053 // Lane[1] = <2 x Ty> <b, f>
25054 // Lane[2] = <2 x Ty> <c, g>
25055 // Lane[3] = <2 x Ty> <d, h>
25056 // result[0] = reduce add Lane[0]
25057 // result[1] = reduce add Lane[1]
25058 // result[2] = reduce add Lane[2]
25059 // result[3] = reduce add Lane[3]
25060 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
25061 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
25062 Rdx = Builder.CreateInsertElement(
25063 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
25064 }
25065 } else {
25066 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
25067 }
25068 if (Rdx->getType() != DestTy)
25069 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
25070 // Improved analysis for add/fadd/xor reductions with same scale
25071 // factor for all operands of reductions. We can emit scalar ops for
25072 // them instead.
25073 if (Scale > 1)
25074 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
25075 return Rdx;
25076 }
25077
25078 /// Calculate the cost of a reduction.
25079 InstructionCost getReductionCost(TargetTransformInfo *TTI,
25080 ArrayRef<Value *> ReducedVals,
25081 bool IsCmpSelMinMax, FastMathFlags FMF,
25082 const BoUpSLP &R, DominatorTree &DT,
25083 const DataLayout &DL,
25084 const TargetLibraryInfo &TLI) {
25086 Type *ScalarTy = ReducedVals.front()->getType();
25087 unsigned ReduxWidth = ReducedVals.size();
25088 FixedVectorType *VectorTy = R.getReductionType();
25089 InstructionCost VectorCost = 0, ScalarCost;
25090 // If all of the reduced values are constant, the vector cost is 0, since
25091 // the reduction value can be calculated at the compile time.
25092 bool AllConsts = allConstant(ReducedVals);
25093 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
25095 // Scalar cost is repeated for N-1 elements.
25096 int Cnt = ReducedVals.size();
25097 for (Value *RdxVal : ReducedVals) {
25098 if (Cnt == 1)
25099 break;
25100 --Cnt;
25101 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
25102 Cost += GenCostFn();
25103 continue;
25104 }
25105 InstructionCost ScalarCost = 0;
25106 for (User *U : RdxVal->users()) {
25107 auto *RdxOp = cast<Instruction>(U);
25108 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
25109 if (RdxKind == RecurKind::FAdd) {
25111 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
25112 if (FMACost.isValid()) {
25113 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
25114 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
25115 // Also, exclude scalar fmul cost.
25116 InstructionCost FMulCost =
25118 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
25119 FMACost -= FMulCost;
25120 }
25121 ScalarCost += FMACost;
25122 continue;
25123 }
25124 }
25125 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
25126 continue;
25127 }
25128 ScalarCost = InstructionCost::getInvalid();
25129 break;
25130 }
25131 if (ScalarCost.isValid())
25132 Cost += ScalarCost;
25133 else
25134 Cost += GenCostFn();
25135 }
25136 return Cost;
25137 };
25138 // Require reduction cost if:
25139 // 1. This type is not a full register type and no other vectors with the
25140 // same type in the storage (first vector with small type).
25141 // 2. The storage does not have any vector with full vector use (first
25142 // vector with full register use).
25143 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
25144 switch (RdxKind) {
25145 case RecurKind::Add:
25146 case RecurKind::Mul:
25147 case RecurKind::Or:
25148 case RecurKind::And:
25149 case RecurKind::Xor:
25150 case RecurKind::FAdd:
25151 case RecurKind::FMul: {
25152 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
25153 if (!AllConsts) {
25154 if (DoesRequireReductionOp) {
25155 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
25156 assert(SLPReVec && "FixedVectorType is not expected.");
25157 unsigned ScalarTyNumElements = VecTy->getNumElements();
25158 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
25159 VectorCost += TTI->getShuffleCost(
25162 ReducedVals.size()),
25163 VectorTy,
25164 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
25165 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
25166 FMF, CostKind);
25167 }
25168 VectorCost += TTI->getScalarizationOverhead(
25169 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
25170 /*Extract*/ false, TTI::TCK_RecipThroughput);
25171 } else {
25172 Type *RedTy = VectorTy->getElementType();
25173 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25174 std::make_pair(RedTy, true));
25175 if (RType == RedTy) {
25176 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
25177 FMF, CostKind);
25178 } else {
25179 VectorCost = TTI->getExtendedReductionCost(
25180 RdxOpcode, !IsSigned, RedTy,
25181 getWidenedType(RType, ReduxWidth), FMF, CostKind);
25182 }
25183 }
25184 } else {
25185 Type *RedTy = VectorTy->getElementType();
25186 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25187 std::make_pair(RedTy, true));
25188 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
25189 InstructionCost FMACost = InstructionCost::getInvalid();
25190 if (RdxKind == RecurKind::FAdd) {
25191 // Check if the reduction operands can be converted to FMA.
25193 FastMathFlags FMF;
25194 FMF.set();
25195 for (Value *RdxVal : ReducedVals) {
25196 if (!RdxVal->hasOneUse()) {
25197 Ops.clear();
25198 break;
25199 }
25200 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
25201 FMF &= FPCI->getFastMathFlags();
25202 Ops.push_back(RdxVal->user_back());
25203 }
25204 if (!Ops.empty()) {
25205 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
25206 *TTI, TLI);
25207 if (FMACost.isValid()) {
25208 // Calculate actual FMAD cost.
25209 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25210 {RVecTy, RVecTy, RVecTy}, FMF);
25211 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
25212
25213 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
25214 // Also, exclude vector fmul cost.
25216 Instruction::FMul, RVecTy, CostKind);
25218 << "Minus vector FMul cost: " << FMulCost << "\n");
25219 FMACost -= FMulCost;
25220 }
25221 }
25222 }
25223 if (FMACost.isValid())
25224 VectorCost += FMACost;
25225 else
25226 VectorCost +=
25227 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
25228 if (RType != RedTy) {
25229 unsigned Opcode = Instruction::Trunc;
25230 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25231 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25232 VectorCost += TTI->getCastInstrCost(
25233 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25234 }
25235 }
25236 }
25237 ScalarCost = EvaluateScalarCost([&]() {
25238 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
25239 });
25240 break;
25241 }
25242 case RecurKind::FMax:
25243 case RecurKind::FMin:
25244 case RecurKind::FMaximum:
25245 case RecurKind::FMinimum:
25246 case RecurKind::SMax:
25247 case RecurKind::SMin:
25248 case RecurKind::UMax:
25249 case RecurKind::UMin: {
25251 if (!AllConsts) {
25252 if (DoesRequireReductionOp) {
25253 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
25254 } else {
25255 // Check if the previous reduction already exists and account it as
25256 // series of operations + single reduction.
25257 Type *RedTy = VectorTy->getElementType();
25258 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25259 std::make_pair(RedTy, true));
25260 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
25261 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25262 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
25263 if (RType != RedTy) {
25264 unsigned Opcode = Instruction::Trunc;
25265 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25266 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25267 VectorCost += TTI->getCastInstrCost(
25268 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25269 }
25270 }
25271 }
25272 ScalarCost = EvaluateScalarCost([&]() {
25273 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25274 return TTI->getIntrinsicInstrCost(ICA, CostKind);
25275 });
25276 break;
25277 }
25278 default:
25279 llvm_unreachable("Expected arithmetic or min/max reduction operation");
25280 }
25281
25282 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
25283 << " for reduction of " << shortBundleName(ReducedVals)
25284 << " (It is a splitting reduction)\n");
25285 return VectorCost - ScalarCost;
25286 }
25287
25288 /// Splits the values, stored in VectorValuesAndScales, into registers/free
25289 /// sub-registers, combines them with the given reduction operation as a
25290 /// vector operation and then performs single (small enough) reduction.
25291 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
25292 Type *DestTy) {
25293 Value *ReducedSubTree = nullptr;
25294 // Creates reduction and combines with the previous reduction.
25295 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
25296 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
25297 if (ReducedSubTree)
25298 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25299 "op.rdx", ReductionOps);
25300 else
25301 ReducedSubTree = Rdx;
25302 };
25303 if (VectorValuesAndScales.size() == 1) {
25304 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
25305 CreateSingleOp(Vec, Scale, IsSigned);
25306 return ReducedSubTree;
25307 }
25308 // Scales Vec using given Cnt scale factor and then performs vector combine
25309 // with previous value of VecOp.
25310 Value *VecRes = nullptr;
25311 bool VecResSignedness = false;
25312 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
25313 Type *ScalarTy = Vec->getType()->getScalarType();
25314 // Scale Vec using given Cnt scale factor.
25315 if (Cnt > 1) {
25316 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
25317 switch (RdxKind) {
25318 case RecurKind::Add: {
25319 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
25320 unsigned VF = getNumElements(Vec->getType());
25321 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
25322 << ". (HorRdx)\n");
25323 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
25324 for (unsigned I : seq<unsigned>(Cnt))
25325 std::iota(std::next(Mask.begin(), VF * I),
25326 std::next(Mask.begin(), VF * (I + 1)), 0);
25327 ++NumVectorInstructions;
25328 Vec = Builder.CreateShuffleVector(Vec, Mask);
25329 break;
25330 }
25331 // res = mul vv, n
25332 if (ScalarTy != DestTy->getScalarType())
25333 Vec = Builder.CreateIntCast(
25334 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25335 IsSigned);
25337 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
25338 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
25339 << ". (HorRdx)\n");
25340 ++NumVectorInstructions;
25341 Vec = Builder.CreateMul(Vec, Scale);
25342 break;
25343 }
25344 case RecurKind::Xor: {
25345 // res = n % 2 ? 0 : vv
25347 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
25348 if (Cnt % 2 == 0)
25349 Vec = Constant::getNullValue(Vec->getType());
25350 break;
25351 }
25352 case RecurKind::FAdd: {
25353 // res = fmul v, n
25354 Value *Scale =
25355 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
25356 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
25357 << ". (HorRdx)\n");
25358 ++NumVectorInstructions;
25359 Vec = Builder.CreateFMul(Vec, Scale);
25360 break;
25361 }
25362 case RecurKind::And:
25363 case RecurKind::Or:
25364 case RecurKind::SMax:
25365 case RecurKind::SMin:
25366 case RecurKind::UMax:
25367 case RecurKind::UMin:
25368 case RecurKind::FMax:
25369 case RecurKind::FMin:
25370 case RecurKind::FMaximum:
25371 case RecurKind::FMinimum:
25372 // res = vv
25373 break;
25374 case RecurKind::Sub:
25375 case RecurKind::AddChainWithSubs:
25376 case RecurKind::Mul:
25377 case RecurKind::FMul:
25378 case RecurKind::FMulAdd:
25379 case RecurKind::AnyOf:
25380 case RecurKind::FindFirstIVSMin:
25381 case RecurKind::FindFirstIVUMin:
25382 case RecurKind::FindLastIVSMax:
25383 case RecurKind::FindLastIVUMax:
25384 case RecurKind::FMaxNum:
25385 case RecurKind::FMinNum:
25386 case RecurKind::FMaximumNum:
25387 case RecurKind::FMinimumNum:
25388 case RecurKind::None:
25389 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25390 }
25391 }
25392 // Combine Vec with the previous VecOp.
25393 if (!VecRes) {
25394 VecRes = Vec;
25395 VecResSignedness = IsSigned;
25396 } else {
25397 ++NumVectorInstructions;
25398 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
25399 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
25400 // Handle ctpop.
25401 unsigned VecResVF = getNumElements(VecRes->getType());
25402 unsigned VecVF = getNumElements(Vec->getType());
25403 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
25404 std::iota(Mask.begin(), Mask.end(), 0);
25405 // Ensure that VecRes is always larger than Vec
25406 if (VecResVF < VecVF) {
25407 std::swap(VecRes, Vec);
25408 std::swap(VecResVF, VecVF);
25409 }
25410 if (VecResVF != VecVF) {
25411 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
25412 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25413 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
25414 }
25415 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
25416 return;
25417 }
25418 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
25419 VecRes = Builder.CreateIntCast(
25420 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
25421 VecResSignedness);
25422 if (ScalarTy != DestTy->getScalarType())
25423 Vec = Builder.CreateIntCast(
25424 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25425 IsSigned);
25426 unsigned VecResVF = getNumElements(VecRes->getType());
25427 unsigned VecVF = getNumElements(Vec->getType());
25428 // Ensure that VecRes is always larger than Vec
25429 if (VecResVF < VecVF) {
25430 std::swap(VecRes, Vec);
25431 std::swap(VecResVF, VecVF);
25432 }
25433 // extract + op + insert
25434 Value *Op = VecRes;
25435 if (VecResVF != VecVF)
25436 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
25437 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
25438 if (VecResVF != VecVF)
25439 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
25440 VecRes = Op;
25441 }
25442 };
25443 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25444 CreateVecOp(Vec, Scale, IsSigned);
25445 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
25446
25447 return ReducedSubTree;
25448 }
25449
25450 /// Emit a horizontal reduction of the vectorized value.
25451 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
25452 const TargetTransformInfo *TTI, Type *DestTy) {
25453 assert(VectorizedValue && "Need to have a vectorized tree node");
25454 assert(RdxKind != RecurKind::FMulAdd &&
25455 "A call to the llvm.fmuladd intrinsic is not handled yet");
25456
25457 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
25458 if (FTy->getScalarType() == Builder.getInt1Ty() &&
25459 RdxKind == RecurKind::Add &&
25460 DestTy->getScalarType() != FTy->getScalarType()) {
25461 // Convert vector_reduce_add(ZExt(<n x i1>)) to
25462 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
25463 Value *V = Builder.CreateBitCast(
25464 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
25465 ++NumVectorInstructions;
25466 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
25467 }
25468 ++NumVectorInstructions;
25469 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
25470 }
25471
25472 /// Emits optimized code for unique scalar value reused \p Cnt times.
25473 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
25474 unsigned Cnt) {
25475 assert(IsSupportedHorRdxIdentityOp &&
25476 "The optimization of matched scalar identity horizontal reductions "
25477 "must be supported.");
25478 if (Cnt == 1)
25479 return VectorizedValue;
25480 switch (RdxKind) {
25481 case RecurKind::Add: {
25482 // res = mul vv, n
25483 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
25484 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
25485 << VectorizedValue << ". (HorRdx)\n");
25486 return Builder.CreateMul(VectorizedValue, Scale);
25487 }
25488 case RecurKind::Xor: {
25489 // res = n % 2 ? 0 : vv
25490 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
25491 << ". (HorRdx)\n");
25492 if (Cnt % 2 == 0)
25493 return Constant::getNullValue(VectorizedValue->getType());
25494 return VectorizedValue;
25495 }
25496 case RecurKind::FAdd: {
25497 // res = fmul v, n
25498 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
25499 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
25500 << VectorizedValue << ". (HorRdx)\n");
25501 return Builder.CreateFMul(VectorizedValue, Scale);
25502 }
25503 case RecurKind::And:
25504 case RecurKind::Or:
25505 case RecurKind::SMax:
25506 case RecurKind::SMin:
25507 case RecurKind::UMax:
25508 case RecurKind::UMin:
25509 case RecurKind::FMax:
25510 case RecurKind::FMin:
25511 case RecurKind::FMaximum:
25512 case RecurKind::FMinimum:
25513 // res = vv
25514 return VectorizedValue;
25515 case RecurKind::Sub:
25516 case RecurKind::AddChainWithSubs:
25517 case RecurKind::Mul:
25518 case RecurKind::FMul:
25519 case RecurKind::FMulAdd:
25520 case RecurKind::AnyOf:
25521 case RecurKind::FindFirstIVSMin:
25522 case RecurKind::FindFirstIVUMin:
25523 case RecurKind::FindLastIVSMax:
25524 case RecurKind::FindLastIVUMax:
25525 case RecurKind::FMaxNum:
25526 case RecurKind::FMinNum:
25527 case RecurKind::FMaximumNum:
25528 case RecurKind::FMinimumNum:
25529 case RecurKind::None:
25530 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25531 }
25532 return nullptr;
25533 }
25534
25535 /// Emits actual operation for the scalar identity values, found during
25536 /// horizontal reduction analysis.
25537 Value *
25538 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25539 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25540 const DenseMap<Value *, Value *> &TrackedToOrig) {
25541 assert(IsSupportedHorRdxIdentityOp &&
25542 "The optimization of matched scalar identity horizontal reductions "
25543 "must be supported.");
25544 ArrayRef<Value *> VL = R.getRootNodeScalars();
25545 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
25546 if (VTy->getElementType() != VL.front()->getType()) {
25547 VectorizedValue = Builder.CreateIntCast(
25548 VectorizedValue,
25549 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
25550 R.isSignedMinBitwidthRootNode());
25551 }
25552 switch (RdxKind) {
25553 case RecurKind::Add: {
25554 // root = mul prev_root, <1, 1, n, 1>
25556 for (Value *V : VL) {
25557 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25558 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
25559 }
25560 auto *Scale = ConstantVector::get(Vals);
25561 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
25562 << VectorizedValue << ". (HorRdx)\n");
25563 return Builder.CreateMul(VectorizedValue, Scale);
25564 }
25565 case RecurKind::And:
25566 case RecurKind::Or:
25567 // No need for multiple or/and(s).
25568 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
25569 << ". (HorRdx)\n");
25570 return VectorizedValue;
25571 case RecurKind::SMax:
25572 case RecurKind::SMin:
25573 case RecurKind::UMax:
25574 case RecurKind::UMin:
25575 case RecurKind::FMax:
25576 case RecurKind::FMin:
25577 case RecurKind::FMaximum:
25578 case RecurKind::FMinimum:
25579 // No need for multiple min/max(s) of the same value.
25580 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
25581 << ". (HorRdx)\n");
25582 return VectorizedValue;
25583 case RecurKind::Xor: {
25584 // Replace values with even number of repeats with 0, since
25585 // x xor x = 0.
25586 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
25587 // 7>, if elements 4th and 6th elements have even number of repeats.
25588 SmallVector<int> Mask(
25589 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
25591 std::iota(Mask.begin(), Mask.end(), 0);
25592 bool NeedShuffle = false;
25593 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
25594 Value *V = VL[I];
25595 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25596 if (Cnt % 2 == 0) {
25597 Mask[I] = VF;
25598 NeedShuffle = true;
25599 }
25600 }
25601 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
25602 : Mask) dbgs()
25603 << I << " ";
25604 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
25605 if (NeedShuffle)
25606 VectorizedValue = Builder.CreateShuffleVector(
25607 VectorizedValue,
25608 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
25609 return VectorizedValue;
25610 }
25611 case RecurKind::FAdd: {
25612 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
25614 for (Value *V : VL) {
25615 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25616 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
25617 }
25618 auto *Scale = ConstantVector::get(Vals);
25619 return Builder.CreateFMul(VectorizedValue, Scale);
25620 }
25621 case RecurKind::Sub:
25622 case RecurKind::AddChainWithSubs:
25623 case RecurKind::Mul:
25624 case RecurKind::FMul:
25625 case RecurKind::FMulAdd:
25626 case RecurKind::AnyOf:
25627 case RecurKind::FindFirstIVSMin:
25628 case RecurKind::FindFirstIVUMin:
25629 case RecurKind::FindLastIVSMax:
25630 case RecurKind::FindLastIVUMax:
25631 case RecurKind::FMaxNum:
25632 case RecurKind::FMinNum:
25633 case RecurKind::FMaximumNum:
25634 case RecurKind::FMinimumNum:
25635 case RecurKind::None:
25636 llvm_unreachable("Unexpected reduction kind for reused scalars.");
25637 }
25638 return nullptr;
25639 }
25640};
25641} // end anonymous namespace
25642
25643/// Gets recurrence kind from the specified value.
25645 return HorizontalReduction::getRdxKind(V);
25646}
25647static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
25648 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25649 return cast<FixedVectorType>(IE->getType())->getNumElements();
25650
25651 unsigned AggregateSize = 1;
25652 auto *IV = cast<InsertValueInst>(InsertInst);
25653 Type *CurrentType = IV->getType();
25654 do {
25655 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
25656 for (auto *Elt : ST->elements())
25657 if (Elt != ST->getElementType(0)) // check homogeneity
25658 return std::nullopt;
25659 AggregateSize *= ST->getNumElements();
25660 CurrentType = ST->getElementType(0);
25661 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25662 AggregateSize *= AT->getNumElements();
25663 CurrentType = AT->getElementType();
25664 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25665 AggregateSize *= VT->getNumElements();
25666 return AggregateSize;
25667 } else if (CurrentType->isSingleValueType()) {
25668 return AggregateSize;
25669 } else {
25670 return std::nullopt;
25671 }
25672 } while (true);
25673}
25674
25675static void findBuildAggregateRec(Instruction *LastInsertInst,
25677 SmallVectorImpl<Value *> &BuildVectorOpds,
25678 SmallVectorImpl<Value *> &InsertElts,
25679 unsigned OperandOffset, const BoUpSLP &R) {
25680 do {
25681 Value *InsertedOperand = LastInsertInst->getOperand(1);
25682 std::optional<unsigned> OperandIndex =
25683 getElementIndex(LastInsertInst, OperandOffset);
25684 if (!OperandIndex || R.isDeleted(LastInsertInst))
25685 return;
25686 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25688 BuildVectorOpds, InsertElts, *OperandIndex, R);
25689
25690 } else {
25691 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25692 InsertElts[*OperandIndex] = LastInsertInst;
25693 }
25694 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
25695 } while (LastInsertInst != nullptr &&
25697 LastInsertInst->hasOneUse());
25698}
25699
25700/// Recognize construction of vectors like
25701/// %ra = insertelement <4 x float> poison, float %s0, i32 0
25702/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
25703/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
25704/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
25705/// starting from the last insertelement or insertvalue instruction.
25706///
25707/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
25708/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
25709/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
25710///
25711/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
25712///
25713/// \return true if it matches.
25714static bool findBuildAggregate(Instruction *LastInsertInst,
25716 SmallVectorImpl<Value *> &BuildVectorOpds,
25717 SmallVectorImpl<Value *> &InsertElts,
25718 const BoUpSLP &R) {
25719
25720 assert((isa<InsertElementInst>(LastInsertInst) ||
25721 isa<InsertValueInst>(LastInsertInst)) &&
25722 "Expected insertelement or insertvalue instruction!");
25723
25724 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
25725 "Expected empty result vectors!");
25726
25727 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
25728 if (!AggregateSize)
25729 return false;
25730 BuildVectorOpds.resize(*AggregateSize);
25731 InsertElts.resize(*AggregateSize);
25732
25733 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
25734 llvm::erase(BuildVectorOpds, nullptr);
25735 llvm::erase(InsertElts, nullptr);
25736 if (BuildVectorOpds.size() >= 2)
25737 return true;
25738
25739 return false;
25740}
25741
25742/// Try and get a reduction instruction from a phi node.
25743///
25744/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
25745/// if they come from either \p ParentBB or a containing loop latch.
25746///
25747/// \returns A candidate reduction value if possible, or \code nullptr \endcode
25748/// if not possible.
25750 BasicBlock *ParentBB, LoopInfo *LI) {
25751 // There are situations where the reduction value is not dominated by the
25752 // reduction phi. Vectorizing such cases has been reported to cause
25753 // miscompiles. See PR25787.
25754 auto DominatedReduxValue = [&](Value *R) {
25755 return isa<Instruction>(R) &&
25756 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
25757 };
25758
25759 Instruction *Rdx = nullptr;
25760
25761 // Return the incoming value if it comes from the same BB as the phi node.
25762 if (P->getIncomingBlock(0) == ParentBB) {
25763 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25764 } else if (P->getIncomingBlock(1) == ParentBB) {
25765 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25766 }
25767
25768 if (Rdx && DominatedReduxValue(Rdx))
25769 return Rdx;
25770
25771 // Otherwise, check whether we have a loop latch to look at.
25772 Loop *BBL = LI->getLoopFor(ParentBB);
25773 if (!BBL)
25774 return nullptr;
25775 BasicBlock *BBLatch = BBL->getLoopLatch();
25776 if (!BBLatch)
25777 return nullptr;
25778
25779 // There is a loop latch, return the incoming value if it comes from
25780 // that. This reduction pattern occasionally turns up.
25781 if (P->getIncomingBlock(0) == BBLatch) {
25782 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25783 } else if (P->getIncomingBlock(1) == BBLatch) {
25784 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25785 }
25786
25787 if (Rdx && DominatedReduxValue(Rdx))
25788 return Rdx;
25789
25790 return nullptr;
25791}
25792
25793static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
25794 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
25795 return true;
25796 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
25797 return true;
25798 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
25799 return true;
25800 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
25801 return true;
25802 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
25803 return true;
25805 return true;
25807 return true;
25809 return true;
25811 return true;
25812 return false;
25813}
25814
25815/// We could have an initial reduction that is not an add.
25816/// r *= v1 + v2 + v3 + v4
25817/// In such a case start looking for a tree rooted in the first '+'.
25818/// \Returns the new root if found, which may be nullptr if not an instruction.
25820 Instruction *Root) {
25821 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25822 isa<IntrinsicInst>(Root)) &&
25823 "Expected binop, select, or intrinsic for reduction matching");
25824 Value *LHS =
25825 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25826 Value *RHS =
25827 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25828 if (LHS == Phi)
25829 return dyn_cast<Instruction>(RHS);
25830 if (RHS == Phi)
25831 return dyn_cast<Instruction>(LHS);
25832 return nullptr;
25833}
25834
25835/// \p Returns the first operand of \p I that does not match \p Phi. If
25836/// operand is not an instruction it returns nullptr.
25838 Value *Op0 = nullptr;
25839 Value *Op1 = nullptr;
25840 if (!matchRdxBop(I, Op0, Op1))
25841 return nullptr;
25842 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25843}
25844
25845/// \Returns true if \p I is a candidate instruction for reduction vectorization.
25847 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
25848 Value *B0 = nullptr, *B1 = nullptr;
25849 bool IsBinop = matchRdxBop(I, B0, B1);
25850 return IsBinop || IsSelect;
25851}
25852
25853bool SLPVectorizerPass::vectorizeHorReduction(
25854 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
25855 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25856 if (!ShouldVectorizeHor)
25857 return false;
25858 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
25859
25860 if (Root->getParent() != BB || isa<PHINode>(Root))
25861 return false;
25862
25863 // If we can find a secondary reduction root, use that instead.
25864 auto SelectRoot = [&]() {
25865 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
25866 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25867 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
25868 return NewRoot;
25869 return Root;
25870 };
25871
25872 // Start analysis starting from Root instruction. If horizontal reduction is
25873 // found, try to vectorize it. If it is not a horizontal reduction or
25874 // vectorization is not possible or not effective, and currently analyzed
25875 // instruction is a binary operation, try to vectorize the operands, using
25876 // pre-order DFS traversal order. If the operands were not vectorized, repeat
25877 // the same procedure considering each operand as a possible root of the
25878 // horizontal reduction.
25879 // Interrupt the process if the Root instruction itself was vectorized or all
25880 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
25881 // If a horizintal reduction was not matched or vectorized we collect
25882 // instructions for possible later attempts for vectorization.
25883 std::queue<std::pair<Instruction *, unsigned>> Stack;
25884 Stack.emplace(SelectRoot(), 0);
25885 SmallPtrSet<Value *, 8> VisitedInstrs;
25886 bool Res = false;
25887 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
25888 if (R.isAnalyzedReductionRoot(Inst))
25889 return nullptr;
25890 if (!isReductionCandidate(Inst))
25891 return nullptr;
25892 HorizontalReduction HorRdx;
25893 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25894 return nullptr;
25895 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25896 };
25897 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
25898 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25899 FutureSeed = getNonPhiOperand(Root, P);
25900 if (!FutureSeed)
25901 return false;
25902 }
25903 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
25904 // analysis is done separately.
25906 PostponedInsts.push_back(FutureSeed);
25907 return true;
25908 };
25909
25910 while (!Stack.empty()) {
25911 Instruction *Inst;
25912 unsigned Level;
25913 std::tie(Inst, Level) = Stack.front();
25914 Stack.pop();
25915 // Do not try to analyze instruction that has already been vectorized.
25916 // This may happen when we vectorize instruction operands on a previous
25917 // iteration while stack was populated before that happened.
25918 if (R.isDeleted(Inst))
25919 continue;
25920 if (Value *VectorizedV = TryToReduce(Inst)) {
25921 Res = true;
25922 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
25923 // Try to find another reduction.
25924 Stack.emplace(I, Level);
25925 continue;
25926 }
25927 if (R.isDeleted(Inst))
25928 continue;
25929 } else {
25930 // We could not vectorize `Inst` so try to use it as a future seed.
25931 if (!TryAppendToPostponedInsts(Inst)) {
25932 assert(Stack.empty() && "Expected empty stack");
25933 break;
25934 }
25935 }
25936
25937 // Try to vectorize operands.
25938 // Continue analysis for the instruction from the same basic block only to
25939 // save compile time.
25940 if (++Level < RecursionMaxDepth)
25941 for (auto *Op : Inst->operand_values())
25942 if (VisitedInstrs.insert(Op).second)
25943 if (auto *I = dyn_cast<Instruction>(Op))
25944 // Do not try to vectorize CmpInst operands, this is done
25945 // separately.
25947 !R.isDeleted(I) && I->getParent() == BB)
25948 Stack.emplace(I, Level);
25949 }
25950 return Res;
25951}
25952
25953bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
25954 if (!I)
25955 return false;
25956
25957 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
25958 return false;
25959 // Skip potential FMA candidates.
25960 if ((I->getOpcode() == Instruction::FAdd ||
25961 I->getOpcode() == Instruction::FSub) &&
25962 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
25963 .isValid())
25964 return false;
25965
25966 Value *P = I->getParent();
25967
25968 // Vectorize in current basic block only.
25969 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
25970 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
25971 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
25972 R.isDeleted(Op0) || R.isDeleted(Op1))
25973 return false;
25974
25975 // First collect all possible candidates
25977 Candidates.emplace_back(Op0, Op1);
25978
25979 auto *A = dyn_cast<BinaryOperator>(Op0);
25980 auto *B = dyn_cast<BinaryOperator>(Op1);
25981 // Try to skip B.
25982 if (A && B && B->hasOneUse()) {
25983 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
25984 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
25985 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
25986 Candidates.emplace_back(A, B0);
25987 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
25988 Candidates.emplace_back(A, B1);
25989 }
25990 // Try to skip A.
25991 if (B && A && A->hasOneUse()) {
25992 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
25993 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
25994 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
25995 Candidates.emplace_back(A0, B);
25996 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
25997 Candidates.emplace_back(A1, B);
25998 }
25999
26000 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
26002 if (!isReductionCandidate(Inst))
26003 return false;
26004 Type *Ty = Inst->getType();
26005 if (!isValidElementType(Ty) || Ty->isPointerTy())
26006 return false;
26007 HorizontalReduction HorRdx(Inst, Ops);
26008 if (!HorRdx.matchReductionForOperands())
26009 return false;
26010 // Check the cost of operations.
26011 VectorType *VecTy = getWidenedType(Ty, Ops.size());
26013 InstructionCost ScalarCost =
26014 TTI.getScalarizationOverhead(
26015 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
26016 /*Extract=*/true, CostKind) +
26017 TTI.getInstructionCost(Inst, CostKind);
26018 InstructionCost RedCost;
26019 switch (::getRdxKind(Inst)) {
26020 case RecurKind::Add:
26021 case RecurKind::Mul:
26022 case RecurKind::Or:
26023 case RecurKind::And:
26024 case RecurKind::Xor:
26025 case RecurKind::FAdd:
26026 case RecurKind::FMul: {
26027 FastMathFlags FMF;
26028 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
26029 FMF = FPCI->getFastMathFlags();
26030 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
26031 CostKind);
26032 break;
26033 }
26034 default:
26035 return false;
26036 }
26037 if (RedCost >= ScalarCost)
26038 return false;
26039
26040 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
26041 };
26042 if (Candidates.size() == 1)
26043 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
26044
26045 // We have multiple options. Try to pick the single best.
26046 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
26047 if (!BestCandidate)
26048 return false;
26049 return (*BestCandidate == 0 &&
26050 TryToReduce(I, {Candidates[*BestCandidate].first,
26051 Candidates[*BestCandidate].second})) ||
26052 tryToVectorizeList({Candidates[*BestCandidate].first,
26053 Candidates[*BestCandidate].second},
26054 R);
26055}
26056
26057bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
26058 BasicBlock *BB, BoUpSLP &R) {
26059 SmallVector<WeakTrackingVH> PostponedInsts;
26060 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
26061 Res |= tryToVectorize(PostponedInsts, R);
26062 return Res;
26063}
26064
26065bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
26066 BoUpSLP &R) {
26067 bool Res = false;
26068 for (Value *V : Insts)
26069 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
26070 Res |= tryToVectorize(Inst, R);
26071 return Res;
26072}
26073
26074bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
26075 BasicBlock *BB, BoUpSLP &R,
26076 bool MaxVFOnly) {
26077 if (!R.canMapToVector(IVI->getType()))
26078 return false;
26079
26080 SmallVector<Value *, 16> BuildVectorOpds;
26081 SmallVector<Value *, 16> BuildVectorInsts;
26082 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
26083 return false;
26084
26085 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
26086 R.getORE()->emit([&]() {
26087 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
26088 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
26089 "trying reduction first.";
26090 });
26091 return false;
26092 }
26093 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
26094 // Aggregate value is unlikely to be processed in vector register.
26095 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
26096}
26097
26098bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
26099 BasicBlock *BB, BoUpSLP &R,
26100 bool MaxVFOnly) {
26101 SmallVector<Value *, 16> BuildVectorInsts;
26102 SmallVector<Value *, 16> BuildVectorOpds;
26103 SmallVector<int> Mask;
26104 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
26106 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
26107 return false;
26108
26109 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
26110 R.getORE()->emit([&]() {
26111 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
26112 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
26113 "trying reduction first.";
26114 });
26115 return false;
26116 }
26117 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
26118 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
26119}
26120
26121template <typename T>
26123 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
26124 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
26125 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
26126 bool MaxVFOnly, BoUpSLP &R) {
26127 bool Changed = false;
26128 // Sort by type, parent, operands.
26129 stable_sort(Incoming, Comparator);
26130
26131 // Try to vectorize elements base on their type.
26132 SmallVector<T *> Candidates;
26134 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
26135 VL.clear()) {
26136 // Look for the next elements with the same type, parent and operand
26137 // kinds.
26138 auto *I = dyn_cast<Instruction>(*IncIt);
26139 if (!I || R.isDeleted(I)) {
26140 ++IncIt;
26141 continue;
26142 }
26143 auto *SameTypeIt = IncIt;
26144 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
26145 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
26146 AreCompatible(VL, *SameTypeIt))) {
26147 auto *I = dyn_cast<Instruction>(*SameTypeIt);
26148 ++SameTypeIt;
26149 if (I && !R.isDeleted(I))
26150 VL.push_back(cast<T>(I));
26151 }
26152
26153 // Try to vectorize them.
26154 unsigned NumElts = VL.size();
26155 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
26156 << NumElts << ")\n");
26157 // The vectorization is a 3-state attempt:
26158 // 1. Try to vectorize instructions with the same/alternate opcodes with the
26159 // size of maximal register at first.
26160 // 2. Try to vectorize remaining instructions with the same type, if
26161 // possible. This may result in the better vectorization results rather than
26162 // if we try just to vectorize instructions with the same/alternate opcodes.
26163 // 3. Final attempt to try to vectorize all instructions with the
26164 // same/alternate ops only, this may result in some extra final
26165 // vectorization.
26166 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
26167 // Success start over because instructions might have been changed.
26168 Changed = true;
26169 VL.swap(Candidates);
26170 Candidates.clear();
26171 for (T *V : VL) {
26172 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
26173 Candidates.push_back(V);
26174 }
26175 } else {
26176 /// \Returns the minimum number of elements that we will attempt to
26177 /// vectorize.
26178 auto GetMinNumElements = [&R](Value *V) {
26179 unsigned EltSize = R.getVectorElementSize(V);
26180 return std::max(2U, R.getMaxVecRegSize() / EltSize);
26181 };
26182 if (NumElts < GetMinNumElements(*IncIt) &&
26183 (Candidates.empty() ||
26184 Candidates.front()->getType() == (*IncIt)->getType())) {
26185 for (T *V : VL) {
26186 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
26187 Candidates.push_back(V);
26188 }
26189 }
26190 }
26191 // Final attempt to vectorize instructions with the same types.
26192 if (Candidates.size() > 1 &&
26193 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
26194 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
26195 // Success start over because instructions might have been changed.
26196 Changed = true;
26197 } else if (MaxVFOnly) {
26198 // Try to vectorize using small vectors.
26200 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
26201 VL.clear()) {
26202 auto *I = dyn_cast<Instruction>(*It);
26203 if (!I || R.isDeleted(I)) {
26204 ++It;
26205 continue;
26206 }
26207 auto *SameTypeIt = It;
26208 while (SameTypeIt != End &&
26209 (!isa<Instruction>(*SameTypeIt) ||
26210 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
26211 AreCompatible(*SameTypeIt, *It))) {
26212 auto *I = dyn_cast<Instruction>(*SameTypeIt);
26213 ++SameTypeIt;
26214 if (I && !R.isDeleted(I))
26215 VL.push_back(cast<T>(I));
26216 }
26217 unsigned NumElts = VL.size();
26218 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
26219 /*MaxVFOnly=*/false))
26220 Changed = true;
26221 It = SameTypeIt;
26222 }
26223 }
26224 Candidates.clear();
26225 }
26226
26227 // Start over at the next instruction of a different type (or the end).
26228 IncIt = SameTypeIt;
26229 }
26230 return Changed;
26231}
26232
26233/// Compare two cmp instructions. If IsCompatibility is true, function returns
26234/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
26235/// operands. If IsCompatibility is false, function implements strict weak
26236/// ordering relation between two cmp instructions, returning true if the first
26237/// instruction is "less" than the second, i.e. its predicate is less than the
26238/// predicate of the second or the operands IDs are less than the operands IDs
26239/// of the second cmp instruction.
26240template <bool IsCompatibility>
26241static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
26242 const DominatorTree &DT) {
26243 assert(isValidElementType(V->getType()) &&
26244 isValidElementType(V2->getType()) &&
26245 "Expected valid element types only.");
26246 if (V == V2)
26247 return IsCompatibility;
26248 auto *CI1 = cast<CmpInst>(V);
26249 auto *CI2 = cast<CmpInst>(V2);
26250 if (CI1->getOperand(0)->getType()->getTypeID() <
26251 CI2->getOperand(0)->getType()->getTypeID())
26252 return !IsCompatibility;
26253 if (CI1->getOperand(0)->getType()->getTypeID() >
26254 CI2->getOperand(0)->getType()->getTypeID())
26255 return false;
26256 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26258 return !IsCompatibility;
26259 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26261 return false;
26262 CmpInst::Predicate Pred1 = CI1->getPredicate();
26263 CmpInst::Predicate Pred2 = CI2->getPredicate();
26266 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
26267 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
26268 if (BasePred1 < BasePred2)
26269 return !IsCompatibility;
26270 if (BasePred1 > BasePred2)
26271 return false;
26272 // Compare operands.
26273 bool CI1Preds = Pred1 == BasePred1;
26274 bool CI2Preds = Pred2 == BasePred1;
26275 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
26276 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
26277 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
26278 if (Op1 == Op2)
26279 continue;
26280 if (Op1->getValueID() < Op2->getValueID())
26281 return !IsCompatibility;
26282 if (Op1->getValueID() > Op2->getValueID())
26283 return false;
26284 if (auto *I1 = dyn_cast<Instruction>(Op1))
26285 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
26286 if (IsCompatibility) {
26287 if (I1->getParent() != I2->getParent())
26288 return false;
26289 } else {
26290 // Try to compare nodes with same parent.
26291 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
26292 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
26293 if (!NodeI1)
26294 return NodeI2 != nullptr;
26295 if (!NodeI2)
26296 return false;
26297 assert((NodeI1 == NodeI2) ==
26298 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26299 "Different nodes should have different DFS numbers");
26300 if (NodeI1 != NodeI2)
26301 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26302 }
26303 InstructionsState S = getSameOpcode({I1, I2}, TLI);
26304 if (S && (IsCompatibility || !S.isAltShuffle()))
26305 continue;
26306 if (IsCompatibility)
26307 return false;
26308 if (I1->getOpcode() != I2->getOpcode())
26309 return I1->getOpcode() < I2->getOpcode();
26310 }
26311 }
26312 return IsCompatibility;
26313}
26314
26315template <typename ItT>
26316bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
26317 BasicBlock *BB, BoUpSLP &R) {
26318 bool Changed = false;
26319 // Try to find reductions first.
26320 for (CmpInst *I : CmpInsts) {
26321 if (R.isDeleted(I))
26322 continue;
26323 for (Value *Op : I->operands())
26324 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
26325 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
26326 if (R.isDeleted(I))
26327 break;
26328 }
26329 }
26330 // Try to vectorize operands as vector bundles.
26331 for (CmpInst *I : CmpInsts) {
26332 if (R.isDeleted(I))
26333 continue;
26334 Changed |= tryToVectorize(I, R);
26335 }
26336 // Try to vectorize list of compares.
26337 // Sort by type, compare predicate, etc.
26338 auto CompareSorter = [&](Value *V, Value *V2) {
26339 if (V == V2)
26340 return false;
26341 return compareCmp<false>(V, V2, *TLI, *DT);
26342 };
26343
26344 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
26345 if (VL.empty() || VL.back() == V1)
26346 return true;
26347 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
26348 };
26349
26351 for (Instruction *V : CmpInsts)
26352 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
26353 Vals.push_back(V);
26354 if (Vals.size() <= 1)
26355 return Changed;
26357 Vals, CompareSorter, AreCompatibleCompares,
26358 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26359 // Exclude possible reductions from other blocks.
26360 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
26361 return any_of(V->users(), [V](User *U) {
26362 auto *Select = dyn_cast<SelectInst>(U);
26363 return Select &&
26364 Select->getParent() != cast<Instruction>(V)->getParent();
26365 });
26366 });
26367 if (ArePossiblyReducedInOtherBlock)
26368 return false;
26369 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26370 },
26371 /*MaxVFOnly=*/true, R);
26372 return Changed;
26373}
26374
26375bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
26376 BasicBlock *BB, BoUpSLP &R) {
26378 "This function only accepts Insert instructions");
26379 bool OpsChanged = false;
26380 SmallVector<WeakTrackingVH> PostponedInsts;
26381 for (auto *I : reverse(Instructions)) {
26382 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
26383 if (R.isDeleted(I) || isa<CmpInst>(I))
26384 continue;
26385 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
26386 OpsChanged |=
26387 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
26388 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
26389 OpsChanged |=
26390 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
26391 }
26392 // pass2 - try to vectorize reductions only
26393 if (R.isDeleted(I))
26394 continue;
26395 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
26396 if (R.isDeleted(I) || isa<CmpInst>(I))
26397 continue;
26398 // pass3 - try to match and vectorize a buildvector sequence.
26399 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
26400 OpsChanged |=
26401 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
26402 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
26403 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
26404 /*MaxVFOnly=*/false);
26405 }
26406 }
26407 // Now try to vectorize postponed instructions.
26408 OpsChanged |= tryToVectorize(PostponedInsts, R);
26409
26410 Instructions.clear();
26411 return OpsChanged;
26412}
26413
26414bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
26415 bool Changed = false;
26416 SmallVector<Value *, 4> Incoming;
26417 SmallPtrSet<Value *, 16> VisitedInstrs;
26418 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
26419 // node. Allows better to identify the chains that can be vectorized in the
26420 // better way.
26421 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
26422 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
26424 isValidElementType(V2->getType()) &&
26425 "Expected vectorizable types only.");
26426 if (V1 == V2)
26427 return false;
26428 // It is fine to compare type IDs here, since we expect only vectorizable
26429 // types, like ints, floats and pointers, we don't care about other type.
26430 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
26431 return true;
26432 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
26433 return false;
26434 if (V1->getType()->getScalarSizeInBits() <
26435 V2->getType()->getScalarSizeInBits())
26436 return true;
26437 if (V1->getType()->getScalarSizeInBits() >
26438 V2->getType()->getScalarSizeInBits())
26439 return false;
26440 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26441 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26442 if (Opcodes1.size() < Opcodes2.size())
26443 return true;
26444 if (Opcodes1.size() > Opcodes2.size())
26445 return false;
26446 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26447 {
26448 // Instructions come first.
26449 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
26450 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
26451 if (I1 && I2) {
26452 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
26453 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
26454 if (!NodeI1)
26455 return NodeI2 != nullptr;
26456 if (!NodeI2)
26457 return false;
26458 assert((NodeI1 == NodeI2) ==
26459 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26460 "Different nodes should have different DFS numbers");
26461 if (NodeI1 != NodeI2)
26462 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26463 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
26464 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
26465 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
26466 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
26467 if (!E1 || !E2)
26468 continue;
26469
26470 // Sort on ExtractElementInsts primarily by vector operands. Prefer
26471 // program order of the vector operands.
26472 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
26473 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
26474 if (V1 != V2) {
26475 if (V1 && !V2)
26476 return true;
26477 if (!V1 && V2)
26478 return false;
26480 DT->getNode(V1->getParent());
26482 DT->getNode(V2->getParent());
26483 if (!NodeI1)
26484 return NodeI2 != nullptr;
26485 if (!NodeI2)
26486 return false;
26487 assert((NodeI1 == NodeI2) ==
26488 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26489 "Different nodes should have different DFS numbers");
26490 if (NodeI1 != NodeI2)
26491 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26492 return V1->comesBefore(V2);
26493 }
26494 // If we have the same vector operand, try to sort by constant
26495 // index.
26496 std::optional<unsigned> Id1 = getExtractIndex(E1);
26497 std::optional<unsigned> Id2 = getExtractIndex(E2);
26498 // Bring constants to the top
26499 if (Id1 && !Id2)
26500 return true;
26501 if (!Id1 && Id2)
26502 return false;
26503 // First elements come first.
26504 if (Id1 && Id2)
26505 return *Id1 < *Id2;
26506
26507 continue;
26508 }
26509 if (I1->getOpcode() == I2->getOpcode())
26510 continue;
26511 return I1->getOpcode() < I2->getOpcode();
26512 }
26513 if (I1)
26514 return true;
26515 if (I2)
26516 return false;
26517 }
26518 {
26519 // Non-undef constants come next.
26520 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
26521 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
26522 if (C1 && C2)
26523 continue;
26524 if (C1)
26525 return true;
26526 if (C2)
26527 return false;
26528 }
26529 bool U1 = isa<UndefValue>(Opcodes1[I]);
26530 bool U2 = isa<UndefValue>(Opcodes2[I]);
26531 {
26532 // Non-constant non-instructions come next.
26533 if (!U1 && !U2) {
26534 auto ValID1 = Opcodes1[I]->getValueID();
26535 auto ValID2 = Opcodes2[I]->getValueID();
26536 if (ValID1 == ValID2)
26537 continue;
26538 if (ValID1 < ValID2)
26539 return true;
26540 if (ValID1 > ValID2)
26541 return false;
26542 }
26543 if (!U1)
26544 return true;
26545 if (!U2)
26546 return false;
26547 }
26548 // Undefs come last.
26549 assert(U1 && U2 && "The only thing left should be undef & undef.");
26550 }
26551 return false;
26552 };
26553 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
26554 Value *V1) {
26555 if (VL.empty() || V1 == VL.back())
26556 return true;
26557 Value *V2 = VL.back();
26558 if (V1->getType() != V2->getType())
26559 return false;
26560 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26561 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26562 if (Opcodes1.size() != Opcodes2.size())
26563 return false;
26564 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26565 // Undefs are compatible with any other value.
26566 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
26567 continue;
26568 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
26569 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
26570 if (R.isDeleted(I1) || R.isDeleted(I2))
26571 return false;
26572 if (I1->getParent() != I2->getParent())
26573 return false;
26574 if (getSameOpcode({I1, I2}, *TLI))
26575 continue;
26576 return false;
26577 }
26578 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
26579 continue;
26580 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
26581 return false;
26582 }
26583 return true;
26584 };
26585
26586 bool HaveVectorizedPhiNodes = false;
26587 do {
26588 // Collect the incoming values from the PHIs.
26589 Incoming.clear();
26590 for (Instruction &I : *BB) {
26591 auto *P = dyn_cast<PHINode>(&I);
26592 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
26593 break;
26594
26595 // No need to analyze deleted, vectorized and non-vectorizable
26596 // instructions.
26597 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
26598 isValidElementType(P->getType()))
26599 Incoming.push_back(P);
26600 }
26601
26602 if (Incoming.size() <= 1)
26603 break;
26604
26605 // Find the corresponding non-phi nodes for better matching when trying to
26606 // build the tree.
26607 for (Value *V : Incoming) {
26608 SmallVectorImpl<Value *> &Opcodes =
26609 PHIToOpcodes.try_emplace(V).first->getSecond();
26610 if (!Opcodes.empty())
26611 continue;
26612 SmallVector<Value *, 4> Nodes(1, V);
26613 SmallPtrSet<Value *, 4> Visited;
26614 while (!Nodes.empty()) {
26615 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
26616 if (!Visited.insert(PHI).second)
26617 continue;
26618 for (Value *V : PHI->incoming_values()) {
26619 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
26620 Nodes.push_back(PHI1);
26621 continue;
26622 }
26623 Opcodes.emplace_back(V);
26624 }
26625 }
26626 }
26627
26628 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26629 Incoming, PHICompare, AreCompatiblePHIs,
26630 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26631 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26632 },
26633 /*MaxVFOnly=*/true, R);
26634 Changed |= HaveVectorizedPhiNodes;
26635 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
26636 auto *PHI = dyn_cast<PHINode>(P.first);
26637 return !PHI || R.isDeleted(PHI);
26638 }))
26639 PHIToOpcodes.clear();
26640 VisitedInstrs.insert_range(Incoming);
26641 } while (HaveVectorizedPhiNodes);
26642
26643 VisitedInstrs.clear();
26644
26645 InstSetVector PostProcessInserts;
26646 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26647 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
26648 // also vectorizes `PostProcessCmps`.
26649 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
26650 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26651 if (VectorizeCmps) {
26652 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
26653 PostProcessCmps.clear();
26654 }
26655 PostProcessInserts.clear();
26656 return Changed;
26657 };
26658 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
26659 auto IsInPostProcessInstrs = [&](Instruction *I) {
26660 if (auto *Cmp = dyn_cast<CmpInst>(I))
26661 return PostProcessCmps.contains(Cmp);
26663 PostProcessInserts.contains(I);
26664 };
26665 // Returns true if `I` is an instruction without users, like terminator, or
26666 // function call with ignored return value, store. Ignore unused instructions
26667 // (basing on instruction type, except for CallInst and InvokeInst).
26668 auto HasNoUsers = [](Instruction *I) {
26669 return I->use_empty() &&
26670 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
26671 };
26672 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
26673 // Skip instructions with scalable type. The num of elements is unknown at
26674 // compile-time for scalable type.
26675 if (isa<ScalableVectorType>(It->getType()))
26676 continue;
26677
26678 // Skip instructions marked for the deletion.
26679 if (R.isDeleted(&*It))
26680 continue;
26681 // We may go through BB multiple times so skip the one we have checked.
26682 if (!VisitedInstrs.insert(&*It).second) {
26683 if (HasNoUsers(&*It) &&
26684 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
26685 // We would like to start over since some instructions are deleted
26686 // and the iterator may become invalid value.
26687 Changed = true;
26688 It = BB->begin();
26689 E = BB->end();
26690 }
26691 continue;
26692 }
26693
26694 // Try to vectorize reductions that use PHINodes.
26695 if (PHINode *P = dyn_cast<PHINode>(It)) {
26696 // Check that the PHI is a reduction PHI.
26697 if (P->getNumIncomingValues() == 2) {
26698 // Try to match and vectorize a horizontal reduction.
26699 Instruction *Root = getReductionInstr(DT, P, BB, LI);
26700 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
26701 Changed = true;
26702 It = BB->begin();
26703 E = BB->end();
26704 continue;
26705 }
26706 }
26707 // Try to vectorize the incoming values of the PHI, to catch reductions
26708 // that feed into PHIs.
26709 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
26710 // Skip if the incoming block is the current BB for now. Also, bypass
26711 // unreachable IR for efficiency and to avoid crashing.
26712 // TODO: Collect the skipped incoming values and try to vectorize them
26713 // after processing BB.
26714 if (BB == P->getIncomingBlock(I) ||
26715 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
26716 continue;
26717
26718 // Postponed instructions should not be vectorized here, delay their
26719 // vectorization.
26720 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
26721 PI && !IsInPostProcessInstrs(PI)) {
26722 bool Res =
26723 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
26724 Changed |= Res;
26725 if (Res && R.isDeleted(P)) {
26726 It = BB->begin();
26727 E = BB->end();
26728 break;
26729 }
26730 }
26731 }
26732 continue;
26733 }
26734
26735 if (HasNoUsers(&*It)) {
26736 bool OpsChanged = false;
26737 auto *SI = dyn_cast<StoreInst>(It);
26738 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
26739 if (SI) {
26740 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
26741 // Try to vectorize chain in store, if this is the only store to the
26742 // address in the block.
26743 // TODO: This is just a temporarily solution to save compile time. Need
26744 // to investigate if we can safely turn on slp-vectorize-hor-store
26745 // instead to allow lookup for reduction chains in all non-vectorized
26746 // stores (need to check side effects and compile time).
26747 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
26748 SI->getValueOperand()->hasOneUse();
26749 }
26750 if (TryToVectorizeRoot) {
26751 for (auto *V : It->operand_values()) {
26752 // Postponed instructions should not be vectorized here, delay their
26753 // vectorization.
26754 if (auto *VI = dyn_cast<Instruction>(V);
26755 VI && !IsInPostProcessInstrs(VI))
26756 // Try to match and vectorize a horizontal reduction.
26757 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
26758 }
26759 }
26760 // Start vectorization of post-process list of instructions from the
26761 // top-tree instructions to try to vectorize as many instructions as
26762 // possible.
26763 OpsChanged |=
26764 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
26765 if (OpsChanged) {
26766 // We would like to start over since some instructions are deleted
26767 // and the iterator may become invalid value.
26768 Changed = true;
26769 It = BB->begin();
26770 E = BB->end();
26771 continue;
26772 }
26773 }
26774
26776 PostProcessInserts.insert(&*It);
26777 else if (isa<CmpInst>(It))
26778 PostProcessCmps.insert(cast<CmpInst>(&*It));
26779 }
26780
26781 return Changed;
26782}
26783
26784bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
26785 auto Changed = false;
26786 for (auto &Entry : GEPs) {
26787 // If the getelementptr list has fewer than two elements, there's nothing
26788 // to do.
26789 if (Entry.second.size() < 2)
26790 continue;
26791
26792 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
26793 << Entry.second.size() << ".\n");
26794
26795 // Process the GEP list in chunks suitable for the target's supported
26796 // vector size. If a vector register can't hold 1 element, we are done. We
26797 // are trying to vectorize the index computations, so the maximum number of
26798 // elements is based on the size of the index expression, rather than the
26799 // size of the GEP itself (the target's pointer size).
26800 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
26801 return !R.isDeleted(GEP);
26802 });
26803 if (It == Entry.second.end())
26804 continue;
26805 unsigned MaxVecRegSize = R.getMaxVecRegSize();
26806 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
26807 if (MaxVecRegSize < EltSize)
26808 continue;
26809
26810 unsigned MaxElts = MaxVecRegSize / EltSize;
26811 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
26812 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26813 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
26814
26815 // Initialize a set a candidate getelementptrs. Note that we use a
26816 // SetVector here to preserve program order. If the index computations
26817 // are vectorizable and begin with loads, we want to minimize the chance
26818 // of having to reorder them later.
26819 SetVector<Value *> Candidates(llvm::from_range, GEPList);
26820
26821 // Some of the candidates may have already been vectorized after we
26822 // initially collected them or their index is optimized to constant value.
26823 // If so, they are marked as deleted, so remove them from the set of
26824 // candidates.
26825 Candidates.remove_if([&R](Value *I) {
26826 return R.isDeleted(cast<Instruction>(I)) ||
26827 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
26828 });
26829
26830 // Remove from the set of candidates all pairs of getelementptrs with
26831 // constant differences. Such getelementptrs are likely not good
26832 // candidates for vectorization in a bottom-up phase since one can be
26833 // computed from the other. We also ensure all candidate getelementptr
26834 // indices are unique.
26835 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
26836 auto *GEPI = GEPList[I];
26837 if (!Candidates.count(GEPI))
26838 continue;
26839 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
26840 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
26841 auto *GEPJ = GEPList[J];
26842 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26843 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
26844 Candidates.remove(GEPI);
26845 Candidates.remove(GEPJ);
26846 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26847 Candidates.remove(GEPJ);
26848 }
26849 }
26850 }
26851
26852 // We break out of the above computation as soon as we know there are
26853 // fewer than two candidates remaining.
26854 if (Candidates.size() < 2)
26855 continue;
26856
26857 // Add the single, non-constant index of each candidate to the bundle. We
26858 // ensured the indices met these constraints when we originally collected
26859 // the getelementptrs.
26860 SmallVector<Value *, 16> Bundle(Candidates.size());
26861 auto BundleIndex = 0u;
26862 for (auto *V : Candidates) {
26863 auto *GEP = cast<GetElementPtrInst>(V);
26864 auto *GEPIdx = GEP->idx_begin()->get();
26865 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26866 Bundle[BundleIndex++] = GEPIdx;
26867 }
26868
26869 // Try and vectorize the indices. We are currently only interested in
26870 // gather-like cases of the form:
26871 //
26872 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
26873 //
26874 // where the loads of "a", the loads of "b", and the subtractions can be
26875 // performed in parallel. It's likely that detecting this pattern in a
26876 // bottom-up phase will be simpler and less costly than building a
26877 // full-blown top-down phase beginning at the consecutive loads.
26878 Changed |= tryToVectorizeList(Bundle, R);
26879 }
26880 }
26881 return Changed;
26882}
26883
26884bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
26885 bool Changed = false;
26886 // Sort by type, base pointers and values operand. Value operands must be
26887 // compatible (have the same opcode, same parent), otherwise it is
26888 // definitely not profitable to try to vectorize them.
26889 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
26890 if (V->getValueOperand()->getType()->getTypeID() <
26891 V2->getValueOperand()->getType()->getTypeID())
26892 return true;
26893 if (V->getValueOperand()->getType()->getTypeID() >
26894 V2->getValueOperand()->getType()->getTypeID())
26895 return false;
26896 if (V->getPointerOperandType()->getTypeID() <
26897 V2->getPointerOperandType()->getTypeID())
26898 return true;
26899 if (V->getPointerOperandType()->getTypeID() >
26900 V2->getPointerOperandType()->getTypeID())
26901 return false;
26902 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
26903 V2->getValueOperand()->getType()->getScalarSizeInBits())
26904 return true;
26905 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
26906 V2->getValueOperand()->getType()->getScalarSizeInBits())
26907 return false;
26908 // UndefValues are compatible with all other values.
26909 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
26910 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26911 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26912 DT->getNode(I1->getParent());
26913 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26914 DT->getNode(I2->getParent());
26915 assert(NodeI1 && "Should only process reachable instructions");
26916 assert(NodeI2 && "Should only process reachable instructions");
26917 assert((NodeI1 == NodeI2) ==
26918 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26919 "Different nodes should have different DFS numbers");
26920 if (NodeI1 != NodeI2)
26921 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26922 return I1->getOpcode() < I2->getOpcode();
26923 }
26924 return V->getValueOperand()->getValueID() <
26925 V2->getValueOperand()->getValueID();
26926 };
26927
26928 bool SameParent = true;
26929 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
26930 if (VL.empty()) {
26931 SameParent = true;
26932 return true;
26933 }
26934 StoreInst *V2 = VL.back();
26935 if (V1 == V2)
26936 return true;
26937 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
26938 return false;
26939 if (V1->getPointerOperandType() != V2->getPointerOperandType())
26940 return false;
26941 // Undefs are compatible with any other value.
26942 if (isa<UndefValue>(V1->getValueOperand()) ||
26944 return true;
26945 if (isa<Constant>(V1->getValueOperand()) &&
26947 return true;
26948 // Check if the operands of the stores can be vectorized. They can be
26949 // vectorized, if they have compatible operands or have operands, which can
26950 // be vectorized as copyables.
26951 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
26952 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
26953 if (I1 || I2) {
26954 // Accept only tail-following non-compatible values for now.
26955 // TODO: investigate if it is possible to vectorize incompatible values,
26956 // if the copyables are first in the list.
26957 if (I1 && !I2)
26958 return false;
26959 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
26960 SmallVector<Value *> NewVL(VL.size() + 1);
26961 for (auto [SI, V] : zip(VL, NewVL))
26962 V = SI->getValueOperand();
26963 NewVL.back() = V1->getValueOperand();
26964 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
26965 InstructionsState S = Analysis.buildInstructionsState(
26966 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
26967 /*SkipSameCodeCheck=*/!SameParent);
26968 if (S)
26969 return true;
26970 if (!SameParent)
26971 return false;
26972 }
26973 return V1->getValueOperand()->getValueID() ==
26974 V2->getValueOperand()->getValueID();
26975 };
26976
26977 // Attempt to sort and vectorize each of the store-groups.
26978 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26979 for (auto &Pair : Stores) {
26980 if (Pair.second.size() < 2)
26981 continue;
26982
26983 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
26984 << Pair.second.size() << ".\n");
26985
26986 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
26987 continue;
26988
26989 // Reverse stores to do bottom-to-top analysis. This is important if the
26990 // values are stores to the same addresses several times, in this case need
26991 // to follow the stores order (reversed to meet the memory dependecies).
26992 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
26993 Pair.second.rend());
26995 ReversedStores, StoreSorter, AreCompatibleStores,
26996 [&](ArrayRef<StoreInst *> Candidates, bool) {
26997 return vectorizeStores(Candidates, R, Attempted);
26998 },
26999 /*MaxVFOnly=*/false, R);
27000 }
27001 return Changed;
27002}
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
Definition ExpandFp.cpp:993
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
static Type * getIndexType(Value *In)
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1407
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1331
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1112
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1397
void negate()
Negate this APInt in place.
Definition APInt.h:1469
unsigned logBase2() const
Definition APInt.h:1762
void setAllBits()
Set every bit to 1.
Definition APInt.h:1320
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1368
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:287
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:178
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:219
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:195
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:157
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:480
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:707
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:664
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
static bool shouldExecute(unsigned CounterName)
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getUnknown()
Definition DebugLoc.h:162
An analysis that produces DemandedBits for a function.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
bool erase(const KeyT &Val)
Definition DenseMap.h:322
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:233
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:149
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:111
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
T & front() const
front - Get the first element.
Definition ArrayRef.h:349
iterator end() const
Definition ArrayRef.h:343
iterator begin() const
Definition ArrayRef.h:342
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:58
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:89
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:101
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:130
void insert_range(Range &&R)
Definition SetVector.h:174
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:92
void clear()
Completely clear the SetVector.
Definition SetVector.h:265
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:149
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:250
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:337
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:228
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:180
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:284
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:24
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
iterator_range< value_op_iterator > operand_values()
Definition User.h:316
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1718
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1724
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2231
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:1981
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2128
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1968
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1763
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:339
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1397
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1920
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2030
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1835
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1407
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:257
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1425
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1434
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)