LLVM 23.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <map>
99#include <memory>
100#include <optional>
101#include <set>
102#include <string>
103#include <tuple>
104#include <utility>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108using namespace slpvectorizer;
109using namespace std::placeholders;
110
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
113
114STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
115
116DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
117 "Controls which SLP graphs should be vectorized.");
118
119static cl::opt<bool>
120 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
121 cl::desc("Run the SLP vectorization passes"));
122
123static cl::opt<bool>
124 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
125 cl::desc("Enable vectorization for wider vector utilization"));
126
127static cl::opt<int>
129 cl::desc("Only vectorize if you gain more than this "
130 "number "));
131
133 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
134 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
135 "heuristics and makes vectorization decision via cost modeling."));
136
137static cl::opt<bool>
138ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
139 cl::desc("Attempt to vectorize horizontal reductions"));
140
142 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
143 cl::desc(
144 "Attempt to vectorize horizontal reductions feeding into a store"));
145
147 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
148 cl::desc("Improve the code quality by splitting alternate instructions"));
149
150static cl::opt<int>
152 cl::desc("Attempt to vectorize for this register size in bits"));
153
156 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
157
158/// Limits the size of scheduling regions in a block.
159/// It avoid long compile times for _very_ large blocks where vector
160/// instructions are spread over a wide range.
161/// This limit is way higher than needed by real-world functions.
162static cl::opt<int>
163ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
164 cl::desc("Limit the size of the SLP scheduling region per block"));
165
167 "slp-min-reg-size", cl::init(128), cl::Hidden,
168 cl::desc("Attempt to vectorize for this register size in bits"));
169
171 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
172 cl::desc("Limit the recursion depth when building a vectorizable tree"));
173
175 "slp-min-tree-size", cl::init(3), cl::Hidden,
176 cl::desc("Only vectorize small trees if they are fully vectorizable"));
177
178// The maximum depth that the look-ahead score heuristic will explore.
179// The higher this value, the higher the compilation time overhead.
181 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
182 cl::desc("The maximum look-ahead depth for operand reordering scores"));
183
184// The maximum depth that the look-ahead score heuristic will explore
185// when it probing among candidates for vectorization tree roots.
186// The higher this value, the higher the compilation time overhead but unlike
187// similar limit for operands ordering this is less frequently used, hence
188// impact of higher value is less noticeable.
190 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
191 cl::desc("The maximum look-ahead depth for searching best rooting option"));
192
194 "slp-min-strided-loads", cl::init(2), cl::Hidden,
195 cl::desc("The minimum number of loads, which should be considered strided, "
196 "if the stride is > 1 or is runtime value"));
197
199 "slp-max-stride", cl::init(8), cl::Hidden,
200 cl::desc("The maximum stride, considered to be profitable."));
201
202static cl::opt<bool>
203 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
204 cl::desc("Disable tree reordering even if it is "
205 "profitable. Used for testing only."));
206
207static cl::opt<bool>
208 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
209 cl::desc("Generate strided loads even if they are not "
210 "profitable. Used for testing only."));
211
212static cl::opt<bool>
213 ViewSLPTree("view-slp-tree", cl::Hidden,
214 cl::desc("Display the SLP trees with Graphviz"));
215
217 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
218 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
219
220/// Enables vectorization of copyable elements.
222 "slp-copyable-elements", cl::init(true), cl::Hidden,
223 cl::desc("Try to replace values with the idempotent instructions for "
224 "better vectorization."));
225
226// Limit the number of alias checks. The limit is chosen so that
227// it has no negative effect on the llvm benchmarks.
228static const unsigned AliasedCheckLimit = 10;
229
230// Limit of the number of uses for potentially transformed instructions/values,
231// used in checks to avoid compile-time explode.
232static constexpr int UsesLimit = 64;
233
234// Another limit for the alias checks: The maximum distance between load/store
235// instructions where alias checks are done.
236// This limit is useful for very large basic blocks.
237static const unsigned MaxMemDepDistance = 160;
238
239/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
240/// regions to be handled.
241static const int MinScheduleRegionSize = 16;
242
243/// Maximum allowed number of operands in the PHI nodes.
244static const unsigned MaxPHINumOperands = 128;
245
246/// Predicate for the element types that the SLP vectorizer supports.
247///
248/// The most important thing to filter here are types which are invalid in LLVM
249/// vectors. We also filter target specific types which have absolutely no
250/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
251/// avoids spending time checking the cost model and realizing that they will
252/// be inevitably scalarized.
253static bool isValidElementType(Type *Ty) {
254 // TODO: Support ScalableVectorType.
256 Ty = Ty->getScalarType();
257 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
258 !Ty->isPPC_FP128Ty();
259}
260
261/// Returns the type of the given value/instruction \p V. If it is store,
262/// returns the type of its value operand, for Cmp - the types of the compare
263/// operands and for insertelement - the type os the inserted operand.
264/// Otherwise, just the type of the value is returned.
266 if (auto *SI = dyn_cast<StoreInst>(V))
267 return SI->getValueOperand()->getType();
268 if (auto *CI = dyn_cast<CmpInst>(V))
269 return CI->getOperand(0)->getType();
270 if (auto *IE = dyn_cast<InsertElementInst>(V))
271 return IE->getOperand(1)->getType();
272 return V->getType();
273}
274
275/// \returns the number of elements for Ty.
276static unsigned getNumElements(Type *Ty) {
278 "ScalableVectorType is not supported.");
279 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
280 return VecTy->getNumElements();
281 return 1;
282}
283
284/// \returns the vector type of ScalarTy based on vectorization factor.
285static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
286 return FixedVectorType::get(ScalarTy->getScalarType(),
287 VF * getNumElements(ScalarTy));
288}
289
290/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
291/// which forms type, which splits by \p TTI into whole vector types during
292/// legalization.
294 Type *Ty, unsigned Sz) {
295 if (!isValidElementType(Ty))
296 return bit_ceil(Sz);
297 // Find the number of elements, which forms full vectors.
298 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
299 if (NumParts == 0 || NumParts >= Sz)
300 return bit_ceil(Sz);
301 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
302}
303
304/// Returns the number of elements of the given type \p Ty, not greater than \p
305/// Sz, which forms type, which splits by \p TTI into whole vector types during
306/// legalization.
307static unsigned
309 unsigned Sz) {
310 if (!isValidElementType(Ty))
311 return bit_floor(Sz);
312 // Find the number of elements, which forms full vectors.
313 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
314 if (NumParts == 0 || NumParts >= Sz)
315 return bit_floor(Sz);
316 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
317 if (RegVF > Sz)
318 return bit_floor(Sz);
319 return (Sz / RegVF) * RegVF;
320}
321
322static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
323 SmallVectorImpl<int> &Mask) {
324 // The ShuffleBuilder implementation use shufflevector to splat an "element".
325 // But the element have different meaning for SLP (scalar) and REVEC
326 // (vector). We need to expand Mask into masks which shufflevector can use
327 // directly.
328 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
329 for (unsigned I : seq<unsigned>(Mask.size()))
330 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
331 I * VecTyNumElements, VecTyNumElements)))
332 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
333 : Mask[I] * VecTyNumElements + J;
334 Mask.swap(NewMask);
335}
336
337/// \returns the number of groups of shufflevector
338/// A group has the following features
339/// 1. All of value in a group are shufflevector.
340/// 2. The mask of all shufflevector is isExtractSubvectorMask.
341/// 3. The mask of all shufflevector uses all of the elements of the source.
342/// e.g., it is 1 group (%0)
343/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
344/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
345/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
346/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
347/// it is 2 groups (%3 and %4)
348/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
349/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
350/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
351/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
352/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
353/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
354/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
355/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
356/// it is 0 group
357/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
358/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
359/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
360/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
362 if (VL.empty())
363 return 0;
365 return 0;
366 auto *SV = cast<ShuffleVectorInst>(VL.front());
367 unsigned SVNumElements =
368 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
369 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
370 if (SVNumElements % ShuffleMaskSize != 0)
371 return 0;
372 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
373 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
374 return 0;
375 unsigned NumGroup = 0;
376 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
377 auto *SV = cast<ShuffleVectorInst>(VL[I]);
378 Value *Src = SV->getOperand(0);
379 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
380 SmallBitVector ExpectedIndex(GroupSize);
381 if (!all_of(Group, [&](Value *V) {
382 auto *SV = cast<ShuffleVectorInst>(V);
383 // From the same source.
384 if (SV->getOperand(0) != Src)
385 return false;
386 int Index;
387 if (!SV->isExtractSubvectorMask(Index))
388 return false;
389 ExpectedIndex.set(Index / ShuffleMaskSize);
390 return true;
391 }))
392 return 0;
393 if (!ExpectedIndex.all())
394 return 0;
395 ++NumGroup;
396 }
397 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
398 return NumGroup;
399}
400
401/// \returns a shufflevector mask which is used to vectorize shufflevectors
402/// e.g.,
403/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
404/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
405/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
406/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
407/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
408/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
409/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
410/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
411/// the result is
412/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
414 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
415 auto *SV = cast<ShuffleVectorInst>(VL.front());
416 unsigned SVNumElements =
417 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
418 SmallVector<int> Mask;
419 unsigned AccumulateLength = 0;
420 for (Value *V : VL) {
421 auto *SV = cast<ShuffleVectorInst>(V);
422 for (int M : SV->getShuffleMask())
423 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
424 : AccumulateLength + M);
425 AccumulateLength += SVNumElements;
426 }
427 return Mask;
428}
429
430/// \returns True if the value is a constant (but not globals/constant
431/// expressions).
432static bool isConstant(Value *V) {
434}
435
436/// Checks if \p V is one of vector-like instructions, i.e. undef,
437/// insertelement/extractelement with constant indices for fixed vector type or
438/// extractvalue instruction.
442 return false;
443 auto *I = dyn_cast<Instruction>(V);
444 if (!I || isa<ExtractValueInst>(I))
445 return true;
446 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
447 return false;
449 return isConstant(I->getOperand(1));
450 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
451 return isConstant(I->getOperand(2));
452}
453
454/// Returns power-of-2 number of elements in a single register (part), given the
455/// total number of elements \p Size and number of registers (parts) \p
456/// NumParts.
457static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
458 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
459}
460
461/// Returns correct remaining number of elements, considering total amount \p
462/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
463/// and current register (part) \p Part.
464static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
465 unsigned Part) {
466 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
467}
468
469#if !defined(NDEBUG)
470/// Print a short descriptor of the instruction bundle suitable for debug output.
471static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
472 std::string Result;
473 raw_string_ostream OS(Result);
474 if (Idx >= 0)
475 OS << "Idx: " << Idx << ", ";
476 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
477 return Result;
478}
479#endif
480
481/// \returns true if all of the instructions in \p VL are in the same block or
482/// false otherwise.
484 auto *It = find_if(VL, IsaPred<Instruction>);
485 if (It == VL.end())
486 return false;
489 return true;
490
491 BasicBlock *BB = I0->getParent();
492 for (Value *V : iterator_range(It, VL.end())) {
493 if (isa<PoisonValue>(V))
494 continue;
495 auto *II = dyn_cast<Instruction>(V);
496 if (!II)
497 return false;
498
499 if (BB != II->getParent())
500 return false;
501 }
502 return true;
503}
504
505/// \returns True if all of the values in \p VL are constants (but not
506/// globals/constant expressions).
508 // Constant expressions and globals can't be vectorized like normal integer/FP
509 // constants.
510 return all_of(VL, isConstant);
511}
512
513/// \returns True if all of the values in \p VL are identical or some of them
514/// are UndefValue.
515static bool isSplat(ArrayRef<Value *> VL) {
516 Value *FirstNonUndef = nullptr;
517 for (Value *V : VL) {
518 if (isa<UndefValue>(V))
519 continue;
520 if (!FirstNonUndef) {
521 FirstNonUndef = V;
522 continue;
523 }
524 if (V != FirstNonUndef)
525 return false;
526 }
527 return FirstNonUndef != nullptr;
528}
529
530/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
531/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
532/// patterns that make it effectively commutative (like equality comparisons
533/// with zero).
534/// In most cases, users should not call this function directly (since \p I and
535/// \p InstWithUses are the same). However, when analyzing interchangeable
536/// instructions, we need to use the converted opcode along with the original
537/// uses.
538/// \param I The instruction to check for commutativity
539/// \param ValWithUses The value whose uses are analyzed for special
540/// patterns
541static bool isCommutative(Instruction *I, Value *ValWithUses,
542 bool IsCopyable = false) {
543 if (auto *Cmp = dyn_cast<CmpInst>(I))
544 return Cmp->isCommutative();
545 if (auto *BO = dyn_cast<BinaryOperator>(I))
546 return BO->isCommutative() ||
547 (BO->getOpcode() == Instruction::Sub &&
548 ValWithUses->hasUseList() &&
549 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
550 all_of(
551 ValWithUses->uses(),
552 [&](const Use &U) {
553 // Commutative, if icmp eq/ne sub, 0
554 CmpPredicate Pred;
555 if (match(U.getUser(),
556 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
557 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return true;
559 // Commutative, if abs(sub nsw, true) or abs(sub, false).
560 ConstantInt *Flag;
561 auto *I = dyn_cast<BinaryOperator>(U.get());
562 return match(U.getUser(),
563 m_Intrinsic<Intrinsic::abs>(
564 m_Specific(U.get()), m_ConstantInt(Flag))) &&
565 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
566 Flag->isOne());
567 })) ||
568 (BO->getOpcode() == Instruction::FSub &&
569 ValWithUses->hasUseList() &&
570 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
571 all_of(ValWithUses->uses(), [](const Use &U) {
572 return match(U.getUser(),
573 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
574 }));
575 return I->isCommutative();
576}
577
578/// Checks if the operand is commutative. In commutative operations, not all
579/// operands might commutable, e.g. for fmuladd only 2 first operands are
580/// commutable.
581static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
582 bool IsCopyable = false) {
583 assert(::isCommutative(I, ValWithUses, IsCopyable) &&
584 "The instruction is not commutative.");
585 if (isa<CmpInst>(I))
586 return true;
587 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
588 switch (BO->getOpcode()) {
589 case Instruction::Sub:
590 case Instruction::FSub:
591 return true;
592 default:
593 break;
594 }
595 }
596 return I->isCommutableOperand(Op);
597}
598
599/// This is a helper function to check whether \p I is commutative.
600/// This is a convenience wrapper that calls the two-parameter version of
601/// isCommutative with the same instruction for both parameters. This is
602/// the common case where the instruction being checked for commutativity
603/// is the same as the instruction whose uses are analyzed for special
604/// patterns (see the two-parameter version above for details).
605/// \param I The instruction to check for commutativity
606/// \returns true if the instruction is commutative, false otherwise
607static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
608
609/// \returns number of operands of \p I, considering commutativity. Returns 2
610/// for commutative instrinsics.
611/// \param I The instruction to check for commutativity
614 // IntrinsicInst::isCommutative returns true if swapping the first "two"
615 // arguments to the intrinsic produces the same result.
616 constexpr unsigned IntrinsicNumOperands = 2;
617 return IntrinsicNumOperands;
618 }
619 return I->getNumOperands();
620}
621
622template <typename T>
623static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
624 unsigned Offset) {
625 static_assert(std::is_same_v<T, InsertElementInst> ||
626 std::is_same_v<T, ExtractElementInst>,
627 "unsupported T");
628 int Index = Offset;
629 if (const auto *IE = dyn_cast<T>(Inst)) {
630 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
631 if (!VT)
632 return std::nullopt;
633 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
634 if (!CI)
635 return std::nullopt;
636 if (CI->getValue().uge(VT->getNumElements()))
637 return std::nullopt;
638 Index *= VT->getNumElements();
639 Index += CI->getZExtValue();
640 return Index;
641 }
642 return std::nullopt;
643}
644
645/// \returns inserting or extracting index of InsertElement, ExtractElement or
646/// InsertValue instruction, using Offset as base offset for index.
647/// \returns std::nullopt if the index is not an immediate.
648static std::optional<unsigned> getElementIndex(const Value *Inst,
649 unsigned Offset = 0) {
650 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
651 return Index;
653 return Index;
654
655 int Index = Offset;
656
657 const auto *IV = dyn_cast<InsertValueInst>(Inst);
658 if (!IV)
659 return std::nullopt;
660
661 Type *CurrentType = IV->getType();
662 for (unsigned I : IV->indices()) {
663 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
664 Index *= ST->getNumElements();
665 CurrentType = ST->getElementType(I);
666 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
667 Index *= AT->getNumElements();
668 CurrentType = AT->getElementType();
669 } else {
670 return std::nullopt;
671 }
672 Index += I;
673 }
674 return Index;
675}
676
677/// \returns true if all of the values in \p VL use the same opcode.
678/// For comparison instructions, also checks if predicates match.
679/// PoisonValues are considered matching.
680/// Interchangeable instructions are not considered.
682 auto *It = find_if(VL, IsaPred<Instruction>);
683 if (It == VL.end())
684 return true;
685 Instruction *MainOp = cast<Instruction>(*It);
686 unsigned Opcode = MainOp->getOpcode();
687 bool IsCmpOp = isa<CmpInst>(MainOp);
688 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
690 return std::all_of(It, VL.end(), [&](Value *V) {
691 if (auto *CI = dyn_cast<CmpInst>(V))
692 return BasePred == CI->getPredicate();
693 if (auto *I = dyn_cast<Instruction>(V))
694 return I->getOpcode() == Opcode;
695 return isa<PoisonValue>(V);
696 });
697}
698
699namespace {
700/// Specifies the way the mask should be analyzed for undefs/poisonous elements
701/// in the shuffle mask.
702enum class UseMask {
703 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
704 ///< check for the mask elements for the first argument (mask
705 ///< indices are in range [0:VF)).
706 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
707 ///< for the mask elements for the second argument (mask indices
708 ///< are in range [VF:2*VF))
709 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
710 ///< future shuffle elements and mark them as ones as being used
711 ///< in future. Non-undef elements are considered as unused since
712 ///< they're already marked as used in the mask.
713};
714} // namespace
715
716/// Prepares a use bitset for the given mask either for the first argument or
717/// for the second.
719 UseMask MaskArg) {
720 SmallBitVector UseMask(VF, true);
721 for (auto [Idx, Value] : enumerate(Mask)) {
722 if (Value == PoisonMaskElem) {
723 if (MaskArg == UseMask::UndefsAsMask)
724 UseMask.reset(Idx);
725 continue;
726 }
727 if (MaskArg == UseMask::FirstArg && Value < VF)
728 UseMask.reset(Value);
729 else if (MaskArg == UseMask::SecondArg && Value >= VF)
730 UseMask.reset(Value - VF);
731 }
732 return UseMask;
733}
734
735/// Checks if the given value is actually an undefined constant vector.
736/// Also, if the \p UseMask is not empty, tries to check if the non-masked
737/// elements actually mask the insertelement buildvector, if any.
738template <bool IsPoisonOnly = false>
740 const SmallBitVector &UseMask = {}) {
741 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
742 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
743 if (isa<T>(V))
744 return Res;
745 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
746 if (!VecTy)
747 return Res.reset();
748 auto *C = dyn_cast<Constant>(V);
749 if (!C) {
750 if (!UseMask.empty()) {
751 const Value *Base = V;
752 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
753 Base = II->getOperand(0);
754 if (isa<T>(II->getOperand(1)))
755 continue;
756 std::optional<unsigned> Idx = getElementIndex(II);
757 if (!Idx) {
758 Res.reset();
759 return Res;
760 }
761 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
762 Res.reset(*Idx);
763 }
764 // TODO: Add analysis for shuffles here too.
765 if (V == Base) {
766 Res.reset();
767 } else {
768 SmallBitVector SubMask(UseMask.size(), false);
769 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
770 }
771 } else {
772 Res.reset();
773 }
774 return Res;
775 }
776 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
777 if (Constant *Elem = C->getAggregateElement(I))
778 if (!isa<T>(Elem) &&
779 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
780 Res.reset(I);
781 }
782 return Res;
783}
784
785/// Checks if the vector of instructions can be represented as a shuffle, like:
786/// %x0 = extractelement <4 x i8> %x, i32 0
787/// %x3 = extractelement <4 x i8> %x, i32 3
788/// %y1 = extractelement <4 x i8> %y, i32 1
789/// %y2 = extractelement <4 x i8> %y, i32 2
790/// %x0x0 = mul i8 %x0, %x0
791/// %x3x3 = mul i8 %x3, %x3
792/// %y1y1 = mul i8 %y1, %y1
793/// %y2y2 = mul i8 %y2, %y2
794/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
795/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
796/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
797/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
798/// ret <4 x i8> %ins4
799/// can be transformed into:
800/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
801/// i32 6>
802/// %2 = mul <4 x i8> %1, %1
803/// ret <4 x i8> %2
804/// Mask will return the Shuffle Mask equivalent to the extracted elements.
805/// TODO: Can we split off and reuse the shuffle mask detection from
806/// ShuffleVectorInst/getShuffleCost?
807static std::optional<TargetTransformInfo::ShuffleKind>
809 AssumptionCache *AC) {
810 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
811 if (It == VL.end())
812 return std::nullopt;
813 unsigned Size =
814 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
815 auto *EI = dyn_cast<ExtractElementInst>(V);
816 if (!EI)
817 return S;
818 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
819 if (!VTy)
820 return S;
821 return std::max(S, VTy->getNumElements());
822 });
823
824 Value *Vec1 = nullptr;
825 Value *Vec2 = nullptr;
826 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
827 auto *EE = dyn_cast<ExtractElementInst>(V);
828 if (!EE)
829 return false;
830 Value *Vec = EE->getVectorOperand();
831 if (isa<UndefValue>(Vec))
832 return false;
833 return isGuaranteedNotToBePoison(Vec, AC);
834 });
835 enum ShuffleMode { Unknown, Select, Permute };
836 ShuffleMode CommonShuffleMode = Unknown;
837 Mask.assign(VL.size(), PoisonMaskElem);
838 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
839 // Undef can be represented as an undef element in a vector.
840 if (isa<UndefValue>(VL[I]))
841 continue;
842 auto *EI = cast<ExtractElementInst>(VL[I]);
843 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
844 return std::nullopt;
845 auto *Vec = EI->getVectorOperand();
846 // We can extractelement from undef or poison vector.
848 continue;
849 // All vector operands must have the same number of vector elements.
850 if (isa<UndefValue>(Vec)) {
851 Mask[I] = I;
852 } else {
853 if (isa<UndefValue>(EI->getIndexOperand()))
854 continue;
855 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
856 if (!Idx)
857 return std::nullopt;
858 // Undefined behavior if Idx is negative or >= Size.
859 if (Idx->getValue().uge(Size))
860 continue;
861 unsigned IntIdx = Idx->getValue().getZExtValue();
862 Mask[I] = IntIdx;
863 }
864 if (isUndefVector(Vec).all() && HasNonUndefVec)
865 continue;
866 // For correct shuffling we have to have at most 2 different vector operands
867 // in all extractelement instructions.
868 if (!Vec1 || Vec1 == Vec) {
869 Vec1 = Vec;
870 } else if (!Vec2 || Vec2 == Vec) {
871 Vec2 = Vec;
872 Mask[I] += Size;
873 } else {
874 return std::nullopt;
875 }
876 if (CommonShuffleMode == Permute)
877 continue;
878 // If the extract index is not the same as the operation number, it is a
879 // permutation.
880 if (Mask[I] % Size != I) {
881 CommonShuffleMode = Permute;
882 continue;
883 }
884 CommonShuffleMode = Select;
885 }
886 // If we're not crossing lanes in different vectors, consider it as blending.
887 if (CommonShuffleMode == Select && Vec2)
889 // If Vec2 was never used, we have a permutation of a single vector, otherwise
890 // we have permutation of 2 vectors.
893}
894
895/// \returns True if Extract{Value,Element} instruction extracts element Idx.
896static std::optional<unsigned> getExtractIndex(const Instruction *E) {
897 unsigned Opcode = E->getOpcode();
898 assert((Opcode == Instruction::ExtractElement ||
899 Opcode == Instruction::ExtractValue) &&
900 "Expected extractelement or extractvalue instruction.");
901 if (Opcode == Instruction::ExtractElement) {
902 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
903 if (!CI)
904 return std::nullopt;
905 return CI->getZExtValue();
906 }
907 auto *EI = cast<ExtractValueInst>(E);
908 if (EI->getNumIndices() != 1)
909 return std::nullopt;
910 return *EI->idx_begin();
911}
912
913/// Checks if the provided value does not require scheduling. It does not
914/// require scheduling if this is not an instruction or it is an instruction
915/// that does not read/write memory and all operands are either not instructions
916/// or phi nodes or instructions from different blocks.
917static bool areAllOperandsNonInsts(Value *V);
918/// Checks if the provided value does not require scheduling. It does not
919/// require scheduling if this is not an instruction or it is an instruction
920/// that does not read/write memory and all users are phi nodes or instructions
921/// from the different blocks.
922static bool isUsedOutsideBlock(Value *V);
923/// Checks if the specified value does not require scheduling. It does not
924/// require scheduling if all operands and all users do not need to be scheduled
925/// in the current basic block.
926static bool doesNotNeedToBeScheduled(Value *V);
927
928/// \returns true if \p Opcode is allowed as part of the main/alternate
929/// instruction for SLP vectorization.
930///
931/// Example of unsupported opcode is SDIV that can potentially cause UB if the
932/// "shuffled out" lane would result in division by zero.
933static bool isValidForAlternation(unsigned Opcode) {
934 return !Instruction::isIntDivRem(Opcode);
935}
936
937namespace {
938
939/// Helper class that determines VL can use the same opcode.
940/// Alternate instruction is supported. In addition, it supports interchangeable
941/// instruction. An interchangeable instruction is an instruction that can be
942/// converted to another instruction with same semantics. For example, x << 1 is
943/// equal to x * 2. x * 1 is equal to x | 0.
944class BinOpSameOpcodeHelper {
945 using MaskType = std::uint_fast16_t;
946 /// Sort SupportedOp because it is used by binary_search.
947 constexpr static std::initializer_list<unsigned> SupportedOp = {
948 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
949 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
950 enum : MaskType {
951 ShlBIT = 0b1,
952 AShrBIT = 0b10,
953 MulBIT = 0b100,
954 AddBIT = 0b1000,
955 SubBIT = 0b10000,
956 AndBIT = 0b100000,
957 OrBIT = 0b1000000,
958 XorBIT = 0b10000000,
959 MainOpBIT = 0b100000000,
961 };
962 /// Return a non-nullptr if either operand of I is a ConstantInt.
963 /// The second return value represents the operand position. We check the
964 /// right-hand side first (1). If the right hand side is not a ConstantInt and
965 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
966 /// side (0).
967 static std::pair<ConstantInt *, unsigned>
968 isBinOpWithConstantInt(const Instruction *I) {
969 unsigned Opcode = I->getOpcode();
970 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
971 (void)SupportedOp;
972 auto *BinOp = cast<BinaryOperator>(I);
973 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
974 return {CI, 1};
975 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
976 Opcode == Instruction::AShr)
977 return {nullptr, 0};
978 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
979 return {CI, 0};
980 return {nullptr, 0};
981 }
982 struct InterchangeableInfo {
983 const Instruction *I = nullptr;
984 /// The bit it sets represents whether MainOp can be converted to.
985 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
986 MulBIT | AShrBIT | ShlBIT;
987 /// We cannot create an interchangeable instruction that does not exist in
988 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
989 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
990 /// 1]. SeenBefore is used to know what operations have been seen before.
991 MaskType SeenBefore = 0;
992 InterchangeableInfo(const Instruction *I) : I(I) {}
993 /// Return false allows BinOpSameOpcodeHelper to find an alternate
994 /// instruction. Directly setting the mask will destroy the mask state,
995 /// preventing us from determining which instruction it should convert to.
996 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
997 if (Mask & InterchangeableMask) {
998 SeenBefore |= OpcodeInMaskForm;
999 Mask &= InterchangeableMask;
1000 return true;
1001 }
1002 return false;
1003 }
1004 bool equal(unsigned Opcode) {
1005 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1006 }
1007 unsigned getOpcode() const {
1008 MaskType Candidate = Mask & SeenBefore;
1009 if (Candidate & MainOpBIT)
1010 return I->getOpcode();
1011 if (Candidate & ShlBIT)
1012 return Instruction::Shl;
1013 if (Candidate & AShrBIT)
1014 return Instruction::AShr;
1015 if (Candidate & MulBIT)
1016 return Instruction::Mul;
1017 if (Candidate & AddBIT)
1018 return Instruction::Add;
1019 if (Candidate & SubBIT)
1020 return Instruction::Sub;
1021 if (Candidate & AndBIT)
1022 return Instruction::And;
1023 if (Candidate & OrBIT)
1024 return Instruction::Or;
1025 if (Candidate & XorBIT)
1026 return Instruction::Xor;
1027 llvm_unreachable("Cannot find interchangeable instruction.");
1028 }
1029
1030 /// Return true if the instruction can be converted to \p Opcode.
1031 bool hasCandidateOpcode(unsigned Opcode) const {
1032 MaskType Candidate = Mask & SeenBefore;
1033 switch (Opcode) {
1034 case Instruction::Shl:
1035 return Candidate & ShlBIT;
1036 case Instruction::AShr:
1037 return Candidate & AShrBIT;
1038 case Instruction::Mul:
1039 return Candidate & MulBIT;
1040 case Instruction::Add:
1041 return Candidate & AddBIT;
1042 case Instruction::Sub:
1043 return Candidate & SubBIT;
1044 case Instruction::And:
1045 return Candidate & AndBIT;
1046 case Instruction::Or:
1047 return Candidate & OrBIT;
1048 case Instruction::Xor:
1049 return Candidate & XorBIT;
1050 case Instruction::LShr:
1051 case Instruction::FAdd:
1052 case Instruction::FSub:
1053 case Instruction::FMul:
1054 case Instruction::SDiv:
1055 case Instruction::UDiv:
1056 case Instruction::FDiv:
1057 case Instruction::SRem:
1058 case Instruction::URem:
1059 case Instruction::FRem:
1060 return false;
1061 default:
1062 break;
1063 }
1064 llvm_unreachable("Cannot find interchangeable instruction.");
1065 }
1066
1067 SmallVector<Value *> getOperand(const Instruction *To) const {
1068 unsigned ToOpcode = To->getOpcode();
1069 unsigned FromOpcode = I->getOpcode();
1070 if (FromOpcode == ToOpcode)
1071 return SmallVector<Value *>(I->operands());
1072 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1073 auto [CI, Pos] = isBinOpWithConstantInt(I);
1074 const APInt &FromCIValue = CI->getValue();
1075 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1076 APInt ToCIValue;
1077 switch (FromOpcode) {
1078 case Instruction::Shl:
1079 if (ToOpcode == Instruction::Mul) {
1080 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1081 FromCIValue.getZExtValue());
1082 } else {
1083 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1084 ToCIValue = ToOpcode == Instruction::And
1085 ? APInt::getAllOnes(FromCIValueBitWidth)
1086 : APInt::getZero(FromCIValueBitWidth);
1087 }
1088 break;
1089 case Instruction::Mul:
1090 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1091 if (ToOpcode == Instruction::Shl) {
1092 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1093 } else {
1094 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1095 ToCIValue = ToOpcode == Instruction::And
1096 ? APInt::getAllOnes(FromCIValueBitWidth)
1097 : APInt::getZero(FromCIValueBitWidth);
1098 }
1099 break;
1100 case Instruction::Add:
1101 case Instruction::Sub:
1102 if (FromCIValue.isZero()) {
1103 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1104 } else {
1105 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1106 "Cannot convert the instruction.");
1107 ToCIValue = FromCIValue;
1108 ToCIValue.negate();
1109 }
1110 break;
1111 case Instruction::And:
1112 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1113 ToCIValue = ToOpcode == Instruction::Mul
1114 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1115 : APInt::getZero(FromCIValueBitWidth);
1116 break;
1117 default:
1118 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1119 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1120 break;
1121 }
1122 Value *LHS = I->getOperand(1 - Pos);
1123 Constant *RHS =
1124 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1125 // constant + x cannot be -constant - x
1126 // instead, it should be x - -constant
1127 if (Pos == 1 ||
1128 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1129 FromOpcode == Instruction::Xor) &&
1130 ToOpcode == Instruction::Sub))
1131 return SmallVector<Value *>({LHS, RHS});
1132 return SmallVector<Value *>({RHS, LHS});
1133 }
1134 };
1135 InterchangeableInfo MainOp;
1136 InterchangeableInfo AltOp;
1137 bool isValidForAlternation(const Instruction *I) const {
1138 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1139 ::isValidForAlternation(I->getOpcode());
1140 }
1141 bool initializeAltOp(const Instruction *I) {
1142 if (AltOp.I)
1143 return true;
1145 return false;
1146 AltOp.I = I;
1147 return true;
1148 }
1149
1150public:
1151 BinOpSameOpcodeHelper(const Instruction *MainOp,
1152 const Instruction *AltOp = nullptr)
1153 : MainOp(MainOp), AltOp(AltOp) {
1154 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1155 }
1156 bool add(const Instruction *I) {
1158 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1159 unsigned Opcode = I->getOpcode();
1160 MaskType OpcodeInMaskForm;
1161 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1162 switch (Opcode) {
1163 case Instruction::Shl:
1164 OpcodeInMaskForm = ShlBIT;
1165 break;
1166 case Instruction::AShr:
1167 OpcodeInMaskForm = AShrBIT;
1168 break;
1169 case Instruction::Mul:
1170 OpcodeInMaskForm = MulBIT;
1171 break;
1172 case Instruction::Add:
1173 OpcodeInMaskForm = AddBIT;
1174 break;
1175 case Instruction::Sub:
1176 OpcodeInMaskForm = SubBIT;
1177 break;
1178 case Instruction::And:
1179 OpcodeInMaskForm = AndBIT;
1180 break;
1181 case Instruction::Or:
1182 OpcodeInMaskForm = OrBIT;
1183 break;
1184 case Instruction::Xor:
1185 OpcodeInMaskForm = XorBIT;
1186 break;
1187 default:
1188 return MainOp.equal(Opcode) ||
1189 (initializeAltOp(I) && AltOp.equal(Opcode));
1190 }
1191 MaskType InterchangeableMask = OpcodeInMaskForm;
1192 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1193 if (CI) {
1194 constexpr MaskType CanBeAll =
1195 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1196 const APInt &CIValue = CI->getValue();
1197 switch (Opcode) {
1198 case Instruction::Shl:
1199 if (CIValue.ult(CIValue.getBitWidth()))
1200 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1201 break;
1202 case Instruction::Mul:
1203 if (CIValue.isOne()) {
1204 InterchangeableMask = CanBeAll;
1205 break;
1206 }
1207 if (CIValue.isPowerOf2())
1208 InterchangeableMask = MulBIT | ShlBIT;
1209 break;
1210 case Instruction::Add:
1211 case Instruction::Sub:
1212 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1213 break;
1214 case Instruction::And:
1215 if (CIValue.isAllOnes())
1216 InterchangeableMask = CanBeAll;
1217 break;
1218 case Instruction::Xor:
1219 if (CIValue.isZero())
1220 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1221 break;
1222 default:
1223 if (CIValue.isZero())
1224 InterchangeableMask = CanBeAll;
1225 break;
1226 }
1227 }
1228 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1229 (initializeAltOp(I) &&
1230 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1231 }
1232 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1233 /// Checks if the list of potential opcodes includes \p Opcode.
1234 bool hasCandidateOpcode(unsigned Opcode) const {
1235 return MainOp.hasCandidateOpcode(Opcode);
1236 }
1237 bool hasAltOp() const { return AltOp.I; }
1238 unsigned getAltOpcode() const {
1239 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1240 }
1241 SmallVector<Value *> getOperand(const Instruction *I) const {
1242 return MainOp.getOperand(I);
1243 }
1244};
1245
1246/// Main data required for vectorization of instructions.
1247class InstructionsState {
1248 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1249 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1250 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1251 /// isAltShuffle).
1252 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1253 /// from getMainAltOpsNoStateVL.
1254 /// For those InstructionsState that use alternate instructions, the resulting
1255 /// vectorized output ultimately comes from a shufflevector. For example,
1256 /// given a vector list (VL):
1257 /// VL[0] = add i32 a, e
1258 /// VL[1] = sub i32 b, f
1259 /// VL[2] = add i32 c, g
1260 /// VL[3] = sub i32 d, h
1261 /// The vectorized result would be:
1262 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1263 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1264 /// result = shufflevector <4 x i32> intermediated_0,
1265 /// <4 x i32> intermediated_1,
1266 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1267 /// Since shufflevector is used in the final result, when calculating the cost
1268 /// (getEntryCost), we must account for the usage of shufflevector in
1269 /// GetVectorCost.
1270 Instruction *MainOp = nullptr;
1271 Instruction *AltOp = nullptr;
1272 /// Wether the instruction state represents copyable instructions.
1273 bool HasCopyables = false;
1274
1275public:
1276 Instruction *getMainOp() const {
1277 assert(valid() && "InstructionsState is invalid.");
1278 return MainOp;
1279 }
1280
1281 Instruction *getAltOp() const {
1282 assert(valid() && "InstructionsState is invalid.");
1283 return AltOp;
1284 }
1285
1286 /// The main/alternate opcodes for the list of instructions.
1287 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1288
1289 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1290
1291 /// Some of the instructions in the list have alternate opcodes.
1292 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1293
1294 /// Checks if the instruction matches either the main or alternate opcode.
1295 /// \returns
1296 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1297 /// to it
1298 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1299 /// it
1300 /// - nullptr if \param I cannot be matched or converted to either opcode
1301 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1302 assert(MainOp && "MainOp cannot be nullptr.");
1303 if (I->getOpcode() == MainOp->getOpcode())
1304 return MainOp;
1305 // Prefer AltOp instead of interchangeable instruction of MainOp.
1306 assert(AltOp && "AltOp cannot be nullptr.");
1307 if (I->getOpcode() == AltOp->getOpcode())
1308 return AltOp;
1309 if (!I->isBinaryOp())
1310 return nullptr;
1311 BinOpSameOpcodeHelper Converter(MainOp);
1312 if (!Converter.add(I) || !Converter.add(MainOp))
1313 return nullptr;
1314 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1315 BinOpSameOpcodeHelper AltConverter(AltOp);
1316 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1317 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1318 return AltOp;
1319 }
1320 if (Converter.hasAltOp() && !isAltShuffle())
1321 return nullptr;
1322 return Converter.hasAltOp() ? AltOp : MainOp;
1323 }
1324
1325 /// Checks if main/alt instructions are shift operations.
1326 bool isShiftOp() const {
1327 return getMainOp()->isShift() && getAltOp()->isShift();
1328 }
1329
1330 /// Checks if main/alt instructions are bitwise logic operations.
1331 bool isBitwiseLogicOp() const {
1332 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1333 }
1334
1335 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1336 bool isMulDivLikeOp() const {
1337 constexpr std::array<unsigned, 8> MulDiv = {
1338 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1339 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1340 Instruction::URem, Instruction::FRem};
1341 return is_contained(MulDiv, getOpcode()) &&
1342 is_contained(MulDiv, getAltOpcode());
1343 }
1344
1345 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1346 bool isAddSubLikeOp() const {
1347 constexpr std::array<unsigned, 4> AddSub = {
1348 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1349 Instruction::FSub};
1350 return is_contained(AddSub, getOpcode()) &&
1351 is_contained(AddSub, getAltOpcode());
1352 }
1353
1354 /// Checks if main/alt instructions are cmp operations.
1355 bool isCmpOp() const {
1356 return (getOpcode() == Instruction::ICmp ||
1357 getOpcode() == Instruction::FCmp) &&
1358 getAltOpcode() == getOpcode();
1359 }
1360
1361 /// Checks if the current state is valid, i.e. has non-null MainOp
1362 bool valid() const { return MainOp && AltOp; }
1363
1364 explicit operator bool() const { return valid(); }
1365
1366 InstructionsState() = delete;
1367 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1368 bool HasCopyables = false)
1369 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1370 static InstructionsState invalid() { return {nullptr, nullptr}; }
1371
1372 /// Checks if the value is a copyable element.
1373 bool isCopyableElement(Value *V) const {
1374 assert(valid() && "InstructionsState is invalid.");
1375 if (!HasCopyables)
1376 return false;
1377 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1378 return false;
1379 auto *I = dyn_cast<Instruction>(V);
1380 if (!I)
1381 return !isa<PoisonValue>(V);
1382 if (I->getParent() != MainOp->getParent() &&
1385 return true;
1386 if (I->getOpcode() == MainOp->getOpcode())
1387 return false;
1388 if (!I->isBinaryOp())
1389 return true;
1390 BinOpSameOpcodeHelper Converter(MainOp);
1391 return !Converter.add(I) || !Converter.add(MainOp) ||
1392 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1393 }
1394
1395 /// Checks if the value is non-schedulable.
1396 bool isNonSchedulable(Value *V) const {
1397 assert(valid() && "InstructionsState is invalid.");
1398 auto *I = dyn_cast<Instruction>(V);
1399 if (!HasCopyables)
1400 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1402 // MainOp for copyables always schedulable to correctly identify
1403 // non-schedulable copyables.
1404 if (getMainOp() == V)
1405 return false;
1406 if (isCopyableElement(V)) {
1407 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1408 auto *I = dyn_cast<Instruction>(V);
1409 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1411 // If the copyable instructions comes after MainOp
1412 // (non-schedulable, but used in the block) - cannot vectorize
1413 // it, will possibly generate use before def.
1414 !MainOp->comesBefore(I));
1415 };
1416
1417 return IsNonSchedulableCopyableElement(V);
1418 }
1419 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1421 }
1422
1423 /// Checks if the state represents copyable instructions.
1424 bool areInstructionsWithCopyableElements() const {
1425 assert(valid() && "InstructionsState is invalid.");
1426 return HasCopyables;
1427 }
1428};
1429
1430std::pair<Instruction *, SmallVector<Value *>>
1431convertTo(Instruction *I, const InstructionsState &S) {
1432 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1433 assert(SelectedOp && "Cannot convert the instruction.");
1434 if (I->isBinaryOp()) {
1435 BinOpSameOpcodeHelper Converter(I);
1436 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1437 }
1438 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1439}
1440
1441} // end anonymous namespace
1442
1443static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1444 const TargetLibraryInfo &TLI);
1445
1446/// Find an instruction with a specific opcode in VL.
1447/// \param VL Array of values to search through. Must contain only Instructions
1448/// and PoisonValues.
1449/// \param Opcode The instruction opcode to search for
1450/// \returns
1451/// - The first instruction found with matching opcode
1452/// - nullptr if no matching instruction is found
1454 unsigned Opcode) {
1455 for (Value *V : VL) {
1456 if (isa<PoisonValue>(V))
1457 continue;
1458 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1459 auto *Inst = cast<Instruction>(V);
1460 if (Inst->getOpcode() == Opcode)
1461 return Inst;
1462 }
1463 return nullptr;
1464}
1465
1466/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1467/// compatible instructions or constants, or just some other regular values.
1468static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1469 Value *Op1, const TargetLibraryInfo &TLI) {
1470 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1471 (isConstant(BaseOp1) && isConstant(Op1)) ||
1472 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1473 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1474 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1475 getSameOpcode({BaseOp0, Op0}, TLI) ||
1476 getSameOpcode({BaseOp1, Op1}, TLI);
1477}
1478
1479/// \returns true if a compare instruction \p CI has similar "look" and
1480/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1481/// swapped, false otherwise.
1482static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1483 const TargetLibraryInfo &TLI) {
1484 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1485 "Assessing comparisons of different types?");
1486 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1487 CmpInst::Predicate Pred = CI->getPredicate();
1489
1490 Value *BaseOp0 = BaseCI->getOperand(0);
1491 Value *BaseOp1 = BaseCI->getOperand(1);
1492 Value *Op0 = CI->getOperand(0);
1493 Value *Op1 = CI->getOperand(1);
1494
1495 return (BasePred == Pred &&
1496 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1497 (BasePred == SwappedPred &&
1498 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1499}
1500
1501/// \returns analysis of the Instructions in \p VL described in
1502/// InstructionsState, the Opcode that we suppose the whole list
1503/// could be vectorized even if its structure is diverse.
1504static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1505 const TargetLibraryInfo &TLI) {
1506 // Make sure these are all Instructions.
1508 return InstructionsState::invalid();
1509
1510 auto *It = find_if(VL, IsaPred<Instruction>);
1511 if (It == VL.end())
1512 return InstructionsState::invalid();
1513
1514 Instruction *MainOp = cast<Instruction>(*It);
1515 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1516 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1517 (VL.size() == 2 && InstCnt < 2))
1518 return InstructionsState::invalid();
1519
1520 bool IsCastOp = isa<CastInst>(MainOp);
1521 bool IsBinOp = isa<BinaryOperator>(MainOp);
1522 bool IsCmpOp = isa<CmpInst>(MainOp);
1523 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1525 Instruction *AltOp = MainOp;
1526 unsigned Opcode = MainOp->getOpcode();
1527 unsigned AltOpcode = Opcode;
1528
1529 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1530 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1531 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1532 UniquePreds.insert(BasePred);
1533 UniqueNonSwappedPreds.insert(BasePred);
1534 for (Value *V : VL) {
1535 auto *I = dyn_cast<CmpInst>(V);
1536 if (!I)
1537 return false;
1538 CmpInst::Predicate CurrentPred = I->getPredicate();
1539 CmpInst::Predicate SwappedCurrentPred =
1540 CmpInst::getSwappedPredicate(CurrentPred);
1541 UniqueNonSwappedPreds.insert(CurrentPred);
1542 if (!UniquePreds.contains(CurrentPred) &&
1543 !UniquePreds.contains(SwappedCurrentPred))
1544 UniquePreds.insert(CurrentPred);
1545 }
1546 // Total number of predicates > 2, but if consider swapped predicates
1547 // compatible only 2, consider swappable predicates as compatible opcodes,
1548 // not alternate.
1549 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1550 }();
1551 // Check for one alternate opcode from another BinaryOperator.
1552 // TODO - generalize to support all operators (types, calls etc.).
1553 Intrinsic::ID BaseID = 0;
1554 SmallVector<VFInfo> BaseMappings;
1555 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1556 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1557 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1558 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1559 return InstructionsState::invalid();
1560 }
1561 bool AnyPoison = InstCnt != VL.size();
1562 // Check MainOp too to be sure that it matches the requirements for the
1563 // instructions.
1564 for (Value *V : iterator_range(It, VL.end())) {
1565 auto *I = dyn_cast<Instruction>(V);
1566 if (!I)
1567 continue;
1568
1569 // Cannot combine poison and divisions.
1570 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1571 // intrinsics/functions only.
1572 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1573 return InstructionsState::invalid();
1574 unsigned InstOpcode = I->getOpcode();
1575 if (IsBinOp && isa<BinaryOperator>(I)) {
1576 if (BinOpHelper.add(I))
1577 continue;
1578 } else if (IsCastOp && isa<CastInst>(I)) {
1579 Value *Op0 = MainOp->getOperand(0);
1580 Type *Ty0 = Op0->getType();
1581 Value *Op1 = I->getOperand(0);
1582 Type *Ty1 = Op1->getType();
1583 if (Ty0 == Ty1) {
1584 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1585 continue;
1586 if (Opcode == AltOpcode) {
1587 assert(isValidForAlternation(Opcode) &&
1588 isValidForAlternation(InstOpcode) &&
1589 "Cast isn't safe for alternation, logic needs to be updated!");
1590 AltOpcode = InstOpcode;
1591 AltOp = I;
1592 continue;
1593 }
1594 }
1595 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1596 auto *BaseInst = cast<CmpInst>(MainOp);
1597 Type *Ty0 = BaseInst->getOperand(0)->getType();
1598 Type *Ty1 = Inst->getOperand(0)->getType();
1599 if (Ty0 == Ty1) {
1600 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1601 assert(InstOpcode == AltOpcode &&
1602 "Alternate instructions are only supported by BinaryOperator "
1603 "and CastInst.");
1604 // Check for compatible operands. If the corresponding operands are not
1605 // compatible - need to perform alternate vectorization.
1606 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1607 CmpInst::Predicate SwappedCurrentPred =
1608 CmpInst::getSwappedPredicate(CurrentPred);
1609
1610 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1611 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1612 continue;
1613
1614 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1615 continue;
1616 auto *AltInst = cast<CmpInst>(AltOp);
1617 if (MainOp != AltOp) {
1618 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1619 continue;
1620 } else if (BasePred != CurrentPred) {
1621 assert(
1622 isValidForAlternation(InstOpcode) &&
1623 "CmpInst isn't safe for alternation, logic needs to be updated!");
1624 AltOp = I;
1625 continue;
1626 }
1627 CmpInst::Predicate AltPred = AltInst->getPredicate();
1628 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1629 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1630 continue;
1631 }
1632 } else if (InstOpcode == Opcode) {
1633 assert(InstOpcode == AltOpcode &&
1634 "Alternate instructions are only supported by BinaryOperator and "
1635 "CastInst.");
1636 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1637 if (Gep->getNumOperands() != 2 ||
1638 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1639 return InstructionsState::invalid();
1640 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1642 return InstructionsState::invalid();
1643 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1644 auto *BaseLI = cast<LoadInst>(MainOp);
1645 if (!LI->isSimple() || !BaseLI->isSimple())
1646 return InstructionsState::invalid();
1647 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1648 auto *CallBase = cast<CallInst>(MainOp);
1649 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1650 return InstructionsState::invalid();
1651 if (Call->hasOperandBundles() &&
1653 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1654 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1655 CallBase->op_begin() +
1657 return InstructionsState::invalid();
1659 if (ID != BaseID)
1660 return InstructionsState::invalid();
1661 if (!ID) {
1662 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1663 if (Mappings.size() != BaseMappings.size() ||
1664 Mappings.front().ISA != BaseMappings.front().ISA ||
1665 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1666 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1667 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1668 Mappings.front().Shape.Parameters !=
1669 BaseMappings.front().Shape.Parameters)
1670 return InstructionsState::invalid();
1671 }
1672 }
1673 continue;
1674 }
1675 return InstructionsState::invalid();
1676 }
1677
1678 if (IsBinOp) {
1679 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1680 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1681 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1682 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1683 }
1684 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1685 "Incorrect implementation of allSameOpcode.");
1686 InstructionsState S(MainOp, AltOp);
1687 assert(all_of(VL,
1688 [&](Value *V) {
1689 return isa<PoisonValue>(V) ||
1690 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1691 }) &&
1692 "Invalid InstructionsState.");
1693 return S;
1694}
1695
1696/// \returns true if all of the values in \p VL have the same type or false
1697/// otherwise.
1699 Type *Ty = VL.consume_front()->getType();
1700 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1701}
1702
1703/// \returns True if in-tree use also needs extract. This refers to
1704/// possible scalar operand in vectorized instruction.
1705static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1706 TargetLibraryInfo *TLI,
1707 const TargetTransformInfo *TTI) {
1708 if (!UserInst)
1709 return false;
1710 unsigned Opcode = UserInst->getOpcode();
1711 switch (Opcode) {
1712 case Instruction::Load: {
1713 LoadInst *LI = cast<LoadInst>(UserInst);
1714 return (LI->getPointerOperand() == Scalar);
1715 }
1716 case Instruction::Store: {
1717 StoreInst *SI = cast<StoreInst>(UserInst);
1718 return (SI->getPointerOperand() == Scalar);
1719 }
1720 case Instruction::Call: {
1721 CallInst *CI = cast<CallInst>(UserInst);
1723 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1724 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1725 Arg.value().get() == Scalar;
1726 });
1727 }
1728 default:
1729 return false;
1730 }
1731}
1732
1733/// \returns the AA location that is being access by the instruction.
1736 return MemoryLocation::get(SI);
1737 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1738 return MemoryLocation::get(LI);
1739 return MemoryLocation();
1740}
1741
1742/// \returns True if the instruction is not a volatile or atomic load/store.
1743static bool isSimple(Instruction *I) {
1744 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1745 return LI->isSimple();
1747 return SI->isSimple();
1749 return !MI->isVolatile();
1750 return true;
1751}
1752
1753/// Shuffles \p Mask in accordance with the given \p SubMask.
1754/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1755/// one but two input vectors.
1756static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1757 bool ExtendingManyInputs = false) {
1758 if (SubMask.empty())
1759 return;
1760 assert(
1761 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1762 // Check if input scalars were extended to match the size of other node.
1763 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1764 "SubMask with many inputs support must be larger than the mask.");
1765 if (Mask.empty()) {
1766 Mask.append(SubMask.begin(), SubMask.end());
1767 return;
1768 }
1769 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1770 int TermValue = std::min(Mask.size(), SubMask.size());
1771 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1772 if (SubMask[I] == PoisonMaskElem ||
1773 (!ExtendingManyInputs &&
1774 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1775 continue;
1776 NewMask[I] = Mask[SubMask[I]];
1777 }
1778 Mask.swap(NewMask);
1779}
1780
1781/// Order may have elements assigned special value (size) which is out of
1782/// bounds. Such indices only appear on places which correspond to undef values
1783/// (see canReuseExtract for details) and used in order to avoid undef values
1784/// have effect on operands ordering.
1785/// The first loop below simply finds all unused indices and then the next loop
1786/// nest assigns these indices for undef values positions.
1787/// As an example below Order has two undef positions and they have assigned
1788/// values 3 and 7 respectively:
1789/// before: 6 9 5 4 9 2 1 0
1790/// after: 6 3 5 4 7 2 1 0
1792 const size_t Sz = Order.size();
1793 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1794 SmallBitVector MaskedIndices(Sz);
1795 for (unsigned I = 0; I < Sz; ++I) {
1796 if (Order[I] < Sz)
1797 UnusedIndices.reset(Order[I]);
1798 else
1799 MaskedIndices.set(I);
1800 }
1801 if (MaskedIndices.none())
1802 return;
1803 assert(UnusedIndices.count() == MaskedIndices.count() &&
1804 "Non-synced masked/available indices.");
1805 int Idx = UnusedIndices.find_first();
1806 int MIdx = MaskedIndices.find_first();
1807 while (MIdx >= 0) {
1808 assert(Idx >= 0 && "Indices must be synced.");
1809 Order[MIdx] = Idx;
1810 Idx = UnusedIndices.find_next(Idx);
1811 MIdx = MaskedIndices.find_next(MIdx);
1812 }
1813}
1814
1815/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1816/// Opcode1.
1818 unsigned Opcode0, unsigned Opcode1) {
1819 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1820 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1821 for (unsigned Lane : seq<unsigned>(VL.size())) {
1822 if (isa<PoisonValue>(VL[Lane]))
1823 continue;
1824 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1825 OpcodeMask.set(Lane * ScalarTyNumElements,
1826 Lane * ScalarTyNumElements + ScalarTyNumElements);
1827 }
1828 return OpcodeMask;
1829}
1830
1831/// Replicates the given \p Val \p VF times.
1833 unsigned VF) {
1834 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1835 "Expected scalar constants.");
1836 SmallVector<Constant *> NewVal(Val.size() * VF);
1837 for (auto [I, V] : enumerate(Val))
1838 std::fill_n(NewVal.begin() + I * VF, VF, V);
1839 return NewVal;
1840}
1841
1843 SmallVectorImpl<int> &Mask) {
1844 Mask.clear();
1845 const unsigned E = Indices.size();
1846 Mask.resize(E, PoisonMaskElem);
1847 for (unsigned I = 0; I < E; ++I)
1848 Mask[Indices[I]] = I;
1849}
1850
1851/// Reorders the list of scalars in accordance with the given \p Mask.
1853 ArrayRef<int> Mask) {
1854 assert(!Mask.empty() && "Expected non-empty mask.");
1855 SmallVector<Value *> Prev(Scalars.size(),
1856 PoisonValue::get(Scalars.front()->getType()));
1857 Prev.swap(Scalars);
1858 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1859 if (Mask[I] != PoisonMaskElem)
1860 Scalars[Mask[I]] = Prev[I];
1861}
1862
1863/// Checks if the provided value does not require scheduling. It does not
1864/// require scheduling if this is not an instruction or it is an instruction
1865/// that does not read/write memory and all operands are either not instructions
1866/// or phi nodes or instructions from different blocks.
1868 auto *I = dyn_cast<Instruction>(V);
1869 if (!I)
1870 return true;
1871 return !mayHaveNonDefUseDependency(*I) &&
1872 all_of(I->operands(), [I](Value *V) {
1873 auto *IO = dyn_cast<Instruction>(V);
1874 if (!IO)
1875 return true;
1876 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1877 });
1878}
1879
1880/// Checks if the provided value does not require scheduling. It does not
1881/// require scheduling if this is not an instruction or it is an instruction
1882/// that does not read/write memory and all users are phi nodes or instructions
1883/// from the different blocks.
1884static bool isUsedOutsideBlock(Value *V) {
1885 auto *I = dyn_cast<Instruction>(V);
1886 if (!I)
1887 return true;
1888 // Limits the number of uses to save compile time.
1889 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1890 all_of(I->users(), [I](User *U) {
1891 auto *IU = dyn_cast<Instruction>(U);
1892 if (!IU)
1893 return true;
1894 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1895 });
1896}
1897
1898/// Checks if the specified value does not require scheduling. It does not
1899/// require scheduling if all operands and all users do not need to be scheduled
1900/// in the current basic block.
1903}
1904
1905/// Checks if the specified array of instructions does not require scheduling.
1906/// It is so if all either instructions have operands that do not require
1907/// scheduling or their users do not require scheduling since they are phis or
1908/// in other basic blocks.
1910 return !VL.empty() &&
1912}
1913
1914/// Returns true if widened type of \p Ty elements with size \p Sz represents
1915/// full vector type, i.e. adding extra element results in extra parts upon type
1916/// legalization.
1918 unsigned Sz) {
1919 if (Sz <= 1)
1920 return false;
1922 return false;
1923 if (has_single_bit(Sz))
1924 return true;
1925 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1926 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1927 Sz % NumParts == 0;
1928}
1929
1930/// Returns number of parts, the type \p VecTy will be split at the codegen
1931/// phase. If the type is going to be scalarized or does not uses whole
1932/// registers, returns 1.
1933static unsigned
1935 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1936 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1937 if (NumParts == 0 || NumParts >= Limit)
1938 return 1;
1939 unsigned Sz = getNumElements(VecTy);
1940 if (NumParts >= Sz || Sz % NumParts != 0 ||
1941 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1942 return 1;
1943 return NumParts;
1944}
1945
1946/// Bottom Up SLP Vectorizer.
1948 class TreeEntry;
1949 class ScheduleEntity;
1950 class ScheduleData;
1951 class ScheduleCopyableData;
1952 class ScheduleBundle;
1955
1956 /// If we decide to generate strided load / store, this struct contains all
1957 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1958 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1959 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1960 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1961 /// size of element of FixedVectorType.
1962 struct StridedPtrInfo {
1963 Value *StrideVal = nullptr;
1964 const SCEV *StrideSCEV = nullptr;
1965 FixedVectorType *Ty = nullptr;
1966 };
1967 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1968
1969public:
1970 /// Tracks the state we can represent the loads in the given sequence.
1978
1985
1987 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1989 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1990 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1991 AC(AC), DB(DB), DL(DL), ORE(ORE),
1992 Builder(Se->getContext(), TargetFolder(*DL)) {
1993 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1994 // Use the vector register size specified by the target unless overridden
1995 // by a command-line option.
1996 // TODO: It would be better to limit the vectorization factor based on
1997 // data type rather than just register size. For example, x86 AVX has
1998 // 256-bit registers, but it does not support integer operations
1999 // at that width (that requires AVX2).
2000 if (MaxVectorRegSizeOption.getNumOccurrences())
2001 MaxVecRegSize = MaxVectorRegSizeOption;
2002 else
2003 MaxVecRegSize =
2004 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
2005 .getFixedValue();
2006
2007 if (MinVectorRegSizeOption.getNumOccurrences())
2008 MinVecRegSize = MinVectorRegSizeOption;
2009 else
2010 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2011 }
2012
2013 /// Vectorize the tree that starts with the elements in \p VL.
2014 /// Returns the vectorized root.
2016
2017 /// Vectorize the tree but with the list of externally used values \p
2018 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
2019 /// generated extractvalue instructions.
2021 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2022 Instruction *ReductionRoot = nullptr,
2023 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2024
2025 /// \returns the cost incurred by unwanted spills and fills, caused by
2026 /// holding live values over call sites.
2028
2029 /// Calculates the cost of the subtrees, trims non-profitable ones and returns
2030 /// final cost.
2033
2034 /// \returns the vectorization cost of the subtree that starts at \p VL.
2035 /// A negative number means that this is profitable.
2037 ArrayRef<Value *> VectorizedVals = {},
2038 InstructionCost ReductionCost = TTI::TCC_Free);
2039
2040 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2041 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2042 void buildTree(ArrayRef<Value *> Roots,
2043 const SmallDenseSet<Value *> &UserIgnoreLst);
2044
2045 /// Construct a vectorizable tree that starts at \p Roots.
2046 void buildTree(ArrayRef<Value *> Roots);
2047
2048 /// Return the scalars of the root node.
2050 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2051 return VectorizableTree.front()->Scalars;
2052 }
2053
2054 /// Returns the type/is-signed info for the root node in the graph without
2055 /// casting.
2056 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2057 const TreeEntry &Root = *VectorizableTree.front();
2058 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2059 !Root.Scalars.front()->getType()->isIntegerTy())
2060 return std::nullopt;
2061 auto It = MinBWs.find(&Root);
2062 if (It != MinBWs.end())
2063 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2064 It->second.first),
2065 It->second.second);
2066 if (Root.getOpcode() == Instruction::ZExt ||
2067 Root.getOpcode() == Instruction::SExt)
2068 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2069 Root.getOpcode() == Instruction::SExt);
2070 return std::nullopt;
2071 }
2072
2073 /// Checks if the root graph node can be emitted with narrower bitwidth at
2074 /// codegen and returns it signedness, if so.
2076 return MinBWs.at(VectorizableTree.front().get()).second;
2077 }
2078
2079 /// Returns reduction type after minbitdth analysis.
2081 if (ReductionBitWidth == 0 ||
2082 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2083 ReductionBitWidth >=
2084 DL->getTypeSizeInBits(
2085 VectorizableTree.front()->Scalars.front()->getType()))
2086 return getWidenedType(
2087 VectorizableTree.front()->Scalars.front()->getType(),
2088 VectorizableTree.front()->getVectorFactor());
2089 return getWidenedType(
2091 VectorizableTree.front()->Scalars.front()->getContext(),
2092 ReductionBitWidth),
2093 VectorizableTree.front()->getVectorFactor());
2094 }
2095
2096 /// Builds external uses of the vectorized scalars, i.e. the list of
2097 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2098 /// ExternallyUsedValues contains additional list of external uses to handle
2099 /// vectorization of reductions.
2100 void
2101 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2102
2103 /// Transforms graph nodes to target specific representations, if profitable.
2104 void transformNodes();
2105
2106 /// Clear the internal data structures that are created by 'buildTree'.
2107 void deleteTree() {
2108 VectorizableTree.clear();
2109 ScalarToTreeEntries.clear();
2110 DeletedNodes.clear();
2111 TransformedToGatherNodes.clear();
2112 OperandsToTreeEntry.clear();
2113 ScalarsInSplitNodes.clear();
2114 MustGather.clear();
2115 NonScheduledFirst.clear();
2116 EntryToLastInstruction.clear();
2117 LastInstructionToPos.clear();
2118 LoadEntriesToVectorize.clear();
2119 IsGraphTransformMode = false;
2120 GatheredLoadsEntriesFirst.reset();
2121 CompressEntryToData.clear();
2122 ExternalUses.clear();
2123 ExternalUsesAsOriginalScalar.clear();
2124 ExternalUsesWithNonUsers.clear();
2125 for (auto &Iter : BlocksSchedules) {
2126 BlockScheduling *BS = Iter.second.get();
2127 BS->clear();
2128 }
2129 MinBWs.clear();
2130 ReductionBitWidth = 0;
2131 BaseGraphSize = 1;
2132 CastMaxMinBWSizes.reset();
2133 ExtraBitWidthNodes.clear();
2134 InstrElementSize.clear();
2135 UserIgnoreList = nullptr;
2136 PostponedGathers.clear();
2137 ValueToGatherNodes.clear();
2138 TreeEntryToStridedPtrInfoMap.clear();
2139 }
2140
2141 unsigned getTreeSize() const { return VectorizableTree.size(); }
2142
2143 /// Returns the base graph size, before any transformations.
2144 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2145
2146 /// Perform LICM and CSE on the newly generated gather sequences.
2148
2149 /// Does this non-empty order represent an identity order? Identity
2150 /// should be represented as an empty order, so this is used to
2151 /// decide if we can canonicalize a computed order. Undef elements
2152 /// (represented as size) are ignored.
2154 assert(!Order.empty() && "expected non-empty order");
2155 const unsigned Sz = Order.size();
2156 return all_of(enumerate(Order), [&](const auto &P) {
2157 return P.value() == P.index() || P.value() == Sz;
2158 });
2159 }
2160
2161 /// Checks if the specified gather tree entry \p TE can be represented as a
2162 /// shuffled vector entry + (possibly) permutation with other gathers. It
2163 /// implements the checks only for possibly ordered scalars (Loads,
2164 /// ExtractElement, ExtractValue), which can be part of the graph.
2165 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2166 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2167 /// node might be ignored.
2168 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2169 bool TopToBottom,
2170 bool IgnoreReorder);
2171
2172 /// Sort loads into increasing pointers offsets to allow greater clustering.
2173 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2174
2175 /// Gets reordering data for the given tree entry. If the entry is vectorized
2176 /// - just return ReorderIndices, otherwise check if the scalars can be
2177 /// reordered and return the most optimal order.
2178 /// \return std::nullopt if ordering is not important, empty order, if
2179 /// identity order is important, or the actual order.
2180 /// \param TopToBottom If true, include the order of vectorized stores and
2181 /// insertelement nodes, otherwise skip them.
2182 /// \param IgnoreReorder true, if the root node order can be ignored.
2183 std::optional<OrdersType>
2184 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2185
2186 /// Checks if it is profitable to reorder the current tree.
2187 /// If the tree does not contain many profitable reordable nodes, better to
2188 /// skip it to save compile time.
2189 bool isProfitableToReorder() const;
2190
2191 /// Reorders the current graph to the most profitable order starting from the
2192 /// root node to the leaf nodes. The best order is chosen only from the nodes
2193 /// of the same size (vectorization factor). Smaller nodes are considered
2194 /// parts of subgraph with smaller VF and they are reordered independently. We
2195 /// can make it because we still need to extend smaller nodes to the wider VF
2196 /// and we can merge reordering shuffles with the widening shuffles.
2197 void reorderTopToBottom();
2198
2199 /// Reorders the current graph to the most profitable order starting from
2200 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2201 /// number of reshuffles if the leaf nodes use the same order. In this case we
2202 /// can merge the orders and just shuffle user node instead of shuffling its
2203 /// operands. Plus, even the leaf nodes have different orders, it allows to
2204 /// sink reordering in the graph closer to the root node and merge it later
2205 /// during analysis.
2206 void reorderBottomToTop(bool IgnoreReorder = false);
2207
2208 /// \return The vector element size in bits to use when vectorizing the
2209 /// expression tree ending at \p V. If V is a store, the size is the width of
2210 /// the stored value. Otherwise, the size is the width of the largest loaded
2211 /// value reaching V. This method is used by the vectorizer to calculate
2212 /// vectorization factors.
2213 unsigned getVectorElementSize(Value *V);
2214
2215 /// Compute the minimum type sizes required to represent the entries in a
2216 /// vectorizable tree.
2218
2219 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2220 unsigned getMaxVecRegSize() const {
2221 return MaxVecRegSize;
2222 }
2223
2224 // \returns minimum vector register size as set by cl::opt.
2225 unsigned getMinVecRegSize() const {
2226 return MinVecRegSize;
2227 }
2228
2229 unsigned getMinVF(unsigned Sz) const {
2230 return std::max(2U, getMinVecRegSize() / Sz);
2231 }
2232
2233 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2234 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2235 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2236 return MaxVF ? MaxVF : UINT_MAX;
2237 }
2238
2239 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2240 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2241 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2242 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2243 ///
2244 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2245 unsigned canMapToVector(Type *T) const;
2246
2247 /// \returns True if the VectorizableTree is both tiny and not fully
2248 /// vectorizable. We do not vectorize such trees.
2249 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2250
2251 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2252 /// It may happen, if all gather nodes are loads and they cannot be
2253 /// "clusterized". In this case even subgraphs cannot be vectorized more
2254 /// effectively than the base graph.
2255 bool isTreeNotExtendable() const;
2256
2257 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2258 /// can be load combined in the backend. Load combining may not be allowed in
2259 /// the IR optimizer, so we do not want to alter the pattern. For example,
2260 /// partially transforming a scalar bswap() pattern into vector code is
2261 /// effectively impossible for the backend to undo.
2262 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2263 /// may not be necessary.
2264 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2265
2266 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2267 /// can be load combined in the backend. Load combining may not be allowed in
2268 /// the IR optimizer, so we do not want to alter the pattern. For example,
2269 /// partially transforming a scalar bswap() pattern into vector code is
2270 /// effectively impossible for the backend to undo.
2271 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2272 /// may not be necessary.
2273 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2274 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2275 Align Alignment, const int64_t Diff,
2276 const size_t Sz) const;
2277
2278 /// Return true if an array of scalar loads can be replaced with a strided
2279 /// load (with constant stride).
2280 ///
2281 /// It is possible that the load gets "widened". Suppose that originally each
2282 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2283 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2284 /// ...
2285 /// %b + 0 * %s + (w - 1)
2286 ///
2287 /// %b + 1 * %s + 0
2288 /// %b + 1 * %s + 1
2289 /// %b + 1 * %s + 2
2290 /// ...
2291 /// %b + 1 * %s + (w - 1)
2292 /// ...
2293 ///
2294 /// %b + (n - 1) * %s + 0
2295 /// %b + (n - 1) * %s + 1
2296 /// %b + (n - 1) * %s + 2
2297 /// ...
2298 /// %b + (n - 1) * %s + (w - 1)
2299 ///
2300 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2301 ///
2302 /// \param PointerOps list of pointer arguments of loads.
2303 /// \param ElemTy original scalar type of loads.
2304 /// \param Alignment alignment of the first load.
2305 /// \param SortedIndices is the order of PointerOps as returned by
2306 /// `sortPtrAccesses`
2307 /// \param Diff Pointer difference between the lowest and the highes pointer
2308 /// in `PointerOps` as returned by `getPointersDiff`.
2309 /// \param Ptr0 first pointer in `PointersOps`.
2310 /// \param PtrN last pointer in `PointersOps`.
2311 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2312 /// of `SPtrInfo` necessary to generate the strided load later.
2314 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2315 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2316 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2317
2318 /// Return true if an array of scalar loads can be replaced with a strided
2319 /// load (with run-time stride).
2320 /// \param PointerOps list of pointer arguments of loads.
2321 /// \param ScalarTy type of loads.
2322 /// \param CommonAlignment common alignement of loads as computed by
2323 /// `computeCommonAlignment<LoadInst>`.
2324 /// \param SortedIndicies is a list of indicies computed by this function such
2325 /// that the sequence `PointerOps[SortedIndices[0]],
2326 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2327 /// ordered by the coefficient of the stride. For example, if PointerOps is
2328 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2329 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2330 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2331 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2332 /// of `SPtrInfo` necessary to generate the strided load later.
2333 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2334 Align CommonAlignment,
2335 SmallVectorImpl<unsigned> &SortedIndices,
2336 StridedPtrInfo &SPtrInfo) const;
2337
2338 /// Checks if the given array of loads can be represented as a vectorized,
2339 /// scatter or just simple gather.
2340 /// \param VL list of loads.
2341 /// \param VL0 main load value.
2342 /// \param Order returned order of load instructions.
2343 /// \param PointerOps returned list of pointer operands.
2344 /// \param BestVF return best vector factor, if recursive check found better
2345 /// vectorization sequences rather than masked gather.
2346 /// \param TryRecursiveCheck used to check if long masked gather can be
2347 /// represented as a serie of loads/insert subvector, if profitable.
2350 SmallVectorImpl<Value *> &PointerOps,
2351 StridedPtrInfo &SPtrInfo,
2352 unsigned *BestVF = nullptr,
2353 bool TryRecursiveCheck = true) const;
2354
2355 /// Registers non-vectorizable sequence of loads
2356 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2357 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2358 }
2359
2360 /// Checks if the given loads sequence is known as not vectorizable
2361 template <typename T>
2363 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2364 }
2365
2367
2368 /// This structure holds any data we need about the edges being traversed
2369 /// during buildTreeRec(). We keep track of:
2370 /// (i) the user TreeEntry index, and
2371 /// (ii) the index of the edge.
2372 struct EdgeInfo {
2373 EdgeInfo() = default;
2374 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2376 /// The user TreeEntry.
2377 TreeEntry *UserTE = nullptr;
2378 /// The operand index of the use.
2379 unsigned EdgeIdx = UINT_MAX;
2380#ifndef NDEBUG
2382 const BoUpSLP::EdgeInfo &EI) {
2383 EI.dump(OS);
2384 return OS;
2385 }
2386 /// Debug print.
2387 void dump(raw_ostream &OS) const {
2388 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2389 << " EdgeIdx:" << EdgeIdx << "}";
2390 }
2391 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2392#endif
2393 bool operator == (const EdgeInfo &Other) const {
2394 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2395 }
2396
2397 operator bool() const { return UserTE != nullptr; }
2398 };
2399 friend struct DenseMapInfo<EdgeInfo>;
2400
2401 /// A helper class used for scoring candidates for two consecutive lanes.
2403 const TargetLibraryInfo &TLI;
2404 const DataLayout &DL;
2405 ScalarEvolution &SE;
2406 const BoUpSLP &R;
2407 int NumLanes; // Total number of lanes (aka vectorization factor).
2408 int MaxLevel; // The maximum recursion depth for accumulating score.
2409
2410 public:
2412 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2413 int MaxLevel)
2414 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2415 MaxLevel(MaxLevel) {}
2416
2417 // The hard-coded scores listed here are not very important, though it shall
2418 // be higher for better matches to improve the resulting cost. When
2419 // computing the scores of matching one sub-tree with another, we are
2420 // basically counting the number of values that are matching. So even if all
2421 // scores are set to 1, we would still get a decent matching result.
2422 // However, sometimes we have to break ties. For example we may have to
2423 // choose between matching loads vs matching opcodes. This is what these
2424 // scores are helping us with: they provide the order of preference. Also,
2425 // this is important if the scalar is externally used or used in another
2426 // tree entry node in the different lane.
2427
2428 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2429 static const int ScoreConsecutiveLoads = 4;
2430 /// The same load multiple times. This should have a better score than
2431 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2432 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2433 /// a vector load and 1.0 for a broadcast.
2434 static const int ScoreSplatLoads = 3;
2435 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2436 static const int ScoreReversedLoads = 3;
2437 /// A load candidate for masked gather.
2438 static const int ScoreMaskedGatherCandidate = 1;
2439 /// ExtractElementInst from same vector and consecutive indexes.
2440 static const int ScoreConsecutiveExtracts = 4;
2441 /// ExtractElementInst from same vector and reversed indices.
2442 static const int ScoreReversedExtracts = 3;
2443 /// Constants.
2444 static const int ScoreConstants = 2;
2445 /// Instructions with the same opcode.
2446 static const int ScoreSameOpcode = 2;
2447 /// Instructions with alt opcodes (e.g, add + sub).
2448 static const int ScoreAltOpcodes = 1;
2449 /// Identical instructions (a.k.a. splat or broadcast).
2450 static const int ScoreSplat = 1;
2451 /// Matching with an undef is preferable to failing.
2452 static const int ScoreUndef = 1;
2453 /// Score for failing to find a decent match.
2454 static const int ScoreFail = 0;
2455 /// Score if all users are vectorized.
2456 static const int ScoreAllUserVectorized = 1;
2457
2458 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2459 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2460 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2461 /// MainAltOps.
2463 ArrayRef<Value *> MainAltOps) const {
2464 if (!isValidElementType(V1->getType()) ||
2467
2468 if (V1 == V2) {
2469 if (isa<LoadInst>(V1)) {
2470 // Retruns true if the users of V1 and V2 won't need to be extracted.
2471 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2472 // Bail out if we have too many uses to save compilation time.
2473 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2474 return false;
2475
2476 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2477 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2478 return U == U1 || U == U2 || R.isVectorized(U);
2479 });
2480 };
2481 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2482 };
2483 // A broadcast of a load can be cheaper on some targets.
2484 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2485 ElementCount::getFixed(NumLanes)) &&
2486 ((int)V1->getNumUses() == NumLanes ||
2487 AllUsersAreInternal(V1, V2)))
2489 }
2491 }
2492
2493 auto CheckSameEntryOrFail = [&]() {
2494 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2496 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2497 !TEs2.empty() &&
2498 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2500 }
2502 };
2503
2504 auto *LI1 = dyn_cast<LoadInst>(V1);
2505 auto *LI2 = dyn_cast<LoadInst>(V2);
2506 if (LI1 && LI2) {
2507 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2508 !LI2->isSimple())
2509 return CheckSameEntryOrFail();
2510
2511 std::optional<int64_t> Dist = getPointersDiff(
2512 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2513 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2514 if (!Dist || *Dist == 0) {
2515 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2516 getUnderlyingObject(LI2->getPointerOperand()) &&
2517 R.TTI->isLegalMaskedGather(
2518 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2520 return CheckSameEntryOrFail();
2521 }
2522 // The distance is too large - still may be profitable to use masked
2523 // loads/gathers.
2524 if (std::abs(*Dist) > NumLanes / 2)
2526 // This still will detect consecutive loads, but we might have "holes"
2527 // in some cases. It is ok for non-power-2 vectorization and may produce
2528 // better results. It should not affect current vectorization.
2531 }
2532
2533 auto *C1 = dyn_cast<Constant>(V1);
2534 auto *C2 = dyn_cast<Constant>(V2);
2535 if (C1 && C2)
2537
2538 // Consider constants and buildvector compatible.
2539 if ((C1 && isa<InsertElementInst>(V2)) ||
2540 (C2 && isa<InsertElementInst>(V1)))
2542
2543 // Extracts from consecutive indexes of the same vector better score as
2544 // the extracts could be optimized away.
2545 Value *EV1;
2546 ConstantInt *Ex1Idx;
2547 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2548 // Undefs are always profitable for extractelements.
2549 // Compiler can easily combine poison and extractelement <non-poison> or
2550 // undef and extractelement <poison>. But combining undef +
2551 // extractelement <non-poison-but-may-produce-poison> requires some
2552 // extra operations.
2553 if (isa<UndefValue>(V2))
2554 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2557 Value *EV2 = nullptr;
2558 ConstantInt *Ex2Idx = nullptr;
2559 if (match(V2,
2561 m_Undef())))) {
2562 // Undefs are always profitable for extractelements.
2563 if (!Ex2Idx)
2565 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2567 if (EV2 == EV1) {
2568 int Idx1 = Ex1Idx->getZExtValue();
2569 int Idx2 = Ex2Idx->getZExtValue();
2570 int Dist = Idx2 - Idx1;
2571 // The distance is too large - still may be profitable to use
2572 // shuffles.
2573 if (std::abs(Dist) == 0)
2575 if (std::abs(Dist) > NumLanes / 2)
2579 }
2581 }
2582 return CheckSameEntryOrFail();
2583 }
2584
2585 auto *I1 = dyn_cast<Instruction>(V1);
2586 auto *I2 = dyn_cast<Instruction>(V2);
2587 if (I1 && I2) {
2588 if (I1->getParent() != I2->getParent())
2589 return CheckSameEntryOrFail();
2590 SmallVector<Value *, 4> Ops(MainAltOps);
2591 Ops.push_back(I1);
2592 Ops.push_back(I2);
2593 InstructionsState S = getSameOpcode(Ops, TLI);
2594 // Note: Only consider instructions with <= 2 operands to avoid
2595 // complexity explosion.
2596 if (S &&
2597 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2598 !S.isAltShuffle()) &&
2599 all_of(Ops, [&S](Value *V) {
2600 return isa<PoisonValue>(V) ||
2601 cast<Instruction>(V)->getNumOperands() ==
2602 S.getMainOp()->getNumOperands();
2603 }))
2604 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2606 }
2607
2608 if (I1 && isa<PoisonValue>(V2))
2610
2611 if (isa<UndefValue>(V2))
2613
2614 return CheckSameEntryOrFail();
2615 }
2616
2617 /// Go through the operands of \p LHS and \p RHS recursively until
2618 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2619 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2620 /// of \p U1 and \p U2), except at the beginning of the recursion where
2621 /// these are set to nullptr.
2622 ///
2623 /// For example:
2624 /// \verbatim
2625 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2626 /// \ / \ / \ / \ /
2627 /// + + + +
2628 /// G1 G2 G3 G4
2629 /// \endverbatim
2630 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2631 /// each level recursively, accumulating the score. It starts from matching
2632 /// the additions at level 0, then moves on to the loads (level 1). The
2633 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2634 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2635 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2636 /// Please note that the order of the operands does not matter, as we
2637 /// evaluate the score of all profitable combinations of operands. In
2638 /// other words the score of G1 and G4 is the same as G1 and G2. This
2639 /// heuristic is based on ideas described in:
2640 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2641 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2642 /// Luís F. W. Góes
2644 Instruction *U2, int CurrLevel,
2645 ArrayRef<Value *> MainAltOps) const {
2646
2647 // Get the shallow score of V1 and V2.
2648 int ShallowScoreAtThisLevel =
2649 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2650
2651 // If reached MaxLevel,
2652 // or if V1 and V2 are not instructions,
2653 // or if they are SPLAT,
2654 // or if they are not consecutive,
2655 // or if profitable to vectorize loads or extractelements, early return
2656 // the current cost.
2657 auto *I1 = dyn_cast<Instruction>(LHS);
2658 auto *I2 = dyn_cast<Instruction>(RHS);
2659 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2660 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2661 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2662 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2664 ShallowScoreAtThisLevel))
2665 return ShallowScoreAtThisLevel;
2666 assert(I1 && I2 && "Should have early exited.");
2667
2668 // Contains the I2 operand indexes that got matched with I1 operands.
2669 SmallSet<unsigned, 4> Op2Used;
2670
2671 // Recursion towards the operands of I1 and I2. We are trying all possible
2672 // operand pairs, and keeping track of the best score.
2673 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2674 OpIdx1 != NumOperands1; ++OpIdx1) {
2675 // Try to pair op1I with the best operand of I2.
2676 int MaxTmpScore = 0;
2677 unsigned MaxOpIdx2 = 0;
2678 bool FoundBest = false;
2679 // If I2 is commutative try all combinations.
2680 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2681 unsigned ToIdx = isCommutative(I2)
2682 ? I2->getNumOperands()
2683 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2684 assert(FromIdx <= ToIdx && "Bad index");
2685 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2686 // Skip operands already paired with OpIdx1.
2687 if (Op2Used.count(OpIdx2))
2688 continue;
2689 // Recursively calculate the cost at each level
2690 int TmpScore =
2691 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2692 I1, I2, CurrLevel + 1, {});
2693 // Look for the best score.
2694 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2695 TmpScore > MaxTmpScore) {
2696 MaxTmpScore = TmpScore;
2697 MaxOpIdx2 = OpIdx2;
2698 FoundBest = true;
2699 }
2700 }
2701 if (FoundBest) {
2702 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2703 Op2Used.insert(MaxOpIdx2);
2704 ShallowScoreAtThisLevel += MaxTmpScore;
2705 }
2706 }
2707 return ShallowScoreAtThisLevel;
2708 }
2709 };
2710 /// A helper data structure to hold the operands of a vector of instructions.
2711 /// This supports a fixed vector length for all operand vectors.
2713 /// For each operand we need (i) the value, and (ii) the opcode that it
2714 /// would be attached to if the expression was in a left-linearized form.
2715 /// This is required to avoid illegal operand reordering.
2716 /// For example:
2717 /// \verbatim
2718 /// 0 Op1
2719 /// |/
2720 /// Op1 Op2 Linearized + Op2
2721 /// \ / ----------> |/
2722 /// - -
2723 ///
2724 /// Op1 - Op2 (0 + Op1) - Op2
2725 /// \endverbatim
2726 ///
2727 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2728 ///
2729 /// Another way to think of this is to track all the operations across the
2730 /// path from the operand all the way to the root of the tree and to
2731 /// calculate the operation that corresponds to this path. For example, the
2732 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2733 /// corresponding operation is a '-' (which matches the one in the
2734 /// linearized tree, as shown above).
2735 ///
2736 /// For lack of a better term, we refer to this operation as Accumulated
2737 /// Path Operation (APO).
2738 struct OperandData {
2739 OperandData() = default;
2740 OperandData(Value *V, bool APO, bool IsUsed)
2741 : V(V), APO(APO), IsUsed(IsUsed) {}
2742 /// The operand value.
2743 Value *V = nullptr;
2744 /// TreeEntries only allow a single opcode, or an alternate sequence of
2745 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2746 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2747 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2748 /// (e.g., Add/Mul)
2749 bool APO = false;
2750 /// Helper data for the reordering function.
2751 bool IsUsed = false;
2752 };
2753
2754 /// During operand reordering, we are trying to select the operand at lane
2755 /// that matches best with the operand at the neighboring lane. Our
2756 /// selection is based on the type of value we are looking for. For example,
2757 /// if the neighboring lane has a load, we need to look for a load that is
2758 /// accessing a consecutive address. These strategies are summarized in the
2759 /// 'ReorderingMode' enumerator.
2760 enum class ReorderingMode {
2761 Load, ///< Matching loads to consecutive memory addresses
2762 Opcode, ///< Matching instructions based on opcode (same or alternate)
2763 Constant, ///< Matching constants
2764 Splat, ///< Matching the same instruction multiple times (broadcast)
2765 Failed, ///< We failed to create a vectorizable group
2766 };
2767
2768 using OperandDataVec = SmallVector<OperandData, 2>;
2769
2770 /// A vector of operand vectors.
2772 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2773 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2774 unsigned ArgSize = 0;
2775
2776 const TargetLibraryInfo &TLI;
2777 const DataLayout &DL;
2778 ScalarEvolution &SE;
2779 const BoUpSLP &R;
2780 const Loop *L = nullptr;
2781
2782 /// \returns the operand data at \p OpIdx and \p Lane.
2783 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2784 return OpsVec[OpIdx][Lane];
2785 }
2786
2787 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2788 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2789 return OpsVec[OpIdx][Lane];
2790 }
2791
2792 /// Clears the used flag for all entries.
2793 void clearUsed() {
2794 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2795 OpIdx != NumOperands; ++OpIdx)
2796 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2797 ++Lane)
2798 OpsVec[OpIdx][Lane].IsUsed = false;
2799 }
2800
2801 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2802 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2803 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2804 }
2805
2806 /// \param Lane lane of the operands under analysis.
2807 /// \param OpIdx operand index in \p Lane lane we're looking the best
2808 /// candidate for.
2809 /// \param Idx operand index of the current candidate value.
2810 /// \returns The additional score due to possible broadcasting of the
2811 /// elements in the lane. It is more profitable to have power-of-2 unique
2812 /// elements in the lane, it will be vectorized with higher probability
2813 /// after removing duplicates. Currently the SLP vectorizer supports only
2814 /// vectorization of the power-of-2 number of unique scalars.
2815 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2816 const SmallBitVector &UsedLanes) const {
2817 Value *IdxLaneV = getData(Idx, Lane).V;
2818 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2819 isa<ExtractElementInst>(IdxLaneV))
2820 return 0;
2822 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2823 if (Ln == Lane)
2824 continue;
2825 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2826 if (!isa<Instruction>(OpIdxLnV))
2827 return 0;
2828 Uniques.try_emplace(OpIdxLnV, Ln);
2829 }
2830 unsigned UniquesCount = Uniques.size();
2831 auto IdxIt = Uniques.find(IdxLaneV);
2832 unsigned UniquesCntWithIdxLaneV =
2833 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2834 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2835 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2836 unsigned UniquesCntWithOpIdxLaneV =
2837 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2838 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2839 return 0;
2840 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2841 UniquesCntWithOpIdxLaneV,
2842 UniquesCntWithOpIdxLaneV -
2843 bit_floor(UniquesCntWithOpIdxLaneV)) -
2844 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2845 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2846 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2847 }
2848
2849 /// \param Lane lane of the operands under analysis.
2850 /// \param OpIdx operand index in \p Lane lane we're looking the best
2851 /// candidate for.
2852 /// \param Idx operand index of the current candidate value.
2853 /// \returns The additional score for the scalar which users are all
2854 /// vectorized.
2855 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2856 Value *IdxLaneV = getData(Idx, Lane).V;
2857 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2858 // Do not care about number of uses for vector-like instructions
2859 // (extractelement/extractvalue with constant indices), they are extracts
2860 // themselves and already externally used. Vectorization of such
2861 // instructions does not add extra extractelement instruction, just may
2862 // remove it.
2863 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2864 isVectorLikeInstWithConstOps(OpIdxLaneV))
2866 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2867 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2868 return 0;
2869 return R.areAllUsersVectorized(IdxLaneI)
2871 : 0;
2872 }
2873
2874 /// Score scaling factor for fully compatible instructions but with
2875 /// different number of external uses. Allows better selection of the
2876 /// instructions with less external uses.
2877 static const int ScoreScaleFactor = 10;
2878
2879 /// \Returns the look-ahead score, which tells us how much the sub-trees
2880 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2881 /// score. This helps break ties in an informed way when we cannot decide on
2882 /// the order of the operands by just considering the immediate
2883 /// predecessors.
2884 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2885 int Lane, unsigned OpIdx, unsigned Idx,
2886 bool &IsUsed, const SmallBitVector &UsedLanes) {
2887 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2889 // Keep track of the instruction stack as we recurse into the operands
2890 // during the look-ahead score exploration.
2891 int Score =
2892 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2893 /*CurrLevel=*/1, MainAltOps);
2894 if (Score) {
2895 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2896 if (Score <= -SplatScore) {
2897 // Failed score.
2898 Score = 0;
2899 } else {
2900 Score += SplatScore;
2901 // Scale score to see the difference between different operands
2902 // and similar operands but all vectorized/not all vectorized
2903 // uses. It does not affect actual selection of the best
2904 // compatible operand in general, just allows to select the
2905 // operand with all vectorized uses.
2906 Score *= ScoreScaleFactor;
2907 Score += getExternalUseScore(Lane, OpIdx, Idx);
2908 IsUsed = true;
2909 }
2910 }
2911 return Score;
2912 }
2913
2914 /// Best defined scores per lanes between the passes. Used to choose the
2915 /// best operand (with the highest score) between the passes.
2916 /// The key - {Operand Index, Lane}.
2917 /// The value - the best score between the passes for the lane and the
2918 /// operand.
2920 BestScoresPerLanes;
2921
2922 // Search all operands in Ops[*][Lane] for the one that matches best
2923 // Ops[OpIdx][LastLane] and return its opreand index.
2924 // If no good match can be found, return std::nullopt.
2925 std::optional<unsigned>
2926 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2927 ArrayRef<ReorderingMode> ReorderingModes,
2928 ArrayRef<Value *> MainAltOps,
2929 const SmallBitVector &UsedLanes) {
2930 unsigned NumOperands = getNumOperands();
2931
2932 // The operand of the previous lane at OpIdx.
2933 Value *OpLastLane = getData(OpIdx, LastLane).V;
2934
2935 // Our strategy mode for OpIdx.
2936 ReorderingMode RMode = ReorderingModes[OpIdx];
2937 if (RMode == ReorderingMode::Failed)
2938 return std::nullopt;
2939
2940 // The linearized opcode of the operand at OpIdx, Lane.
2941 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2942
2943 // The best operand index and its score.
2944 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2945 // are using the score to differentiate between the two.
2946 struct BestOpData {
2947 std::optional<unsigned> Idx;
2948 unsigned Score = 0;
2949 } BestOp;
2950 BestOp.Score =
2951 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2952 .first->second;
2953
2954 // Track if the operand must be marked as used. If the operand is set to
2955 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2956 // want to reestimate the operands again on the following iterations).
2957 bool IsUsed = RMode == ReorderingMode::Splat ||
2958 RMode == ReorderingMode::Constant ||
2959 RMode == ReorderingMode::Load;
2960 // Iterate through all unused operands and look for the best.
2961 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2962 // Get the operand at Idx and Lane.
2963 OperandData &OpData = getData(Idx, Lane);
2964 Value *Op = OpData.V;
2965 bool OpAPO = OpData.APO;
2966
2967 // Skip already selected operands.
2968 if (OpData.IsUsed)
2969 continue;
2970
2971 // Skip if we are trying to move the operand to a position with a
2972 // different opcode in the linearized tree form. This would break the
2973 // semantics.
2974 if (OpAPO != OpIdxAPO)
2975 continue;
2976
2977 // Look for an operand that matches the current mode.
2978 switch (RMode) {
2979 case ReorderingMode::Load:
2980 case ReorderingMode::Opcode: {
2981 bool LeftToRight = Lane > LastLane;
2982 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2983 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2984 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2985 OpIdx, Idx, IsUsed, UsedLanes);
2986 if (Score > static_cast<int>(BestOp.Score) ||
2987 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2988 Idx == OpIdx)) {
2989 BestOp.Idx = Idx;
2990 BestOp.Score = Score;
2991 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2992 }
2993 break;
2994 }
2995 case ReorderingMode::Constant:
2996 if (isa<Constant>(Op) ||
2997 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2998 BestOp.Idx = Idx;
2999 if (isa<Constant>(Op)) {
3001 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3003 }
3005 IsUsed = false;
3006 }
3007 break;
3008 case ReorderingMode::Splat:
3009 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
3010 IsUsed = Op == OpLastLane;
3011 if (Op == OpLastLane) {
3012 BestOp.Score = LookAheadHeuristics::ScoreSplat;
3013 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3015 }
3016 BestOp.Idx = Idx;
3017 }
3018 break;
3019 case ReorderingMode::Failed:
3020 llvm_unreachable("Not expected Failed reordering mode.");
3021 }
3022 }
3023
3024 if (BestOp.Idx) {
3025 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3026 return BestOp.Idx;
3027 }
3028 // If we could not find a good match return std::nullopt.
3029 return std::nullopt;
3030 }
3031
3032 /// Helper for reorderOperandVecs.
3033 /// \returns the lane that we should start reordering from. This is the one
3034 /// which has the least number of operands that can freely move about or
3035 /// less profitable because it already has the most optimal set of operands.
3036 unsigned getBestLaneToStartReordering() const {
3037 unsigned Min = UINT_MAX;
3038 unsigned SameOpNumber = 0;
3039 // std::pair<unsigned, unsigned> is used to implement a simple voting
3040 // algorithm and choose the lane with the least number of operands that
3041 // can freely move about or less profitable because it already has the
3042 // most optimal set of operands. The first unsigned is a counter for
3043 // voting, the second unsigned is the counter of lanes with instructions
3044 // with same/alternate opcodes and same parent basic block.
3046 // Try to be closer to the original results, if we have multiple lanes
3047 // with same cost. If 2 lanes have the same cost, use the one with the
3048 // highest index.
3049 for (int I = getNumLanes(); I > 0; --I) {
3050 unsigned Lane = I - 1;
3051 OperandsOrderData NumFreeOpsHash =
3052 getMaxNumOperandsThatCanBeReordered(Lane);
3053 // Compare the number of operands that can move and choose the one with
3054 // the least number.
3055 if (NumFreeOpsHash.NumOfAPOs < Min) {
3056 Min = NumFreeOpsHash.NumOfAPOs;
3057 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3058 HashMap.clear();
3059 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3060 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3061 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3062 // Select the most optimal lane in terms of number of operands that
3063 // should be moved around.
3064 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3065 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3066 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3067 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3068 auto [It, Inserted] =
3069 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3070 if (!Inserted)
3071 ++It->second.first;
3072 }
3073 }
3074 // Select the lane with the minimum counter.
3075 unsigned BestLane = 0;
3076 unsigned CntMin = UINT_MAX;
3077 for (const auto &Data : reverse(HashMap)) {
3078 if (Data.second.first < CntMin) {
3079 CntMin = Data.second.first;
3080 BestLane = Data.second.second;
3081 }
3082 }
3083 return BestLane;
3084 }
3085
3086 /// Data structure that helps to reorder operands.
3087 struct OperandsOrderData {
3088 /// The best number of operands with the same APOs, which can be
3089 /// reordered.
3090 unsigned NumOfAPOs = UINT_MAX;
3091 /// Number of operands with the same/alternate instruction opcode and
3092 /// parent.
3093 unsigned NumOpsWithSameOpcodeParent = 0;
3094 /// Hash for the actual operands ordering.
3095 /// Used to count operands, actually their position id and opcode
3096 /// value. It is used in the voting mechanism to find the lane with the
3097 /// least number of operands that can freely move about or less profitable
3098 /// because it already has the most optimal set of operands. Can be
3099 /// replaced with SmallVector<unsigned> instead but hash code is faster
3100 /// and requires less memory.
3101 unsigned Hash = 0;
3102 };
3103 /// \returns the maximum number of operands that are allowed to be reordered
3104 /// for \p Lane and the number of compatible instructions(with the same
3105 /// parent/opcode). This is used as a heuristic for selecting the first lane
3106 /// to start operand reordering.
3107 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3108 unsigned CntTrue = 0;
3109 unsigned NumOperands = getNumOperands();
3110 // Operands with the same APO can be reordered. We therefore need to count
3111 // how many of them we have for each APO, like this: Cnt[APO] = x.
3112 // Since we only have two APOs, namely true and false, we can avoid using
3113 // a map. Instead we can simply count the number of operands that
3114 // correspond to one of them (in this case the 'true' APO), and calculate
3115 // the other by subtracting it from the total number of operands.
3116 // Operands with the same instruction opcode and parent are more
3117 // profitable since we don't need to move them in many cases, with a high
3118 // probability such lane already can be vectorized effectively.
3119 bool AllUndefs = true;
3120 unsigned NumOpsWithSameOpcodeParent = 0;
3121 Instruction *OpcodeI = nullptr;
3122 BasicBlock *Parent = nullptr;
3123 unsigned Hash = 0;
3124 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3125 const OperandData &OpData = getData(OpIdx, Lane);
3126 if (OpData.APO)
3127 ++CntTrue;
3128 // Use Boyer-Moore majority voting for finding the majority opcode and
3129 // the number of times it occurs.
3130 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3131 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3132 I->getParent() != Parent) {
3133 if (NumOpsWithSameOpcodeParent == 0) {
3134 NumOpsWithSameOpcodeParent = 1;
3135 OpcodeI = I;
3136 Parent = I->getParent();
3137 } else {
3138 --NumOpsWithSameOpcodeParent;
3139 }
3140 } else {
3141 ++NumOpsWithSameOpcodeParent;
3142 }
3143 }
3144 Hash = hash_combine(
3145 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3146 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3147 }
3148 if (AllUndefs)
3149 return {};
3150 OperandsOrderData Data;
3151 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3152 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3153 Data.Hash = Hash;
3154 return Data;
3155 }
3156
3157 /// Go through the instructions in VL and append their operands.
3158 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3159 const InstructionsState &S) {
3160 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3161 assert((empty() || all_of(Operands,
3162 [this](const ValueList &VL) {
3163 return VL.size() == getNumLanes();
3164 })) &&
3165 "Expected same number of lanes");
3166 assert(S.valid() && "InstructionsState is invalid.");
3167 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3168 // arguments to the intrinsic produces the same result.
3169 Instruction *MainOp = S.getMainOp();
3170 unsigned NumOperands = MainOp->getNumOperands();
3172 OpsVec.resize(ArgSize);
3173 unsigned NumLanes = VL.size();
3174 for (OperandDataVec &Ops : OpsVec)
3175 Ops.resize(NumLanes);
3176 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3177 // Our tree has just 3 nodes: the root and two operands.
3178 // It is therefore trivial to get the APO. We only need to check the
3179 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3180 // operand. The LHS operand of both add and sub is never attached to an
3181 // inversese operation in the linearized form, therefore its APO is
3182 // false. The RHS is true only if V is an inverse operation.
3183
3184 // Since operand reordering is performed on groups of commutative
3185 // operations or alternating sequences (e.g., +, -), we can safely tell
3186 // the inverse operations by checking commutativity.
3187 auto *I = dyn_cast<Instruction>(VL[Lane]);
3188 if (!I && isa<PoisonValue>(VL[Lane])) {
3189 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3190 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3191 continue;
3192 }
3193 bool IsInverseOperation = false;
3194 if (S.isCopyableElement(VL[Lane])) {
3195 // The value is a copyable element.
3196 IsInverseOperation =
3197 !isCommutative(MainOp, VL[Lane], /*IsCopyable=*/true);
3198 } else {
3199 assert(I && "Expected instruction");
3200 auto [SelectedOp, Ops] = convertTo(I, S);
3201 // We cannot check commutativity by the converted instruction
3202 // (SelectedOp) because isCommutative also examines def-use
3203 // relationships.
3204 IsInverseOperation = !isCommutative(SelectedOp, I);
3205 }
3206 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3207 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3208 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3209 }
3210 }
3211 }
3212
3213 /// \returns the number of operands.
3214 unsigned getNumOperands() const { return ArgSize; }
3215
3216 /// \returns the number of lanes.
3217 unsigned getNumLanes() const { return OpsVec[0].size(); }
3218
3219 /// \returns the operand value at \p OpIdx and \p Lane.
3220 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3221 return getData(OpIdx, Lane).V;
3222 }
3223
3224 /// \returns true if the data structure is empty.
3225 bool empty() const { return OpsVec.empty(); }
3226
3227 /// Clears the data.
3228 void clear() { OpsVec.clear(); }
3229
3230 /// \Returns true if there are enough operands identical to \p Op to fill
3231 /// the whole vector (it is mixed with constants or loop invariant values).
3232 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3233 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3234 assert(Op == getValue(OpIdx, Lane) &&
3235 "Op is expected to be getValue(OpIdx, Lane).");
3236 // Small number of loads - try load matching.
3237 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3238 return false;
3239 bool OpAPO = getData(OpIdx, Lane).APO;
3240 bool IsInvariant = L && L->isLoopInvariant(Op);
3241 unsigned Cnt = 0;
3242 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3243 if (Ln == Lane)
3244 continue;
3245 // This is set to true if we found a candidate for broadcast at Lane.
3246 bool FoundCandidate = false;
3247 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3248 OperandData &Data = getData(OpI, Ln);
3249 if (Data.APO != OpAPO || Data.IsUsed)
3250 continue;
3251 Value *OpILane = getValue(OpI, Lane);
3252 bool IsConstantOp = isa<Constant>(OpILane);
3253 // Consider the broadcast candidate if:
3254 // 1. Same value is found in one of the operands.
3255 if (Data.V == Op ||
3256 // 2. The operand in the given lane is not constant but there is a
3257 // constant operand in another lane (which can be moved to the
3258 // given lane). In this case we can represent it as a simple
3259 // permutation of constant and broadcast.
3260 (!IsConstantOp &&
3261 ((Lns > 2 && isa<Constant>(Data.V)) ||
3262 // 2.1. If we have only 2 lanes, need to check that value in the
3263 // next lane does not build same opcode sequence.
3264 (Lns == 2 &&
3265 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3266 isa<Constant>(Data.V)))) ||
3267 // 3. The operand in the current lane is loop invariant (can be
3268 // hoisted out) and another operand is also a loop invariant
3269 // (though not a constant). In this case the whole vector can be
3270 // hoisted out.
3271 // FIXME: need to teach the cost model about this case for better
3272 // estimation.
3273 (IsInvariant && !isa<Constant>(Data.V) &&
3274 !getSameOpcode({Op, Data.V}, TLI) &&
3275 L->isLoopInvariant(Data.V))) {
3276 FoundCandidate = true;
3277 Data.IsUsed = Data.V == Op;
3278 if (Data.V == Op)
3279 ++Cnt;
3280 break;
3281 }
3282 }
3283 if (!FoundCandidate)
3284 return false;
3285 }
3286 return getNumLanes() == 2 || Cnt > 1;
3287 }
3288
3289 /// Checks if there is at least single compatible operand in lanes other
3290 /// than \p Lane, compatible with the operand \p Op.
3291 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3292 assert(Op == getValue(OpIdx, Lane) &&
3293 "Op is expected to be getValue(OpIdx, Lane).");
3294 bool OpAPO = getData(OpIdx, Lane).APO;
3295 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3296 if (Ln == Lane)
3297 continue;
3298 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3299 const OperandData &Data = getData(OpI, Ln);
3300 if (Data.APO != OpAPO || Data.IsUsed)
3301 return true;
3302 Value *OpILn = getValue(OpI, Ln);
3303 return (L && L->isLoopInvariant(OpILn)) ||
3304 (getSameOpcode({Op, OpILn}, TLI) &&
3305 allSameBlock({Op, OpILn}));
3306 }))
3307 return true;
3308 }
3309 return false;
3310 }
3311
3312 public:
3313 /// Initialize with all the operands of the instruction vector \p RootVL.
3315 const InstructionsState &S, const BoUpSLP &R)
3316 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3317 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3318 // Append all the operands of RootVL.
3319 appendOperands(RootVL, Operands, S);
3320 }
3321
3322 /// \Returns a value vector with the operands across all lanes for the
3323 /// opearnd at \p OpIdx.
3324 ValueList getVL(unsigned OpIdx) const {
3325 ValueList OpVL(OpsVec[OpIdx].size());
3326 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3327 "Expected same num of lanes across all operands");
3328 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3329 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3330 return OpVL;
3331 }
3332
3333 // Performs operand reordering for 2 or more operands.
3334 // The original operands are in OrigOps[OpIdx][Lane].
3335 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3336 void reorder() {
3337 unsigned NumOperands = getNumOperands();
3338 unsigned NumLanes = getNumLanes();
3339 // Each operand has its own mode. We are using this mode to help us select
3340 // the instructions for each lane, so that they match best with the ones
3341 // we have selected so far.
3342 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3343
3344 // This is a greedy single-pass algorithm. We are going over each lane
3345 // once and deciding on the best order right away with no back-tracking.
3346 // However, in order to increase its effectiveness, we start with the lane
3347 // that has operands that can move the least. For example, given the
3348 // following lanes:
3349 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3350 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3351 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3352 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3353 // we will start at Lane 1, since the operands of the subtraction cannot
3354 // be reordered. Then we will visit the rest of the lanes in a circular
3355 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3356
3357 // Find the first lane that we will start our search from.
3358 unsigned FirstLane = getBestLaneToStartReordering();
3359
3360 // Initialize the modes.
3361 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3362 Value *OpLane0 = getValue(OpIdx, FirstLane);
3363 // Keep track if we have instructions with all the same opcode on one
3364 // side.
3365 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3366 // Check if OpLane0 should be broadcast.
3367 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3368 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3369 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3370 else if (isa<LoadInst>(OpILane0))
3371 ReorderingModes[OpIdx] = ReorderingMode::Load;
3372 else
3373 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3374 } else if (isa<Constant>(OpLane0)) {
3375 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3376 } else if (isa<Argument>(OpLane0)) {
3377 // Our best hope is a Splat. It may save some cost in some cases.
3378 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3379 } else {
3380 llvm_unreachable("Unexpected value kind.");
3381 }
3382 }
3383
3384 // Check that we don't have same operands. No need to reorder if operands
3385 // are just perfect diamond or shuffled diamond match. Do not do it only
3386 // for possible broadcasts or non-power of 2 number of scalars (just for
3387 // now).
3388 auto &&SkipReordering = [this]() {
3389 SmallPtrSet<Value *, 4> UniqueValues;
3390 ArrayRef<OperandData> Op0 = OpsVec.front();
3391 for (const OperandData &Data : Op0)
3392 UniqueValues.insert(Data.V);
3394 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3395 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3396 return !UniqueValues.contains(Data.V);
3397 }))
3398 return false;
3399 }
3400 // TODO: Check if we can remove a check for non-power-2 number of
3401 // scalars after full support of non-power-2 vectorization.
3402 return UniqueValues.size() != 2 &&
3403 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3404 UniqueValues.size());
3405 };
3406
3407 // If the initial strategy fails for any of the operand indexes, then we
3408 // perform reordering again in a second pass. This helps avoid assigning
3409 // high priority to the failed strategy, and should improve reordering for
3410 // the non-failed operand indexes.
3411 for (int Pass = 0; Pass != 2; ++Pass) {
3412 // Check if no need to reorder operands since they're are perfect or
3413 // shuffled diamond match.
3414 // Need to do it to avoid extra external use cost counting for
3415 // shuffled matches, which may cause regressions.
3416 if (SkipReordering())
3417 break;
3418 // Skip the second pass if the first pass did not fail.
3419 bool StrategyFailed = false;
3420 // Mark all operand data as free to use.
3421 clearUsed();
3422 // We keep the original operand order for the FirstLane, so reorder the
3423 // rest of the lanes. We are visiting the nodes in a circular fashion,
3424 // using FirstLane as the center point and increasing the radius
3425 // distance.
3426 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3427 for (unsigned I = 0; I < NumOperands; ++I)
3428 MainAltOps[I].push_back(getData(I, FirstLane).V);
3429
3430 SmallBitVector UsedLanes(NumLanes);
3431 UsedLanes.set(FirstLane);
3432 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3433 // Visit the lane on the right and then the lane on the left.
3434 for (int Direction : {+1, -1}) {
3435 int Lane = FirstLane + Direction * Distance;
3436 if (Lane < 0 || Lane >= (int)NumLanes)
3437 continue;
3438 UsedLanes.set(Lane);
3439 int LastLane = Lane - Direction;
3440 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3441 "Out of bounds");
3442 // Look for a good match for each operand.
3443 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3444 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3445 std::optional<unsigned> BestIdx =
3446 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3447 MainAltOps[OpIdx], UsedLanes);
3448 // By not selecting a value, we allow the operands that follow to
3449 // select a better matching value. We will get a non-null value in
3450 // the next run of getBestOperand().
3451 if (BestIdx) {
3452 // Swap the current operand with the one returned by
3453 // getBestOperand().
3454 swap(OpIdx, *BestIdx, Lane);
3455 } else {
3456 // Enable the second pass.
3457 StrategyFailed = true;
3458 }
3459 // Try to get the alternate opcode and follow it during analysis.
3460 if (MainAltOps[OpIdx].size() != 2) {
3461 OperandData &AltOp = getData(OpIdx, Lane);
3462 InstructionsState OpS =
3463 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3464 if (OpS && OpS.isAltShuffle())
3465 MainAltOps[OpIdx].push_back(AltOp.V);
3466 }
3467 }
3468 }
3469 }
3470 // Skip second pass if the strategy did not fail.
3471 if (!StrategyFailed)
3472 break;
3473 }
3474 }
3475
3476#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3477 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3478 switch (RMode) {
3479 case ReorderingMode::Load:
3480 return "Load";
3481 case ReorderingMode::Opcode:
3482 return "Opcode";
3483 case ReorderingMode::Constant:
3484 return "Constant";
3485 case ReorderingMode::Splat:
3486 return "Splat";
3487 case ReorderingMode::Failed:
3488 return "Failed";
3489 }
3490 llvm_unreachable("Unimplemented Reordering Type");
3491 }
3492
3493 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3494 raw_ostream &OS) {
3495 return OS << getModeStr(RMode);
3496 }
3497
3498 /// Debug print.
3499 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3500 printMode(RMode, dbgs());
3501 }
3502
3503 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3504 return printMode(RMode, OS);
3505 }
3506
3508 const unsigned Indent = 2;
3509 unsigned Cnt = 0;
3510 for (const OperandDataVec &OpDataVec : OpsVec) {
3511 OS << "Operand " << Cnt++ << "\n";
3512 for (const OperandData &OpData : OpDataVec) {
3513 OS.indent(Indent) << "{";
3514 if (Value *V = OpData.V)
3515 OS << *V;
3516 else
3517 OS << "null";
3518 OS << ", APO:" << OpData.APO << "}\n";
3519 }
3520 OS << "\n";
3521 }
3522 return OS;
3523 }
3524
3525 /// Debug print.
3526 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3527#endif
3528 };
3529
3530 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3531 /// for a pair which have highest score deemed to have best chance to form
3532 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3533 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3534 /// of the cost, considered to be good enough score.
3535 std::optional<int>
3536 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3537 int Limit = LookAheadHeuristics::ScoreFail) const {
3538 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3540 int BestScore = Limit;
3541 std::optional<int> Index;
3542 for (int I : seq<int>(0, Candidates.size())) {
3543 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3544 Candidates[I].second,
3545 /*U1=*/nullptr, /*U2=*/nullptr,
3546 /*CurrLevel=*/1, {});
3547 if (Score > BestScore) {
3548 BestScore = Score;
3549 Index = I;
3550 }
3551 }
3552 return Index;
3553 }
3554
3555 /// Checks if the instruction is marked for deletion.
3556 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3557
3558 /// Removes an instruction from its block and eventually deletes it.
3559 /// It's like Instruction::eraseFromParent() except that the actual deletion
3560 /// is delayed until BoUpSLP is destructed.
3562 DeletedInstructions.insert(I);
3563 }
3564
3565 /// Remove instructions from the parent function and clear the operands of \p
3566 /// DeadVals instructions, marking for deletion trivially dead operands.
3567 template <typename T>
3569 ArrayRef<T *> DeadVals,
3570 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3572 for (T *V : DeadVals) {
3573 auto *I = cast<Instruction>(V);
3575 }
3576 DenseSet<Value *> Processed;
3577 for (T *V : DeadVals) {
3578 if (!V || !Processed.insert(V).second)
3579 continue;
3580 auto *I = cast<Instruction>(V);
3582 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3583 for (Use &U : I->operands()) {
3584 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3585 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3587 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3588 return Entry->VectorizedValue == OpI;
3589 })))
3590 DeadInsts.push_back(OpI);
3591 }
3592 I->dropAllReferences();
3593 }
3594 for (T *V : DeadVals) {
3595 auto *I = cast<Instruction>(V);
3596 if (!I->getParent())
3597 continue;
3598 assert((I->use_empty() || all_of(I->uses(),
3599 [&](Use &U) {
3600 return isDeleted(
3601 cast<Instruction>(U.getUser()));
3602 })) &&
3603 "trying to erase instruction with users.");
3604 I->removeFromParent();
3605 SE->forgetValue(I);
3606 }
3607 // Process the dead instruction list until empty.
3608 while (!DeadInsts.empty()) {
3609 Value *V = DeadInsts.pop_back_val();
3611 if (!VI || !VI->getParent())
3612 continue;
3614 "Live instruction found in dead worklist!");
3615 assert(VI->use_empty() && "Instructions with uses are not dead.");
3616
3617 // Don't lose the debug info while deleting the instructions.
3618 salvageDebugInfo(*VI);
3619
3620 // Null out all of the instruction's operands to see if any operand
3621 // becomes dead as we go.
3622 for (Use &OpU : VI->operands()) {
3623 Value *OpV = OpU.get();
3624 if (!OpV)
3625 continue;
3626 OpU.set(nullptr);
3627
3628 if (!OpV->use_empty())
3629 continue;
3630
3631 // If the operand is an instruction that became dead as we nulled out
3632 // the operand, and if it is 'trivially' dead, delete it in a future
3633 // loop iteration.
3634 if (auto *OpI = dyn_cast<Instruction>(OpV))
3635 if (!DeletedInstructions.contains(OpI) &&
3636 (!OpI->getType()->isVectorTy() ||
3637 none_of(VectorValuesAndScales,
3638 [&](const std::tuple<Value *, unsigned, bool> &V) {
3639 return std::get<0>(V) == OpI;
3640 })) &&
3642 DeadInsts.push_back(OpI);
3643 }
3644
3645 VI->removeFromParent();
3646 eraseInstruction(VI);
3647 SE->forgetValue(VI);
3648 }
3649 }
3650
3651 /// Checks if the instruction was already analyzed for being possible
3652 /// reduction root.
3654 return AnalyzedReductionsRoots.count(I);
3655 }
3656 /// Register given instruction as already analyzed for being possible
3657 /// reduction root.
3659 AnalyzedReductionsRoots.insert(I);
3660 }
3661 /// Checks if the provided list of reduced values was checked already for
3662 /// vectorization.
3664 return AnalyzedReductionVals.contains(hash_value(VL));
3665 }
3666 /// Adds the list of reduced values to list of already checked values for the
3667 /// vectorization.
3669 AnalyzedReductionVals.insert(hash_value(VL));
3670 }
3671 /// Clear the list of the analyzed reduction root instructions.
3673 AnalyzedReductionsRoots.clear();
3674 AnalyzedReductionVals.clear();
3675 AnalyzedMinBWVals.clear();
3676 }
3677 /// Checks if the given value is gathered in one of the nodes.
3678 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3679 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3680 }
3681 /// Checks if the given value is gathered in one of the nodes.
3682 bool isGathered(const Value *V) const {
3683 return MustGather.contains(V);
3684 }
3685 /// Checks if the specified value was not schedule.
3686 bool isNotScheduled(const Value *V) const {
3687 return NonScheduledFirst.contains(V);
3688 }
3689
3690 /// Check if the value is vectorized in the tree.
3691 bool isVectorized(const Value *V) const {
3692 assert(V && "V cannot be nullptr.");
3693 return ScalarToTreeEntries.contains(V);
3694 }
3695
3696 ~BoUpSLP();
3697
3698private:
3699 /// Determine if a node \p E in can be demoted to a smaller type with a
3700 /// truncation. We collect the entries that will be demoted in ToDemote.
3701 /// \param E Node for analysis
3702 /// \param ToDemote indices of the nodes to be demoted.
3703 bool collectValuesToDemote(
3704 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3706 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3707 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3708
3709 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3710 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3711 /// they have only one user and reordarable).
3712 /// \param ReorderableGathers List of all gather nodes that require reordering
3713 /// (e.g., gather of extractlements or partially vectorizable loads).
3714 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3715 /// reordering, subset of \p NonVectorized.
3716 void buildReorderableOperands(
3717 TreeEntry *UserTE,
3718 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3719 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3720 SmallVectorImpl<TreeEntry *> &GatherOps);
3721
3722 /// Checks if the given \p TE is a gather node with clustered reused scalars
3723 /// and reorders it per given \p Mask.
3724 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3725
3726 /// Checks if all users of \p I are the part of the vectorization tree.
3727 bool areAllUsersVectorized(
3728 Instruction *I,
3729 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3730
3731 /// Return information about the vector formed for the specified index
3732 /// of a vector of (the same) instruction.
3734
3735 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3736 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3737 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3738 return const_cast<TreeEntry *>(
3739 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3740 }
3741
3742 /// Gets the root instruction for the given node. If the node is a strided
3743 /// load/store node with the reverse order, the root instruction is the last
3744 /// one.
3745 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3746
3747 /// \returns Cast context for the given graph node.
3749 getCastContextHint(const TreeEntry &TE) const;
3750
3751 /// \returns the cost of the vectorizable entry.
3752 InstructionCost getEntryCost(const TreeEntry *E,
3753 ArrayRef<Value *> VectorizedVals,
3754 SmallPtrSetImpl<Value *> &CheckedExtracts);
3755
3756 /// Checks if it is legal and profitable to build SplitVectorize node for the
3757 /// given \p VL.
3758 /// \param Op1 first homogeneous scalars.
3759 /// \param Op2 second homogeneous scalars.
3760 /// \param ReorderIndices indices to reorder the scalars.
3761 /// \returns true if the node was successfully built.
3762 bool canBuildSplitNode(ArrayRef<Value *> VL,
3763 const InstructionsState &LocalState,
3766 OrdersType &ReorderIndices) const;
3767
3768 /// This is the recursive part of buildTree.
3769 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3770 unsigned InterleaveFactor = 0);
3771
3772 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3773 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3774 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3775 /// returns false, setting \p CurrentOrder to either an empty vector or a
3776 /// non-identity permutation that allows to reuse extract instructions.
3777 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3778 /// extract order.
3779 bool canReuseExtract(ArrayRef<Value *> VL,
3780 SmallVectorImpl<unsigned> &CurrentOrder,
3781 bool ResizeAllowed = false) const;
3782
3783 /// Vectorize a single entry in the tree.
3784 Value *vectorizeTree(TreeEntry *E);
3785
3786 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3787 /// \p E.
3788 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3789
3790 /// Create a new vector from a list of scalar values. Produces a sequence
3791 /// which exploits values reused across lanes, and arranges the inserts
3792 /// for ease of later optimization.
3793 template <typename BVTy, typename ResTy, typename... Args>
3794 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3795
3796 /// Create a new vector from a list of scalar values. Produces a sequence
3797 /// which exploits values reused across lanes, and arranges the inserts
3798 /// for ease of later optimization.
3799 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3800
3801 /// Returns the instruction in the bundle, which can be used as a base point
3802 /// for scheduling. Usually it is the last instruction in the bundle, except
3803 /// for the case when all operands are external (in this case, it is the first
3804 /// instruction in the list).
3805 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3806
3807 /// Tries to find extractelement instructions with constant indices from fixed
3808 /// vector type and gather such instructions into a bunch, which highly likely
3809 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3810 /// was successful, the matched scalars are replaced by poison values in \p VL
3811 /// for future analysis.
3812 std::optional<TargetTransformInfo::ShuffleKind>
3813 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3814 SmallVectorImpl<int> &Mask) const;
3815
3816 /// Tries to find extractelement instructions with constant indices from fixed
3817 /// vector type and gather such instructions into a bunch, which highly likely
3818 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3819 /// was successful, the matched scalars are replaced by poison values in \p VL
3820 /// for future analysis.
3822 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3824 unsigned NumParts) const;
3825
3826 /// Checks if the gathered \p VL can be represented as a single register
3827 /// shuffle(s) of previous tree entries.
3828 /// \param TE Tree entry checked for permutation.
3829 /// \param VL List of scalars (a subset of the TE scalar), checked for
3830 /// permutations. Must form single-register vector.
3831 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3832 /// commands to build the mask using the original vector value, without
3833 /// relying on the potential reordering.
3834 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3835 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3836 std::optional<TargetTransformInfo::ShuffleKind>
3837 isGatherShuffledSingleRegisterEntry(
3838 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3839 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3840 bool ForOrder);
3841
3842 /// Checks if the gathered \p VL can be represented as multi-register
3843 /// shuffle(s) of previous tree entries.
3844 /// \param TE Tree entry checked for permutation.
3845 /// \param VL List of scalars (a subset of the TE scalar), checked for
3846 /// permutations.
3847 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3848 /// commands to build the mask using the original vector value, without
3849 /// relying on the potential reordering.
3850 /// \returns per-register series of ShuffleKind, if gathered values can be
3851 /// represented as shuffles of previous tree entries. \p Mask is filled with
3852 /// the shuffle mask (also on per-register base).
3854 isGatherShuffledEntry(
3855 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3857 unsigned NumParts, bool ForOrder = false);
3858
3859 /// \returns the cost of gathering (inserting) the values in \p VL into a
3860 /// vector.
3861 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3862 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3863 Type *ScalarTy) const;
3864
3865 /// Set the Builder insert point to one after the last instruction in
3866 /// the bundle
3867 void setInsertPointAfterBundle(const TreeEntry *E);
3868
3869 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3870 /// specified, the starting vector value is poison.
3871 Value *
3872 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3873 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3874
3875 /// \returns whether the VectorizableTree is fully vectorizable and will
3876 /// be beneficial even the tree height is tiny.
3877 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3878
3879 /// Run through the list of all gathered loads in the graph and try to find
3880 /// vector loads/masked gathers instead of regular gathers. Later these loads
3881 /// are reshufled to build final gathered nodes.
3882 void tryToVectorizeGatheredLoads(
3883 const SmallMapVector<
3884 std::tuple<BasicBlock *, Value *, Type *>,
3885 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3886 &GatheredLoads);
3887
3888 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3889 /// users of \p TE and collects the stores. It returns the map from the store
3890 /// pointers to the collected stores.
3892 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3893
3894 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3895 /// stores in \p StoresVec can form a vector instruction. If so it returns
3896 /// true and populates \p ReorderIndices with the shuffle indices of the
3897 /// stores when compared to the sorted vector.
3898 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3899 OrdersType &ReorderIndices) const;
3900
3901 /// Iterates through the users of \p TE, looking for scalar stores that can be
3902 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3903 /// their order and builds an order index vector for each store bundle. It
3904 /// returns all these order vectors found.
3905 /// We run this after the tree has formed, otherwise we may come across user
3906 /// instructions that are not yet in the tree.
3908 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3909
3910 /// Tries to reorder the gathering node for better vectorization
3911 /// opportunities.
3912 void reorderGatherNode(TreeEntry &TE);
3913
3914 class TreeEntry {
3915 public:
3916 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3917 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3918
3919 /// \returns Common mask for reorder indices and reused scalars.
3920 SmallVector<int> getCommonMask() const {
3921 if (State == TreeEntry::SplitVectorize)
3922 return {};
3923 SmallVector<int> Mask;
3924 inversePermutation(ReorderIndices, Mask);
3925 ::addMask(Mask, ReuseShuffleIndices);
3926 return Mask;
3927 }
3928
3929 /// \returns The mask for split nodes.
3930 SmallVector<int> getSplitMask() const {
3931 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3932 "Expected only split vectorize node.");
3933 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3934 unsigned CommonVF = std::max<unsigned>(
3935 CombinedEntriesWithIndices.back().second,
3936 Scalars.size() - CombinedEntriesWithIndices.back().second);
3937 for (auto [Idx, I] : enumerate(ReorderIndices))
3938 Mask[I] =
3939 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3940 ? CommonVF - CombinedEntriesWithIndices.back().second
3941 : 0);
3942 return Mask;
3943 }
3944
3945 /// Updates (reorders) SplitVectorize node according to the given mask \p
3946 /// Mask and order \p MaskOrder.
3947 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3948 ArrayRef<int> MaskOrder);
3949
3950 /// \returns true if the scalars in VL are equal to this entry.
3951 bool isSame(ArrayRef<Value *> VL) const {
3952 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3953 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3954 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3955 return VL.size() == Mask.size() &&
3956 std::equal(VL.begin(), VL.end(), Mask.begin(),
3957 [Scalars](Value *V, int Idx) {
3958 return (isa<UndefValue>(V) &&
3959 Idx == PoisonMaskElem) ||
3960 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3961 });
3962 };
3963 if (!ReorderIndices.empty()) {
3964 // TODO: implement matching if the nodes are just reordered, still can
3965 // treat the vector as the same if the list of scalars matches VL
3966 // directly, without reordering.
3967 SmallVector<int> Mask;
3968 inversePermutation(ReorderIndices, Mask);
3969 if (VL.size() == Scalars.size())
3970 return IsSame(Scalars, Mask);
3971 if (VL.size() == ReuseShuffleIndices.size()) {
3972 ::addMask(Mask, ReuseShuffleIndices);
3973 return IsSame(Scalars, Mask);
3974 }
3975 return false;
3976 }
3977 return IsSame(Scalars, ReuseShuffleIndices);
3978 }
3979
3980 /// \returns true if current entry has same operands as \p TE.
3981 bool hasEqualOperands(const TreeEntry &TE) const {
3982 if (TE.getNumOperands() != getNumOperands())
3983 return false;
3984 SmallBitVector Used(getNumOperands());
3985 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3986 unsigned PrevCount = Used.count();
3987 for (unsigned K = 0; K < E; ++K) {
3988 if (Used.test(K))
3989 continue;
3990 if (getOperand(K) == TE.getOperand(I)) {
3991 Used.set(K);
3992 break;
3993 }
3994 }
3995 // Check if we actually found the matching operand.
3996 if (PrevCount == Used.count())
3997 return false;
3998 }
3999 return true;
4000 }
4001
4002 /// \return Final vectorization factor for the node. Defined by the total
4003 /// number of vectorized scalars, including those, used several times in the
4004 /// entry and counted in the \a ReuseShuffleIndices, if any.
4005 unsigned getVectorFactor() const {
4006 if (!ReuseShuffleIndices.empty())
4007 return ReuseShuffleIndices.size();
4008 return Scalars.size();
4009 };
4010
4011 /// Checks if the current node is a gather node.
4012 bool isGather() const { return State == NeedToGather; }
4013
4014 /// A vector of scalars.
4015 ValueList Scalars;
4016
4017 /// The Scalars are vectorized into this value. It is initialized to Null.
4018 WeakTrackingVH VectorizedValue = nullptr;
4019
4020 /// Do we need to gather this sequence or vectorize it
4021 /// (either with vector instruction or with scatter/gather
4022 /// intrinsics for store/load)?
4023 enum EntryState {
4024 Vectorize, ///< The node is regularly vectorized.
4025 ScatterVectorize, ///< Masked scatter/gather node.
4026 StridedVectorize, ///< Strided loads (and stores)
4027 CompressVectorize, ///< (Masked) load with compress.
4028 NeedToGather, ///< Gather/buildvector node.
4029 CombinedVectorize, ///< Vectorized node, combined with its user into more
4030 ///< complex node like select/cmp to minmax, mul/add to
4031 ///< fma, etc. Must be used for the following nodes in
4032 ///< the pattern, not the very first one.
4033 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4034 ///< independently and then combines back.
4035 };
4036 EntryState State;
4037
4038 /// List of combined opcodes supported by the vectorizer.
4039 enum CombinedOpcode {
4040 NotCombinedOp = -1,
4041 MinMax = Instruction::OtherOpsEnd + 1,
4042 FMulAdd,
4043 };
4044 CombinedOpcode CombinedOp = NotCombinedOp;
4045
4046 /// Does this sequence require some shuffling?
4047 SmallVector<int, 4> ReuseShuffleIndices;
4048
4049 /// Does this entry require reordering?
4050 SmallVector<unsigned, 4> ReorderIndices;
4051
4052 /// Points back to the VectorizableTree.
4053 ///
4054 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4055 /// to be a pointer and needs to be able to initialize the child iterator.
4056 /// Thus we need a reference back to the container to translate the indices
4057 /// to entries.
4058 VecTreeTy &Container;
4059
4060 /// The TreeEntry index containing the user of this entry.
4061 EdgeInfo UserTreeIndex;
4062
4063 /// The index of this treeEntry in VectorizableTree.
4064 unsigned Idx = 0;
4065
4066 /// For gather/buildvector/alt opcode nodes, which are combined from
4067 /// other nodes as a series of insertvector instructions.
4068 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4069
4070 private:
4071 /// The operands of each instruction in each lane Operands[op_index][lane].
4072 /// Note: This helps avoid the replication of the code that performs the
4073 /// reordering of operands during buildTreeRec() and vectorizeTree().
4074 SmallVector<ValueList, 2> Operands;
4075
4076 /// Copyable elements of the entry node.
4077 SmallPtrSet<const Value *, 4> CopyableElements;
4078
4079 /// MainOp and AltOp are recorded inside. S should be obtained from
4080 /// newTreeEntry.
4081 InstructionsState S = InstructionsState::invalid();
4082
4083 /// Interleaving factor for interleaved loads Vectorize nodes.
4084 unsigned InterleaveFactor = 0;
4085
4086 /// True if the node does not require scheduling.
4087 bool DoesNotNeedToSchedule = false;
4088
4089 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4090 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4091 if (Operands.size() < OpIdx + 1)
4092 Operands.resize(OpIdx + 1);
4093 assert(Operands[OpIdx].empty() && "Already resized?");
4094 assert(OpVL.size() <= Scalars.size() &&
4095 "Number of operands is greater than the number of scalars.");
4096 Operands[OpIdx].resize(OpVL.size());
4097 copy(OpVL, Operands[OpIdx].begin());
4098 }
4099
4100 public:
4101 /// Returns interleave factor for interleave nodes.
4102 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4103 /// Sets interleaving factor for the interleaving nodes.
4104 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4105
4106 /// Marks the node as one that does not require scheduling.
4107 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4108 /// Returns true if the node is marked as one that does not require
4109 /// scheduling.
4110 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4111
4112 /// Set this bundle's operands from \p Operands.
4113 void setOperands(ArrayRef<ValueList> Operands) {
4114 for (unsigned I : seq<unsigned>(Operands.size()))
4115 setOperand(I, Operands[I]);
4116 }
4117
4118 /// Reorders operands of the node to the given mask \p Mask.
4119 void reorderOperands(ArrayRef<int> Mask) {
4120 for (ValueList &Operand : Operands)
4121 reorderScalars(Operand, Mask);
4122 }
4123
4124 /// \returns the \p OpIdx operand of this TreeEntry.
4125 ValueList &getOperand(unsigned OpIdx) {
4126 assert(OpIdx < Operands.size() && "Off bounds");
4127 return Operands[OpIdx];
4128 }
4129
4130 /// \returns the \p OpIdx operand of this TreeEntry.
4131 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4132 assert(OpIdx < Operands.size() && "Off bounds");
4133 return Operands[OpIdx];
4134 }
4135
4136 /// \returns the number of operands.
4137 unsigned getNumOperands() const { return Operands.size(); }
4138
4139 /// \return the single \p OpIdx operand.
4140 Value *getSingleOperand(unsigned OpIdx) const {
4141 assert(OpIdx < Operands.size() && "Off bounds");
4142 assert(!Operands[OpIdx].empty() && "No operand available");
4143 return Operands[OpIdx][0];
4144 }
4145
4146 /// Some of the instructions in the list have alternate opcodes.
4147 bool isAltShuffle() const { return S.isAltShuffle(); }
4148
4149 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4150 return S.getMatchingMainOpOrAltOp(I);
4151 }
4152
4153 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4154 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4155 /// \p OpValue.
4156 Value *isOneOf(Value *Op) const {
4157 auto *I = dyn_cast<Instruction>(Op);
4158 if (I && getMatchingMainOpOrAltOp(I))
4159 return Op;
4160 return S.getMainOp();
4161 }
4162
4163 void setOperations(const InstructionsState &S) {
4164 assert(S && "InstructionsState is invalid.");
4165 this->S = S;
4166 }
4167
4168 Instruction *getMainOp() const { return S.getMainOp(); }
4169
4170 Instruction *getAltOp() const { return S.getAltOp(); }
4171
4172 /// The main/alternate opcodes for the list of instructions.
4173 unsigned getOpcode() const { return S.getOpcode(); }
4174
4175 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4176
4177 bool hasState() const { return S.valid(); }
4178
4179 /// Add \p V to the list of copyable elements.
4180 void addCopyableElement(Value *V) {
4181 assert(S.isCopyableElement(V) && "Not a copyable element.");
4182 CopyableElements.insert(V);
4183 }
4184
4185 /// Returns true if \p V is a copyable element.
4186 bool isCopyableElement(Value *V) const {
4187 return CopyableElements.contains(V);
4188 }
4189
4190 /// Returns true if any scalar in the list is a copyable element.
4191 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4192
4193 /// Returns the state of the operations.
4194 const InstructionsState &getOperations() const { return S; }
4195
4196 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4197 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4198 unsigned findLaneForValue(Value *V) const {
4199 unsigned FoundLane = getVectorFactor();
4200 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4201 std::advance(It, 1)) {
4202 if (*It != V)
4203 continue;
4204 FoundLane = std::distance(Scalars.begin(), It);
4205 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4206 if (!ReorderIndices.empty())
4207 FoundLane = ReorderIndices[FoundLane];
4208 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4209 if (ReuseShuffleIndices.empty())
4210 break;
4211 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4212 RIt != ReuseShuffleIndices.end()) {
4213 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4214 break;
4215 }
4216 }
4217 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4218 return FoundLane;
4219 }
4220
4221 /// Build a shuffle mask for graph entry which represents a merge of main
4222 /// and alternate operations.
4223 void
4224 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4225 SmallVectorImpl<int> &Mask,
4226 SmallVectorImpl<Value *> *OpScalars = nullptr,
4227 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4228
4229 /// Return true if this is a non-power-of-2 node.
4230 bool isNonPowOf2Vec() const {
4231 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4232 return IsNonPowerOf2;
4233 }
4234
4235 /// Return true if this is a node, which tries to vectorize number of
4236 /// elements, forming whole vectors.
4237 bool
4238 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4239 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4240 TTI, getValueType(Scalars.front()), Scalars.size());
4241 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4242 "Reshuffling not supported with non-power-of-2 vectors yet.");
4243 return IsNonPowerOf2;
4244 }
4245
4246 Value *getOrdered(unsigned Idx) const {
4247 if (ReorderIndices.empty())
4248 return Scalars[Idx];
4249 SmallVector<int> Mask;
4250 inversePermutation(ReorderIndices, Mask);
4251 return Scalars[Mask[Idx]];
4252 }
4253
4254#ifndef NDEBUG
4255 /// Debug printer.
4256 LLVM_DUMP_METHOD void dump() const {
4257 dbgs() << Idx << ".\n";
4258 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4259 dbgs() << "Operand " << OpI << ":\n";
4260 for (const Value *V : Operands[OpI])
4261 dbgs().indent(2) << *V << "\n";
4262 }
4263 dbgs() << "Scalars: \n";
4264 for (Value *V : Scalars)
4265 dbgs().indent(2) << *V << "\n";
4266 dbgs() << "State: ";
4267 if (S && hasCopyableElements())
4268 dbgs() << "[[Copyable]] ";
4269 switch (State) {
4270 case Vectorize:
4271 if (InterleaveFactor > 0) {
4272 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4273 << "\n";
4274 } else {
4275 dbgs() << "Vectorize\n";
4276 }
4277 break;
4278 case ScatterVectorize:
4279 dbgs() << "ScatterVectorize\n";
4280 break;
4281 case StridedVectorize:
4282 dbgs() << "StridedVectorize\n";
4283 break;
4284 case CompressVectorize:
4285 dbgs() << "CompressVectorize\n";
4286 break;
4287 case NeedToGather:
4288 dbgs() << "NeedToGather\n";
4289 break;
4290 case CombinedVectorize:
4291 dbgs() << "CombinedVectorize\n";
4292 break;
4293 case SplitVectorize:
4294 dbgs() << "SplitVectorize\n";
4295 break;
4296 }
4297 if (S) {
4298 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4299 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4300 } else {
4301 dbgs() << "MainOp: NULL\n";
4302 dbgs() << "AltOp: NULL\n";
4303 }
4304 dbgs() << "VectorizedValue: ";
4305 if (VectorizedValue)
4306 dbgs() << *VectorizedValue << "\n";
4307 else
4308 dbgs() << "NULL\n";
4309 dbgs() << "ReuseShuffleIndices: ";
4310 if (ReuseShuffleIndices.empty())
4311 dbgs() << "Empty";
4312 else
4313 for (int ReuseIdx : ReuseShuffleIndices)
4314 dbgs() << ReuseIdx << ", ";
4315 dbgs() << "\n";
4316 dbgs() << "ReorderIndices: ";
4317 for (unsigned ReorderIdx : ReorderIndices)
4318 dbgs() << ReorderIdx << ", ";
4319 dbgs() << "\n";
4320 dbgs() << "UserTreeIndex: ";
4321 if (UserTreeIndex)
4322 dbgs() << UserTreeIndex;
4323 else
4324 dbgs() << "<invalid>";
4325 dbgs() << "\n";
4326 if (!CombinedEntriesWithIndices.empty()) {
4327 dbgs() << "Combined entries: ";
4328 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4329 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4330 });
4331 dbgs() << "\n";
4332 }
4333 }
4334#endif
4335 };
4336
4337#ifndef NDEBUG
4338 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4339 InstructionCost VecCost, InstructionCost ScalarCost,
4340 StringRef Banner) const {
4341 dbgs() << "SLP: " << Banner << ":\n";
4342 E->dump();
4343 dbgs() << "SLP: Costs:\n";
4344 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4345 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4346 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4347 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4348 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4349 }
4350#endif
4351
4352 /// Create a new gather TreeEntry
4353 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4354 const InstructionsState &S,
4355 const EdgeInfo &UserTreeIdx,
4356 ArrayRef<int> ReuseShuffleIndices = {}) {
4357 auto Invalid = ScheduleBundle::invalid();
4358 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4359 }
4360
4361 /// Create a new VectorizableTree entry.
4362 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4363 const InstructionsState &S,
4364 const EdgeInfo &UserTreeIdx,
4365 ArrayRef<int> ReuseShuffleIndices = {},
4366 ArrayRef<unsigned> ReorderIndices = {},
4367 unsigned InterleaveFactor = 0) {
4368 TreeEntry::EntryState EntryState =
4369 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4370 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4371 ReuseShuffleIndices, ReorderIndices);
4372 if (E && InterleaveFactor > 0)
4373 E->setInterleave(InterleaveFactor);
4374 return E;
4375 }
4376
4377 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4378 TreeEntry::EntryState EntryState,
4379 ScheduleBundle &Bundle, const InstructionsState &S,
4380 const EdgeInfo &UserTreeIdx,
4381 ArrayRef<int> ReuseShuffleIndices = {},
4382 ArrayRef<unsigned> ReorderIndices = {}) {
4383 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4384 EntryState == TreeEntry::SplitVectorize)) ||
4385 (Bundle && EntryState != TreeEntry::NeedToGather &&
4386 EntryState != TreeEntry::SplitVectorize)) &&
4387 "Need to vectorize gather entry?");
4388 // Gathered loads still gathered? Do not create entry, use the original one.
4389 if (GatheredLoadsEntriesFirst.has_value() &&
4390 EntryState == TreeEntry::NeedToGather && S &&
4391 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4392 !UserTreeIdx.UserTE)
4393 return nullptr;
4394 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4395 TreeEntry *Last = VectorizableTree.back().get();
4396 Last->Idx = VectorizableTree.size() - 1;
4397 Last->State = EntryState;
4398 if (UserTreeIdx.UserTE)
4399 OperandsToTreeEntry.try_emplace(
4400 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4401 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4402 // for non-power-of-two vectors.
4403 assert(
4404 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4405 ReuseShuffleIndices.empty()) &&
4406 "Reshuffling scalars not yet supported for nodes with padding");
4407 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4408 ReuseShuffleIndices.end());
4409 if (ReorderIndices.empty()) {
4410 Last->Scalars.assign(VL.begin(), VL.end());
4411 if (S)
4412 Last->setOperations(S);
4413 } else {
4414 // Reorder scalars and build final mask.
4415 Last->Scalars.assign(VL.size(), nullptr);
4416 transform(ReorderIndices, Last->Scalars.begin(),
4417 [VL](unsigned Idx) -> Value * {
4418 if (Idx >= VL.size())
4419 return UndefValue::get(VL.front()->getType());
4420 return VL[Idx];
4421 });
4422 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4423 if (S)
4424 Last->setOperations(S);
4425 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4426 }
4427 if (EntryState == TreeEntry::SplitVectorize) {
4428 assert(S && "Split nodes must have operations.");
4429 Last->setOperations(S);
4430 SmallPtrSet<Value *, 4> Processed;
4431 for (Value *V : VL) {
4432 auto *I = dyn_cast<Instruction>(V);
4433 if (!I)
4434 continue;
4435 auto It = ScalarsInSplitNodes.find(V);
4436 if (It == ScalarsInSplitNodes.end()) {
4437 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4438 (void)Processed.insert(V);
4439 } else if (Processed.insert(V).second) {
4440 assert(!is_contained(It->getSecond(), Last) &&
4441 "Value already associated with the node.");
4442 It->getSecond().push_back(Last);
4443 }
4444 }
4445 } else if (!Last->isGather()) {
4446 if (isa<PHINode>(S.getMainOp()) ||
4447 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4448 (!S.areInstructionsWithCopyableElements() &&
4449 doesNotNeedToSchedule(VL)) ||
4450 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4451 Last->setDoesNotNeedToSchedule();
4452 SmallPtrSet<Value *, 4> Processed;
4453 for (Value *V : VL) {
4454 if (isa<PoisonValue>(V))
4455 continue;
4456 if (S.isCopyableElement(V)) {
4457 Last->addCopyableElement(V);
4458 continue;
4459 }
4460 auto It = ScalarToTreeEntries.find(V);
4461 if (It == ScalarToTreeEntries.end()) {
4462 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4463 (void)Processed.insert(V);
4464 } else if (Processed.insert(V).second) {
4465 assert(!is_contained(It->getSecond(), Last) &&
4466 "Value already associated with the node.");
4467 It->getSecond().push_back(Last);
4468 }
4469 }
4470 // Update the scheduler bundle to point to this TreeEntry.
4471 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4472 "Bundle and VL out of sync");
4473 if (!Bundle.getBundle().empty()) {
4474#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4475 auto *BundleMember = Bundle.getBundle().begin();
4476 SmallPtrSet<Value *, 4> Processed;
4477 for (Value *V : VL) {
4478 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4479 continue;
4480 ++BundleMember;
4481 }
4482 assert(BundleMember == Bundle.getBundle().end() &&
4483 "Bundle and VL out of sync");
4484#endif
4485 Bundle.setTreeEntry(Last);
4486 }
4487 } else {
4488 // Build a map for gathered scalars to the nodes where they are used.
4489 bool AllConstsOrCasts = true;
4490 for (Value *V : VL) {
4491 if (S && S.areInstructionsWithCopyableElements() &&
4492 S.isCopyableElement(V))
4493 Last->addCopyableElement(V);
4494 if (!isConstant(V)) {
4495 auto *I = dyn_cast<CastInst>(V);
4496 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4497 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4498 !UserTreeIdx.UserTE->isGather())
4499 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4500 }
4501 }
4502 if (AllConstsOrCasts)
4503 CastMaxMinBWSizes =
4504 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4505 MustGather.insert_range(VL);
4506 }
4507
4508 if (UserTreeIdx.UserTE)
4509 Last->UserTreeIndex = UserTreeIdx;
4510 return Last;
4511 }
4512
4513 /// -- Vectorization State --
4514 /// Holds all of the tree entries.
4515 TreeEntry::VecTreeTy VectorizableTree;
4516
4517#ifndef NDEBUG
4518 /// Debug printer.
4519 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4520 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4521 VectorizableTree[Id]->dump();
4522 dbgs() << "\n";
4523 }
4524 }
4525#endif
4526
4527 /// Get list of vector entries, associated with the value \p V.
4528 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4529 assert(V && "V cannot be nullptr.");
4530 auto It = ScalarToTreeEntries.find(V);
4531 if (It == ScalarToTreeEntries.end())
4532 return {};
4533 return It->getSecond();
4534 }
4535
4536 /// Get list of split vector entries, associated with the value \p V.
4537 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4538 assert(V && "V cannot be nullptr.");
4539 auto It = ScalarsInSplitNodes.find(V);
4540 if (It == ScalarsInSplitNodes.end())
4541 return {};
4542 return It->getSecond();
4543 }
4544
4545 /// Returns first vector node for value \p V, matching values \p VL.
4546 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4547 bool SameVF = false) const {
4548 assert(V && "V cannot be nullptr.");
4549 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4550 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4551 return TE;
4552 return nullptr;
4553 }
4554
4555 /// Check that the operand node of alternate node does not generate
4556 /// buildvector sequence. If it is, then probably not worth it to build
4557 /// alternate shuffle, if number of buildvector operands + alternate
4558 /// instruction > than the number of buildvector instructions.
4559 /// \param S the instructions state of the analyzed values.
4560 /// \param VL list of the instructions with alternate opcodes.
4561 bool areAltOperandsProfitable(const InstructionsState &S,
4562 ArrayRef<Value *> VL) const;
4563
4564 /// Contains all the outputs of legality analysis for a list of values to
4565 /// vectorize.
4566 class ScalarsVectorizationLegality {
4567 InstructionsState S;
4568 bool IsLegal;
4569 bool TryToFindDuplicates;
4570 bool TrySplitVectorize;
4571
4572 public:
4573 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4574 bool TryToFindDuplicates = true,
4575 bool TrySplitVectorize = false)
4576 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4577 TrySplitVectorize(TrySplitVectorize) {
4578 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4579 "Inconsistent state");
4580 }
4581 const InstructionsState &getInstructionsState() const { return S; };
4582 bool isLegal() const { return IsLegal; }
4583 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4584 bool trySplitVectorize() const { return TrySplitVectorize; }
4585 };
4586
4587 /// Checks if the specified list of the instructions/values can be vectorized
4588 /// in general.
4589 ScalarsVectorizationLegality
4590 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4591 const EdgeInfo &UserTreeIdx,
4592 bool TryCopyableElementsVectorization) const;
4593
4594 /// Checks if the specified list of the instructions/values can be vectorized
4595 /// and fills required data before actual scheduling of the instructions.
4596 TreeEntry::EntryState getScalarsVectorizationState(
4597 const InstructionsState &S, ArrayRef<Value *> VL,
4598 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4599 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4600
4601 /// Maps a specific scalar to its tree entry(ies).
4602 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4603
4604 /// List of deleted non-profitable nodes.
4605 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4606
4607 /// List of nodes, transformed to gathered, with their conservative
4608 /// gather/buildvector cost estimation.
4609 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4610
4611 /// Maps the operand index and entry to the corresponding tree entry.
4612 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4613 OperandsToTreeEntry;
4614
4615 /// Scalars, used in split vectorize nodes.
4616 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4617
4618 /// Maps a value to the proposed vectorizable size.
4619 SmallDenseMap<Value *, unsigned> InstrElementSize;
4620
4621 /// A list of scalars that we found that we need to keep as scalars.
4622 ValueSet MustGather;
4623
4624 /// A set of first non-schedulable values.
4625 ValueSet NonScheduledFirst;
4626
4627 /// A map between the vectorized entries and the last instructions in the
4628 /// bundles. The bundles are built in use order, not in the def order of the
4629 /// instructions. So, we cannot rely directly on the last instruction in the
4630 /// bundle being the last instruction in the program order during
4631 /// vectorization process since the basic blocks are affected, need to
4632 /// pre-gather them before.
4633 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4634
4635 /// Keeps the mapping between the last instructions and their insertion
4636 /// points, which is an instruction-after-the-last-instruction.
4637 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4638
4639 /// List of gather nodes, depending on other gather/vector nodes, which should
4640 /// be emitted after the vector instruction emission process to correctly
4641 /// handle order of the vector instructions and shuffles.
4642 SetVector<const TreeEntry *> PostponedGathers;
4643
4644 using ValueToGatherNodesMap =
4645 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4646 ValueToGatherNodesMap ValueToGatherNodes;
4647
4648 /// A list of the load entries (node indices), which can be vectorized using
4649 /// strided or masked gather approach, but attempted to be represented as
4650 /// contiguous loads.
4651 SetVector<unsigned> LoadEntriesToVectorize;
4652
4653 /// true if graph nodes transforming mode is on.
4654 bool IsGraphTransformMode = false;
4655
4656 /// The index of the first gathered load entry in the VectorizeTree.
4657 std::optional<unsigned> GatheredLoadsEntriesFirst;
4658
4659 /// Maps compress entries to their mask data for the final codegen.
4660 SmallDenseMap<const TreeEntry *,
4661 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4662 CompressEntryToData;
4663
4664 /// This POD struct describes one external user in the vectorized tree.
4665 struct ExternalUser {
4666 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4667 : Scalar(S), User(U), E(E), Lane(L) {}
4668
4669 /// Which scalar in our function.
4670 Value *Scalar = nullptr;
4671
4672 /// Which user that uses the scalar.
4673 llvm::User *User = nullptr;
4674
4675 /// Vector node, the value is part of.
4676 const TreeEntry &E;
4677
4678 /// Which lane does the scalar belong to.
4679 unsigned Lane;
4680 };
4681 using UserList = SmallVector<ExternalUser, 16>;
4682
4683 /// Checks if two instructions may access the same memory.
4684 ///
4685 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4686 /// is invariant in the calling loop.
4687 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4688 Instruction *Inst2) {
4689 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4690 // First check if the result is already in the cache.
4691 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4692 auto Res = AliasCache.try_emplace(Key);
4693 if (!Res.second)
4694 return Res.first->second;
4695 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4696 // Store the result in the cache.
4697 Res.first->getSecond() = Aliased;
4698 return Aliased;
4699 }
4700
4701 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4702
4703 /// Cache for alias results.
4704 /// TODO: consider moving this to the AliasAnalysis itself.
4705 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4706
4707 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4708 // globally through SLP because we don't perform any action which
4709 // invalidates capture results.
4710 BatchAAResults BatchAA;
4711
4712 /// Temporary store for deleted instructions. Instructions will be deleted
4713 /// eventually when the BoUpSLP is destructed. The deferral is required to
4714 /// ensure that there are no incorrect collisions in the AliasCache, which
4715 /// can happen if a new instruction is allocated at the same address as a
4716 /// previously deleted instruction.
4717 DenseSet<Instruction *> DeletedInstructions;
4718
4719 /// Set of the instruction, being analyzed already for reductions.
4720 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4721
4722 /// Set of hashes for the list of reduction values already being analyzed.
4723 DenseSet<size_t> AnalyzedReductionVals;
4724
4725 /// Values, already been analyzed for mininmal bitwidth and found to be
4726 /// non-profitable.
4727 DenseSet<Value *> AnalyzedMinBWVals;
4728
4729 /// A list of values that need to extracted out of the tree.
4730 /// This list holds pairs of (Internal Scalar : External User). External User
4731 /// can be nullptr, it means that this Internal Scalar will be used later,
4732 /// after vectorization.
4733 UserList ExternalUses;
4734
4735 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4736 /// extractelement instructions.
4737 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4738
4739 /// A list of scalar to be extracted without specific user necause of too many
4740 /// uses.
4741 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4742
4743 /// Values used only by @llvm.assume calls.
4744 SmallPtrSet<const Value *, 32> EphValues;
4745
4746 /// Holds all of the instructions that we gathered, shuffle instructions and
4747 /// extractelements.
4748 SetVector<Instruction *> GatherShuffleExtractSeq;
4749
4750 /// A list of blocks that we are going to CSE.
4751 DenseSet<BasicBlock *> CSEBlocks;
4752
4753 /// List of hashes of vector of loads, which are known to be non vectorizable.
4754 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4755
4756 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4757 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4758 /// instructions, while ScheduleBundle represents a batch of instructions,
4759 /// going to be groupped together. ScheduleCopyableData models extra user for
4760 /// "copyable" instructions.
4761 class ScheduleEntity {
4762 friend class ScheduleBundle;
4763 friend class ScheduleData;
4764 friend class ScheduleCopyableData;
4765
4766 protected:
4767 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4768 Kind getKind() const { return K; }
4769 ScheduleEntity(Kind K) : K(K) {}
4770
4771 private:
4772 /// Used for getting a "good" final ordering of instructions.
4773 int SchedulingPriority = 0;
4774 /// True if this instruction (or bundle) is scheduled (or considered as
4775 /// scheduled in the dry-run).
4776 bool IsScheduled = false;
4777 /// The kind of the ScheduleEntity.
4778 const Kind K = Kind::ScheduleData;
4779
4780 public:
4781 ScheduleEntity() = delete;
4782 /// Gets/sets the scheduling priority.
4783 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4784 int getSchedulingPriority() const { return SchedulingPriority; }
4785 bool isReady() const {
4786 if (const auto *SD = dyn_cast<ScheduleData>(this))
4787 return SD->isReady();
4788 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4789 return CD->isReady();
4790 return cast<ScheduleBundle>(this)->isReady();
4791 }
4792 /// Returns true if the dependency information has been calculated.
4793 /// Note that depenendency validity can vary between instructions within
4794 /// a single bundle.
4795 bool hasValidDependencies() const {
4796 if (const auto *SD = dyn_cast<ScheduleData>(this))
4797 return SD->hasValidDependencies();
4798 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4799 return CD->hasValidDependencies();
4800 return cast<ScheduleBundle>(this)->hasValidDependencies();
4801 }
4802 /// Gets the number of unscheduled dependencies.
4803 int getUnscheduledDeps() const {
4804 if (const auto *SD = dyn_cast<ScheduleData>(this))
4805 return SD->getUnscheduledDeps();
4806 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4807 return CD->getUnscheduledDeps();
4808 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4809 }
4810 /// Increments the number of unscheduled dependencies.
4811 int incrementUnscheduledDeps(int Incr) {
4812 if (auto *SD = dyn_cast<ScheduleData>(this))
4813 return SD->incrementUnscheduledDeps(Incr);
4814 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4815 }
4816 /// Gets the number of dependencies.
4817 int getDependencies() const {
4818 if (const auto *SD = dyn_cast<ScheduleData>(this))
4819 return SD->getDependencies();
4820 return cast<ScheduleCopyableData>(this)->getDependencies();
4821 }
4822 /// Gets the instruction.
4823 Instruction *getInst() const {
4824 if (const auto *SD = dyn_cast<ScheduleData>(this))
4825 return SD->getInst();
4826 return cast<ScheduleCopyableData>(this)->getInst();
4827 }
4828
4829 /// Gets/sets if the bundle is scheduled.
4830 bool isScheduled() const { return IsScheduled; }
4831 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4832
4833 static bool classof(const ScheduleEntity *) { return true; }
4834
4835#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4836 void dump(raw_ostream &OS) const {
4837 if (const auto *SD = dyn_cast<ScheduleData>(this))
4838 return SD->dump(OS);
4839 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4840 return CD->dump(OS);
4841 return cast<ScheduleBundle>(this)->dump(OS);
4842 }
4843
4844 LLVM_DUMP_METHOD void dump() const {
4845 dump(dbgs());
4846 dbgs() << '\n';
4847 }
4848#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4849 };
4850
4851#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4853 const BoUpSLP::ScheduleEntity &SE) {
4854 SE.dump(OS);
4855 return OS;
4856 }
4857#endif
4858
4859 /// Contains all scheduling relevant data for an instruction.
4860 /// A ScheduleData either represents a single instruction or a member of an
4861 /// instruction bundle (= a group of instructions which is combined into a
4862 /// vector instruction).
4863 class ScheduleData final : public ScheduleEntity {
4864 public:
4865 // The initial value for the dependency counters. It means that the
4866 // dependencies are not calculated yet.
4867 enum { InvalidDeps = -1 };
4868
4869 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4870 static bool classof(const ScheduleEntity *Entity) {
4871 return Entity->getKind() == Kind::ScheduleData;
4872 }
4873
4874 void init(int BlockSchedulingRegionID, Instruction *I) {
4875 NextLoadStore = nullptr;
4876 IsScheduled = false;
4877 SchedulingRegionID = BlockSchedulingRegionID;
4878 clearDependencies();
4879 Inst = I;
4880 }
4881
4882 /// Verify basic self consistency properties
4883 void verify() {
4884 if (hasValidDependencies()) {
4885 assert(UnscheduledDeps <= Dependencies && "invariant");
4886 } else {
4887 assert(UnscheduledDeps == Dependencies && "invariant");
4888 }
4889
4890 if (IsScheduled) {
4891 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4892 "unexpected scheduled state");
4893 }
4894 }
4895
4896 /// Returns true if the dependency information has been calculated.
4897 /// Note that depenendency validity can vary between instructions within
4898 /// a single bundle.
4899 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4900
4901 /// Returns true if it is ready for scheduling, i.e. it has no more
4902 /// unscheduled depending instructions/bundles.
4903 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4904
4905 /// Modifies the number of unscheduled dependencies for this instruction,
4906 /// and returns the number of remaining dependencies for the containing
4907 /// bundle.
4908 int incrementUnscheduledDeps(int Incr) {
4909 assert(hasValidDependencies() &&
4910 "increment of unscheduled deps would be meaningless");
4911 UnscheduledDeps += Incr;
4912 assert(UnscheduledDeps >= 0 &&
4913 "Expected valid number of unscheduled deps");
4914 return UnscheduledDeps;
4915 }
4916
4917 /// Sets the number of unscheduled dependencies to the number of
4918 /// dependencies.
4919 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4920
4921 /// Clears all dependency information.
4922 void clearDependencies() {
4923 clearDirectDependencies();
4924 MemoryDependencies.clear();
4925 ControlDependencies.clear();
4926 }
4927
4928 /// Clears all direct dependencies only, except for control and memory
4929 /// dependencies.
4930 /// Required for copyable elements to correctly handle control/memory deps
4931 /// and avoid extra reclaculation of such deps.
4932 void clearDirectDependencies() {
4933 Dependencies = InvalidDeps;
4934 resetUnscheduledDeps();
4935 IsScheduled = false;
4936 }
4937
4938 /// Gets the number of unscheduled dependencies.
4939 int getUnscheduledDeps() const { return UnscheduledDeps; }
4940 /// Gets the number of dependencies.
4941 int getDependencies() const { return Dependencies; }
4942 /// Initializes the number of dependencies.
4943 void initDependencies() { Dependencies = 0; }
4944 /// Increments the number of dependencies.
4945 void incDependencies() { Dependencies++; }
4946
4947 /// Gets scheduling region ID.
4948 int getSchedulingRegionID() const { return SchedulingRegionID; }
4949
4950 /// Gets the instruction.
4951 Instruction *getInst() const { return Inst; }
4952
4953 /// Gets the list of memory dependencies.
4954 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4955 return MemoryDependencies;
4956 }
4957 /// Adds a memory dependency.
4958 void addMemoryDependency(ScheduleData *Dep) {
4959 MemoryDependencies.push_back(Dep);
4960 }
4961 /// Gets the list of control dependencies.
4962 ArrayRef<ScheduleData *> getControlDependencies() const {
4963 return ControlDependencies;
4964 }
4965 /// Adds a control dependency.
4966 void addControlDependency(ScheduleData *Dep) {
4967 ControlDependencies.push_back(Dep);
4968 }
4969 /// Gets/sets the next load/store instruction in the block.
4970 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4971 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4972
4973 void dump(raw_ostream &OS) const { OS << *Inst; }
4974
4975 LLVM_DUMP_METHOD void dump() const {
4976 dump(dbgs());
4977 dbgs() << '\n';
4978 }
4979
4980 private:
4981 Instruction *Inst = nullptr;
4982
4983 /// Single linked list of all memory instructions (e.g. load, store, call)
4984 /// in the block - until the end of the scheduling region.
4985 ScheduleData *NextLoadStore = nullptr;
4986
4987 /// The dependent memory instructions.
4988 /// This list is derived on demand in calculateDependencies().
4989 SmallVector<ScheduleData *> MemoryDependencies;
4990
4991 /// List of instructions which this instruction could be control dependent
4992 /// on. Allowing such nodes to be scheduled below this one could introduce
4993 /// a runtime fault which didn't exist in the original program.
4994 /// ex: this is a load or udiv following a readonly call which inf loops
4995 SmallVector<ScheduleData *> ControlDependencies;
4996
4997 /// This ScheduleData is in the current scheduling region if this matches
4998 /// the current SchedulingRegionID of BlockScheduling.
4999 int SchedulingRegionID = 0;
5000
5001 /// The number of dependencies. Constitutes of the number of users of the
5002 /// instruction plus the number of dependent memory instructions (if any).
5003 /// This value is calculated on demand.
5004 /// If InvalidDeps, the number of dependencies is not calculated yet.
5005 int Dependencies = InvalidDeps;
5006
5007 /// The number of dependencies minus the number of dependencies of scheduled
5008 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5009 /// for scheduling.
5010 /// Note that this is negative as long as Dependencies is not calculated.
5011 int UnscheduledDeps = InvalidDeps;
5012 };
5013
5014#ifndef NDEBUG
5016 const BoUpSLP::ScheduleData &SD) {
5017 SD.dump(OS);
5018 return OS;
5019 }
5020#endif
5021
5022 class ScheduleBundle final : public ScheduleEntity {
5023 /// The schedule data for the instructions in the bundle.
5025 /// True if this bundle is valid.
5026 bool IsValid = true;
5027 /// The TreeEntry that this instruction corresponds to.
5028 TreeEntry *TE = nullptr;
5029 ScheduleBundle(bool IsValid)
5030 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5031
5032 public:
5033 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5034 static bool classof(const ScheduleEntity *Entity) {
5035 return Entity->getKind() == Kind::ScheduleBundle;
5036 }
5037
5038 /// Verify basic self consistency properties
5039 void verify() const {
5040 for (const ScheduleEntity *SD : Bundle) {
5041 if (SD->hasValidDependencies()) {
5042 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5043 "invariant");
5044 } else {
5045 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5046 "invariant");
5047 }
5048
5049 if (isScheduled()) {
5050 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5051 "unexpected scheduled state");
5052 }
5053 }
5054 }
5055
5056 /// Returns the number of unscheduled dependencies in the bundle.
5057 int unscheduledDepsInBundle() const {
5058 assert(*this && "bundle must not be empty");
5059 int Sum = 0;
5060 for (const ScheduleEntity *BundleMember : Bundle) {
5061 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5062 return ScheduleData::InvalidDeps;
5063 Sum += BundleMember->getUnscheduledDeps();
5064 }
5065 return Sum;
5066 }
5067
5068 /// Returns true if the dependency information has been calculated.
5069 /// Note that depenendency validity can vary between instructions within
5070 /// a single bundle.
5071 bool hasValidDependencies() const {
5072 return all_of(Bundle, [](const ScheduleEntity *SD) {
5073 return SD->hasValidDependencies();
5074 });
5075 }
5076
5077 /// Returns true if it is ready for scheduling, i.e. it has no more
5078 /// unscheduled depending instructions/bundles.
5079 bool isReady() const {
5080 assert(*this && "bundle must not be empty");
5081 return unscheduledDepsInBundle() == 0 && !isScheduled();
5082 }
5083
5084 /// Returns the bundle of scheduling data, associated with the current
5085 /// instruction.
5086 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5087 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5088 /// Adds an instruction to the bundle.
5089 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5090
5091 /// Gets/sets the associated tree entry.
5092 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5093 TreeEntry *getTreeEntry() const { return TE; }
5094
5095 static ScheduleBundle invalid() { return {false}; }
5096
5097 operator bool() const { return IsValid; }
5098
5099#ifndef NDEBUG
5100 void dump(raw_ostream &OS) const {
5101 if (!*this) {
5102 OS << "[]";
5103 return;
5104 }
5105 OS << '[';
5106 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5108 OS << "<Copyable>";
5109 OS << *SD->getInst();
5110 });
5111 OS << ']';
5112 }
5113
5114 LLVM_DUMP_METHOD void dump() const {
5115 dump(dbgs());
5116 dbgs() << '\n';
5117 }
5118#endif // NDEBUG
5119 };
5120
5121#ifndef NDEBUG
5123 const BoUpSLP::ScheduleBundle &Bundle) {
5124 Bundle.dump(OS);
5125 return OS;
5126 }
5127#endif
5128
5129 /// Contains all scheduling relevant data for the copyable instruction.
5130 /// It models the virtual instructions, supposed to replace the original
5131 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5132 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5133 /// instruction %virt = add %0, 0.
5134 class ScheduleCopyableData final : public ScheduleEntity {
5135 /// The source schedule data for the instruction.
5136 Instruction *Inst = nullptr;
5137 /// The edge information for the instruction.
5138 const EdgeInfo EI;
5139 /// This ScheduleData is in the current scheduling region if this matches
5140 /// the current SchedulingRegionID of BlockScheduling.
5141 int SchedulingRegionID = 0;
5142 /// Bundle, this data is part of.
5143 ScheduleBundle &Bundle;
5144
5145 public:
5146 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5147 const EdgeInfo &EI, ScheduleBundle &Bundle)
5148 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5149 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5150 static bool classof(const ScheduleEntity *Entity) {
5151 return Entity->getKind() == Kind::ScheduleCopyableData;
5152 }
5153
5154 /// Verify basic self consistency properties
5155 void verify() {
5156 if (hasValidDependencies()) {
5157 assert(UnscheduledDeps <= Dependencies && "invariant");
5158 } else {
5159 assert(UnscheduledDeps == Dependencies && "invariant");
5160 }
5161
5162 if (IsScheduled) {
5163 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5164 "unexpected scheduled state");
5165 }
5166 }
5167
5168 /// Returns true if the dependency information has been calculated.
5169 /// Note that depenendency validity can vary between instructions within
5170 /// a single bundle.
5171 bool hasValidDependencies() const {
5172 return Dependencies != ScheduleData::InvalidDeps;
5173 }
5174
5175 /// Returns true if it is ready for scheduling, i.e. it has no more
5176 /// unscheduled depending instructions/bundles.
5177 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5178
5179 /// Modifies the number of unscheduled dependencies for this instruction,
5180 /// and returns the number of remaining dependencies for the containing
5181 /// bundle.
5182 int incrementUnscheduledDeps(int Incr) {
5183 assert(hasValidDependencies() &&
5184 "increment of unscheduled deps would be meaningless");
5185 UnscheduledDeps += Incr;
5186 assert(UnscheduledDeps >= 0 && "invariant");
5187 return UnscheduledDeps;
5188 }
5189
5190 /// Sets the number of unscheduled dependencies to the number of
5191 /// dependencies.
5192 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5193
5194 /// Gets the number of unscheduled dependencies.
5195 int getUnscheduledDeps() const { return UnscheduledDeps; }
5196 /// Gets the number of dependencies.
5197 int getDependencies() const { return Dependencies; }
5198 /// Initializes the number of dependencies.
5199 void initDependencies() { Dependencies = 0; }
5200 /// Increments the number of dependencies.
5201 void incDependencies() { Dependencies++; }
5202
5203 /// Gets scheduling region ID.
5204 int getSchedulingRegionID() const { return SchedulingRegionID; }
5205
5206 /// Gets the instruction.
5207 Instruction *getInst() const { return Inst; }
5208
5209 /// Clears all dependency information.
5210 void clearDependencies() {
5211 Dependencies = ScheduleData::InvalidDeps;
5212 UnscheduledDeps = ScheduleData::InvalidDeps;
5213 IsScheduled = false;
5214 }
5215
5216 /// Gets the edge information.
5217 const EdgeInfo &getEdgeInfo() const { return EI; }
5218
5219 /// Gets the bundle.
5220 ScheduleBundle &getBundle() { return Bundle; }
5221 const ScheduleBundle &getBundle() const { return Bundle; }
5222
5223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5224 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5225
5226 LLVM_DUMP_METHOD void dump() const {
5227 dump(dbgs());
5228 dbgs() << '\n';
5229 }
5230#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5231
5232 private:
5233 /// true, if it has valid dependency information. These nodes always have
5234 /// only single dependency.
5235 int Dependencies = ScheduleData::InvalidDeps;
5236
5237 /// The number of dependencies minus the number of dependencies of scheduled
5238 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5239 /// for scheduling.
5240 /// Note that this is negative as long as Dependencies is not calculated.
5241 int UnscheduledDeps = ScheduleData::InvalidDeps;
5242 };
5243
5244#ifndef NDEBUG
5245 friend inline raw_ostream &
5246 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5247 SD.dump(OS);
5248 return OS;
5249 }
5250#endif
5251
5252 friend struct GraphTraits<BoUpSLP *>;
5253 friend struct DOTGraphTraits<BoUpSLP *>;
5254
5255 /// Contains all scheduling data for a basic block.
5256 /// It does not schedules instructions, which are not memory read/write
5257 /// instructions and their operands are either constants, or arguments, or
5258 /// phis, or instructions from others blocks, or their users are phis or from
5259 /// the other blocks. The resulting vector instructions can be placed at the
5260 /// beginning of the basic block without scheduling (if operands does not need
5261 /// to be scheduled) or at the end of the block (if users are outside of the
5262 /// block). It allows to save some compile time and memory used by the
5263 /// compiler.
5264 /// ScheduleData is assigned for each instruction in between the boundaries of
5265 /// the tree entry, even for those, which are not part of the graph. It is
5266 /// required to correctly follow the dependencies between the instructions and
5267 /// their correct scheduling. The ScheduleData is not allocated for the
5268 /// instructions, which do not require scheduling, like phis, nodes with
5269 /// extractelements/insertelements only or nodes with instructions, with
5270 /// uses/operands outside of the block.
5271 struct BlockScheduling {
5272 BlockScheduling(BasicBlock *BB)
5273 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5274
5275 void clear() {
5276 ScheduledBundles.clear();
5277 ScheduledBundlesList.clear();
5278 ScheduleCopyableDataMap.clear();
5279 ScheduleCopyableDataMapByInst.clear();
5280 ScheduleCopyableDataMapByInstUser.clear();
5281 ScheduleCopyableDataMapByUsers.clear();
5282 ReadyInsts.clear();
5283 ScheduleStart = nullptr;
5284 ScheduleEnd = nullptr;
5285 FirstLoadStoreInRegion = nullptr;
5286 LastLoadStoreInRegion = nullptr;
5287 RegionHasStackSave = false;
5288
5289 // Reduce the maximum schedule region size by the size of the
5290 // previous scheduling run.
5291 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5292 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5293 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5294 ScheduleRegionSize = 0;
5295
5296 // Make a new scheduling region, i.e. all existing ScheduleData is not
5297 // in the new region yet.
5298 ++SchedulingRegionID;
5299 }
5300
5301 ScheduleData *getScheduleData(Instruction *I) {
5302 if (!I)
5303 return nullptr;
5304 if (BB != I->getParent())
5305 // Avoid lookup if can't possibly be in map.
5306 return nullptr;
5307 ScheduleData *SD = ScheduleDataMap.lookup(I);
5308 if (SD && isInSchedulingRegion(*SD))
5309 return SD;
5310 return nullptr;
5311 }
5312
5313 ScheduleData *getScheduleData(Value *V) {
5314 return getScheduleData(dyn_cast<Instruction>(V));
5315 }
5316
5317 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5318 /// operand number) and value.
5319 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5320 const Value *V) const {
5321 if (ScheduleCopyableDataMap.empty())
5322 return nullptr;
5323 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5324 if (It == ScheduleCopyableDataMap.end())
5325 return nullptr;
5326 ScheduleCopyableData *SD = It->getSecond().get();
5327 if (!isInSchedulingRegion(*SD))
5328 return nullptr;
5329 return SD;
5330 }
5331
5332 /// Returns the ScheduleCopyableData for the given user \p User, operand
5333 /// number and operand \p V.
5335 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5336 const Value *V) {
5337 if (ScheduleCopyableDataMapByInstUser.empty())
5338 return {};
5339 const auto It = ScheduleCopyableDataMapByInstUser.find(
5340 std::make_pair(std::make_pair(User, OperandIdx), V));
5341 if (It == ScheduleCopyableDataMapByInstUser.end())
5342 return {};
5344 for (ScheduleCopyableData *SD : It->getSecond()) {
5345 if (isInSchedulingRegion(*SD))
5346 Res.push_back(SD);
5347 }
5348 return Res;
5349 }
5350
5351 /// Returns true if all operands of the given instruction \p User are
5352 /// replaced by copyable data.
5353 /// \param User The user instruction.
5354 /// \param Op The operand, which might be replaced by the copyable data.
5355 /// \param SLP The SLP tree.
5356 /// \param NumOps The number of operands used. If the instruction uses the
5357 /// same operand several times, check for the first use, then the second,
5358 /// etc.
5359 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5360 Instruction *Op, BoUpSLP &SLP,
5361 unsigned NumOps) const {
5362 assert(NumOps > 0 && "No operands");
5363 if (ScheduleCopyableDataMap.empty())
5364 return false;
5365 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5366 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5367 if (Entries.empty())
5368 return false;
5369 unsigned CurNumOps = 0;
5370 for (const Use &U : User->operands()) {
5371 if (U.get() != Op)
5372 continue;
5373 ++CurNumOps;
5374 // Check all tree entries, if they have operands replaced by copyable
5375 // data.
5376 for (TreeEntry *TE : Entries) {
5377 unsigned Inc = 0;
5378 bool IsNonSchedulableWithParentPhiNode =
5379 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5380 TE->UserTreeIndex.UserTE->hasState() &&
5381 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5382 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5383 // Count the number of unique phi nodes, which are the parent for
5384 // parent entry, and exit, if all the unique phis are processed.
5385 if (IsNonSchedulableWithParentPhiNode) {
5386 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5387 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5388 for (Value *V : ParentTE->Scalars) {
5389 auto *PHI = dyn_cast<PHINode>(V);
5390 if (!PHI)
5391 continue;
5392 if (ParentsUniqueUsers.insert(PHI).second &&
5393 is_contained(PHI->incoming_values(), User))
5394 ++Inc;
5395 }
5396 } else {
5397 Inc = count(TE->Scalars, User);
5398 }
5399
5400 // Check if the user is commutative.
5401 // The commutatives are handled later, as their operands can be
5402 // reordered.
5403 // Same applies even for non-commutative cmps, because we can invert
5404 // their predicate potentially and, thus, reorder the operands.
5405 bool IsCommutativeUser =
5406 ::isCommutative(User) &&
5407 ::isCommutableOperand(User, User, U.getOperandNo());
5408 if (!IsCommutativeUser) {
5409 Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User);
5410 IsCommutativeUser =
5411 ::isCommutative(MainOp, User) &&
5412 ::isCommutableOperand(MainOp, User, U.getOperandNo());
5413 }
5414 // The commutative user with the same operands can be safely
5415 // considered as non-commutative, operands reordering does not change
5416 // the semantics.
5417 assert(
5418 (!IsCommutativeUser ||
5419 (((::isCommutative(User) &&
5420 ::isCommutableOperand(User, User, 0) &&
5421 ::isCommutableOperand(User, User, 1)) ||
5422 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
5423 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5424 User, 0) &&
5425 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5426 User, 1))))) &&
5427 "Expected commutative user with 2 first commutable operands");
5428 bool IsCommutativeWithSameOps =
5429 IsCommutativeUser && User->getOperand(0) == User->getOperand(1);
5430 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5431 !isa<CmpInst>(User)) {
5432 EdgeInfo EI(TE, U.getOperandNo());
5433 if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
5434 continue;
5435 return false;
5436 }
5437 PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5438 .first->getSecond() += Inc;
5439 }
5440 }
5441 if (PotentiallyReorderedEntriesCount.empty())
5442 return true;
5443 // Check the commutative/cmp entries.
5444 for (auto &P : PotentiallyReorderedEntriesCount) {
5445 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5446 bool IsNonSchedulableWithParentPhiNode =
5447 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5448 P.first->UserTreeIndex.UserTE->hasState() &&
5449 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5450 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5451 auto *It = find(P.first->Scalars, User);
5452 do {
5453 assert(It != P.first->Scalars.end() &&
5454 "User is not in the tree entry");
5455 int Lane = std::distance(P.first->Scalars.begin(), It);
5456 assert(Lane >= 0 && "Lane is not found");
5457 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5458 Lane = P.first->ReorderIndices[Lane];
5459 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5460 "Couldn't find extract lane");
5461 // Count the number of unique phi nodes, which are the parent for
5462 // parent entry, and exit, if all the unique phis are processed.
5463 if (IsNonSchedulableWithParentPhiNode) {
5464 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5465 Value *User = ParentTE->Scalars[Lane];
5466 if (!ParentsUniqueUsers.insert(User).second) {
5467 It =
5468 find(make_range(std::next(It), P.first->Scalars.end()), User);
5469 continue;
5470 }
5471 }
5472 for (unsigned OpIdx :
5474 P.first->getMainOp()))) {
5475 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5476 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5477 --P.getSecond();
5478 }
5479 // If parent node is schedulable, it will be handled correctly.
5480 It = find(make_range(std::next(It), P.first->Scalars.end()), User);
5481 } while (It != P.first->Scalars.end());
5482 }
5483 return all_of(PotentiallyReorderedEntriesCount,
5484 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5485 return P.second == NumOps - 1;
5486 });
5487 }
5488
5490 getScheduleCopyableData(const Instruction *I) const {
5491 if (ScheduleCopyableDataMapByInst.empty())
5492 return {};
5493 const auto It = ScheduleCopyableDataMapByInst.find(I);
5494 if (It == ScheduleCopyableDataMapByInst.end())
5495 return {};
5497 for (ScheduleCopyableData *SD : It->getSecond()) {
5498 if (isInSchedulingRegion(*SD))
5499 Res.push_back(SD);
5500 }
5501 return Res;
5502 }
5503
5505 getScheduleCopyableDataUsers(const Instruction *User) const {
5506 if (ScheduleCopyableDataMapByUsers.empty())
5507 return {};
5508 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5509 if (It == ScheduleCopyableDataMapByUsers.end())
5510 return {};
5512 for (ScheduleCopyableData *SD : It->getSecond()) {
5513 if (isInSchedulingRegion(*SD))
5514 Res.push_back(SD);
5515 }
5516 return Res;
5517 }
5518
5519 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5520 Instruction *I,
5521 int SchedulingRegionID,
5522 ScheduleBundle &Bundle) {
5523 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5524 ScheduleCopyableData *CD =
5525 ScheduleCopyableDataMap
5526 .try_emplace(std::make_pair(EI, I),
5527 std::make_unique<ScheduleCopyableData>(
5528 SchedulingRegionID, I, EI, Bundle))
5529 .first->getSecond()
5530 .get();
5531 ScheduleCopyableDataMapByInst[I].push_back(CD);
5532 if (EI.UserTE) {
5533 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5534 const auto *It = find(Op, I);
5535 assert(It != Op.end() && "Lane not set");
5536 SmallPtrSet<Instruction *, 4> Visited;
5537 do {
5538 int Lane = std::distance(Op.begin(), It);
5539 assert(Lane >= 0 && "Lane not set");
5540 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5541 !EI.UserTE->ReorderIndices.empty())
5542 Lane = EI.UserTE->ReorderIndices[Lane];
5543 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5544 "Couldn't find extract lane");
5545 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5546 if (!Visited.insert(In).second) {
5547 It = find(make_range(std::next(It), Op.end()), I);
5548 continue;
5549 }
5550 ScheduleCopyableDataMapByInstUser
5551 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5552 .first->getSecond()
5553 .push_back(CD);
5554 ScheduleCopyableDataMapByUsers.try_emplace(I)
5555 .first->getSecond()
5556 .insert(CD);
5557 // Remove extra deps for users, becoming non-immediate users of the
5558 // instruction. It may happen, if the chain of same copyable elements
5559 // appears in the tree.
5560 if (In == I) {
5561 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5562 if (ScheduleCopyableData *UserCD =
5563 getScheduleCopyableData(UserEI, In))
5564 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5565 }
5566 It = find(make_range(std::next(It), Op.end()), I);
5567 } while (It != Op.end());
5568 } else {
5569 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5570 CD);
5571 }
5572 return *CD;
5573 }
5574
5575 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5576 auto *I = dyn_cast<Instruction>(V);
5577 if (!I)
5578 return {};
5579 auto It = ScheduledBundles.find(I);
5580 if (It == ScheduledBundles.end())
5581 return {};
5582 return It->getSecond();
5583 }
5584
5585 /// Returns true if the entity is in the scheduling region.
5586 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5587 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5588 return Data->getSchedulingRegionID() == SchedulingRegionID;
5589 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5590 return CD->getSchedulingRegionID() == SchedulingRegionID;
5591 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5592 [&](const ScheduleEntity *BundleMember) {
5593 return isInSchedulingRegion(*BundleMember);
5594 });
5595 }
5596
5597 /// Marks an instruction as scheduled and puts all dependent ready
5598 /// instructions into the ready-list.
5599 template <typename ReadyListType>
5600 void schedule(const BoUpSLP &R, const InstructionsState &S,
5601 const EdgeInfo &EI, ScheduleEntity *Data,
5602 ReadyListType &ReadyList) {
5603 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5605 // Handle the def-use chain dependencies.
5606
5607 // Decrement the unscheduled counter and insert to ready list if ready.
5608 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5609 if ((IsControl || Data->hasValidDependencies()) &&
5610 Data->incrementUnscheduledDeps(-1) == 0) {
5611 // There are no more unscheduled dependencies after
5612 // decrementing, so we can put the dependent instruction
5613 // into the ready list.
5614 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5616 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5617 CopyableBundle.push_back(&CD->getBundle());
5618 Bundles = CopyableBundle;
5619 } else {
5620 Bundles = getScheduleBundles(Data->getInst());
5621 }
5622 if (!Bundles.empty()) {
5623 for (ScheduleBundle *Bundle : Bundles) {
5624 if (Bundle->unscheduledDepsInBundle() == 0) {
5625 assert(!Bundle->isScheduled() &&
5626 "already scheduled bundle gets ready");
5627 ReadyList.insert(Bundle);
5629 << "SLP: gets ready: " << *Bundle << "\n");
5630 }
5631 }
5632 return;
5633 }
5634 assert(!Data->isScheduled() &&
5635 "already scheduled bundle gets ready");
5637 "Expected non-copyable data");
5638 ReadyList.insert(Data);
5639 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5640 }
5641 };
5642
5643 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5644 Instruction *I) {
5645 if (!ScheduleCopyableDataMap.empty()) {
5647 getScheduleCopyableData(User, OpIdx, I);
5648 for (ScheduleCopyableData *CD : CopyableData)
5649 DecrUnsched(CD, /*IsControl=*/false);
5650 if (!CopyableData.empty())
5651 return;
5652 }
5653 if (ScheduleData *OpSD = getScheduleData(I))
5654 DecrUnsched(OpSD, /*IsControl=*/false);
5655 };
5656
5657 // If BundleMember is a vector bundle, its operands may have been
5658 // reordered during buildTree(). We therefore need to get its operands
5659 // through the TreeEntry.
5660 if (!Bundles.empty()) {
5661 auto *In = BundleMember->getInst();
5662 // Count uses of each instruction operand.
5663 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5664 unsigned TotalOpCount = 0;
5665 if (isa<ScheduleCopyableData>(BundleMember)) {
5666 // Copyable data is used only once (uses itself).
5667 TotalOpCount = OperandsUses[In] = 1;
5668 } else {
5669 for (const Use &U : In->operands()) {
5670 if (auto *I = dyn_cast<Instruction>(U.get())) {
5671 auto Res = OperandsUses.try_emplace(I, 0);
5672 ++Res.first->getSecond();
5673 ++TotalOpCount;
5674 }
5675 }
5676 }
5677 // Decrement the unscheduled counter and insert to ready list if
5678 // ready.
5679 auto DecrUnschedForInst =
5680 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5681 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5682 &Checked) {
5683 if (!ScheduleCopyableDataMap.empty()) {
5684 const EdgeInfo EI = {UserTE, OpIdx};
5685 if (ScheduleCopyableData *CD =
5686 getScheduleCopyableData(EI, I)) {
5687 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
5688 return;
5689 DecrUnsched(CD, /*IsControl=*/false);
5690 return;
5691 }
5692 }
5693 auto It = OperandsUses.find(I);
5694 assert(It != OperandsUses.end() && "Operand not found");
5695 if (It->second > 0) {
5696 if (ScheduleData *OpSD = getScheduleData(I)) {
5697 if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
5698 return;
5699 --It->getSecond();
5700 assert(TotalOpCount > 0 && "No more operands to decrement");
5701 --TotalOpCount;
5702 DecrUnsched(OpSD, /*IsControl=*/false);
5703 } else {
5704 --It->getSecond();
5705 assert(TotalOpCount > 0 && "No more operands to decrement");
5706 --TotalOpCount;
5707 }
5708 }
5709 };
5710
5711 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5712 for (ScheduleBundle *Bundle : Bundles) {
5713 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5714 break;
5715 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5716 // Need to search for the lane since the tree entry can be
5717 // reordered.
5718 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
5719 bool IsNonSchedulableWithParentPhiNode =
5720 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5721 Bundle->getTreeEntry()->UserTreeIndex &&
5722 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5723 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5724 TreeEntry::SplitVectorize &&
5725 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5726 Instruction::PHI;
5727 do {
5728 int Lane =
5729 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5730 assert(Lane >= 0 && "Lane not set");
5731 if (isa<StoreInst>(In) &&
5732 !Bundle->getTreeEntry()->ReorderIndices.empty())
5733 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5734 assert(Lane < static_cast<int>(
5735 Bundle->getTreeEntry()->Scalars.size()) &&
5736 "Couldn't find extract lane");
5737
5738 // Since vectorization tree is being built recursively this
5739 // assertion ensures that the tree entry has all operands set
5740 // before reaching this code. Couple of exceptions known at the
5741 // moment are extracts where their second (immediate) operand is
5742 // not added. Since immediates do not affect scheduler behavior
5743 // this is considered okay.
5744 assert(In &&
5746 In->getNumOperands() ==
5747 Bundle->getTreeEntry()->getNumOperands() ||
5748 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5749 "Missed TreeEntry operands?");
5750
5751 // Count the number of unique phi nodes, which are the parent for
5752 // parent entry, and exit, if all the unique phis are processed.
5753 if (IsNonSchedulableWithParentPhiNode) {
5754 const TreeEntry *ParentTE =
5755 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5756 Value *User = ParentTE->Scalars[Lane];
5757 if (!ParentsUniqueUsers.insert(User).second) {
5758 It = std::find(std::next(It),
5759 Bundle->getTreeEntry()->Scalars.end(), In);
5760 continue;
5761 }
5762 }
5763
5764 for (unsigned OpIdx :
5765 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5766 if (auto *I = dyn_cast<Instruction>(
5767 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5768 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5769 << *I << "\n");
5770 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5771 }
5772 // If parent node is schedulable, it will be handled correctly.
5773 if (Bundle->getTreeEntry()->isCopyableElement(In))
5774 break;
5775 It = std::find(std::next(It),
5776 Bundle->getTreeEntry()->Scalars.end(), In);
5777 } while (It != Bundle->getTreeEntry()->Scalars.end());
5778 }
5779 } else {
5780 // If BundleMember is a stand-alone instruction, no operand reordering
5781 // has taken place, so we directly access its operands.
5782 for (Use &U : BundleMember->getInst()->operands()) {
5783 if (auto *I = dyn_cast<Instruction>(U.get())) {
5785 << "SLP: check for readiness (def): " << *I << "\n");
5786 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5787 }
5788 }
5789 }
5790 // Handle the memory dependencies.
5791 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5792 if (!SD)
5793 return;
5794 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5795 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5796 if (!VisitedMemory.insert(MemoryDep).second)
5797 continue;
5798 // There are no more unscheduled dependencies after decrementing,
5799 // so we can put the dependent instruction into the ready list.
5800 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5801 << *MemoryDep << "\n");
5802 DecrUnsched(MemoryDep);
5803 }
5804 // Handle the control dependencies.
5805 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5806 for (ScheduleData *Dep : SD->getControlDependencies()) {
5807 if (!VisitedControl.insert(Dep).second)
5808 continue;
5809 // There are no more unscheduled dependencies after decrementing,
5810 // so we can put the dependent instruction into the ready list.
5812 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5813 DecrUnsched(Dep, /*IsControl=*/true);
5814 }
5815 };
5816 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5817 SD->setScheduled(/*Scheduled=*/true);
5818 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5821 Instruction *In = SD->getInst();
5822 if (R.isVectorized(In)) {
5823 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5824 for (TreeEntry *TE : Entries) {
5826 In->getNumOperands() != TE->getNumOperands())
5827 continue;
5828 auto &BundlePtr =
5829 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5830 BundlePtr->setTreeEntry(TE);
5831 BundlePtr->add(SD);
5832 Bundles.push_back(BundlePtr.get());
5833 }
5834 }
5835 ProcessBundleMember(SD, Bundles);
5836 } else {
5837 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5838 Bundle.setScheduled(/*Scheduled=*/true);
5839 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5840 auto AreAllBundlesScheduled =
5841 [&](const ScheduleEntity *SD,
5842 ArrayRef<ScheduleBundle *> SDBundles) {
5844 return true;
5845 return !SDBundles.empty() &&
5846 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5847 return SDBundle->isScheduled();
5848 });
5849 };
5850 for (ScheduleEntity *SD : Bundle.getBundle()) {
5853 SDBundles = getScheduleBundles(SD->getInst());
5854 if (AreAllBundlesScheduled(SD, SDBundles)) {
5855 SD->setScheduled(/*Scheduled=*/true);
5856 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5857 : SDBundles);
5858 }
5859 }
5860 }
5861 }
5862
5863 /// Verify basic self consistency properties of the data structure.
5864 void verify() {
5865 if (!ScheduleStart)
5866 return;
5867
5868 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5869 ScheduleStart->comesBefore(ScheduleEnd) &&
5870 "Not a valid scheduling region?");
5871
5872 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5873 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5874 if (!Bundles.empty()) {
5875 for (ScheduleBundle *Bundle : Bundles) {
5876 assert(isInSchedulingRegion(*Bundle) &&
5877 "primary schedule data not in window?");
5878 Bundle->verify();
5879 }
5880 continue;
5881 }
5882 auto *SD = getScheduleData(I);
5883 if (!SD)
5884 continue;
5885 assert(isInSchedulingRegion(*SD) &&
5886 "primary schedule data not in window?");
5887 SD->verify();
5888 }
5889
5890 assert(all_of(ReadyInsts,
5891 [](const ScheduleEntity *Bundle) {
5892 return Bundle->isReady();
5893 }) &&
5894 "item in ready list not ready?");
5895 }
5896
5897 /// Put all instructions into the ReadyList which are ready for scheduling.
5898 template <typename ReadyListType>
5899 void initialFillReadyList(ReadyListType &ReadyList) {
5900 SmallPtrSet<ScheduleBundle *, 16> Visited;
5901 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5902 ScheduleData *SD = getScheduleData(I);
5903 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5904 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5905 !Bundles.empty()) {
5906 for (ScheduleBundle *Bundle : Bundles) {
5907 if (!Visited.insert(Bundle).second)
5908 continue;
5909 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5910 ReadyList.insert(Bundle);
5911 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5912 << *Bundle << "\n");
5913 }
5914 }
5915 continue;
5916 }
5917 ReadyList.insert(SD);
5919 << "SLP: initially in ready list: " << *SD << "\n");
5920 }
5921 }
5922 }
5923
5924 /// Build a bundle from the ScheduleData nodes corresponding to the
5925 /// scalar instruction for each lane.
5926 /// \param VL The list of scalar instructions.
5927 /// \param S The state of the instructions.
5928 /// \param EI The edge in the SLP graph or the user node/operand number.
5929 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5930 const InstructionsState &S, const EdgeInfo &EI);
5931
5932 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5933 /// cyclic dependencies. This is only a dry-run, no instructions are
5934 /// actually moved at this stage.
5935 /// \returns the scheduling bundle. The returned Optional value is not
5936 /// std::nullopt if \p VL is allowed to be scheduled.
5937 std::optional<ScheduleBundle *>
5938 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5939 const InstructionsState &S, const EdgeInfo &EI);
5940
5941 /// Allocates schedule data chunk.
5942 ScheduleData *allocateScheduleDataChunks();
5943
5944 /// Extends the scheduling region so that V is inside the region.
5945 /// \returns true if the region size is within the limit.
5946 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5947
5948 /// Initialize the ScheduleData structures for new instructions in the
5949 /// scheduling region.
5950 void initScheduleData(Instruction *FromI, Instruction *ToI,
5951 ScheduleData *PrevLoadStore,
5952 ScheduleData *NextLoadStore);
5953
5954 /// Updates the dependency information of a bundle and of all instructions/
5955 /// bundles which depend on the original bundle.
5956 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5957 BoUpSLP *SLP,
5958 ArrayRef<ScheduleData *> ControlDeps = {});
5959
5960 /// Sets all instruction in the scheduling region to un-scheduled.
5961 void resetSchedule();
5962
5963 BasicBlock *BB;
5964
5965 /// Simple memory allocation for ScheduleData.
5967
5968 /// The size of a ScheduleData array in ScheduleDataChunks.
5969 int ChunkSize;
5970
5971 /// The allocator position in the current chunk, which is the last entry
5972 /// of ScheduleDataChunks.
5973 int ChunkPos;
5974
5975 /// Attaches ScheduleData to Instruction.
5976 /// Note that the mapping survives during all vectorization iterations, i.e.
5977 /// ScheduleData structures are recycled.
5978 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5979
5980 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5981 /// number) and the operand instruction, represented as copyable element.
5982 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5983 std::unique_ptr<ScheduleCopyableData>>
5984 ScheduleCopyableDataMap;
5985
5986 /// Represents mapping between instruction and all related
5987 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5988 /// element). The SLP tree may contain several representations of the same
5989 /// instruction.
5990 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5991 ScheduleCopyableDataMapByInst;
5992
5993 /// Represents mapping between user value and operand number, the operand
5994 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5995 /// the same user may refernce the same operand in different tree entries
5996 /// and the operand may be modelled by the different copyable data element.
5997 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5999 ScheduleCopyableDataMapByInstUser;
6000
6001 /// Represents mapping between instruction and all related
6002 /// ScheduleCopyableData. It represents the mapping between the actual
6003 /// instruction and the last copyable data element in the chain. E.g., if
6004 /// the graph models the following instructions:
6005 /// %0 = non-add instruction ...
6006 /// ...
6007 /// %4 = add %3, 1
6008 /// %5 = add %4, 1
6009 /// %6 = insertelement poison, %0, 0
6010 /// %7 = insertelement %6, %5, 1
6011 /// And the graph is modeled as:
6012 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
6013 /// -> [1, 0] -> [%1, 0]
6014 ///
6015 /// this map will map %0 only to the copyable element <1>, which is the last
6016 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
6017 /// keep the map to <0>, not the %0.
6018 SmallDenseMap<const Instruction *,
6019 SmallSetVector<ScheduleCopyableData *, 4>>
6020 ScheduleCopyableDataMapByUsers;
6021
6022 /// Attaches ScheduleBundle to Instruction.
6023 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6024 ScheduledBundles;
6025 /// The list of ScheduleBundles.
6026 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
6027
6028 /// The ready-list for scheduling (only used for the dry-run).
6029 SetVector<ScheduleEntity *> ReadyInsts;
6030
6031 /// The first instruction of the scheduling region.
6032 Instruction *ScheduleStart = nullptr;
6033
6034 /// The first instruction _after_ the scheduling region.
6035 Instruction *ScheduleEnd = nullptr;
6036
6037 /// The first memory accessing instruction in the scheduling region
6038 /// (can be null).
6039 ScheduleData *FirstLoadStoreInRegion = nullptr;
6040
6041 /// The last memory accessing instruction in the scheduling region
6042 /// (can be null).
6043 ScheduleData *LastLoadStoreInRegion = nullptr;
6044
6045 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
6046 /// region? Used to optimize the dependence calculation for the
6047 /// common case where there isn't.
6048 bool RegionHasStackSave = false;
6049
6050 /// The current size of the scheduling region.
6051 int ScheduleRegionSize = 0;
6052
6053 /// The maximum size allowed for the scheduling region.
6054 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6055
6056 /// The ID of the scheduling region. For a new vectorization iteration this
6057 /// is incremented which "removes" all ScheduleData from the region.
6058 /// Make sure that the initial SchedulingRegionID is greater than the
6059 /// initial SchedulingRegionID in ScheduleData (which is 0).
6060 int SchedulingRegionID = 1;
6061 };
6062
6063 /// Attaches the BlockScheduling structures to basic blocks.
6064 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6065
6066 /// Performs the "real" scheduling. Done before vectorization is actually
6067 /// performed in a basic block.
6068 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6069
6070 /// List of users to ignore during scheduling and that don't need extracting.
6071 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6072
6073 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6074 /// sorted SmallVectors of unsigned.
6075 struct OrdersTypeDenseMapInfo {
6076 static OrdersType getEmptyKey() {
6077 OrdersType V;
6078 V.push_back(~1U);
6079 return V;
6080 }
6081
6082 static OrdersType getTombstoneKey() {
6083 OrdersType V;
6084 V.push_back(~2U);
6085 return V;
6086 }
6087
6088 static unsigned getHashValue(const OrdersType &V) {
6089 return static_cast<unsigned>(hash_combine_range(V));
6090 }
6091
6092 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6093 return LHS == RHS;
6094 }
6095 };
6096
6097 // Analysis and block reference.
6098 Function *F;
6099 ScalarEvolution *SE;
6100 TargetTransformInfo *TTI;
6101 TargetLibraryInfo *TLI;
6102 LoopInfo *LI;
6103 DominatorTree *DT;
6104 AssumptionCache *AC;
6105 DemandedBits *DB;
6106 const DataLayout *DL;
6107 OptimizationRemarkEmitter *ORE;
6108
6109 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6110 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6111
6112 /// Instruction builder to construct the vectorized tree.
6113 IRBuilder<TargetFolder> Builder;
6114
6115 /// A map of scalar integer values to the smallest bit width with which they
6116 /// can legally be represented. The values map to (width, signed) pairs,
6117 /// where "width" indicates the minimum bit width and "signed" is True if the
6118 /// value must be signed-extended, rather than zero-extended, back to its
6119 /// original width.
6120 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6121
6122 /// Final size of the reduced vector, if the current graph represents the
6123 /// input for the reduction and it was possible to narrow the size of the
6124 /// reduction.
6125 unsigned ReductionBitWidth = 0;
6126
6127 /// Canonical graph size before the transformations.
6128 unsigned BaseGraphSize = 1;
6129
6130 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6131 /// type sizes, used in the tree.
6132 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6133
6134 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6135 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6136 DenseSet<unsigned> ExtraBitWidthNodes;
6137};
6138
6139template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6143 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6144 SecondInfo::getEmptyKey());
6145 }
6146
6148 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6149 SecondInfo::getTombstoneKey());
6150 }
6151
6152 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6153 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6154 SecondInfo::getHashValue(Val.EdgeIdx));
6155 }
6156
6157 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6158 const BoUpSLP::EdgeInfo &RHS) {
6159 return LHS == RHS;
6160 }
6161};
6162
6163template <> struct llvm::GraphTraits<BoUpSLP *> {
6164 using TreeEntry = BoUpSLP::TreeEntry;
6165
6166 /// NodeRef has to be a pointer per the GraphWriter.
6168
6169 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6170
6171 /// Add the VectorizableTree to the index iterator to be able to return
6172 /// TreeEntry pointers.
6174 : public iterator_adaptor_base<
6175 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6177
6181
6182 NodeRef operator*() { return I->UserTE; }
6183 };
6184
6186 return R.VectorizableTree[0].get();
6187 }
6188
6190 return {&N->UserTreeIndex, N->Container};
6191 }
6192
6194 return {&N->UserTreeIndex + 1, N->Container};
6195 }
6196
6197 /// For the node iterator we just need to turn the TreeEntry iterator into a
6198 /// TreeEntry* iterator so that it dereferences to NodeRef.
6200 using ItTy = ContainerTy::iterator;
6201 ItTy It;
6202
6203 public:
6204 nodes_iterator(const ItTy &It2) : It(It2) {}
6205 NodeRef operator*() { return It->get(); }
6207 ++It;
6208 return *this;
6209 }
6210 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6211 };
6212
6214 return nodes_iterator(R->VectorizableTree.begin());
6215 }
6216
6218 return nodes_iterator(R->VectorizableTree.end());
6219 }
6220
6221 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6222};
6223
6224template <>
6226 using TreeEntry = BoUpSLP::TreeEntry;
6227
6228 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6229
6230 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6231 std::string Str;
6232 raw_string_ostream OS(Str);
6233 OS << Entry->Idx << ".\n";
6234 if (isSplat(Entry->Scalars))
6235 OS << "<splat> ";
6236 for (auto *V : Entry->Scalars) {
6237 OS << *V;
6238 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6239 return EU.Scalar == V;
6240 }))
6241 OS << " <extract>";
6242 OS << "\n";
6243 }
6244 return Str;
6245 }
6246
6247 static std::string getNodeAttributes(const TreeEntry *Entry,
6248 const BoUpSLP *) {
6249 if (Entry->isGather())
6250 return "color=red";
6251 if (Entry->State == TreeEntry::ScatterVectorize ||
6252 Entry->State == TreeEntry::StridedVectorize ||
6253 Entry->State == TreeEntry::CompressVectorize)
6254 return "color=blue";
6255 return "";
6256 }
6257};
6258
6261 for (auto *I : DeletedInstructions) {
6262 if (!I->getParent()) {
6263 // Temporarily insert instruction back to erase them from parent and
6264 // memory later.
6265 if (isa<PHINode>(I))
6266 // Phi nodes must be the very first instructions in the block.
6267 I->insertBefore(F->getEntryBlock(),
6268 F->getEntryBlock().getFirstNonPHIIt());
6269 else
6270 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6271 continue;
6272 }
6273 for (Use &U : I->operands()) {
6274 auto *Op = dyn_cast<Instruction>(U.get());
6275 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6277 DeadInsts.emplace_back(Op);
6278 }
6279 I->dropAllReferences();
6280 }
6281 for (auto *I : DeletedInstructions) {
6282 assert(I->use_empty() &&
6283 "trying to erase instruction with users.");
6284 I->eraseFromParent();
6285 }
6286
6287 // Cleanup any dead scalar code feeding the vectorized instructions
6289
6290#ifdef EXPENSIVE_CHECKS
6291 // If we could guarantee that this call is not extremely slow, we could
6292 // remove the ifdef limitation (see PR47712).
6293 assert(!verifyFunction(*F, &dbgs()));
6294#endif
6295}
6296
6297/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6298/// contains original mask for the scalars reused in the node. Procedure
6299/// transform this mask in accordance with the given \p Mask.
6301 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6302 "Expected non-empty mask.");
6303 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6304 Prev.swap(Reuses);
6305 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6306 if (Mask[I] != PoisonMaskElem)
6307 Reuses[Mask[I]] = Prev[I];
6308}
6309
6310/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6311/// the original order of the scalars. Procedure transforms the provided order
6312/// in accordance with the given \p Mask. If the resulting \p Order is just an
6313/// identity order, \p Order is cleared.
6315 bool BottomOrder = false) {
6316 assert(!Mask.empty() && "Expected non-empty mask.");
6317 unsigned Sz = Mask.size();
6318 if (BottomOrder) {
6319 SmallVector<unsigned> PrevOrder;
6320 if (Order.empty()) {
6321 PrevOrder.resize(Sz);
6322 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6323 } else {
6324 PrevOrder.swap(Order);
6325 }
6326 Order.assign(Sz, Sz);
6327 for (unsigned I = 0; I < Sz; ++I)
6328 if (Mask[I] != PoisonMaskElem)
6329 Order[I] = PrevOrder[Mask[I]];
6330 if (all_of(enumerate(Order), [&](const auto &Data) {
6331 return Data.value() == Sz || Data.index() == Data.value();
6332 })) {
6333 Order.clear();
6334 return;
6335 }
6336 fixupOrderingIndices(Order);
6337 return;
6338 }
6339 SmallVector<int> MaskOrder;
6340 if (Order.empty()) {
6341 MaskOrder.resize(Sz);
6342 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6343 } else {
6344 inversePermutation(Order, MaskOrder);
6345 }
6346 reorderReuses(MaskOrder, Mask);
6347 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6348 Order.clear();
6349 return;
6350 }
6351 Order.assign(Sz, Sz);
6352 for (unsigned I = 0; I < Sz; ++I)
6353 if (MaskOrder[I] != PoisonMaskElem)
6354 Order[MaskOrder[I]] = I;
6355 fixupOrderingIndices(Order);
6356}
6357
6358std::optional<BoUpSLP::OrdersType>
6359BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6360 bool TopToBottom, bool IgnoreReorder) {
6361 assert(TE.isGather() && "Expected gather node only.");
6362 // Try to find subvector extract/insert patterns and reorder only such
6363 // patterns.
6364 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6365 Type *ScalarTy = GatheredScalars.front()->getType();
6366 size_t NumScalars = GatheredScalars.size();
6367 if (!isValidElementType(ScalarTy))
6368 return std::nullopt;
6369 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6370 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6371 SmallVector<int> ExtractMask;
6372 SmallVector<int> Mask;
6375 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6377 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6378 /*ForOrder=*/true);
6379 // No shuffled operands - ignore.
6380 if (GatherShuffles.empty() && ExtractShuffles.empty())
6381 return std::nullopt;
6382 OrdersType CurrentOrder(NumScalars, NumScalars);
6383 if (GatherShuffles.size() == 1 &&
6384 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6385 Entries.front().front()->isSame(TE.Scalars)) {
6386 // If the full matched node in whole tree rotation - no need to consider the
6387 // matching order, rotating the whole tree.
6388 if (TopToBottom)
6389 return std::nullopt;
6390 // No need to keep the order for the same user node.
6391 if (Entries.front().front()->UserTreeIndex.UserTE ==
6392 TE.UserTreeIndex.UserTE)
6393 return std::nullopt;
6394 // No need to keep the order for the matched root node, if it can be freely
6395 // reordered.
6396 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6397 return std::nullopt;
6398 // If shuffling 2 elements only and the matching node has reverse reuses -
6399 // no need to count order, both work fine.
6400 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6401 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6402 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6403 [](const auto &P) {
6404 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6405 }))
6406 return std::nullopt;
6407
6408 // Perfect match in the graph, will reuse the previously vectorized
6409 // node. Cost is 0.
6410 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6411 return CurrentOrder;
6412 }
6413 auto IsSplatMask = [](ArrayRef<int> Mask) {
6414 int SingleElt = PoisonMaskElem;
6415 return all_of(Mask, [&](int I) {
6416 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6417 SingleElt = I;
6418 return I == PoisonMaskElem || I == SingleElt;
6419 });
6420 };
6421 // Exclusive broadcast mask - ignore.
6422 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6423 (Entries.size() != 1 ||
6424 Entries.front().front()->ReorderIndices.empty())) ||
6425 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6426 return std::nullopt;
6427 SmallBitVector ShuffledSubMasks(NumParts);
6428 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6429 ArrayRef<int> Mask, int PartSz, int NumParts,
6430 function_ref<unsigned(unsigned)> GetVF) {
6431 for (int I : seq<int>(0, NumParts)) {
6432 if (ShuffledSubMasks.test(I))
6433 continue;
6434 const int VF = GetVF(I);
6435 if (VF == 0)
6436 continue;
6437 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6438 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6439 // Shuffle of at least 2 vectors - ignore.
6440 if (any_of(Slice, not_equal_to(NumScalars))) {
6441 llvm::fill(Slice, NumScalars);
6442 ShuffledSubMasks.set(I);
6443 continue;
6444 }
6445 // Try to include as much elements from the mask as possible.
6446 int FirstMin = INT_MAX;
6447 int SecondVecFound = false;
6448 for (int K : seq<int>(Limit)) {
6449 int Idx = Mask[I * PartSz + K];
6450 if (Idx == PoisonMaskElem) {
6451 Value *V = GatheredScalars[I * PartSz + K];
6452 if (isConstant(V) && !isa<PoisonValue>(V)) {
6453 SecondVecFound = true;
6454 break;
6455 }
6456 continue;
6457 }
6458 if (Idx < VF) {
6459 if (FirstMin > Idx)
6460 FirstMin = Idx;
6461 } else {
6462 SecondVecFound = true;
6463 break;
6464 }
6465 }
6466 FirstMin = (FirstMin / PartSz) * PartSz;
6467 // Shuffle of at least 2 vectors - ignore.
6468 if (SecondVecFound) {
6469 llvm::fill(Slice, NumScalars);
6470 ShuffledSubMasks.set(I);
6471 continue;
6472 }
6473 for (int K : seq<int>(Limit)) {
6474 int Idx = Mask[I * PartSz + K];
6475 if (Idx == PoisonMaskElem)
6476 continue;
6477 Idx -= FirstMin;
6478 if (Idx >= PartSz) {
6479 SecondVecFound = true;
6480 break;
6481 }
6482 if (CurrentOrder[I * PartSz + Idx] >
6483 static_cast<unsigned>(I * PartSz + K) &&
6484 CurrentOrder[I * PartSz + Idx] !=
6485 static_cast<unsigned>(I * PartSz + Idx))
6486 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6487 }
6488 // Shuffle of at least 2 vectors - ignore.
6489 if (SecondVecFound) {
6490 llvm::fill(Slice, NumScalars);
6491 ShuffledSubMasks.set(I);
6492 continue;
6493 }
6494 }
6495 };
6496 int PartSz = getPartNumElems(NumScalars, NumParts);
6497 if (!ExtractShuffles.empty())
6498 TransformMaskToOrder(
6499 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6500 if (!ExtractShuffles[I])
6501 return 0U;
6502 unsigned VF = 0;
6503 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6504 for (unsigned Idx : seq<unsigned>(Sz)) {
6505 int K = I * PartSz + Idx;
6506 if (ExtractMask[K] == PoisonMaskElem)
6507 continue;
6508 if (!TE.ReuseShuffleIndices.empty())
6509 K = TE.ReuseShuffleIndices[K];
6510 if (K == PoisonMaskElem)
6511 continue;
6512 if (!TE.ReorderIndices.empty())
6513 K = std::distance(TE.ReorderIndices.begin(),
6514 find(TE.ReorderIndices, K));
6515 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6516 if (!EI)
6517 continue;
6518 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6519 ->getElementCount()
6520 .getKnownMinValue());
6521 }
6522 return VF;
6523 });
6524 // Check special corner case - single shuffle of the same entry.
6525 if (GatherShuffles.size() == 1 && NumParts != 1) {
6526 if (ShuffledSubMasks.any())
6527 return std::nullopt;
6528 PartSz = NumScalars;
6529 NumParts = 1;
6530 }
6531 if (!Entries.empty())
6532 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6533 if (!GatherShuffles[I])
6534 return 0U;
6535 return std::max(Entries[I].front()->getVectorFactor(),
6536 Entries[I].back()->getVectorFactor());
6537 });
6538 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6539 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6540 return std::nullopt;
6541 return std::move(CurrentOrder);
6542}
6543
6544static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6545 const TargetLibraryInfo &TLI,
6546 bool CompareOpcodes = true) {
6549 return false;
6550 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6551 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6552 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6553 (!GEP2 || GEP2->getNumOperands() == 2) &&
6554 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6555 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6556 !CompareOpcodes ||
6557 (GEP1 && GEP2 &&
6558 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6559}
6560
6561/// Calculates minimal alignment as a common alignment.
6562template <typename T>
6564 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6565 for (Value *V : VL)
6566 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6567 return CommonAlignment;
6568}
6569
6570/// Check if \p Order represents reverse order.
6572 assert(!Order.empty() &&
6573 "Order is empty. Please check it before using isReverseOrder.");
6574 unsigned Sz = Order.size();
6575 return all_of(enumerate(Order), [&](const auto &Pair) {
6576 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6577 });
6578}
6579
6580/// Checks if the provided list of pointers \p Pointers represents the strided
6581/// pointers for type ElemTy. If they are not, nullptr is returned.
6582/// Otherwise, SCEV* of the stride value is returned.
6583/// If `PointerOps` can be rearanged into the following sequence:
6584/// ```
6585/// %x + c_0 * stride,
6586/// %x + c_1 * stride,
6587/// %x + c_2 * stride
6588/// ...
6589/// ```
6590/// where each `c_i` is constant. The `Coeffs` will contain `c_0, c_1, c_2, ..`
6591/// and the SCEV of the `stride` will be returned.
6592static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6593 const DataLayout &DL, ScalarEvolution &SE,
6594 SmallVectorImpl<unsigned> &SortedIndices,
6595 SmallVectorImpl<int64_t> &Coeffs) {
6596 assert(Coeffs.size() == PointerOps.size() &&
6597 "Coeffs vector needs to be of correct size");
6599 const SCEV *PtrSCEVLowest = nullptr;
6600 const SCEV *PtrSCEVHighest = nullptr;
6601 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6602 // addresses).
6603 for (Value *Ptr : PointerOps) {
6604 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6605 if (!PtrSCEV)
6606 return nullptr;
6607 SCEVs.push_back(PtrSCEV);
6608 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6609 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6610 continue;
6611 }
6612 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6613 if (isa<SCEVCouldNotCompute>(Diff))
6614 return nullptr;
6615 if (Diff->isNonConstantNegative()) {
6616 PtrSCEVLowest = PtrSCEV;
6617 continue;
6618 }
6619 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6620 if (isa<SCEVCouldNotCompute>(Diff1))
6621 return nullptr;
6622 if (Diff1->isNonConstantNegative()) {
6623 PtrSCEVHighest = PtrSCEV;
6624 continue;
6625 }
6626 }
6627 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6628 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6629 if (isa<SCEVCouldNotCompute>(Dist))
6630 return nullptr;
6631 int Size = DL.getTypeStoreSize(ElemTy);
6632 auto TryGetStride = [&](const SCEV *Dist,
6633 const SCEV *Multiplier) -> const SCEV * {
6634 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6635 if (M->getOperand(0) == Multiplier)
6636 return M->getOperand(1);
6637 if (M->getOperand(1) == Multiplier)
6638 return M->getOperand(0);
6639 return nullptr;
6640 }
6641 if (Multiplier == Dist)
6642 return SE.getConstant(Dist->getType(), 1);
6643 return SE.getUDivExactExpr(Dist, Multiplier);
6644 };
6645 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6646 const SCEV *Stride = nullptr;
6647 if (Size != 1 || SCEVs.size() > 2) {
6648 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6649 Stride = TryGetStride(Dist, Sz);
6650 if (!Stride)
6651 return nullptr;
6652 }
6653 if (!Stride || isa<SCEVConstant>(Stride))
6654 return nullptr;
6655 // Iterate through all pointers and check if all distances are
6656 // unique multiple of Stride.
6657 using DistOrdPair = std::pair<int64_t, int>;
6658 auto Compare = llvm::less_first();
6659 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6660 int Cnt = 0;
6661 bool IsConsecutive = true;
6662 for (const auto [Idx, PtrSCEV] : enumerate(SCEVs)) {
6663 unsigned Dist = 0;
6664 if (PtrSCEV != PtrSCEVLowest) {
6665 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6666 const SCEV *Coeff = TryGetStride(Diff, Stride);
6667 if (!Coeff)
6668 return nullptr;
6669 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6670 if (!SC || isa<SCEVCouldNotCompute>(SC))
6671 return nullptr;
6672 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6673 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6674 SE.getMulExpr(Stride, SC)))
6675 ->isZero())
6676 return nullptr;
6677 Dist = SC->getAPInt().getZExtValue();
6678 } else {
6679 Coeffs[Idx] = 0;
6680 }
6681 // If the strides are not the same or repeated, we can't vectorize.
6682 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6683 return nullptr;
6684 auto Res = Offsets.emplace(Dist, Cnt);
6685 if (!Res.second)
6686 return nullptr;
6687 // Consecutive order if the inserted element is the last one.
6688 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6689 ++Cnt;
6690 }
6691 if (Offsets.size() != SCEVs.size())
6692 return nullptr;
6693 SortedIndices.clear();
6694 if (!IsConsecutive) {
6695 // Fill SortedIndices array only if it is non-consecutive.
6696 SortedIndices.resize(PointerOps.size());
6697 Cnt = 0;
6698 for (const std::pair<int64_t, int> &Pair : Offsets) {
6699 SortedIndices[Cnt] = Pair.second;
6700 ++Cnt;
6701 }
6702 }
6703 return Stride;
6704}
6705
6706static std::pair<InstructionCost, InstructionCost>
6708 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6709 Type *ScalarTy, VectorType *VecTy);
6710
6711/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6712/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6713/// subvector pattern.
6714static InstructionCost
6716 VectorType *Tp, ArrayRef<int> Mask = {},
6718 int Index = 0, VectorType *SubTp = nullptr,
6720 VectorType *DstTy = Tp;
6721 if (!Mask.empty())
6722 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6723
6724 if (Kind != TTI::SK_PermuteTwoSrc)
6725 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6726 Args);
6727 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6728 int NumSubElts;
6730 Mask, NumSrcElts, NumSubElts, Index)) {
6731 if (Index + NumSubElts > NumSrcElts &&
6732 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6733 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6734 TTI::TCK_RecipThroughput, Index, Tp);
6735 }
6736 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6737 Args);
6738}
6739
6740/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6741/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6742/// instead of a scalar.
6743static InstructionCost
6745 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6746 bool Extract, TTI::TargetCostKind CostKind,
6747 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6749 "ScalableVectorType is not supported.");
6750 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6751 getNumElements(Ty) &&
6752 "Incorrect usage.");
6753 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6754 assert(SLPReVec && "Only supported by REVEC.");
6755 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6756 // of CreateInsertElement.
6757 unsigned ScalarTyNumElements = VecTy->getNumElements();
6758 InstructionCost Cost = 0;
6759 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6760 if (!DemandedElts[I])
6761 continue;
6762 if (Insert)
6764 I * ScalarTyNumElements, VecTy);
6765 if (Extract)
6767 I * ScalarTyNumElements, VecTy);
6768 }
6769 return Cost;
6770 }
6771 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6772 CostKind, ForPoisonSrc, VL);
6773}
6774
6775/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6776/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6778 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6779 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6780 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6781 if (Opcode == Instruction::ExtractElement) {
6782 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6783 assert(SLPReVec && "Only supported by REVEC.");
6784 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6786 cast<VectorType>(Val), {}, CostKind,
6787 Index * VecTy->getNumElements(), VecTy);
6788 }
6789 }
6790 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6791 ScalarUserAndIdx);
6792}
6793
6794/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6795/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6797 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6798 VectorType *VecTy, unsigned Index,
6800 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6801 assert(SLPReVec && "Only supported by REVEC.");
6802 auto *SubTp =
6803 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6805 Index * ScalarTy->getNumElements(), SubTp) +
6806 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6807 CostKind);
6808 }
6809 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6810}
6811
6812/// Creates subvector insert. Generates shuffle using \p Generator or
6813/// using default shuffle.
6815 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6816 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6817 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6818 return Vec;
6819 const unsigned SubVecVF = getNumElements(V->getType());
6820 // Create shuffle, insertvector requires that index is multiple of
6821 // the subvector length.
6822 const unsigned VecVF = getNumElements(Vec->getType());
6824 if (isa<PoisonValue>(Vec)) {
6825 auto *Begin = std::next(Mask.begin(), Index);
6826 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6827 Vec = Builder.CreateShuffleVector(V, Mask);
6828 return Vec;
6829 }
6830 std::iota(Mask.begin(), Mask.end(), 0);
6831 std::iota(std::next(Mask.begin(), Index),
6832 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6833 if (Generator)
6834 return Generator(Vec, V, Mask);
6835 // 1. Resize V to the size of Vec.
6836 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6837 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6838 V = Builder.CreateShuffleVector(V, ResizeMask);
6839 // 2. Insert V into Vec.
6840 return Builder.CreateShuffleVector(Vec, V, Mask);
6841}
6842
6843/// Generates subvector extract using \p Generator or using default shuffle.
6845 unsigned SubVecVF, unsigned Index) {
6846 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6847 std::iota(Mask.begin(), Mask.end(), Index);
6848 return Builder.CreateShuffleVector(Vec, Mask);
6849}
6850
6851/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6852/// with \p Order.
6853/// \return true if the mask represents strided access, false - otherwise.
6855 ArrayRef<unsigned> Order, Type *ScalarTy,
6856 const DataLayout &DL, ScalarEvolution &SE,
6857 SmallVectorImpl<int> &CompressMask) {
6858 const unsigned Sz = PointerOps.size();
6859 CompressMask.assign(Sz, PoisonMaskElem);
6860 // The first element always set.
6861 CompressMask[0] = 0;
6862 // Check if the mask represents strided access.
6863 std::optional<unsigned> Stride = 0;
6864 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6865 for (unsigned I : seq<unsigned>(1, Sz)) {
6866 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6867 std::optional<int64_t> OptPos =
6868 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6869 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6870 return false;
6871 unsigned Pos = static_cast<unsigned>(*OptPos);
6872 CompressMask[I] = Pos;
6873 if (!Stride)
6874 continue;
6875 if (*Stride == 0) {
6876 *Stride = Pos;
6877 continue;
6878 }
6879 if (Pos != *Stride * I)
6880 Stride.reset();
6881 }
6882 return Stride.has_value();
6883}
6884
6885/// Checks if the \p VL can be transformed to a (masked)load + compress or
6886/// (masked) interleaved load.
6888 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6891 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6892 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6893 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6894 VectorType *&LoadVecTy) {
6895 InterleaveFactor = 0;
6896 Type *ScalarTy = VL.front()->getType();
6897 const size_t Sz = VL.size();
6898 auto *VecTy = getWidenedType(ScalarTy, Sz);
6900 SmallVector<int> Mask;
6901 if (!Order.empty())
6902 inversePermutation(Order, Mask);
6903 // Check external uses.
6904 for (const auto [I, V] : enumerate(VL)) {
6905 if (AreAllUsersVectorized(V))
6906 continue;
6907 InstructionCost ExtractCost =
6908 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6909 Mask.empty() ? I : Mask[I]);
6910 InstructionCost ScalarCost =
6911 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6912 if (ExtractCost <= ScalarCost)
6913 return false;
6914 }
6915 Value *Ptr0;
6916 Value *PtrN;
6917 if (Order.empty()) {
6918 Ptr0 = PointerOps.front();
6919 PtrN = PointerOps.back();
6920 } else {
6921 Ptr0 = PointerOps[Order.front()];
6922 PtrN = PointerOps[Order.back()];
6923 }
6924 std::optional<int64_t> Diff =
6925 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6926 if (!Diff)
6927 return false;
6928 const size_t MaxRegSize =
6930 .getFixedValue();
6931 // Check for very large distances between elements.
6932 if (*Diff / Sz >= MaxRegSize / 8)
6933 return false;
6934 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6935 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6936 Align CommonAlignment = LI->getAlign();
6937 IsMasked = !isSafeToLoadUnconditionally(
6938 Ptr0, LoadVecTy, CommonAlignment, DL,
6939 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6940 &TLI);
6941 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6942 LI->getPointerAddressSpace()))
6943 return false;
6944 // TODO: perform the analysis of each scalar load for better
6945 // safe-load-unconditionally analysis.
6946 bool IsStrided =
6947 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6948 assert(CompressMask.size() >= 2 && "At least two elements are required");
6949 SmallVector<Value *> OrderedPointerOps(PointerOps);
6950 if (!Order.empty())
6951 reorderScalars(OrderedPointerOps, Mask);
6952 auto [ScalarGEPCost, VectorGEPCost] =
6953 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6954 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6955 // The cost of scalar loads.
6956 InstructionCost ScalarLoadsCost =
6957 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6958 [&](InstructionCost C, Value *V) {
6959 return C + TTI.getInstructionCost(cast<Instruction>(V),
6960 CostKind);
6961 }) +
6962 ScalarGEPCost;
6963 APInt DemandedElts = APInt::getAllOnes(Sz);
6964 InstructionCost GatherCost =
6965 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6966 /*Insert=*/true,
6967 /*Extract=*/false, CostKind) +
6968 ScalarLoadsCost;
6969 InstructionCost LoadCost = 0;
6970 if (IsMasked) {
6971 LoadCost = TTI.getMemIntrinsicInstrCost(
6972 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
6973 CommonAlignment,
6974 LI->getPointerAddressSpace()),
6975 CostKind);
6976 } else {
6977 LoadCost =
6978 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6979 LI->getPointerAddressSpace(), CostKind);
6980 }
6981 if (IsStrided && !IsMasked && Order.empty()) {
6982 // Check for potential segmented(interleaved) loads.
6983 VectorType *AlignedLoadVecTy = getWidenedType(
6984 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6985 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6986 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6987 &TLI))
6988 AlignedLoadVecTy = LoadVecTy;
6989 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6990 CommonAlignment,
6991 LI->getPointerAddressSpace())) {
6992 InstructionCost InterleavedCost =
6993 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6994 Instruction::Load, AlignedLoadVecTy,
6995 CompressMask[1], {}, CommonAlignment,
6996 LI->getPointerAddressSpace(), CostKind, IsMasked);
6997 if (InterleavedCost < GatherCost) {
6998 InterleaveFactor = CompressMask[1];
6999 LoadVecTy = AlignedLoadVecTy;
7000 return true;
7001 }
7002 }
7003 }
7004 InstructionCost CompressCost = ::getShuffleCost(
7005 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
7006 if (!Order.empty()) {
7007 SmallVector<int> NewMask(Sz, PoisonMaskElem);
7008 for (unsigned I : seq<unsigned>(Sz)) {
7009 NewMask[I] = CompressMask[Mask[I]];
7010 }
7011 CompressMask.swap(NewMask);
7012 }
7013 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7014 return TotalVecCost < GatherCost;
7015}
7016
7017/// Checks if the \p VL can be transformed to a (masked)load + compress or
7018/// (masked) interleaved load.
7019static bool
7022 const DataLayout &DL, ScalarEvolution &SE,
7023 AssumptionCache &AC, const DominatorTree &DT,
7024 const TargetLibraryInfo &TLI,
7025 const function_ref<bool(Value *)> AreAllUsersVectorized) {
7026 bool IsMasked;
7027 unsigned InterleaveFactor;
7028 SmallVector<int> CompressMask;
7029 VectorType *LoadVecTy;
7030 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
7031 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7032 CompressMask, LoadVecTy);
7033}
7034
7035/// Checks if strided loads can be generated out of \p VL loads with pointers \p
7036/// PointerOps:
7037/// 1. Target with strided load support is detected.
7038/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
7039/// potential stride <= MaxProfitableLoadStride and the potential stride is
7040/// power-of-2 (to avoid perf regressions for the very small number of loads)
7041/// and max distance > number of loads, or potential stride is -1.
7042/// 3. The loads are ordered, or number of unordered loads <=
7043/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
7044/// to avoid extra costs for very expensive shuffles).
7045/// 4. Any pointer operand is an instruction with the users outside of the
7046/// current graph (for masked gathers extra extractelement instructions
7047/// might be required).
7049 Align Alignment, const int64_t Diff,
7050 const size_t Sz) const {
7051 if (Diff % (Sz - 1) != 0)
7052 return false;
7053
7054 // Try to generate strided load node.
7055 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
7056 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
7057 return !isVectorized(U) && !MustGather.contains(U);
7058 });
7059 });
7060
7061 const uint64_t AbsoluteDiff = std::abs(Diff);
7062 auto *VecTy = getWidenedType(ScalarTy, Sz);
7063 if (IsAnyPointerUsedOutGraph ||
7064 (AbsoluteDiff > Sz &&
7066 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
7067 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
7068 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7069 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7070 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7071 return false;
7072 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7073 return false;
7074 return true;
7075 }
7076 return false;
7077}
7078
7080 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7081 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7082 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
7083 const size_t Sz = PointerOps.size();
7084 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7085 // Go through `PointerOps` in sorted order and record offsets from
7086 // PointerOps[0]. We use PointerOps[0] rather than Ptr0 because
7087 // sortPtrAccesses only validates getPointersDiff for pairs relative to
7088 // PointerOps[0]. This is safe since only offset differences are used below.
7089 for (unsigned I : seq<unsigned>(Sz)) {
7090 Value *Ptr =
7091 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7092 std::optional<int64_t> Offset =
7093 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, Ptr, *DL, *SE);
7094 assert(Offset && "sortPtrAccesses should have validated this pointer");
7095 SortedOffsetsFromBase[I] = *Offset;
7096 }
7097
7098 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7099 // ```
7100 // [
7101 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7102 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7103 // ...
7104 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7105 // GroupSize - 1}), // last group
7106 // ]
7107 // ```
7108 // The distance between consecutive elements within each group should all be
7109 // the same `StrideWithinGroup`. The distance between the first elements of
7110 // consecutive groups should all be the same `StrideBetweenGroups`.
7111
7112 int64_t StrideWithinGroup =
7113 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7114 // Determine size of the first group. Later we will check that all other
7115 // groups have the same size.
7116 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7117 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7118 StrideWithinGroup;
7119 };
7120 auto Indices = seq<unsigned>(1, Sz);
7121 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7122 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7123
7124 unsigned VecSz = Sz;
7125 Type *NewScalarTy = ScalarTy;
7126
7127 // Quick detour: at this point we can say what the type of strided load would
7128 // be if all the checks pass. Check if this type is legal for the target.
7129 bool NeedsWidening = Sz != GroupSize;
7130 if (NeedsWidening) {
7131 if (Sz % GroupSize != 0)
7132 return false;
7133
7134 if (StrideWithinGroup != 1)
7135 return false;
7136 VecSz = Sz / GroupSize;
7137 NewScalarTy = Type::getIntNTy(
7138 SE->getContext(),
7139 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7140 }
7141
7142 if (!isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7143 return false;
7144
7145 int64_t StrideIntVal = StrideWithinGroup;
7146 if (NeedsWidening) {
7147 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7148 // Check that the strides between groups are all the same.
7149 unsigned CurrentGroupStartIdx = GroupSize;
7150 int64_t StrideBetweenGroups =
7151 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7152 StrideIntVal = StrideBetweenGroups;
7153 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7154 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7155 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7156 StrideBetweenGroups)
7157 return false;
7158 }
7159
7160 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7161 auto Indices = seq<unsigned>(StartIdx + 1, Sz);
7162 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7163 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7164 return GroupEndIdx - StartIdx == GroupSize;
7165 };
7166 for (unsigned I = 0; I < Sz; I += GroupSize) {
7167 if (!CheckGroup(I))
7168 return false;
7169 }
7170 }
7171
7172 Type *StrideTy = DL->getIndexType(Ptr0->getType());
7173 SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal);
7174 SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
7175 return true;
7176}
7177
7179 Type *ScalarTy, Align CommonAlignment,
7180 SmallVectorImpl<unsigned> &SortedIndices,
7181 StridedPtrInfo &SPtrInfo) const {
7182 // If each value in `PointerOps` is of the form `%x + Offset` where `Offset`
7183 // is constant, we partition `PointerOps` sequence into subsequences of
7184 // pointers with the same offset. For each offset we record values from
7185 // `PointerOps` and their indicies in `PointerOps`.
7187 OffsetToPointerOpIdxMap;
7188 for (auto [Idx, Ptr] : enumerate(PointerOps)) {
7189 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7190 if (!PtrSCEV)
7191 return false;
7192
7193 const auto *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
7194 int64_t Offset = 0;
7195 if (Add) {
7196 // `Offset` is non-zero.
7197 for (int I : seq<int>(Add->getNumOperands())) {
7198 const auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
7199 if (!SC)
7200 continue;
7201 Offset = SC->getAPInt().getSExtValue();
7202 break;
7203 }
7204 }
7205 OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
7206 OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
7207 }
7208 unsigned NumOffsets = OffsetToPointerOpIdxMap.size();
7209
7210 // Quick detour: at this point we can say what the type of strided load would
7211 // be if all the checks pass. Check if this type is legal for the target.
7212 const unsigned Sz = PointerOps.size();
7213 unsigned VecSz = Sz;
7214 Type *NewScalarTy = ScalarTy;
7215 if (NumOffsets > 1) {
7216 if (Sz % NumOffsets != 0)
7217 return false;
7218 VecSz = Sz / NumOffsets;
7219 NewScalarTy = Type::getIntNTy(
7220 SE->getContext(),
7221 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * NumOffsets);
7222 }
7223 FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
7224 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7225 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7226 return false;
7227
7228 // Check if the offsets are contiguous and that each group has the required
7229 // size.
7230 SmallVector<int64_t> SortedOffsetsV(NumOffsets);
7231 for (auto [Idx, MapPair] : enumerate(OffsetToPointerOpIdxMap)) {
7232 if (MapPair.second.first.size() != VecSz)
7233 return false;
7234 SortedOffsetsV[Idx] = MapPair.first;
7235 }
7236 sort(SortedOffsetsV);
7237
7238 if (NumOffsets > 1) {
7239 for (int I : seq<int>(1, SortedOffsetsV.size())) {
7240 if (SortedOffsetsV[I] - SortedOffsetsV[I - 1] != 1)
7241 return false;
7242 }
7243 }
7244
7245 // Introduce some notation for the explanations below. Let `PointerOps_j`
7246 // denote the subsequence of `PointerOps` with offsets equal to
7247 // `SortedOffsetsV[j]`. Let `SortedIndices_j` be a such that the sequence
7248 // ```
7249 // PointerOps_j[SortedIndices_j[0]],
7250 // PointerOps_j[SortedIndices_j[1]],
7251 // PointerOps_j[SortedIndices_j[2]],
7252 // ...
7253 // ```
7254 // is sorted. Also, let `IndicesInAllPointerOps_j` be the vector
7255 // of indices of the subsequence `PointerOps_j` in all of `PointerOps`,
7256 // i.e `PointerOps_j[i] = PointerOps[IndicesInAllPointerOps_j[i]]`.
7257 // The entire sorted `PointerOps` looks like this:
7258 // ```
7259 // PointerOps_0[SortedIndices_0[0]] = PointerOps[IndicesInAllPointerOps_0[0]],
7260 // PointerOps_1[SortedIndices_1[0]] = PointerOps[IndicesInAllPointerOps_1[0]],
7261 // PointerOps_2[SortedIndices_2[0]] = PointerOps[IndicesInAllPointerOps_2[0]],
7262 // ...
7263 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[0]] =
7264 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[0]],
7265 //
7266 // PointerOps_0[SortedIndices_0[1]] = PointerOps[IndicesInAllPointerOps_0[1]],
7267 // PointerOps_1[SortedIndices_1[1]] = PointerOps[IndicesInAllPointerOps_1[1]],
7268 // PointerOps_2[SortedIndices_2[1]] = PointerOps[IndicesInAllPointerOps_2[1]],
7269 // ...
7270 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[1]] =
7271 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[1]],
7272 //
7273 // PointerOps_0[SortedIndices_0[2]] = PointerOps[IndicesInAllPointerOps_0[2]],
7274 // PointerOps_1[SortedIndices_1[2]] = PointerOps[IndicesInAllPointerOps_1[2]],
7275 // PointerOps_2[SortedIndices_2[2]] = PointerOps[IndicesInAllPointerOps_2[2]],
7276 // ...
7277 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[2]] =
7278 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[2]],
7279 // ...
7280 // ...
7281 // ...
7282 // PointerOps_0[SortedIndices_0[VecSz - 1]] =
7283 // PointerOps[IndicesInAllPointerOps_0[VecSz - 1]],
7284 // PointerOps_1[SortedIndices_1[VecSz - 1]] =
7285 // PointerOps[IndicesInAllPointerOps_1[VecSz - 1]],
7286 // PointerOps_2[SortedIndices_2[VecSz - 1]] =
7287 // PointerOps[IndicesInAllPointerOps_2[VecSz - 1]],
7288 // ...
7289 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[VecSz - 1]] =
7290 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[VecSz - 1]],
7291 // ```
7292 // In order to be able to generate a strided load, we need the following
7293 // checks to pass:
7294 //
7295 // (1) for each `PointerOps_j` check that the distance
7296 // between adjacent pointers are all equal to the same value (stride).
7297 // (2) for each `PointerOps_j` check that coefficients calculated by
7298 // `calculateRtStride` are all the same.
7299 //
7300 // As we do that, also calculate SortedIndices. Since we should not modify
7301 // `SortedIndices` unless we know that all the checks succeed, record the
7302 // indicies into `SortedIndicesDraft`.
7303 SmallVector<unsigned> SortedIndicesDraft(Sz);
7304
7305 // Given sorted indices for a particular offset (as calculated by
7306 // calculateRtStride), update the `SortedIndicesDraft` for all of PointerOps.
7307 // Let `Offset` be `SortedOffsetsV[OffsetNum]`.
7308 // \param `OffsetNum` the index of `Offset` in `SortedOffsetsV`.
7309 // \param `IndicesInAllPointerOps` vector of indices of the
7310 // subsequence `PointerOps_OffsetNum` in `PointerOps`, i.e. using the above
7311 // notation `IndicesInAllPointerOps = IndicesInAllPointerOps_OffsetNum`.
7312 // \param `SortedIndicesForOffset = SortedIndices_OffsetNum`
7313 auto UpdateSortedIndices =
7314 [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7315 ArrayRef<unsigned> IndicesInAllPointerOps, const int64_t OffsetNum) {
7316 if (SortedIndicesForOffset.empty()) {
7317 SortedIndicesForOffset.resize(IndicesInAllPointerOps.size());
7318 std::iota(SortedIndicesForOffset.begin(),
7319 SortedIndicesForOffset.end(), 0);
7320 }
7321 for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
7322 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7323 IndicesInAllPointerOps[Idx];
7324 }
7325 };
7326
7327 int64_t LowestOffset = SortedOffsetsV[0];
7328 ArrayRef<Value *> PointerOps0 = OffsetToPointerOpIdxMap[LowestOffset].first;
7329
7330 SmallVector<int64_t> Coeffs0(VecSz);
7331 SmallVector<unsigned> SortedIndicesForOffset0;
7332 const SCEV *Stride0 = calculateRtStride(PointerOps0, ScalarTy, *DL, *SE,
7333 SortedIndicesForOffset0, Coeffs0);
7334 if (!Stride0)
7335 return false;
7336 unsigned NumCoeffs0 = Coeffs0.size();
7337 if (NumCoeffs0 * NumOffsets != Sz)
7338 return false;
7339 sort(Coeffs0);
7340
7341 ArrayRef<unsigned> IndicesInAllPointerOps0 =
7342 OffsetToPointerOpIdxMap[LowestOffset].second;
7343 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7344
7345 // Now that we know what the common stride and coefficients has to be check
7346 // the remaining `PointerOps_j`.
7347 SmallVector<int64_t> Coeffs;
7348 SmallVector<unsigned> SortedIndicesForOffset;
7349 for (int J : seq<int>(1, NumOffsets)) {
7350 Coeffs.clear();
7351 Coeffs.resize(VecSz);
7352 SortedIndicesForOffset.clear();
7353
7354 int64_t Offset = SortedOffsetsV[J];
7355 ArrayRef<Value *> PointerOpsForOffset =
7356 OffsetToPointerOpIdxMap[Offset].first;
7357 ArrayRef<unsigned> IndicesInAllPointerOps =
7358 OffsetToPointerOpIdxMap[Offset].second;
7359 const SCEV *StrideWithinGroup =
7360 calculateRtStride(PointerOpsForOffset, ScalarTy, *DL, *SE,
7361 SortedIndicesForOffset, Coeffs);
7362
7363 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7364 return false;
7365 if (Coeffs.size() != NumCoeffs0)
7366 return false;
7367 sort(Coeffs);
7368 if (Coeffs != Coeffs0)
7369 return false;
7370
7371 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7372 }
7373
7374 SortedIndices.clear();
7375 SortedIndices = SortedIndicesDraft;
7376 SPtrInfo.StrideSCEV = Stride0;
7377 SPtrInfo.Ty = StridedLoadTy;
7378 return true;
7379}
7380
7382 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7383 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7384 unsigned *BestVF, bool TryRecursiveCheck) const {
7385 // Check that a vectorized load would load the same memory as a scalar
7386 // load. For example, we don't want to vectorize loads that are smaller
7387 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7388 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7389 // from such a struct, we read/write packed bits disagreeing with the
7390 // unvectorized version.
7391 if (BestVF)
7392 *BestVF = 0;
7394 return LoadsState::Gather;
7395 Type *ScalarTy = VL0->getType();
7396
7397 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7398 return LoadsState::Gather;
7399
7400 // Make sure all loads in the bundle are simple - we can't vectorize
7401 // atomic or volatile loads.
7402 PointerOps.clear();
7403 const size_t Sz = VL.size();
7404 PointerOps.resize(Sz);
7405 auto *POIter = PointerOps.begin();
7406 for (Value *V : VL) {
7407 auto *L = dyn_cast<LoadInst>(V);
7408 if (!L || !L->isSimple())
7409 return LoadsState::Gather;
7410 *POIter = L->getPointerOperand();
7411 ++POIter;
7412 }
7413
7414 Order.clear();
7415 // Check the order of pointer operands or that all pointers are the same.
7416 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7417
7418 auto *VecTy = getWidenedType(ScalarTy, Sz);
7419 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7420 if (!IsSorted) {
7421 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7422 SPtrInfo))
7424
7425 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7426 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7427 return LoadsState::Gather;
7428
7429 if (!all_of(PointerOps, [&](Value *P) {
7430 return arePointersCompatible(P, PointerOps.front(), *TLI);
7431 }))
7432 return LoadsState::Gather;
7433
7434 } else {
7435 Value *Ptr0;
7436 Value *PtrN;
7437 if (Order.empty()) {
7438 Ptr0 = PointerOps.front();
7439 PtrN = PointerOps.back();
7440 } else {
7441 Ptr0 = PointerOps[Order.front()];
7442 PtrN = PointerOps[Order.back()];
7443 }
7444 // sortPtrAccesses validates getPointersDiff for all pointers relative to
7445 // PointerOps[0], so compute the span using PointerOps[0] as intermediate:
7446 // Diff = offset(PtrN) - offset(Ptr0) relative to PointerOps[0]
7447 std::optional<int64_t> Diff0 =
7448 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, Ptr0, *DL, *SE);
7449 std::optional<int64_t> DiffN =
7450 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, PtrN, *DL, *SE);
7451 assert(Diff0 && DiffN &&
7452 "sortPtrAccesses should have validated these pointers");
7453 int64_t Diff = *DiffN - *Diff0;
7454 // Check that the sorted loads are consecutive.
7455 if (static_cast<uint64_t>(Diff) == Sz - 1)
7456 return LoadsState::Vectorize;
7457 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7458 *TLI, [&](Value *V) {
7459 return areAllUsersVectorized(
7460 cast<Instruction>(V), UserIgnoreList);
7461 }))
7463 Align Alignment =
7464 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7465 ->getAlign();
7466 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7467 Diff, Ptr0, PtrN, SPtrInfo))
7469 }
7470 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7471 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7472 return LoadsState::Gather;
7473 // Correctly identify compare the cost of loads + shuffles rather than
7474 // strided/masked gather loads. Returns true if vectorized + shuffles
7475 // representation is better than just gather.
7476 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7477 unsigned *BestVF,
7478 bool ProfitableGatherPointers) {
7479 if (BestVF)
7480 *BestVF = 0;
7481 // Compare masked gather cost and loads + insert subvector costs.
7483 auto [ScalarGEPCost, VectorGEPCost] =
7484 getGEPCosts(TTI, PointerOps, PointerOps.front(),
7485 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7486 // Estimate the cost of masked gather GEP. If not a splat, roughly
7487 // estimate as a buildvector, otherwise estimate as splat.
7488 APInt DemandedElts = APInt::getAllOnes(Sz);
7489 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7490 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7491 if (static_cast<unsigned>(count_if(
7492 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7493 any_of(PointerOps, [&](Value *V) {
7494 return getUnderlyingObject(V) !=
7495 getUnderlyingObject(PointerOps.front());
7496 }))
7497 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7498 DemandedElts, /*Insert=*/true,
7499 /*Extract=*/false, CostKind);
7500 else
7501 VectorGEPCost +=
7503 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7504 /*Insert=*/true, /*Extract=*/false, CostKind) +
7505 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7506 // The cost of scalar loads.
7507 InstructionCost ScalarLoadsCost =
7508 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7509 [&](InstructionCost C, Value *V) {
7510 return C + TTI.getInstructionCost(
7512 }) +
7513 ScalarGEPCost;
7514 // The cost of masked gather.
7515 InstructionCost MaskedGatherCost =
7516 TTI.getMemIntrinsicInstrCost(
7517 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
7519 /*VariableMask=*/false, CommonAlignment),
7520 CostKind) +
7521 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7522 InstructionCost GatherCost =
7523 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7524 /*Insert=*/true,
7525 /*Extract=*/false, CostKind) +
7526 ScalarLoadsCost;
7527 // The list of loads is small or perform partial check already - directly
7528 // compare masked gather cost and gather cost.
7529 constexpr unsigned ListLimit = 4;
7530 if (!TryRecursiveCheck || VL.size() < ListLimit)
7531 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7532
7533 // FIXME: The following code has not been updated for non-power-of-2
7534 // vectors (and not whole registers). The splitting logic here does not
7535 // cover the original vector if the vector factor is not a power of two.
7536 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7537 return false;
7538
7539 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7540 unsigned MinVF = getMinVF(2 * Sz);
7541 DemandedElts.clearAllBits();
7542 // Iterate through possible vectorization factors and check if vectorized +
7543 // shuffles is better than just gather.
7544 for (unsigned VF =
7545 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7546 VF >= MinVF;
7547 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7549 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7550 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7552 SmallVector<Value *> PointerOps;
7553 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7554 PointerOps, SPtrInfo, BestVF,
7555 /*TryRecursiveCheck=*/false);
7556 // Check that the sorted loads are consecutive.
7557 if (LS == LoadsState::Gather) {
7558 if (BestVF) {
7559 DemandedElts.setAllBits();
7560 break;
7561 }
7562 DemandedElts.setBits(Cnt, Cnt + VF);
7563 continue;
7564 }
7565 // If need the reorder - consider as high-cost masked gather for now.
7566 if ((LS == LoadsState::Vectorize ||
7569 !Order.empty() && !isReverseOrder(Order))
7571 States.push_back(LS);
7572 }
7573 if (DemandedElts.isAllOnes())
7574 // All loads gathered - try smaller VF.
7575 continue;
7576 // Can be vectorized later as a serie of loads/insertelements.
7577 InstructionCost VecLdCost = 0;
7578 if (!DemandedElts.isZero()) {
7579 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7580 /*Insert=*/true,
7581 /*Extract=*/false, CostKind) +
7582 ScalarGEPCost;
7583 for (unsigned Idx : seq<unsigned>(VL.size()))
7584 if (DemandedElts[Idx])
7585 VecLdCost +=
7586 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7587 }
7588 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7589 for (auto [I, LS] : enumerate(States)) {
7590 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7591 InstructionCost VectorGEPCost =
7592 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7593 ? 0
7594 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7595 LI0->getPointerOperand(),
7596 Instruction::GetElementPtr, CostKind, ScalarTy,
7597 SubVecTy)
7598 .second;
7599 if (LS == LoadsState::ScatterVectorize) {
7600 if (static_cast<unsigned>(
7601 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7602 PointerOps.size() - 1 ||
7603 any_of(PointerOps, [&](Value *V) {
7604 return getUnderlyingObject(V) !=
7605 getUnderlyingObject(PointerOps.front());
7606 }))
7607 VectorGEPCost += getScalarizationOverhead(
7608 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7609 /*Insert=*/true, /*Extract=*/false, CostKind);
7610 else
7611 VectorGEPCost +=
7613 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7614 /*Insert=*/true, /*Extract=*/false, CostKind) +
7615 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7616 CostKind);
7617 }
7618 switch (LS) {
7620 VecLdCost +=
7621 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7622 LI0->getPointerAddressSpace(), CostKind,
7624 VectorGEPCost;
7625 break;
7627 VecLdCost += TTI.getMemIntrinsicInstrCost(
7629 Intrinsic::experimental_vp_strided_load,
7630 SubVecTy, LI0->getPointerOperand(),
7631 /*VariableMask=*/false, CommonAlignment),
7632 CostKind) +
7633 VectorGEPCost;
7634 break;
7636 VecLdCost += TTI.getMemIntrinsicInstrCost(
7638 Intrinsic::masked_load, SubVecTy,
7639 CommonAlignment, LI0->getPointerAddressSpace()),
7640 CostKind) +
7642 {}, CostKind);
7643 break;
7645 VecLdCost += TTI.getMemIntrinsicInstrCost(
7647 Intrinsic::masked_gather, SubVecTy,
7648 LI0->getPointerOperand(),
7649 /*VariableMask=*/false, CommonAlignment),
7650 CostKind) +
7651 VectorGEPCost;
7652 break;
7653 case LoadsState::Gather:
7654 // Gathers are already calculated - ignore.
7655 continue;
7656 }
7657 SmallVector<int> ShuffleMask(VL.size());
7658 for (int Idx : seq<int>(0, VL.size()))
7659 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7660 if (I > 0)
7661 VecLdCost +=
7662 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7663 CostKind, I * VF, SubVecTy);
7664 }
7665 // If masked gather cost is higher - better to vectorize, so
7666 // consider it as a gather node. It will be better estimated
7667 // later.
7668 if (MaskedGatherCost >= VecLdCost &&
7669 VecLdCost - GatherCost < -SLPCostThreshold) {
7670 if (BestVF)
7671 *BestVF = VF;
7672 return true;
7673 }
7674 }
7675 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7676 };
7677 // TODO: need to improve analysis of the pointers, if not all of them are
7678 // GEPs or have > 2 operands, we end up with a gather node, which just
7679 // increases the cost.
7680 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7681 bool ProfitableGatherPointers =
7682 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7683 return L->isLoopInvariant(V);
7684 })) <= Sz / 2;
7685 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7687 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7688 (GEP && GEP->getNumOperands() == 2 &&
7689 isa<Constant, Instruction>(GEP->getOperand(1)));
7690 })) {
7691 // Check if potential masked gather can be represented as series
7692 // of loads + insertsubvectors.
7693 // If masked gather cost is higher - better to vectorize, so
7694 // consider it as a gather node. It will be better estimated
7695 // later.
7696 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7697 ProfitableGatherPointers))
7699 }
7700
7701 return LoadsState::Gather;
7702}
7703
7705 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7706 const DataLayout &DL, ScalarEvolution &SE,
7707 SmallVectorImpl<unsigned> &SortedIndices) {
7708 assert(
7709 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7710 "Expected list of pointer operands.");
7711 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7712 // Ptr into, sort and return the sorted indices with values next to one
7713 // another.
7715 std::pair<BasicBlock *, Value *>,
7717 Bases;
7718 Bases
7719 .try_emplace(std::make_pair(
7721 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7722
7723 SortedIndices.clear();
7724 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7725 auto Key = std::make_pair(BBs[Cnt + 1],
7727 bool Found = any_of(Bases.try_emplace(Key).first->second,
7728 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7729 std::optional<int64_t> Diff =
7730 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7731 ElemTy, Ptr, DL, SE,
7732 /*StrictCheck=*/true);
7733 if (!Diff)
7734 return false;
7735
7736 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7737 return true;
7738 });
7739
7740 if (!Found) {
7741 // If we haven't found enough to usefully cluster, return early.
7742 if (Bases.size() > VL.size() / 2 - 1)
7743 return false;
7744
7745 // Not found already - add a new Base
7746 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7747 }
7748 }
7749
7750 if (Bases.size() == VL.size())
7751 return false;
7752
7753 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7754 Bases.front().second.size() == VL.size()))
7755 return false;
7756
7757 // For each of the bases sort the pointers by Offset and check if any of the
7758 // base become consecutively allocated.
7759 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7760 SmallPtrSet<Value *, 13> FirstPointers;
7761 SmallPtrSet<Value *, 13> SecondPointers;
7762 Value *P1 = Ptr1;
7763 Value *P2 = Ptr2;
7764 unsigned Depth = 0;
7765 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7766 if (P1 == P2 || Depth > RecursionMaxDepth)
7767 return false;
7768 FirstPointers.insert(P1);
7769 SecondPointers.insert(P2);
7770 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7771 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7772 ++Depth;
7773 }
7774 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7775 "Unable to find matching root.");
7776 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7777 };
7778 for (auto &Base : Bases) {
7779 for (auto &Vec : Base.second) {
7780 if (Vec.size() > 1) {
7782 int64_t InitialOffset = std::get<1>(Vec[0]);
7783 bool AnyConsecutive =
7784 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7785 return std::get<1>(P.value()) ==
7786 int64_t(P.index()) + InitialOffset;
7787 });
7788 // Fill SortedIndices array only if it looks worth-while to sort the
7789 // ptrs.
7790 if (!AnyConsecutive)
7791 return false;
7792 }
7793 }
7794 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7795 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7796 });
7797 }
7798
7799 for (auto &T : Bases)
7800 for (const auto &Vec : T.second)
7801 for (const auto &P : Vec)
7802 SortedIndices.push_back(std::get<2>(P));
7803
7804 assert(SortedIndices.size() == VL.size() &&
7805 "Expected SortedIndices to be the size of VL");
7806 return true;
7807}
7808
7809std::optional<BoUpSLP::OrdersType>
7810BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7811 assert(TE.isGather() && "Expected gather node only.");
7812 Type *ScalarTy = TE.Scalars[0]->getType();
7813
7815 Ptrs.reserve(TE.Scalars.size());
7817 BBs.reserve(TE.Scalars.size());
7818 for (Value *V : TE.Scalars) {
7819 auto *L = dyn_cast<LoadInst>(V);
7820 if (!L || !L->isSimple())
7821 return std::nullopt;
7822 Ptrs.push_back(L->getPointerOperand());
7823 BBs.push_back(L->getParent());
7824 }
7825
7826 BoUpSLP::OrdersType Order;
7827 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7828 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7829 return std::move(Order);
7830 return std::nullopt;
7831}
7832
7833/// Check if two insertelement instructions are from the same buildvector.
7836 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7837 // Instructions must be from the same basic blocks.
7838 if (VU->getParent() != V->getParent())
7839 return false;
7840 // Checks if 2 insertelements are from the same buildvector.
7841 if (VU->getType() != V->getType())
7842 return false;
7843 // Multiple used inserts are separate nodes.
7844 if (!VU->hasOneUse() && !V->hasOneUse())
7845 return false;
7846 auto *IE1 = VU;
7847 auto *IE2 = V;
7848 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7849 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7850 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7851 return false;
7852 // Go through the vector operand of insertelement instructions trying to find
7853 // either VU as the original vector for IE2 or V as the original vector for
7854 // IE1.
7855 SmallBitVector ReusedIdx(
7856 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7857 bool IsReusedIdx = false;
7858 do {
7859 if (IE2 == VU && !IE1)
7860 return VU->hasOneUse();
7861 if (IE1 == V && !IE2)
7862 return V->hasOneUse();
7863 if (IE1 && IE1 != V) {
7864 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7865 IsReusedIdx |= ReusedIdx.test(Idx1);
7866 ReusedIdx.set(Idx1);
7867 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7868 IE1 = nullptr;
7869 else
7870 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7871 }
7872 if (IE2 && IE2 != VU) {
7873 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7874 IsReusedIdx |= ReusedIdx.test(Idx2);
7875 ReusedIdx.set(Idx2);
7876 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7877 IE2 = nullptr;
7878 else
7879 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7880 }
7881 } while (!IsReusedIdx && (IE1 || IE2));
7882 return false;
7883}
7884
7885/// Checks if the specified instruction \p I is an alternate operation for
7886/// the given \p MainOp and \p AltOp instructions.
7887static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7888 Instruction *AltOp,
7889 const TargetLibraryInfo &TLI);
7890
7891std::optional<BoUpSLP::OrdersType>
7892BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7893 bool IgnoreReorder) {
7894 // No need to reorder if need to shuffle reuses, still need to shuffle the
7895 // node.
7896 if (!TE.ReuseShuffleIndices.empty()) {
7897 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7898 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7899 "Reshuffling scalars not yet supported for nodes with padding");
7900
7901 if (isSplat(TE.Scalars))
7902 return std::nullopt;
7903 // Check if reuse shuffle indices can be improved by reordering.
7904 // For this, check that reuse mask is "clustered", i.e. each scalar values
7905 // is used once in each submask of size <number_of_scalars>.
7906 // Example: 4 scalar values.
7907 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7908 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7909 // element 3 is used twice in the second submask.
7910 unsigned Sz = TE.Scalars.size();
7911 if (TE.isGather()) {
7912 if (std::optional<OrdersType> CurrentOrder =
7913 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7914 SmallVector<int> Mask;
7915 fixupOrderingIndices(*CurrentOrder);
7916 inversePermutation(*CurrentOrder, Mask);
7917 ::addMask(Mask, TE.ReuseShuffleIndices);
7918 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7919 unsigned Sz = TE.Scalars.size();
7920 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7921 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7922 if (Idx != PoisonMaskElem)
7923 Res[Idx + K * Sz] = I + K * Sz;
7924 }
7925 return std::move(Res);
7926 }
7927 }
7928 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7929 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7930 2 * TE.getVectorFactor())) == 1)
7931 return std::nullopt;
7932 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7933 return std::nullopt;
7934 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7935 Sz)) {
7936 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7937 if (TE.ReorderIndices.empty())
7938 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7939 else
7940 inversePermutation(TE.ReorderIndices, ReorderMask);
7941 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7942 unsigned VF = ReorderMask.size();
7943 OrdersType ResOrder(VF, VF);
7944 unsigned NumParts = divideCeil(VF, Sz);
7945 SmallBitVector UsedVals(NumParts);
7946 for (unsigned I = 0; I < VF; I += Sz) {
7947 int Val = PoisonMaskElem;
7948 unsigned UndefCnt = 0;
7949 unsigned Limit = std::min(Sz, VF - I);
7950 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7951 [&](int Idx) {
7952 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7953 Val = Idx;
7954 if (Idx == PoisonMaskElem)
7955 ++UndefCnt;
7956 return Idx != PoisonMaskElem && Idx != Val;
7957 }) ||
7958 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7959 UndefCnt > Sz / 2)
7960 return std::nullopt;
7961 UsedVals.set(Val);
7962 for (unsigned K = 0; K < NumParts; ++K) {
7963 unsigned Idx = Val + Sz * K;
7964 if (Idx < VF && I + K < VF)
7965 ResOrder[Idx] = I + K;
7966 }
7967 }
7968 return std::move(ResOrder);
7969 }
7970 unsigned VF = TE.getVectorFactor();
7971 // Try build correct order for extractelement instructions.
7972 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7973 TE.ReuseShuffleIndices.end());
7974 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7975 all_of(TE.Scalars, [Sz](Value *V) {
7976 if (isa<PoisonValue>(V))
7977 return true;
7978 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7979 return Idx && *Idx < Sz;
7980 })) {
7981 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7982 "by BinaryOperator and CastInst.");
7983 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7984 if (TE.ReorderIndices.empty())
7985 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7986 else
7987 inversePermutation(TE.ReorderIndices, ReorderMask);
7988 for (unsigned I = 0; I < VF; ++I) {
7989 int &Idx = ReusedMask[I];
7990 if (Idx == PoisonMaskElem)
7991 continue;
7992 Value *V = TE.Scalars[ReorderMask[Idx]];
7993 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7994 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7995 }
7996 }
7997 // Build the order of the VF size, need to reorder reuses shuffles, they are
7998 // always of VF size.
7999 OrdersType ResOrder(VF);
8000 std::iota(ResOrder.begin(), ResOrder.end(), 0);
8001 auto *It = ResOrder.begin();
8002 for (unsigned K = 0; K < VF; K += Sz) {
8003 OrdersType CurrentOrder(TE.ReorderIndices);
8004 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
8005 if (SubMask.front() == PoisonMaskElem)
8006 std::iota(SubMask.begin(), SubMask.end(), 0);
8007 reorderOrder(CurrentOrder, SubMask);
8008 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
8009 std::advance(It, Sz);
8010 }
8011 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
8012 return Data.index() == Data.value();
8013 }))
8014 return std::nullopt; // No need to reorder.
8015 return std::move(ResOrder);
8016 }
8017 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8018 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8019 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
8020 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
8021 return std::nullopt;
8022 if (TE.State == TreeEntry::SplitVectorize ||
8023 ((TE.State == TreeEntry::Vectorize ||
8024 TE.State == TreeEntry::StridedVectorize ||
8025 TE.State == TreeEntry::CompressVectorize) &&
8027 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
8028 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8029 "Alternate instructions are only supported by "
8030 "BinaryOperator and CastInst.");
8031 return TE.ReorderIndices;
8032 }
8033 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8034 TE.isAltShuffle()) {
8035 assert(TE.ReuseShuffleIndices.empty() &&
8036 "ReuseShuffleIndices should be "
8037 "empty for alternate instructions.");
8038 SmallVector<int> Mask;
8039 TE.buildAltOpShuffleMask(
8040 [&](Instruction *I) {
8041 assert(TE.getMatchingMainOpOrAltOp(I) &&
8042 "Unexpected main/alternate opcode");
8043 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
8044 },
8045 Mask);
8046 const int VF = TE.getVectorFactor();
8047 OrdersType ResOrder(VF, VF);
8048 for (unsigned I : seq<unsigned>(VF)) {
8049 if (Mask[I] == PoisonMaskElem)
8050 continue;
8051 ResOrder[Mask[I] % VF] = I;
8052 }
8053 return std::move(ResOrder);
8054 }
8055 if (!TE.ReorderIndices.empty())
8056 return TE.ReorderIndices;
8057 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8058 if (!TE.ReorderIndices.empty())
8059 return TE.ReorderIndices;
8060
8061 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
8062 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
8063 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
8064 continue;
8065 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
8066 if (!II)
8067 continue;
8068 Instruction *BVHead = nullptr;
8069 BasicBlock *BB = II->getParent();
8070 while (II && II->hasOneUse() && II->getParent() == BB) {
8071 BVHead = II;
8072 II = dyn_cast<InsertElementInst>(II->getOperand(0));
8073 }
8074 I = BVHead;
8075 }
8076
8077 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
8078 assert(BB1 != BB2 && "Expected different basic blocks.");
8079 if (!DT->isReachableFromEntry(BB1))
8080 return false;
8081 if (!DT->isReachableFromEntry(BB2))
8082 return true;
8083 auto *NodeA = DT->getNode(BB1);
8084 auto *NodeB = DT->getNode(BB2);
8085 assert(NodeA && "Should only process reachable instructions");
8086 assert(NodeB && "Should only process reachable instructions");
8087 assert((NodeA == NodeB) ==
8088 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8089 "Different nodes should have different DFS numbers");
8090 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8091 };
8092 auto PHICompare = [&](unsigned I1, unsigned I2) {
8093 Value *V1 = TE.Scalars[I1];
8094 Value *V2 = TE.Scalars[I2];
8095 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
8096 return false;
8097 if (isa<PoisonValue>(V1))
8098 return true;
8099 if (isa<PoisonValue>(V2))
8100 return false;
8101 if (V1->getNumUses() < V2->getNumUses())
8102 return true;
8103 if (V1->getNumUses() > V2->getNumUses())
8104 return false;
8105 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
8106 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
8107 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8108 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8109 FirstUserOfPhi2->getParent());
8110 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
8111 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
8112 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
8113 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
8114 if (IE1 && !IE2)
8115 return true;
8116 if (!IE1 && IE2)
8117 return false;
8118 if (IE1 && IE2) {
8119 if (UserBVHead[I1] && !UserBVHead[I2])
8120 return true;
8121 if (!UserBVHead[I1])
8122 return false;
8123 if (UserBVHead[I1] == UserBVHead[I2])
8124 return getElementIndex(IE1) < getElementIndex(IE2);
8125 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
8126 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
8127 UserBVHead[I2]->getParent());
8128 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8129 }
8130 if (EE1 && !EE2)
8131 return true;
8132 if (!EE1 && EE2)
8133 return false;
8134 if (EE1 && EE2) {
8135 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
8136 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
8137 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
8138 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
8139 if (!Inst2 && !P2)
8140 return Inst1 || P1;
8141 if (EE1->getOperand(0) == EE2->getOperand(0))
8142 return getElementIndex(EE1) < getElementIndex(EE2);
8143 if (!Inst1 && Inst2)
8144 return false;
8145 if (Inst1 && Inst2) {
8146 if (Inst1->getParent() != Inst2->getParent())
8147 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
8148 return Inst1->comesBefore(Inst2);
8149 }
8150 if (!P1 && P2)
8151 return false;
8152 assert(P1 && P2 &&
8153 "Expected either instructions or arguments vector operands.");
8154 return P1->getArgNo() < P2->getArgNo();
8155 }
8156 return false;
8157 };
8158 OrdersType Phis(TE.Scalars.size());
8159 std::iota(Phis.begin(), Phis.end(), 0);
8160 stable_sort(Phis, PHICompare);
8161 if (isIdentityOrder(Phis))
8162 return std::nullopt; // No need to reorder.
8163 return std::move(Phis);
8164 }
8165 if (TE.isGather() &&
8166 (!TE.hasState() || !TE.isAltShuffle() ||
8167 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8168 allSameType(TE.Scalars)) {
8169 // TODO: add analysis of other gather nodes with extractelement
8170 // instructions and other values/instructions, not only undefs.
8171 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8173 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
8174 all_of(TE.Scalars, [](Value *V) {
8175 auto *EE = dyn_cast<ExtractElementInst>(V);
8176 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8177 })) {
8178 // Check that gather of extractelements can be represented as
8179 // just a shuffle of a single vector.
8180 OrdersType CurrentOrder;
8181 bool Reuse =
8182 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
8183 if (Reuse || !CurrentOrder.empty())
8184 return std::move(CurrentOrder);
8185 }
8186 // If the gather node is <undef, v, .., poison> and
8187 // insertelement poison, v, 0 [+ permute]
8188 // is cheaper than
8189 // insertelement poison, v, n - try to reorder.
8190 // If rotating the whole graph, exclude the permute cost, the whole graph
8191 // might be transformed.
8192 int Sz = TE.Scalars.size();
8193 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
8194 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
8195 const auto *It = find_if_not(TE.Scalars, isConstant);
8196 if (It == TE.Scalars.begin())
8197 return OrdersType();
8198 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
8199 if (It != TE.Scalars.end()) {
8200 OrdersType Order(Sz, Sz);
8201 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8202 Order[Idx] = 0;
8203 fixupOrderingIndices(Order);
8204 SmallVector<int> Mask;
8205 inversePermutation(Order, Mask);
8206 InstructionCost PermuteCost =
8207 TopToBottom
8208 ? 0
8209 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
8210 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
8211 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
8212 PoisonValue::get(Ty), *It);
8213 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
8214 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
8215 PoisonValue::get(Ty), *It);
8216 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8217 OrdersType Order(Sz, Sz);
8218 Order[Idx] = 0;
8219 return std::move(Order);
8220 }
8221 }
8222 }
8223 if (isSplat(TE.Scalars))
8224 return std::nullopt;
8225 if (TE.Scalars.size() >= 3)
8226 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
8227 return Order;
8228 // Check if can include the order of vectorized loads. For masked gathers do
8229 // extra analysis later, so include such nodes into a special list.
8230 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8231 SmallVector<Value *> PointerOps;
8232 StridedPtrInfo SPtrInfo;
8233 OrdersType CurrentOrder;
8234 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
8235 CurrentOrder, PointerOps, SPtrInfo);
8238 return std::move(CurrentOrder);
8239 }
8240 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
8241 // has been auditted for correctness with non-power-of-two vectors.
8242 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
8243 if (std::optional<OrdersType> CurrentOrder =
8244 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
8245 return CurrentOrder;
8246 }
8247 return std::nullopt;
8248}
8249
8250/// Checks if the given mask is a "clustered" mask with the same clusters of
8251/// size \p Sz, which are not identity submasks.
8253 unsigned Sz) {
8254 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
8255 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
8256 return false;
8257 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
8258 ArrayRef<int> Cluster = Mask.slice(I, Sz);
8259 if (Cluster != FirstCluster)
8260 return false;
8261 }
8262 return true;
8263}
8264
8265void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8266 // Reorder reuses mask.
8267 reorderReuses(TE.ReuseShuffleIndices, Mask);
8268 const unsigned Sz = TE.Scalars.size();
8269 // For vectorized and non-clustered reused no need to do anything else.
8270 if (!TE.isGather() ||
8272 Sz) ||
8273 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
8274 return;
8275 SmallVector<int> NewMask;
8276 inversePermutation(TE.ReorderIndices, NewMask);
8277 addMask(NewMask, TE.ReuseShuffleIndices);
8278 // Clear reorder since it is going to be applied to the new mask.
8279 TE.ReorderIndices.clear();
8280 // Try to improve gathered nodes with clustered reuses, if possible.
8281 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
8282 SmallVector<unsigned> NewOrder(Slice);
8283 inversePermutation(NewOrder, NewMask);
8284 reorderScalars(TE.Scalars, NewMask);
8285 // Fill the reuses mask with the identity submasks.
8286 for (auto *It = TE.ReuseShuffleIndices.begin(),
8287 *End = TE.ReuseShuffleIndices.end();
8288 It != End; std::advance(It, Sz))
8289 std::iota(It, std::next(It, Sz), 0);
8290}
8291
8293 ArrayRef<unsigned> SecondaryOrder) {
8294 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8295 "Expected same size of orders");
8296 size_t Sz = Order.size();
8297 SmallBitVector UsedIndices(Sz);
8298 for (unsigned Idx : seq<unsigned>(0, Sz)) {
8299 if (Order[Idx] != Sz)
8300 UsedIndices.set(Order[Idx]);
8301 }
8302 if (SecondaryOrder.empty()) {
8303 for (unsigned Idx : seq<unsigned>(0, Sz))
8304 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8305 Order[Idx] = Idx;
8306 } else {
8307 for (unsigned Idx : seq<unsigned>(0, Sz))
8308 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8309 !UsedIndices.test(SecondaryOrder[Idx]))
8310 Order[Idx] = SecondaryOrder[Idx];
8311 }
8312}
8313
8316 return false;
8317
8318 constexpr unsigned TinyVF = 2;
8319 constexpr unsigned TinyTree = 10;
8320 constexpr unsigned PhiOpsLimit = 12;
8321 constexpr unsigned GatherLoadsLimit = 2;
8322 if (VectorizableTree.size() <= TinyTree)
8323 return true;
8324 if (VectorizableTree.front()->hasState() &&
8325 !VectorizableTree.front()->isGather() &&
8326 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8327 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8328 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8329 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8330 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8331 VectorizableTree.front()->ReorderIndices.empty()) {
8332 // Check if the tree has only single store and single (unordered) load node,
8333 // other nodes are phis or geps/binops, combined with phis, and/or single
8334 // gather load node
8335 if (VectorizableTree.front()->hasState() &&
8336 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8337 VectorizableTree.front()->Scalars.size() == TinyVF &&
8338 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8339 return false;
8340 // Single node, which require reorder - skip.
8341 if (VectorizableTree.front()->hasState() &&
8342 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8343 VectorizableTree.front()->ReorderIndices.empty()) {
8344 const unsigned ReorderedSplitsCnt =
8345 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8346 return TE->State == TreeEntry::SplitVectorize &&
8347 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8348 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8349 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
8350 });
8351 if (ReorderedSplitsCnt <= 1 &&
8352 static_cast<unsigned>(count_if(
8353 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8354 return ((!TE->isGather() &&
8355 (TE->ReorderIndices.empty() ||
8356 (TE->UserTreeIndex.UserTE &&
8357 TE->UserTreeIndex.UserTE->State ==
8358 TreeEntry::Vectorize &&
8359 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8360 .empty()))) ||
8361 (TE->isGather() && TE->ReorderIndices.empty() &&
8362 (!TE->hasState() || TE->isAltShuffle() ||
8363 TE->getOpcode() == Instruction::Load ||
8364 TE->getOpcode() == Instruction::ZExt ||
8365 TE->getOpcode() == Instruction::SExt))) &&
8366 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8367 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
8368 return !isConstant(V) && isVectorized(V);
8369 }));
8370 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8371 return false;
8372 }
8373 bool HasPhis = false;
8374 bool HasLoad = true;
8375 unsigned GatherLoads = 0;
8376 for (const std::unique_ptr<TreeEntry> &TE :
8377 ArrayRef(VectorizableTree).drop_front()) {
8378 if (TE->State == TreeEntry::SplitVectorize)
8379 continue;
8380 if (!TE->hasState()) {
8381 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
8383 continue;
8384 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8386 continue;
8387 return true;
8388 }
8389 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8390 if (!TE->isGather()) {
8391 HasLoad = false;
8392 continue;
8393 }
8394 if (HasLoad)
8395 return true;
8396 ++GatherLoads;
8397 if (GatherLoads >= GatherLoadsLimit)
8398 return true;
8399 }
8400 if (TE->getOpcode() == Instruction::GetElementPtr ||
8401 Instruction::isBinaryOp(TE->getOpcode()))
8402 continue;
8403 if (TE->getOpcode() != Instruction::PHI &&
8404 (!TE->hasCopyableElements() ||
8405 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8406 TE->Scalars.size() / 2))
8407 return true;
8408 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8409 TE->getNumOperands() > PhiOpsLimit)
8410 return false;
8411 HasPhis = true;
8412 }
8413 return !HasPhis;
8414 }
8415 return true;
8416}
8417
8418void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8419 ArrayRef<int> MaskOrder) {
8420 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8421 SmallVector<int> NewMask(getVectorFactor());
8422 SmallVector<int> NewMaskOrder(getVectorFactor());
8423 std::iota(NewMask.begin(), NewMask.end(), 0);
8424 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8425 if (Idx == 0) {
8426 copy(Mask, NewMask.begin());
8427 copy(MaskOrder, NewMaskOrder.begin());
8428 } else {
8429 assert(Idx == 1 && "Expected either 0 or 1 index.");
8430 unsigned Offset = CombinedEntriesWithIndices.back().second;
8431 for (unsigned I : seq<unsigned>(Mask.size())) {
8432 NewMask[I + Offset] = Mask[I] + Offset;
8433 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8434 }
8435 }
8436 reorderScalars(Scalars, NewMask);
8437 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8438 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8439 ReorderIndices.clear();
8440}
8441
8443 // Maps VF to the graph nodes.
8445 // ExtractElement gather nodes which can be vectorized and need to handle
8446 // their ordering.
8448
8449 // Phi nodes can have preferred ordering based on their result users
8451
8452 // AltShuffles can also have a preferred ordering that leads to fewer
8453 // instructions, e.g., the addsub instruction in x86.
8454 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8455
8456 // Maps a TreeEntry to the reorder indices of external users.
8458 ExternalUserReorderMap;
8459 // Find all reorderable nodes with the given VF.
8460 // Currently the are vectorized stores,loads,extracts + some gathering of
8461 // extracts.
8462 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8463 const std::unique_ptr<TreeEntry> &TE) {
8464 // Look for external users that will probably be vectorized.
8465 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8466 findExternalStoreUsersReorderIndices(TE.get());
8467 if (!ExternalUserReorderIndices.empty()) {
8468 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8469 ExternalUserReorderMap.try_emplace(TE.get(),
8470 std::move(ExternalUserReorderIndices));
8471 }
8472
8473 // Patterns like [fadd,fsub] can be combined into a single instruction in
8474 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8475 // to take into account their order when looking for the most used order.
8476 if (TE->hasState() && TE->isAltShuffle() &&
8477 TE->State != TreeEntry::SplitVectorize) {
8478 Type *ScalarTy = TE->Scalars[0]->getType();
8479 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8480 unsigned Opcode0 = TE->getOpcode();
8481 unsigned Opcode1 = TE->getAltOpcode();
8482 SmallBitVector OpcodeMask(
8483 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8484 // If this pattern is supported by the target then we consider the order.
8485 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8486 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8487 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8488 }
8489 // TODO: Check the reverse order too.
8490 }
8491
8492 bool IgnoreReorder =
8493 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8494 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8495 VectorizableTree.front()->getOpcode() == Instruction::Store);
8496 if (std::optional<OrdersType> CurrentOrder =
8497 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8498 // Do not include ordering for nodes used in the alt opcode vectorization,
8499 // better to reorder them during bottom-to-top stage. If follow the order
8500 // here, it causes reordering of the whole graph though actually it is
8501 // profitable just to reorder the subgraph that starts from the alternate
8502 // opcode vectorization node. Such nodes already end-up with the shuffle
8503 // instruction and it is just enough to change this shuffle rather than
8504 // rotate the scalars for the whole graph.
8505 unsigned Cnt = 0;
8506 const TreeEntry *UserTE = TE.get();
8507 while (UserTE && Cnt < RecursionMaxDepth) {
8508 if (!UserTE->UserTreeIndex)
8509 break;
8510 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8511 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8512 UserTE->UserTreeIndex.UserTE->Idx != 0)
8513 return;
8514 UserTE = UserTE->UserTreeIndex.UserTE;
8515 ++Cnt;
8516 }
8517 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8518 if (!(TE->State == TreeEntry::Vectorize ||
8519 TE->State == TreeEntry::StridedVectorize ||
8520 TE->State == TreeEntry::SplitVectorize ||
8521 TE->State == TreeEntry::CompressVectorize) ||
8522 !TE->ReuseShuffleIndices.empty())
8523 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8524 if (TE->State == TreeEntry::Vectorize &&
8525 TE->getOpcode() == Instruction::PHI)
8526 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8527 }
8528 });
8529
8530 // Reorder the graph nodes according to their vectorization factor.
8531 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8532 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8533 auto It = VFToOrderedEntries.find(VF);
8534 if (It == VFToOrderedEntries.end())
8535 continue;
8536 // Try to find the most profitable order. We just are looking for the most
8537 // used order and reorder scalar elements in the nodes according to this
8538 // mostly used order.
8539 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8540 // Delete VF entry upon exit.
8541 llvm::scope_exit Cleanup([&]() { VFToOrderedEntries.erase(It); });
8542
8543 // All operands are reordered and used only in this node - propagate the
8544 // most used order to the user node.
8547 OrdersUses;
8548 for (const TreeEntry *OpTE : OrderedEntries) {
8549 // No need to reorder this nodes, still need to extend and to use shuffle,
8550 // just need to merge reordering shuffle and the reuse shuffle.
8551 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8552 OpTE->State != TreeEntry::SplitVectorize)
8553 continue;
8554 // Count number of orders uses.
8555 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8556 &PhisToOrders]() -> const OrdersType & {
8557 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8558 auto It = GathersToOrders.find(OpTE);
8559 if (It != GathersToOrders.end())
8560 return It->second;
8561 }
8562 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8563 auto It = AltShufflesToOrders.find(OpTE);
8564 if (It != AltShufflesToOrders.end())
8565 return It->second;
8566 }
8567 if (OpTE->State == TreeEntry::Vectorize &&
8568 OpTE->getOpcode() == Instruction::PHI) {
8569 auto It = PhisToOrders.find(OpTE);
8570 if (It != PhisToOrders.end())
8571 return It->second;
8572 }
8573 return OpTE->ReorderIndices;
8574 }();
8575 // First consider the order of the external scalar users.
8576 auto It = ExternalUserReorderMap.find(OpTE);
8577 if (It != ExternalUserReorderMap.end()) {
8578 const auto &ExternalUserReorderIndices = It->second;
8579 // If the OpTE vector factor != number of scalars - use natural order,
8580 // it is an attempt to reorder node with reused scalars but with
8581 // external uses.
8582 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8583 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8584 ExternalUserReorderIndices.size();
8585 } else {
8586 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8587 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8588 }
8589 // No other useful reorder data in this entry.
8590 if (Order.empty())
8591 continue;
8592 }
8593 // Stores actually store the mask, not the order, need to invert.
8594 if (OpTE->State == TreeEntry::Vectorize &&
8595 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8596 assert(!OpTE->isAltShuffle() &&
8597 "Alternate instructions are only supported by BinaryOperator "
8598 "and CastInst.");
8599 SmallVector<int> Mask;
8600 inversePermutation(Order, Mask);
8601 unsigned E = Order.size();
8602 OrdersType CurrentOrder(E, E);
8603 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8604 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8605 });
8606 fixupOrderingIndices(CurrentOrder);
8607 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8608 } else {
8609 ++OrdersUses.try_emplace(Order, 0).first->second;
8610 }
8611 }
8612 if (OrdersUses.empty())
8613 continue;
8614 // Choose the most used order.
8615 unsigned IdentityCnt = 0;
8616 unsigned FilledIdentityCnt = 0;
8617 OrdersType IdentityOrder(VF, VF);
8618 for (auto &Pair : OrdersUses) {
8619 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8620 if (!Pair.first.empty())
8621 FilledIdentityCnt += Pair.second;
8622 IdentityCnt += Pair.second;
8623 combineOrders(IdentityOrder, Pair.first);
8624 }
8625 }
8626 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8627 unsigned Cnt = IdentityCnt;
8628 for (auto &Pair : OrdersUses) {
8629 // Prefer identity order. But, if filled identity found (non-empty order)
8630 // with same number of uses, as the new candidate order, we can choose
8631 // this candidate order.
8632 if (Cnt < Pair.second ||
8633 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8634 Cnt == Pair.second && !BestOrder.empty() &&
8635 isIdentityOrder(BestOrder))) {
8636 combineOrders(Pair.first, BestOrder);
8637 BestOrder = Pair.first;
8638 Cnt = Pair.second;
8639 } else {
8640 combineOrders(BestOrder, Pair.first);
8641 }
8642 }
8643 // Set order of the user node.
8644 if (isIdentityOrder(BestOrder))
8645 continue;
8646 fixupOrderingIndices(BestOrder);
8647 SmallVector<int> Mask;
8648 inversePermutation(BestOrder, Mask);
8649 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8650 unsigned E = BestOrder.size();
8651 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8652 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8653 });
8654 // Do an actual reordering, if profitable.
8655 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8656 // Just do the reordering for the nodes with the given VF.
8657 if (TE->Scalars.size() != VF) {
8658 if (TE->ReuseShuffleIndices.size() == VF) {
8659 assert(TE->State != TreeEntry::SplitVectorize &&
8660 "Split vectorized not expected.");
8661 // Need to reorder the reuses masks of the operands with smaller VF to
8662 // be able to find the match between the graph nodes and scalar
8663 // operands of the given node during vectorization/cost estimation.
8664 assert(
8665 (!TE->UserTreeIndex ||
8666 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8667 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8668 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8669 "All users must be of VF size.");
8670 if (SLPReVec) {
8671 assert(SLPReVec && "Only supported by REVEC.");
8672 // ShuffleVectorInst does not do reorderOperands (and it should not
8673 // because ShuffleVectorInst supports only a limited set of
8674 // patterns). Only do reorderNodeWithReuses if the user is not
8675 // ShuffleVectorInst.
8676 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8677 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8678 continue;
8679 }
8680 // Update ordering of the operands with the smaller VF than the given
8681 // one.
8682 reorderNodeWithReuses(*TE, Mask);
8683 // Update orders in user split vectorize nodes.
8684 if (TE->UserTreeIndex &&
8685 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8686 TE->UserTreeIndex.UserTE->reorderSplitNode(
8687 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8688 }
8689 continue;
8690 }
8691 if ((TE->State == TreeEntry::SplitVectorize &&
8692 TE->ReuseShuffleIndices.empty()) ||
8693 ((TE->State == TreeEntry::Vectorize ||
8694 TE->State == TreeEntry::StridedVectorize ||
8695 TE->State == TreeEntry::CompressVectorize) &&
8697 InsertElementInst>(TE->getMainOp()) ||
8698 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8699 assert(
8700 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8701 TE->ReuseShuffleIndices.empty())) &&
8702 "Alternate instructions are only supported by BinaryOperator "
8703 "and CastInst.");
8704 // Build correct orders for extract{element,value}, loads,
8705 // stores and alternate (split) nodes.
8706 reorderOrder(TE->ReorderIndices, Mask);
8707 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8708 TE->reorderOperands(Mask);
8709 } else {
8710 // Reorder the node and its operands.
8711 TE->reorderOperands(Mask);
8712 assert(TE->ReorderIndices.empty() &&
8713 "Expected empty reorder sequence.");
8714 reorderScalars(TE->Scalars, Mask);
8715 }
8716 if (!TE->ReuseShuffleIndices.empty()) {
8717 // Apply reversed order to keep the original ordering of the reused
8718 // elements to avoid extra reorder indices shuffling.
8719 OrdersType CurrentOrder;
8720 reorderOrder(CurrentOrder, MaskOrder);
8721 SmallVector<int> NewReuses;
8722 inversePermutation(CurrentOrder, NewReuses);
8723 addMask(NewReuses, TE->ReuseShuffleIndices);
8724 TE->ReuseShuffleIndices.swap(NewReuses);
8725 } else if (TE->UserTreeIndex &&
8726 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8727 // Update orders in user split vectorize nodes.
8728 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8729 Mask, MaskOrder);
8730 }
8731 }
8732}
8733
8734void BoUpSLP::buildReorderableOperands(
8735 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8736 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8737 SmallVectorImpl<TreeEntry *> &GatherOps) {
8738 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8739 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8740 return OpData.first == I &&
8741 (OpData.second->State == TreeEntry::Vectorize ||
8742 OpData.second->State == TreeEntry::StridedVectorize ||
8743 OpData.second->State == TreeEntry::CompressVectorize ||
8744 OpData.second->State == TreeEntry::SplitVectorize);
8745 }))
8746 continue;
8747 // Do not request operands, if they do not exist.
8748 if (UserTE->hasState()) {
8749 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8750 UserTE->getOpcode() == Instruction::ExtractValue)
8751 continue;
8752 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8753 continue;
8754 if (UserTE->getOpcode() == Instruction::Store &&
8755 UserTE->State == TreeEntry::Vectorize && I == 1)
8756 continue;
8757 if (UserTE->getOpcode() == Instruction::Load &&
8758 (UserTE->State == TreeEntry::Vectorize ||
8759 UserTE->State == TreeEntry::StridedVectorize ||
8760 UserTE->State == TreeEntry::CompressVectorize))
8761 continue;
8762 }
8763 TreeEntry *TE = getOperandEntry(UserTE, I);
8764 assert(TE && "Expected operand entry.");
8765 if (!TE->isGather()) {
8766 // Add the node to the list of the ordered nodes with the identity
8767 // order.
8768 Edges.emplace_back(I, TE);
8769 // Add ScatterVectorize nodes to the list of operands, where just
8770 // reordering of the scalars is required. Similar to the gathers, so
8771 // simply add to the list of gathered ops.
8772 // If there are reused scalars, process this node as a regular vectorize
8773 // node, just reorder reuses mask.
8774 if (TE->State == TreeEntry::ScatterVectorize &&
8775 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8776 GatherOps.push_back(TE);
8777 continue;
8778 }
8779 if (ReorderableGathers.contains(TE))
8780 GatherOps.push_back(TE);
8781 }
8782}
8783
8784void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8785 struct TreeEntryCompare {
8786 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8787 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8788 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8789 return LHS->Idx < RHS->Idx;
8790 }
8791 };
8793 DenseSet<const TreeEntry *> GathersToOrders;
8794 // Find all reorderable leaf nodes with the given VF.
8795 // Currently the are vectorized loads,extracts without alternate operands +
8796 // some gathering of extracts.
8798 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8799 if (TE->State != TreeEntry::Vectorize &&
8800 TE->State != TreeEntry::StridedVectorize &&
8801 TE->State != TreeEntry::CompressVectorize &&
8802 TE->State != TreeEntry::SplitVectorize)
8803 NonVectorized.insert(TE.get());
8804 if (std::optional<OrdersType> CurrentOrder =
8805 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8806 Queue.push(TE.get());
8807 if (!(TE->State == TreeEntry::Vectorize ||
8808 TE->State == TreeEntry::StridedVectorize ||
8809 TE->State == TreeEntry::CompressVectorize ||
8810 TE->State == TreeEntry::SplitVectorize) ||
8811 !TE->ReuseShuffleIndices.empty())
8812 GathersToOrders.insert(TE.get());
8813 }
8814 }
8815
8816 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8817 // I.e., if the node has operands, that are reordered, try to make at least
8818 // one operand order in the natural order and reorder others + reorder the
8819 // user node itself.
8820 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8821 while (!Queue.empty()) {
8822 // 1. Filter out only reordered nodes.
8823 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8824 TreeEntry *TE = Queue.top();
8825 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8826 Queue.pop();
8827 SmallVector<TreeEntry *> OrderedOps(1, TE);
8828 while (!Queue.empty()) {
8829 TE = Queue.top();
8830 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8831 break;
8832 Queue.pop();
8833 OrderedOps.push_back(TE);
8834 }
8835 for (TreeEntry *TE : OrderedOps) {
8836 if (!(TE->State == TreeEntry::Vectorize ||
8837 TE->State == TreeEntry::StridedVectorize ||
8838 TE->State == TreeEntry::CompressVectorize ||
8839 TE->State == TreeEntry::SplitVectorize ||
8840 (TE->isGather() && GathersToOrders.contains(TE))) ||
8841 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8842 !Visited.insert(TE).second)
8843 continue;
8844 // Build a map between user nodes and their operands order to speedup
8845 // search. The graph currently does not provide this dependency directly.
8846 Users.first = TE->UserTreeIndex.UserTE;
8847 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8848 }
8849 if (Users.first) {
8850 auto &Data = Users;
8851 if (Data.first->State == TreeEntry::SplitVectorize) {
8852 assert(
8853 Data.second.size() <= 2 &&
8854 "Expected not greater than 2 operands for split vectorize node.");
8855 if (any_of(Data.second,
8856 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8857 continue;
8858 // Update orders in user split vectorize nodes.
8859 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8860 "Expected exactly 2 entries.");
8861 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8862 TreeEntry &OpTE = *VectorizableTree[P.first];
8863 OrdersType Order = OpTE.ReorderIndices;
8864 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8865 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8866 continue;
8867 const auto BestOrder =
8868 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8869 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8870 continue;
8871 Order = *BestOrder;
8872 }
8873 fixupOrderingIndices(Order);
8874 SmallVector<int> Mask;
8875 inversePermutation(Order, Mask);
8876 const unsigned E = Order.size();
8877 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8878 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8879 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8880 });
8881 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8882 // Clear ordering of the operand.
8883 if (!OpTE.ReorderIndices.empty()) {
8884 OpTE.ReorderIndices.clear();
8885 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8886 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8887 } else {
8888 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8889 reorderScalars(OpTE.Scalars, Mask);
8890 }
8891 }
8892 if (Data.first->ReuseShuffleIndices.empty() &&
8893 !Data.first->ReorderIndices.empty()) {
8894 // Insert user node to the list to try to sink reordering deeper in
8895 // the graph.
8896 Queue.push(Data.first);
8897 }
8898 continue;
8899 }
8900 // Check that operands are used only in the User node.
8901 SmallVector<TreeEntry *> GatherOps;
8902 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8903 GatherOps);
8904 // All operands are reordered and used only in this node - propagate the
8905 // most used order to the user node.
8908 OrdersUses;
8909 // Do the analysis for each tree entry only once, otherwise the order of
8910 // the same node my be considered several times, though might be not
8911 // profitable.
8914 for (const auto &Op : Data.second) {
8915 TreeEntry *OpTE = Op.second;
8916 if (!VisitedOps.insert(OpTE).second)
8917 continue;
8918 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8919 continue;
8920 const auto Order = [&]() -> const OrdersType {
8921 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8922 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8923 IgnoreReorder)
8924 .value_or(OrdersType(1));
8925 return OpTE->ReorderIndices;
8926 }();
8927 // The order is partially ordered, skip it in favor of fully non-ordered
8928 // orders.
8929 if (Order.size() == 1)
8930 continue;
8931
8932 // Check that the reordering does not increase number of shuffles, i.e.
8933 // same-values-nodes has same parents or their parents has same parents.
8934 if (!Order.empty() && !isIdentityOrder(Order)) {
8935 Value *Root = OpTE->hasState()
8936 ? OpTE->getMainOp()
8937 : *find_if_not(OpTE->Scalars, isConstant);
8938 auto GetSameNodesUsers = [&](Value *Root) {
8940 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8941 if (TE != OpTE && TE->UserTreeIndex &&
8942 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8943 TE->Scalars.size() == OpTE->Scalars.size() &&
8944 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8945 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8946 Res.insert(TE->UserTreeIndex.UserTE);
8947 }
8948 for (const TreeEntry *TE : getTreeEntries(Root)) {
8949 if (TE != OpTE && TE->UserTreeIndex &&
8950 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8951 TE->Scalars.size() == OpTE->Scalars.size() &&
8952 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8953 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8954 Res.insert(TE->UserTreeIndex.UserTE);
8955 }
8956 return Res.takeVector();
8957 };
8958 auto GetNumOperands = [](const TreeEntry *TE) {
8959 if (TE->State == TreeEntry::SplitVectorize)
8960 return TE->getNumOperands();
8961 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8962 return CI->arg_size();
8963 return TE->getNumOperands();
8964 };
8965 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8966 const TreeEntry *TE) {
8968 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8970 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8973 continue;
8974 const TreeEntry *Op = getOperandEntry(TE, Idx);
8975 if (Op->isGather() && Op->hasState()) {
8976 const TreeEntry *VecOp =
8977 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8978 if (VecOp)
8979 Op = VecOp;
8980 }
8981 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8982 return false;
8983 }
8984 return true;
8985 };
8986 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8987 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8988 if (!RevisitedOps.insert(UTE).second)
8989 return false;
8990 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8991 !UTE->ReuseShuffleIndices.empty() ||
8992 (UTE->UserTreeIndex &&
8993 UTE->UserTreeIndex.UserTE == Data.first) ||
8994 (Data.first->UserTreeIndex &&
8995 Data.first->UserTreeIndex.UserTE == UTE) ||
8996 (IgnoreReorder && UTE->UserTreeIndex &&
8997 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8998 NodeShouldBeReorderedWithOperands(UTE);
8999 }))
9000 continue;
9001 for (TreeEntry *UTE : Users) {
9003 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
9005 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
9008 continue;
9009 const TreeEntry *Op = getOperandEntry(UTE, Idx);
9010 Visited.erase(Op);
9011 Queue.push(const_cast<TreeEntry *>(Op));
9012 }
9013 }
9014 }
9015 unsigned NumOps = count_if(
9016 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
9017 return P.second == OpTE;
9018 });
9019 // Stores actually store the mask, not the order, need to invert.
9020 if (OpTE->State == TreeEntry::Vectorize &&
9021 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
9022 assert(!OpTE->isAltShuffle() &&
9023 "Alternate instructions are only supported by BinaryOperator "
9024 "and CastInst.");
9025 SmallVector<int> Mask;
9026 inversePermutation(Order, Mask);
9027 unsigned E = Order.size();
9028 OrdersType CurrentOrder(E, E);
9029 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
9030 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9031 });
9032 fixupOrderingIndices(CurrentOrder);
9033 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
9034 } else {
9035 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
9036 }
9037 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
9038 const auto AllowsReordering = [&](const TreeEntry *TE) {
9039 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9040 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9041 (IgnoreReorder && TE->Idx == 0))
9042 return true;
9043 if (TE->isGather()) {
9044 if (GathersToOrders.contains(TE))
9045 return !getReorderingData(*TE, /*TopToBottom=*/false,
9046 IgnoreReorder)
9047 .value_or(OrdersType(1))
9048 .empty();
9049 return true;
9050 }
9051 return false;
9052 };
9053 if (OpTE->UserTreeIndex) {
9054 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9055 if (!VisitedUsers.insert(UserTE).second)
9056 continue;
9057 // May reorder user node if it requires reordering, has reused
9058 // scalars, is an alternate op vectorize node or its op nodes require
9059 // reordering.
9060 if (AllowsReordering(UserTE))
9061 continue;
9062 // Check if users allow reordering.
9063 // Currently look up just 1 level of operands to avoid increase of
9064 // the compile time.
9065 // Profitable to reorder if definitely more operands allow
9066 // reordering rather than those with natural order.
9068 if (static_cast<unsigned>(count_if(
9069 Ops, [UserTE, &AllowsReordering](
9070 const std::pair<unsigned, TreeEntry *> &Op) {
9071 return AllowsReordering(Op.second) &&
9072 Op.second->UserTreeIndex.UserTE == UserTE;
9073 })) <= Ops.size() / 2)
9074 ++Res.first->second;
9075 }
9076 }
9077 if (OrdersUses.empty()) {
9078 Visited.insert_range(llvm::make_second_range(Data.second));
9079 continue;
9080 }
9081 // Choose the most used order.
9082 unsigned IdentityCnt = 0;
9083 unsigned VF = Data.second.front().second->getVectorFactor();
9084 OrdersType IdentityOrder(VF, VF);
9085 for (auto &Pair : OrdersUses) {
9086 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
9087 IdentityCnt += Pair.second;
9088 combineOrders(IdentityOrder, Pair.first);
9089 }
9090 }
9091 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
9092 unsigned Cnt = IdentityCnt;
9093 for (auto &Pair : OrdersUses) {
9094 // Prefer identity order. But, if filled identity found (non-empty
9095 // order) with same number of uses, as the new candidate order, we can
9096 // choose this candidate order.
9097 if (Cnt < Pair.second) {
9098 combineOrders(Pair.first, BestOrder);
9099 BestOrder = Pair.first;
9100 Cnt = Pair.second;
9101 } else {
9102 combineOrders(BestOrder, Pair.first);
9103 }
9104 }
9105 // Set order of the user node.
9106 if (isIdentityOrder(BestOrder)) {
9107 Visited.insert_range(llvm::make_second_range(Data.second));
9108 continue;
9109 }
9110 fixupOrderingIndices(BestOrder);
9111 // Erase operands from OrderedEntries list and adjust their orders.
9112 VisitedOps.clear();
9113 SmallVector<int> Mask;
9114 inversePermutation(BestOrder, Mask);
9115 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
9116 unsigned E = BestOrder.size();
9117 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
9118 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9119 });
9120 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
9121 TreeEntry *TE = Op.second;
9122 if (!VisitedOps.insert(TE).second)
9123 continue;
9124 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
9125 reorderNodeWithReuses(*TE, Mask);
9126 continue;
9127 }
9128 // Gathers are processed separately.
9129 if (TE->State != TreeEntry::Vectorize &&
9130 TE->State != TreeEntry::StridedVectorize &&
9131 TE->State != TreeEntry::CompressVectorize &&
9132 TE->State != TreeEntry::SplitVectorize &&
9133 (TE->State != TreeEntry::ScatterVectorize ||
9134 TE->ReorderIndices.empty()))
9135 continue;
9136 assert((BestOrder.size() == TE->ReorderIndices.size() ||
9137 TE->ReorderIndices.empty()) &&
9138 "Non-matching sizes of user/operand entries.");
9139 reorderOrder(TE->ReorderIndices, Mask);
9140 if (IgnoreReorder && TE == VectorizableTree.front().get())
9141 IgnoreReorder = false;
9142 }
9143 // For gathers just need to reorder its scalars.
9144 for (TreeEntry *Gather : GatherOps) {
9145 assert(Gather->ReorderIndices.empty() &&
9146 "Unexpected reordering of gathers.");
9147 if (!Gather->ReuseShuffleIndices.empty()) {
9148 // Just reorder reuses indices.
9149 reorderReuses(Gather->ReuseShuffleIndices, Mask);
9150 continue;
9151 }
9152 reorderScalars(Gather->Scalars, Mask);
9153 Visited.insert(Gather);
9154 }
9155 // Reorder operands of the user node and set the ordering for the user
9156 // node itself.
9157 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
9158 return TE.isAltShuffle() &&
9159 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9160 TE.ReorderIndices.empty());
9161 };
9162 if (Data.first->State != TreeEntry::Vectorize ||
9164 Data.first->getMainOp()) ||
9165 IsNotProfitableAltCodeNode(*Data.first))
9166 Data.first->reorderOperands(Mask);
9167 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
9168 IsNotProfitableAltCodeNode(*Data.first) ||
9169 Data.first->State == TreeEntry::StridedVectorize ||
9170 Data.first->State == TreeEntry::CompressVectorize) {
9171 reorderScalars(Data.first->Scalars, Mask);
9172 reorderOrder(Data.first->ReorderIndices, MaskOrder,
9173 /*BottomOrder=*/true);
9174 if (Data.first->ReuseShuffleIndices.empty() &&
9175 !Data.first->ReorderIndices.empty() &&
9176 !IsNotProfitableAltCodeNode(*Data.first)) {
9177 // Insert user node to the list to try to sink reordering deeper in
9178 // the graph.
9179 Queue.push(Data.first);
9180 }
9181 } else {
9182 reorderOrder(Data.first->ReorderIndices, Mask);
9183 }
9184 }
9185 }
9186 // If the reordering is unnecessary, just remove the reorder.
9187 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9188 VectorizableTree.front()->ReuseShuffleIndices.empty())
9189 VectorizableTree.front()->ReorderIndices.clear();
9190}
9191
9192Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
9193 if (Entry.hasState() &&
9194 (Entry.getOpcode() == Instruction::Store ||
9195 Entry.getOpcode() == Instruction::Load) &&
9196 Entry.State == TreeEntry::StridedVectorize &&
9197 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
9198 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
9199 return dyn_cast<Instruction>(Entry.Scalars.front());
9200}
9201
9203 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
9204 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9205 DenseMap<Value *, unsigned> ScalarToExtUses;
9206 // Collect the values that we need to extract from the tree.
9207 for (auto &TEPtr : VectorizableTree) {
9208 TreeEntry *Entry = TEPtr.get();
9209
9210 // No need to handle users of gathered values.
9211 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9212 DeletedNodes.contains(Entry) ||
9213 TransformedToGatherNodes.contains(Entry))
9214 continue;
9215
9216 // For each lane:
9217 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9218 Value *Scalar = Entry->Scalars[Lane];
9219 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
9220 continue;
9221
9222 // All uses must be replaced already? No need to do it again.
9223 auto It = ScalarToExtUses.find(Scalar);
9224 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
9225 continue;
9226
9227 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
9228 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9229 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
9230 << " from " << *Scalar << "for many users.\n");
9231 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9232 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9233 ExternalUsesWithNonUsers.insert(Scalar);
9234 continue;
9235 }
9236
9237 // Check if the scalar is externally used as an extra arg.
9238 const auto ExtI = ExternallyUsedValues.find(Scalar);
9239 if (ExtI != ExternallyUsedValues.end()) {
9240 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9241 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
9242 << FoundLane << " from " << *Scalar << ".\n");
9243 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
9244 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9245 continue;
9246 }
9247 for (User *U : Scalar->users()) {
9248 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
9249
9250 Instruction *UserInst = dyn_cast<Instruction>(U);
9251 if (!UserInst || isDeleted(UserInst))
9252 continue;
9253
9254 // Ignore users in the user ignore list.
9255 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9256 continue;
9257
9258 // Skip in-tree scalars that become vectors
9259 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
9260 any_of(UseEntries, [this](const TreeEntry *UseEntry) {
9261 return !DeletedNodes.contains(UseEntry) &&
9262 !TransformedToGatherNodes.contains(UseEntry);
9263 })) {
9264 // Some in-tree scalars will remain as scalar in vectorized
9265 // instructions. If that is the case, the one in FoundLane will
9266 // be used.
9267 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9268 isa<LoadInst, StoreInst>(UserInst)) ||
9269 isa<CallInst>(UserInst)) ||
9270 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9271 if (DeletedNodes.contains(UseEntry) ||
9272 TransformedToGatherNodes.contains(UseEntry))
9273 return true;
9274 return UseEntry->State == TreeEntry::ScatterVectorize ||
9276 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9277 TTI);
9278 })) {
9279 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9280 << ".\n");
9281 assert(none_of(UseEntries,
9282 [](TreeEntry *UseEntry) {
9283 return UseEntry->isGather();
9284 }) &&
9285 "Bad state");
9286 continue;
9287 }
9288 U = nullptr;
9289 if (It != ScalarToExtUses.end()) {
9290 ExternalUses[It->second].User = nullptr;
9291 break;
9292 }
9293 }
9294
9295 if (U && Scalar->hasNUsesOrMore(UsesLimit))
9296 U = nullptr;
9297 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9298 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9299 << " from lane " << FoundLane << " from " << *Scalar
9300 << ".\n");
9301 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9302 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9303 ExternalUsesWithNonUsers.insert(Scalar);
9304 if (!U)
9305 break;
9306 }
9307 }
9308 }
9309}
9310
9312BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9315 PtrToStoresMap;
9316 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
9317 Value *V = TE->Scalars[Lane];
9318 // Don't iterate over the users of constant data.
9319 if (!isa<Instruction>(V))
9320 continue;
9321 // To save compilation time we don't visit if we have too many users.
9322 if (V->hasNUsesOrMore(UsesLimit))
9323 break;
9324
9325 // Collect stores per pointer object.
9326 for (User *U : V->users()) {
9327 auto *SI = dyn_cast<StoreInst>(U);
9328 // Test whether we can handle the store. V might be a global, which could
9329 // be used in a different function.
9330 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9331 !isValidElementType(SI->getValueOperand()->getType()))
9332 continue;
9333 // Skip entry if already
9334 if (isVectorized(U))
9335 continue;
9336
9337 Value *Ptr =
9338 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
9339 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9340 SI->getValueOperand()->getType(), Ptr}];
9341 // For now just keep one store per pointer object per lane.
9342 // TODO: Extend this to support multiple stores per pointer per lane
9343 if (StoresVec.size() > Lane)
9344 continue;
9345 if (!StoresVec.empty()) {
9346 std::optional<int64_t> Diff = getPointersDiff(
9347 SI->getValueOperand()->getType(), SI->getPointerOperand(),
9348 SI->getValueOperand()->getType(),
9349 StoresVec.front()->getPointerOperand(), *DL, *SE,
9350 /*StrictCheck=*/true);
9351 // We failed to compare the pointers so just abandon this store.
9352 if (!Diff)
9353 continue;
9354 }
9355 StoresVec.push_back(SI);
9356 }
9357 }
9358 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9359 unsigned I = 0;
9360 for (auto &P : PtrToStoresMap) {
9361 Res[I].swap(P.second);
9362 ++I;
9363 }
9364 return Res;
9365}
9366
9367bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9368 OrdersType &ReorderIndices) const {
9369 // We check whether the stores in StoreVec can form a vector by sorting them
9370 // and checking whether they are consecutive.
9371
9372 // To avoid calling getPointersDiff() while sorting we create a vector of
9373 // pairs {store, offset from first} and sort this instead.
9375 StoreInst *S0 = StoresVec[0];
9376 StoreOffsetVec.emplace_back(0, 0);
9377 Type *S0Ty = S0->getValueOperand()->getType();
9378 Value *S0Ptr = S0->getPointerOperand();
9379 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
9380 StoreInst *SI = StoresVec[Idx];
9381 std::optional<int64_t> Diff =
9382 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
9383 SI->getPointerOperand(), *DL, *SE,
9384 /*StrictCheck=*/true);
9385 StoreOffsetVec.emplace_back(*Diff, Idx);
9386 }
9387
9388 // Check if the stores are consecutive by checking if their difference is 1.
9389 if (StoreOffsetVec.size() != StoresVec.size())
9390 return false;
9391 sort(StoreOffsetVec, llvm::less_first());
9392 unsigned Idx = 0;
9393 int64_t PrevDist = 0;
9394 for (const auto &P : StoreOffsetVec) {
9395 if (Idx > 0 && P.first != PrevDist + 1)
9396 return false;
9397 PrevDist = P.first;
9398 ++Idx;
9399 }
9400
9401 // Calculate the shuffle indices according to their offset against the sorted
9402 // StoreOffsetVec.
9403 ReorderIndices.assign(StoresVec.size(), 0);
9404 bool IsIdentity = true;
9405 for (auto [I, P] : enumerate(StoreOffsetVec)) {
9406 ReorderIndices[P.second] = I;
9407 IsIdentity &= P.second == I;
9408 }
9409 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9410 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9411 // same convention here.
9412 if (IsIdentity)
9413 ReorderIndices.clear();
9414
9415 return true;
9416}
9417
9418#ifndef NDEBUG
9420 for (unsigned Idx : Order)
9421 dbgs() << Idx << ", ";
9422 dbgs() << "\n";
9423}
9424#endif
9425
9427BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9428 unsigned NumLanes = TE->Scalars.size();
9429
9430 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9431
9432 // Holds the reorder indices for each candidate store vector that is a user of
9433 // the current TreeEntry.
9434 SmallVector<OrdersType, 1> ExternalReorderIndices;
9435
9436 // Now inspect the stores collected per pointer and look for vectorization
9437 // candidates. For each candidate calculate the reorder index vector and push
9438 // it into `ExternalReorderIndices`
9439 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9440 // If we have fewer than NumLanes stores, then we can't form a vector.
9441 if (StoresVec.size() != NumLanes)
9442 continue;
9443
9444 // If the stores are not consecutive then abandon this StoresVec.
9445 OrdersType ReorderIndices;
9446 if (!canFormVector(StoresVec, ReorderIndices))
9447 continue;
9448
9449 // We now know that the scalars in StoresVec can form a vector instruction,
9450 // so set the reorder indices.
9451 ExternalReorderIndices.push_back(ReorderIndices);
9452 }
9453 return ExternalReorderIndices;
9454}
9455
9457 const SmallDenseSet<Value *> &UserIgnoreLst) {
9458 deleteTree();
9459 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9460 "TreeEntryToStridedPtrInfoMap is not cleared");
9461 UserIgnoreList = &UserIgnoreLst;
9462 if (!allSameType(Roots))
9463 return;
9464 buildTreeRec(Roots, 0, EdgeInfo());
9465}
9466
9468 deleteTree();
9469 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9470 "TreeEntryToStridedPtrInfoMap is not cleared");
9471 if (!allSameType(Roots))
9472 return;
9473 buildTreeRec(Roots, 0, EdgeInfo());
9474}
9475
9476/// Tries to find subvector of loads and builds new vector of only loads if can
9477/// be profitable.
9479 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9481 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9482 bool AddNew = true) {
9483 if (VL.empty())
9484 return;
9485 Type *ScalarTy = getValueType(VL.front());
9486 if (!isValidElementType(ScalarTy))
9487 return;
9489 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9490 for (Value *V : VL) {
9491 auto *LI = dyn_cast<LoadInst>(V);
9492 if (!LI)
9493 continue;
9494 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9495 continue;
9496 bool IsFound = false;
9497 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9498 assert(LI->getParent() == Data.front().first->getParent() &&
9499 LI->getType() == Data.front().first->getType() &&
9500 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9501 getUnderlyingObject(Data.front().first->getPointerOperand(),
9503 "Expected loads with the same type, same parent and same "
9504 "underlying pointer.");
9505 std::optional<int64_t> Dist = getPointersDiff(
9506 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9507 Data.front().first->getPointerOperand(), DL, SE,
9508 /*StrictCheck=*/true);
9509 if (!Dist)
9510 continue;
9511 auto It = Map.find(*Dist);
9512 if (It != Map.end() && It->second != LI)
9513 continue;
9514 if (It == Map.end()) {
9515 Data.emplace_back(LI, *Dist);
9516 Map.try_emplace(*Dist, LI);
9517 }
9518 IsFound = true;
9519 break;
9520 }
9521 if (!IsFound) {
9522 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9523 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9524 }
9525 }
9526 auto FindMatchingLoads =
9529 &GatheredLoads,
9530 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9531 int64_t &Offset, unsigned &Start) {
9532 if (Loads.empty())
9533 return GatheredLoads.end();
9534 LoadInst *LI = Loads.front().first;
9535 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9536 if (Idx < Start)
9537 continue;
9538 ToAdd.clear();
9539 if (LI->getParent() != Data.front().first->getParent() ||
9540 LI->getType() != Data.front().first->getType())
9541 continue;
9542 std::optional<int64_t> Dist =
9544 Data.front().first->getType(),
9545 Data.front().first->getPointerOperand(), DL, SE,
9546 /*StrictCheck=*/true);
9547 if (!Dist)
9548 continue;
9549 SmallSet<int64_t, 4> DataDists;
9551 for (std::pair<LoadInst *, int64_t> P : Data) {
9552 DataDists.insert(P.second);
9553 DataLoads.insert(P.first);
9554 }
9555 // Found matching gathered loads - check if all loads are unique or
9556 // can be effectively vectorized.
9557 unsigned NumUniques = 0;
9558 for (auto [Cnt, Pair] : enumerate(Loads)) {
9559 bool Used = DataLoads.contains(Pair.first);
9560 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9561 ++NumUniques;
9562 ToAdd.insert(Cnt);
9563 } else if (Used) {
9564 Repeated.insert(Cnt);
9565 }
9566 }
9567 if (NumUniques > 0 &&
9568 (Loads.size() == NumUniques ||
9569 (Loads.size() - NumUniques >= 2 &&
9570 Loads.size() - NumUniques >= Loads.size() / 2 &&
9571 (has_single_bit(Data.size() + NumUniques) ||
9572 bit_ceil(Data.size()) <
9573 bit_ceil(Data.size() + NumUniques))))) {
9574 Offset = *Dist;
9575 Start = Idx + 1;
9576 return std::next(GatheredLoads.begin(), Idx);
9577 }
9578 }
9579 ToAdd.clear();
9580 return GatheredLoads.end();
9581 };
9582 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9583 unsigned Start = 0;
9584 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9585 int64_t Offset = 0;
9586 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9587 Offset, Start);
9588 while (It != GatheredLoads.end()) {
9589 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9590 for (unsigned Idx : LocalToAdd)
9591 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9592 ToAdd.insert_range(LocalToAdd);
9593 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9594 Start);
9595 }
9596 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9597 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9598 })) {
9599 auto AddNewLoads =
9601 for (unsigned Idx : seq<unsigned>(Data.size())) {
9602 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9603 continue;
9604 Loads.push_back(Data[Idx]);
9605 }
9606 };
9607 if (!AddNew) {
9608 LoadInst *LI = Data.front().first;
9609 It = find_if(
9610 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9611 return PD.front().first->getParent() == LI->getParent() &&
9612 PD.front().first->getType() == LI->getType();
9613 });
9614 while (It != GatheredLoads.end()) {
9615 AddNewLoads(*It);
9616 It = std::find_if(
9617 std::next(It), GatheredLoads.end(),
9618 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9619 return PD.front().first->getParent() == LI->getParent() &&
9620 PD.front().first->getType() == LI->getType();
9621 });
9622 }
9623 }
9624 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9625 AddNewLoads(GatheredLoads.emplace_back());
9626 }
9627 }
9628}
9629
9630void BoUpSLP::tryToVectorizeGatheredLoads(
9631 const SmallMapVector<
9632 std::tuple<BasicBlock *, Value *, Type *>,
9633 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9634 &GatheredLoads) {
9635 GatheredLoadsEntriesFirst = VectorizableTree.size();
9636
9637 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9638 LoadEntriesToVectorize.size());
9639 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9640 Set.insert_range(VectorizableTree[Idx]->Scalars);
9641
9642 // Sort loads by distance.
9643 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9644 const std::pair<LoadInst *, int64_t> &L2) {
9645 return L1.second > L2.second;
9646 };
9647
9648 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9649 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9650 Loads.size());
9651 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9652 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9653 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9654 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9655 };
9656
9657 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9658 BoUpSLP::ValueSet &VectorizedLoads,
9659 SmallVectorImpl<LoadInst *> &NonVectorized,
9660 bool Final, unsigned MaxVF) {
9662 unsigned StartIdx = 0;
9663 SmallVector<int> CandidateVFs;
9664 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9665 CandidateVFs.push_back(MaxVF);
9666 for (int NumElts = getFloorFullVectorNumberOfElements(
9667 *TTI, Loads.front()->getType(), MaxVF);
9668 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9669 *TTI, Loads.front()->getType(), NumElts - 1)) {
9670 CandidateVFs.push_back(NumElts);
9671 if (VectorizeNonPowerOf2 && NumElts > 2)
9672 CandidateVFs.push_back(NumElts - 1);
9673 }
9674
9675 if (Final && CandidateVFs.empty())
9676 return Results;
9677
9678 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9679 for (unsigned NumElts : CandidateVFs) {
9680 if (Final && NumElts > BestVF)
9681 continue;
9682 SmallVector<unsigned> MaskedGatherVectorized;
9683 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9684 ++Cnt) {
9685 ArrayRef<LoadInst *> Slice =
9686 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9687 if (VectorizedLoads.count(Slice.front()) ||
9688 VectorizedLoads.count(Slice.back()) ||
9690 continue;
9691 // Check if it is profitable to try vectorizing gathered loads. It is
9692 // profitable if we have more than 3 consecutive loads or if we have
9693 // less but all users are vectorized or deleted.
9694 bool AllowToVectorize = false;
9695 // Check if it is profitable to vectorize 2-elements loads.
9696 if (NumElts == 2) {
9697 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9698 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9699 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9700 for (LoadInst *LI : Slice) {
9701 // If single use/user - allow to vectorize.
9702 if (LI->hasOneUse())
9703 continue;
9704 // 1. Check if number of uses equals number of users.
9705 // 2. All users are deleted.
9706 // 3. The load broadcasts are not allowed or the load is not
9707 // broadcasted.
9708 if (static_cast<unsigned int>(std::distance(
9709 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9710 return false;
9711 if (!IsLegalBroadcastLoad)
9712 continue;
9713 if (LI->hasNUsesOrMore(UsesLimit))
9714 return false;
9715 for (User *U : LI->users()) {
9716 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9717 continue;
9718 for (const TreeEntry *UTE : getTreeEntries(U)) {
9719 for (int I : seq<int>(UTE->getNumOperands())) {
9720 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9721 return V == LI || isa<PoisonValue>(V);
9722 }))
9723 // Found legal broadcast - do not vectorize.
9724 return false;
9725 }
9726 }
9727 }
9728 }
9729 return true;
9730 };
9731 AllowToVectorize = CheckIfAllowed(Slice);
9732 } else {
9733 AllowToVectorize =
9734 (NumElts >= 3 ||
9735 any_of(ValueToGatherNodes.at(Slice.front()),
9736 [=](const TreeEntry *TE) {
9737 return TE->Scalars.size() == 2 &&
9738 ((TE->Scalars.front() == Slice.front() &&
9739 TE->Scalars.back() == Slice.back()) ||
9740 (TE->Scalars.front() == Slice.back() &&
9741 TE->Scalars.back() == Slice.front()));
9742 })) &&
9743 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9744 Slice.size());
9745 }
9746 if (AllowToVectorize) {
9747 SmallVector<Value *> PointerOps;
9748 OrdersType CurrentOrder;
9749 // Try to build vector load.
9750 ArrayRef<Value *> Values(
9751 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9752 StridedPtrInfo SPtrInfo;
9753 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9754 PointerOps, SPtrInfo, &BestVF);
9755 if (LS != LoadsState::Gather ||
9756 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9757 if (LS == LoadsState::ScatterVectorize) {
9758 if (MaskedGatherVectorized.empty() ||
9759 Cnt >= MaskedGatherVectorized.back() + NumElts)
9760 MaskedGatherVectorized.push_back(Cnt);
9761 continue;
9762 }
9763 if (LS != LoadsState::Gather) {
9764 Results.emplace_back(Values, LS);
9765 VectorizedLoads.insert_range(Slice);
9766 // If we vectorized initial block, no need to try to vectorize it
9767 // again.
9768 if (Cnt == StartIdx)
9769 StartIdx += NumElts;
9770 }
9771 // Check if the whole array was vectorized already - exit.
9772 if (StartIdx >= Loads.size())
9773 break;
9774 // Erase last masked gather candidate, if another candidate within
9775 // the range is found to be better.
9776 if (!MaskedGatherVectorized.empty() &&
9777 Cnt < MaskedGatherVectorized.back() + NumElts)
9778 MaskedGatherVectorized.pop_back();
9779 Cnt += NumElts - 1;
9780 continue;
9781 }
9782 }
9783 if (!AllowToVectorize || BestVF == 0)
9785 }
9786 // Mark masked gathers candidates as vectorized, if any.
9787 for (unsigned Cnt : MaskedGatherVectorized) {
9788 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9789 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9790 ArrayRef<Value *> Values(
9791 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9792 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9793 VectorizedLoads.insert_range(Slice);
9794 // If we vectorized initial block, no need to try to vectorize it again.
9795 if (Cnt == StartIdx)
9796 StartIdx += NumElts;
9797 }
9798 }
9799 for (LoadInst *LI : Loads) {
9800 if (!VectorizedLoads.contains(LI))
9801 NonVectorized.push_back(LI);
9802 }
9803 return Results;
9804 };
9805 auto ProcessGatheredLoads =
9806 [&, &TTI = *TTI](
9808 bool Final = false) {
9809 SmallVector<LoadInst *> NonVectorized;
9810 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9811 GatheredLoads) {
9812 if (LoadsDists.size() <= 1) {
9813 NonVectorized.push_back(LoadsDists.back().first);
9814 continue;
9815 }
9817 LoadsDists);
9818 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9819 stable_sort(LocalLoadsDists, LoadSorter);
9821 unsigned MaxConsecutiveDistance = 0;
9822 unsigned CurrentConsecutiveDist = 1;
9823 int64_t LastDist = LocalLoadsDists.front().second;
9824 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9825 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9826 if (isVectorized(L.first))
9827 continue;
9828 assert(LastDist >= L.second &&
9829 "Expected first distance always not less than second");
9830 if (static_cast<uint64_t>(LastDist - L.second) ==
9831 CurrentConsecutiveDist) {
9832 ++CurrentConsecutiveDist;
9833 MaxConsecutiveDistance =
9834 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9835 Loads.push_back(L.first);
9836 continue;
9837 }
9838 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9839 !Loads.empty())
9840 Loads.pop_back();
9841 CurrentConsecutiveDist = 1;
9842 LastDist = L.second;
9843 Loads.push_back(L.first);
9844 }
9845 if (Loads.size() <= 1)
9846 continue;
9847 if (AllowMaskedGather)
9848 MaxConsecutiveDistance = Loads.size();
9849 else if (MaxConsecutiveDistance < 2)
9850 continue;
9851 BoUpSLP::ValueSet VectorizedLoads;
9852 SmallVector<LoadInst *> SortedNonVectorized;
9854 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9855 Final, MaxConsecutiveDistance);
9856 if (!Results.empty() && !SortedNonVectorized.empty() &&
9857 OriginalLoads.size() == Loads.size() &&
9858 MaxConsecutiveDistance == Loads.size() &&
9860 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9861 return P.second == LoadsState::ScatterVectorize;
9862 })) {
9863 VectorizedLoads.clear();
9864 SmallVector<LoadInst *> UnsortedNonVectorized;
9866 UnsortedResults =
9867 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9868 UnsortedNonVectorized, Final,
9869 OriginalLoads.size());
9870 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9871 SortedNonVectorized.swap(UnsortedNonVectorized);
9872 Results.swap(UnsortedResults);
9873 }
9874 }
9875 for (auto [Slice, _] : Results) {
9876 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9877 << Slice.size() << ")\n");
9878 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9879 for (Value *L : Slice)
9880 if (!isVectorized(L))
9881 SortedNonVectorized.push_back(cast<LoadInst>(L));
9882 continue;
9883 }
9884
9885 // Select maximum VF as a maximum of user gathered nodes and
9886 // distance between scalar loads in these nodes.
9887 unsigned MaxVF = Slice.size();
9888 unsigned UserMaxVF = 0;
9889 unsigned InterleaveFactor = 0;
9890 if (MaxVF == 2) {
9891 UserMaxVF = MaxVF;
9892 } else {
9893 // Found distance between segments of the interleaved loads.
9894 std::optional<unsigned> InterleavedLoadsDistance = 0;
9895 unsigned Order = 0;
9896 std::optional<unsigned> CommonVF = 0;
9897 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9898 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9899 for (auto [Idx, V] : enumerate(Slice)) {
9900 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9901 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9902 unsigned Pos =
9903 EntryToPosition.try_emplace(E, Idx).first->second;
9904 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9905 if (CommonVF) {
9906 if (*CommonVF == 0) {
9907 CommonVF = E->Scalars.size();
9908 continue;
9909 }
9910 if (*CommonVF != E->Scalars.size())
9911 CommonVF.reset();
9912 }
9913 // Check if the load is the part of the interleaved load.
9914 if (Pos != Idx && InterleavedLoadsDistance) {
9915 if (!DeinterleavedNodes.contains(E) &&
9916 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9917 if (isa<Constant>(V))
9918 return false;
9919 if (isVectorized(V))
9920 return true;
9921 const auto &Nodes = ValueToGatherNodes.at(V);
9922 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9923 !is_contained(Slice, V);
9924 })) {
9925 InterleavedLoadsDistance.reset();
9926 continue;
9927 }
9928 DeinterleavedNodes.insert(E);
9929 if (*InterleavedLoadsDistance == 0) {
9930 InterleavedLoadsDistance = Idx - Pos;
9931 continue;
9932 }
9933 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9934 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9935 InterleavedLoadsDistance.reset();
9936 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9937 }
9938 }
9939 }
9940 DeinterleavedNodes.clear();
9941 // Check if the large load represents interleaved load operation.
9942 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9943 CommonVF.value_or(0) != 0) {
9944 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9945 unsigned VF = *CommonVF;
9946 OrdersType Order;
9947 SmallVector<Value *> PointerOps;
9948 StridedPtrInfo SPtrInfo;
9949 // Segmented load detected - vectorize at maximum vector factor.
9950 if (InterleaveFactor <= Slice.size() &&
9951 TTI.isLegalInterleavedAccessType(
9952 getWidenedType(Slice.front()->getType(), VF),
9953 InterleaveFactor,
9954 cast<LoadInst>(Slice.front())->getAlign(),
9955 cast<LoadInst>(Slice.front())
9957 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9958 SPtrInfo) == LoadsState::Vectorize) {
9959 UserMaxVF = InterleaveFactor * VF;
9960 } else {
9961 InterleaveFactor = 0;
9962 }
9963 }
9964 // Cannot represent the loads as consecutive vectorizable nodes -
9965 // just exit.
9966 unsigned ConsecutiveNodesSize = 0;
9967 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9968 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9969 [&, Slice = Slice](const auto &P) {
9970 const auto *It = find_if(Slice, [&](Value *V) {
9971 return std::get<1>(P).contains(V);
9972 });
9973 if (It == Slice.end())
9974 return false;
9975 const TreeEntry &TE =
9976 *VectorizableTree[std::get<0>(P)];
9977 ArrayRef<Value *> VL = TE.Scalars;
9978 OrdersType Order;
9979 SmallVector<Value *> PointerOps;
9980 StridedPtrInfo SPtrInfo;
9982 VL, VL.front(), Order, PointerOps, SPtrInfo);
9983 if (State == LoadsState::ScatterVectorize ||
9985 return false;
9986 ConsecutiveNodesSize += VL.size();
9987 size_t Start = std::distance(Slice.begin(), It);
9988 size_t Sz = Slice.size() - Start;
9989 return Sz < VL.size() ||
9990 Slice.slice(Start, VL.size()) != VL;
9991 }))
9992 continue;
9993 // Try to build long masked gather loads.
9994 UserMaxVF = bit_ceil(UserMaxVF);
9995 if (InterleaveFactor == 0 &&
9996 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9997 [&, Slice = Slice](unsigned Idx) {
9998 OrdersType Order;
9999 SmallVector<Value *> PointerOps;
10000 StridedPtrInfo SPtrInfo;
10001 return canVectorizeLoads(
10002 Slice.slice(Idx * UserMaxVF, UserMaxVF),
10003 Slice[Idx * UserMaxVF], Order, PointerOps,
10004 SPtrInfo) == LoadsState::ScatterVectorize;
10005 }))
10006 UserMaxVF = MaxVF;
10007 if (Slice.size() != ConsecutiveNodesSize)
10008 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
10009 }
10010 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10011 bool IsVectorized = true;
10012 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
10013 ArrayRef<Value *> SubSlice =
10014 Slice.slice(I, std::min(VF, E - I));
10015 if (isVectorized(SubSlice.front()))
10016 continue;
10017 // Check if the subslice is to be-vectorized entry, which is not
10018 // equal to entry.
10019 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10020 [&](const auto &P) {
10021 return !SubSlice.equals(
10022 VectorizableTree[std::get<0>(P)]
10023 ->Scalars) &&
10024 set_is_subset(SubSlice, std::get<1>(P));
10025 }))
10026 continue;
10027 unsigned Sz = VectorizableTree.size();
10028 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
10029 if (Sz == VectorizableTree.size()) {
10030 IsVectorized = false;
10031 // Try non-interleaved vectorization with smaller vector
10032 // factor.
10033 if (InterleaveFactor > 0) {
10034 VF = 2 * (MaxVF / InterleaveFactor);
10035 InterleaveFactor = 0;
10036 }
10037 continue;
10038 }
10039 }
10040 if (IsVectorized)
10041 break;
10042 }
10043 }
10044 NonVectorized.append(SortedNonVectorized);
10045 }
10046 return NonVectorized;
10047 };
10048 for (const auto &GLs : GatheredLoads) {
10049 const auto &Ref = GLs.second;
10050 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
10051 if (!Ref.empty() && !NonVectorized.empty() &&
10052 std::accumulate(
10053 Ref.begin(), Ref.end(), 0u,
10054 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10055 -> unsigned { return S + LoadsDists.size(); }) !=
10056 NonVectorized.size() &&
10057 IsMaskedGatherSupported(NonVectorized)) {
10059 FinalGatheredLoads;
10060 for (LoadInst *LI : NonVectorized) {
10061 // Reinsert non-vectorized loads to other list of loads with the same
10062 // base pointers.
10063 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
10064 FinalGatheredLoads,
10065 /*AddNew=*/false);
10066 }
10067 // Final attempt to vectorize non-vectorized loads.
10068 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
10069 }
10070 }
10071 // Try to vectorize postponed load entries, previously marked as gathered.
10072 for (unsigned Idx : LoadEntriesToVectorize) {
10073 const TreeEntry &E = *VectorizableTree[Idx];
10074 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
10075 // Avoid reordering, if possible.
10076 if (!E.ReorderIndices.empty()) {
10077 // Build a mask out of the reorder indices and reorder scalars per this
10078 // mask.
10079 SmallVector<int> ReorderMask;
10080 inversePermutation(E.ReorderIndices, ReorderMask);
10081 reorderScalars(GatheredScalars, ReorderMask);
10082 }
10083 buildTreeRec(GatheredScalars, 0, EdgeInfo());
10084 }
10085 // If no new entries created, consider it as no gathered loads entries must be
10086 // handled.
10087 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10088 VectorizableTree.size())
10089 GatheredLoadsEntriesFirst.reset();
10090}
10091
10092/// Generates key/subkey pair for the given value to provide effective sorting
10093/// of the values and better detection of the vectorizable values sequences. The
10094/// keys/subkeys can be used for better sorting of the values themselves (keys)
10095/// and in values subgroups (subkeys).
10096static std::pair<size_t, size_t> generateKeySubkey(
10097 Value *V, const TargetLibraryInfo *TLI,
10098 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
10099 bool AllowAlternate) {
10100 hash_code Key = hash_value(V->getValueID() + 2);
10101 hash_code SubKey = hash_value(0);
10102 // Sort the loads by the distance between the pointers.
10103 if (auto *LI = dyn_cast<LoadInst>(V)) {
10104 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
10105 if (LI->isSimple())
10106 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
10107 else
10108 Key = SubKey = hash_value(LI);
10109 } else if (isVectorLikeInstWithConstOps(V)) {
10110 // Sort extracts by the vector operands.
10112 Key = hash_value(Value::UndefValueVal + 1);
10113 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
10114 if (!isUndefVector(EI->getVectorOperand()).all() &&
10115 !isa<UndefValue>(EI->getIndexOperand()))
10116 SubKey = hash_value(EI->getVectorOperand());
10117 }
10118 } else if (auto *I = dyn_cast<Instruction>(V)) {
10119 // Sort other instructions just by the opcodes except for CMPInst.
10120 // For CMP also sort by the predicate kind.
10122 isValidForAlternation(I->getOpcode())) {
10123 if (AllowAlternate)
10124 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
10125 else
10126 Key = hash_combine(hash_value(I->getOpcode()), Key);
10127 SubKey = hash_combine(
10128 hash_value(I->getOpcode()), hash_value(I->getType()),
10130 ? I->getType()
10131 : cast<CastInst>(I)->getOperand(0)->getType()));
10132 // For casts, look through the only operand to improve compile time.
10133 if (isa<CastInst>(I)) {
10134 std::pair<size_t, size_t> OpVals =
10135 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
10136 /*AllowAlternate=*/true);
10137 Key = hash_combine(OpVals.first, Key);
10138 SubKey = hash_combine(OpVals.first, SubKey);
10139 }
10140 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
10141 CmpInst::Predicate Pred = CI->getPredicate();
10142 if (CI->isCommutative())
10143 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
10145 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
10146 hash_value(SwapPred),
10147 hash_value(CI->getOperand(0)->getType()));
10148 } else if (auto *Call = dyn_cast<CallInst>(I)) {
10151 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
10152 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
10153 SubKey = hash_combine(hash_value(I->getOpcode()),
10154 hash_value(Call->getCalledFunction()));
10155 } else {
10157 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
10158 }
10159 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
10160 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
10161 hash_value(Op.Tag), SubKey);
10162 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
10163 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
10164 SubKey = hash_value(Gep->getPointerOperand());
10165 else
10166 SubKey = hash_value(Gep);
10167 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
10168 !isa<ConstantInt>(I->getOperand(1))) {
10169 // Do not try to vectorize instructions with potentially high cost.
10170 SubKey = hash_value(I);
10171 } else {
10172 SubKey = hash_value(I->getOpcode());
10173 }
10174 Key = hash_combine(hash_value(I->getParent()), Key);
10175 }
10176 return std::make_pair(Key, SubKey);
10177}
10178
10179/// Checks if the specified instruction \p I is an main operation for the given
10180/// \p MainOp and \p AltOp instructions.
10181static bool isMainInstruction(Instruction *I, Instruction *MainOp,
10182 Instruction *AltOp, const TargetLibraryInfo &TLI);
10183
10184bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
10185 ArrayRef<Value *> VL) const {
10186 Type *ScalarTy = S.getMainOp()->getType();
10187 unsigned Opcode0 = S.getOpcode();
10188 unsigned Opcode1 = S.getAltOpcode();
10189 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10190 // If this pattern is supported by the target then consider it profitable.
10191 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
10192 Opcode1, OpcodeMask))
10193 return true;
10194 SmallVector<ValueList> Operands;
10195 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
10196 Operands.emplace_back();
10197 // Prepare the operand vector.
10198 for (Value *V : VL) {
10199 if (isa<PoisonValue>(V)) {
10200 Operands.back().push_back(
10201 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
10202 continue;
10203 }
10204 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
10205 }
10206 }
10207 if (Operands.size() == 2) {
10208 // Try find best operands candidates.
10209 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
10211 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
10212 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
10213 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
10214 std::optional<int> Res = findBestRootPair(Candidates);
10215 switch (Res.value_or(0)) {
10216 case 0:
10217 break;
10218 case 1:
10219 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
10220 break;
10221 case 2:
10222 std::swap(Operands[0][I], Operands[1][I]);
10223 break;
10224 default:
10225 llvm_unreachable("Unexpected index.");
10226 }
10227 }
10228 }
10229 DenseSet<unsigned> UniqueOpcodes;
10230 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
10231 unsigned NonInstCnt = 0;
10232 // Estimate number of instructions, required for the vectorized node and for
10233 // the buildvector node.
10234 unsigned UndefCnt = 0;
10235 // Count the number of extra shuffles, required for vector nodes.
10236 unsigned ExtraShuffleInsts = 0;
10237 // Check that operands do not contain same values and create either perfect
10238 // diamond match or shuffled match.
10239 if (Operands.size() == 2) {
10240 // Do not count same operands twice.
10241 if (Operands.front() == Operands.back()) {
10242 Operands.erase(Operands.begin());
10243 } else if (!allConstant(Operands.front()) &&
10244 all_of(Operands.front(), [&](Value *V) {
10245 return is_contained(Operands.back(), V);
10246 })) {
10247 Operands.erase(Operands.begin());
10248 ++ExtraShuffleInsts;
10249 }
10250 }
10251 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
10252 // Vectorize node, if:
10253 // 1. at least single operand is constant or splat.
10254 // 2. Operands have many loop invariants (the instructions are not loop
10255 // invariants).
10256 // 3. At least single unique operands is supposed to vectorized.
10257 return none_of(Operands,
10258 [&](ArrayRef<Value *> Op) {
10259 if (allConstant(Op) ||
10260 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
10261 getSameOpcode(Op, *TLI)))
10262 return false;
10263 DenseMap<Value *, unsigned> Uniques;
10264 for (Value *V : Op) {
10266 isVectorized(V) || (L && L->isLoopInvariant(V))) {
10267 if (isa<UndefValue>(V))
10268 ++UndefCnt;
10269 continue;
10270 }
10271 auto Res = Uniques.try_emplace(V, 0);
10272 // Found first duplicate - need to add shuffle.
10273 if (!Res.second && Res.first->second == 1)
10274 ++ExtraShuffleInsts;
10275 ++Res.first->getSecond();
10276 if (auto *I = dyn_cast<Instruction>(V))
10277 UniqueOpcodes.insert(I->getOpcode());
10278 else if (Res.second)
10279 ++NonInstCnt;
10280 }
10281 return none_of(Uniques, [&](const auto &P) {
10282 return P.first->hasNUsesOrMore(P.second + 1) &&
10283 none_of(P.first->users(), [&](User *U) {
10284 return isVectorized(U) || Uniques.contains(U);
10285 });
10286 });
10287 }) ||
10288 // Do not vectorize node, if estimated number of vector instructions is
10289 // more than estimated number of buildvector instructions. Number of
10290 // vector operands is number of vector instructions + number of vector
10291 // instructions for operands (buildvectors). Number of buildvector
10292 // instructions is just number_of_operands * number_of_scalars.
10293 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10294 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
10295 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10296}
10297
10298/// Builds the arguments types vector for the given call instruction with the
10299/// given \p ID for the specified vector factor.
10302 const unsigned VF, unsigned MinBW,
10303 const TargetTransformInfo *TTI) {
10304 SmallVector<Type *> ArgTys;
10305 for (auto [Idx, Arg] : enumerate(CI->args())) {
10308 ArgTys.push_back(Arg->getType());
10309 continue;
10310 }
10311 if (MinBW > 0) {
10312 ArgTys.push_back(
10313 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
10314 continue;
10315 }
10316 }
10317 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
10318 }
10319 return ArgTys;
10320}
10321
10322/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10323/// function (if possible) calls. Returns invalid cost for the corresponding
10324/// calls, if they cannot be vectorized/will be scalarized.
10325static std::pair<InstructionCost, InstructionCost>
10328 ArrayRef<Type *> ArgTys) {
10329 auto Shape = VFShape::get(CI->getFunctionType(),
10331 false /*HasGlobalPred*/);
10332 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10333 auto LibCost = InstructionCost::getInvalid();
10334 if (!CI->isNoBuiltin() && VecFunc) {
10335 // Calculate the cost of the vector library call.
10336 // If the corresponding vector call is cheaper, return its cost.
10337 LibCost =
10338 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
10339 }
10341
10342 // Calculate the cost of the vector intrinsic call.
10343 FastMathFlags FMF;
10344 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
10345 FMF = FPCI->getFastMathFlags();
10346 const InstructionCost ScalarLimit = 10000;
10347 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10348 LibCost.isValid() ? LibCost : ScalarLimit);
10349 auto IntrinsicCost =
10350 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
10351 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10352 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10354
10355 return {IntrinsicCost, LibCost};
10356}
10357
10358BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10359 const InstructionsState &S, ArrayRef<Value *> VL,
10360 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10361 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10362 assert(S.getMainOp() &&
10363 "Expected instructions with same/alternate opcodes only.");
10364
10365 unsigned ShuffleOrOp =
10366 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10367 Instruction *VL0 = S.getMainOp();
10368 switch (ShuffleOrOp) {
10369 case Instruction::PHI: {
10370 // Too many operands - gather, most probably won't be vectorized.
10371 if (VL0->getNumOperands() > MaxPHINumOperands)
10372 return TreeEntry::NeedToGather;
10373 // Check for terminator values (e.g. invoke).
10374 for (Value *V : VL) {
10375 auto *PHI = dyn_cast<PHINode>(V);
10376 if (!PHI)
10377 continue;
10378 for (Value *Incoming : PHI->incoming_values()) {
10380 if (Term && Term->isTerminator()) {
10382 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10383 return TreeEntry::NeedToGather;
10384 }
10385 }
10386 }
10387
10388 return TreeEntry::Vectorize;
10389 }
10390 case Instruction::ExtractElement:
10391 if (any_of(VL, [&](Value *V) {
10392 auto *EI = dyn_cast<ExtractElementInst>(V);
10393 if (!EI)
10394 return true;
10395 return isVectorized(EI->getOperand(0));
10396 }))
10397 return TreeEntry::NeedToGather;
10398 [[fallthrough]];
10399 case Instruction::ExtractValue: {
10400 bool Reuse = canReuseExtract(VL, CurrentOrder);
10401 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10402 // non-full registers).
10403 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
10404 return TreeEntry::NeedToGather;
10405 if (Reuse || !CurrentOrder.empty())
10406 return TreeEntry::Vectorize;
10407 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10408 return TreeEntry::NeedToGather;
10409 }
10410 case Instruction::InsertElement: {
10411 // Check that we have a buildvector and not a shuffle of 2 or more
10412 // different vectors.
10413 ValueSet SourceVectors;
10414 for (Value *V : VL) {
10415 if (isa<PoisonValue>(V)) {
10416 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10417 return TreeEntry::NeedToGather;
10418 }
10419 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10420 assert(getElementIndex(V) != std::nullopt &&
10421 "Non-constant or undef index?");
10422 }
10423
10424 if (count_if(VL, [&SourceVectors](Value *V) {
10425 return !SourceVectors.contains(V);
10426 }) >= 2) {
10427 // Found 2nd source vector - cancel.
10428 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10429 "different source vectors.\n");
10430 return TreeEntry::NeedToGather;
10431 }
10432
10433 if (any_of(VL, [&SourceVectors](Value *V) {
10434 // The last InsertElement can have multiple uses.
10435 return SourceVectors.contains(V) && !V->hasOneUse();
10436 })) {
10437 assert(SLPReVec && "Only supported by REVEC.");
10438 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10439 "multiple uses.\n");
10440 return TreeEntry::NeedToGather;
10441 }
10442
10443 return TreeEntry::Vectorize;
10444 }
10445 case Instruction::Load: {
10446 // Check that a vectorized load would load the same memory as a scalar
10447 // load. For example, we don't want to vectorize loads that are smaller
10448 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10449 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10450 // from such a struct, we read/write packed bits disagreeing with the
10451 // unvectorized version.
10452 auto IsGatheredNode = [&]() {
10453 if (!GatheredLoadsEntriesFirst)
10454 return false;
10455 return all_of(VL, [&](Value *V) {
10456 if (isa<PoisonValue>(V))
10457 return true;
10458 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10459 return TE->Idx >= *GatheredLoadsEntriesFirst;
10460 });
10461 });
10462 };
10463 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10465 return TreeEntry::Vectorize;
10467 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10468 // Delay slow vectorized nodes for better vectorization attempts.
10469 LoadEntriesToVectorize.insert(VectorizableTree.size());
10470 return TreeEntry::NeedToGather;
10471 }
10472 return IsGatheredNode() ? TreeEntry::NeedToGather
10473 : TreeEntry::CompressVectorize;
10475 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10476 // Delay slow vectorized nodes for better vectorization attempts.
10477 LoadEntriesToVectorize.insert(VectorizableTree.size());
10478 return TreeEntry::NeedToGather;
10479 }
10480 return IsGatheredNode() ? TreeEntry::NeedToGather
10481 : TreeEntry::ScatterVectorize;
10483 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10484 // Delay slow vectorized nodes for better vectorization attempts.
10485 LoadEntriesToVectorize.insert(VectorizableTree.size());
10486 return TreeEntry::NeedToGather;
10487 }
10488 return IsGatheredNode() ? TreeEntry::NeedToGather
10489 : TreeEntry::StridedVectorize;
10490 case LoadsState::Gather:
10491#ifndef NDEBUG
10492 Type *ScalarTy = VL0->getType();
10493 if (DL->getTypeSizeInBits(ScalarTy) !=
10494 DL->getTypeAllocSizeInBits(ScalarTy))
10495 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10496 else if (any_of(VL, [](Value *V) {
10497 auto *LI = dyn_cast<LoadInst>(V);
10498 return !LI || !LI->isSimple();
10499 }))
10500 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10501 else
10502 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10503#endif // NDEBUG
10505 return TreeEntry::NeedToGather;
10506 }
10507 llvm_unreachable("Unexpected state of loads");
10508 }
10509 case Instruction::ZExt:
10510 case Instruction::SExt:
10511 case Instruction::FPToUI:
10512 case Instruction::FPToSI:
10513 case Instruction::FPExt:
10514 case Instruction::PtrToInt:
10515 case Instruction::IntToPtr:
10516 case Instruction::SIToFP:
10517 case Instruction::UIToFP:
10518 case Instruction::Trunc:
10519 case Instruction::FPTrunc:
10520 case Instruction::BitCast: {
10521 Type *SrcTy = VL0->getOperand(0)->getType();
10522 for (Value *V : VL) {
10523 if (isa<PoisonValue>(V))
10524 continue;
10525 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10526 if (Ty != SrcTy || !isValidElementType(Ty)) {
10527 LLVM_DEBUG(
10528 dbgs() << "SLP: Gathering casts with different src types.\n");
10529 return TreeEntry::NeedToGather;
10530 }
10531 }
10532 return TreeEntry::Vectorize;
10533 }
10534 case Instruction::ICmp:
10535 case Instruction::FCmp: {
10536 // Check that all of the compares have the same predicate.
10537 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10539 Type *ComparedTy = VL0->getOperand(0)->getType();
10540 for (Value *V : VL) {
10541 if (isa<PoisonValue>(V))
10542 continue;
10543 auto *Cmp = cast<CmpInst>(V);
10544 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10545 Cmp->getOperand(0)->getType() != ComparedTy) {
10546 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10547 return TreeEntry::NeedToGather;
10548 }
10549 }
10550 return TreeEntry::Vectorize;
10551 }
10552 case Instruction::Select:
10553 case Instruction::FNeg:
10554 case Instruction::Add:
10555 case Instruction::FAdd:
10556 case Instruction::Sub:
10557 case Instruction::FSub:
10558 case Instruction::Mul:
10559 case Instruction::FMul:
10560 case Instruction::UDiv:
10561 case Instruction::SDiv:
10562 case Instruction::FDiv:
10563 case Instruction::URem:
10564 case Instruction::SRem:
10565 case Instruction::FRem:
10566 case Instruction::Shl:
10567 case Instruction::LShr:
10568 case Instruction::AShr:
10569 case Instruction::And:
10570 case Instruction::Or:
10571 case Instruction::Xor:
10572 case Instruction::Freeze:
10573 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10574 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10575 auto *I = dyn_cast<Instruction>(V);
10576 return I && I->isBinaryOp() && !I->isFast();
10577 }))
10578 return TreeEntry::NeedToGather;
10579 return TreeEntry::Vectorize;
10580 case Instruction::GetElementPtr: {
10581 // We don't combine GEPs with complicated (nested) indexing.
10582 for (Value *V : VL) {
10583 auto *I = dyn_cast<GetElementPtrInst>(V);
10584 if (!I)
10585 continue;
10586 if (I->getNumOperands() != 2) {
10587 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10588 return TreeEntry::NeedToGather;
10589 }
10590 }
10591
10592 // We can't combine several GEPs into one vector if they operate on
10593 // different types.
10594 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10595 for (Value *V : VL) {
10596 auto *GEP = dyn_cast<GEPOperator>(V);
10597 if (!GEP)
10598 continue;
10599 Type *CurTy = GEP->getSourceElementType();
10600 if (Ty0 != CurTy) {
10601 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10602 return TreeEntry::NeedToGather;
10603 }
10604 }
10605
10606 // We don't combine GEPs with non-constant indexes.
10607 Type *Ty1 = VL0->getOperand(1)->getType();
10608 for (Value *V : VL) {
10609 auto *I = dyn_cast<GetElementPtrInst>(V);
10610 if (!I)
10611 continue;
10612 auto *Op = I->getOperand(1);
10613 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10614 (Op->getType() != Ty1 &&
10615 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10616 Op->getType()->getScalarSizeInBits() >
10617 DL->getIndexSizeInBits(
10618 V->getType()->getPointerAddressSpace())))) {
10619 LLVM_DEBUG(
10620 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10621 return TreeEntry::NeedToGather;
10622 }
10623 }
10624
10625 return TreeEntry::Vectorize;
10626 }
10627 case Instruction::Store: {
10628 // Check if the stores are consecutive or if we need to swizzle them.
10629 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10630 // Avoid types that are padded when being allocated as scalars, while
10631 // being packed together in a vector (such as i1).
10632 if (DL->getTypeSizeInBits(ScalarTy) !=
10633 DL->getTypeAllocSizeInBits(ScalarTy)) {
10634 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10635 return TreeEntry::NeedToGather;
10636 }
10637 // Make sure all stores in the bundle are simple - we can't vectorize
10638 // atomic or volatile stores.
10639 for (Value *V : VL) {
10640 auto *SI = cast<StoreInst>(V);
10641 if (!SI->isSimple()) {
10642 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10643 return TreeEntry::NeedToGather;
10644 }
10645 PointerOps.push_back(SI->getPointerOperand());
10646 }
10647
10648 // Check the order of pointer operands.
10649 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10650 Value *Ptr0;
10651 Value *PtrN;
10652 if (CurrentOrder.empty()) {
10653 Ptr0 = PointerOps.front();
10654 PtrN = PointerOps.back();
10655 } else {
10656 Ptr0 = PointerOps[CurrentOrder.front()];
10657 PtrN = PointerOps[CurrentOrder.back()];
10658 }
10659 std::optional<int64_t> Dist =
10660 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10661 // Check that the sorted pointer operands are consecutive.
10662 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10663 return TreeEntry::Vectorize;
10664 }
10665
10666 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10667 return TreeEntry::NeedToGather;
10668 }
10669 case Instruction::Call: {
10670 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10671 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10672 auto *I = dyn_cast<Instruction>(V);
10673 return I && !I->isFast();
10674 }))
10675 return TreeEntry::NeedToGather;
10676 // Check if the calls are all to the same vectorizable intrinsic or
10677 // library function.
10678 CallInst *CI = cast<CallInst>(VL0);
10680
10681 VFShape Shape = VFShape::get(
10682 CI->getFunctionType(),
10683 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10684 false /*HasGlobalPred*/);
10685 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10686
10687 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10688 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10689 return TreeEntry::NeedToGather;
10690 }
10691 Function *F = CI->getCalledFunction();
10692 unsigned NumArgs = CI->arg_size();
10693 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10694 for (unsigned J = 0; J != NumArgs; ++J)
10696 ScalarArgs[J] = CI->getArgOperand(J);
10697 for (Value *V : VL) {
10698 CallInst *CI2 = dyn_cast<CallInst>(V);
10699 if (!CI2 || CI2->getCalledFunction() != F ||
10700 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10701 (VecFunc &&
10702 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10704 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10705 << "\n");
10706 return TreeEntry::NeedToGather;
10707 }
10708 // Some intrinsics have scalar arguments and should be same in order for
10709 // them to be vectorized.
10710 for (unsigned J = 0; J != NumArgs; ++J) {
10712 Value *A1J = CI2->getArgOperand(J);
10713 if (ScalarArgs[J] != A1J) {
10715 << "SLP: mismatched arguments in call:" << *CI
10716 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10717 return TreeEntry::NeedToGather;
10718 }
10719 }
10720 }
10721 // Verify that the bundle operands are identical between the two calls.
10722 if (CI->hasOperandBundles() &&
10723 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10724 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10725 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10726 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10727 << "!=" << *V << '\n');
10728 return TreeEntry::NeedToGather;
10729 }
10730 }
10731 SmallVector<Type *> ArgTys =
10732 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10733 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10734 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10735 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10736 return TreeEntry::NeedToGather;
10737
10738 return TreeEntry::Vectorize;
10739 }
10740 case Instruction::ShuffleVector: {
10741 if (!S.isAltShuffle()) {
10742 // REVEC can support non alternate shuffle.
10744 return TreeEntry::Vectorize;
10745 // If this is not an alternate sequence of opcode like add-sub
10746 // then do not vectorize this instruction.
10747 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10748 return TreeEntry::NeedToGather;
10749 }
10750 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10751 LLVM_DEBUG(
10752 dbgs()
10753 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10754 "the whole alt sequence is not profitable.\n");
10755 return TreeEntry::NeedToGather;
10756 }
10757
10758 return TreeEntry::Vectorize;
10759 }
10760 default:
10761 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10762 return TreeEntry::NeedToGather;
10763 }
10764}
10765
10766namespace {
10767/// Allows to correctly handle operands of the phi nodes based on the \p Main
10768/// PHINode order of incoming basic blocks/values.
10769class PHIHandler {
10770 DominatorTree &DT;
10771 PHINode *Main = nullptr;
10774
10775public:
10776 PHIHandler() = delete;
10777 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10778 : DT(DT), Main(Main), Phis(Phis),
10779 Operands(Main->getNumIncomingValues(),
10780 SmallVector<Value *>(Phis.size(), nullptr)) {}
10781 void buildOperands() {
10782 constexpr unsigned FastLimit = 4;
10783 if (Main->getNumIncomingValues() <= FastLimit) {
10784 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10785 BasicBlock *InBB = Main->getIncomingBlock(I);
10786 if (!DT.isReachableFromEntry(InBB)) {
10787 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10788 continue;
10789 }
10790 // Prepare the operand vector.
10791 for (auto [Idx, V] : enumerate(Phis)) {
10792 auto *P = dyn_cast<PHINode>(V);
10793 if (!P) {
10795 "Expected isa instruction or poison value.");
10796 Operands[I][Idx] = V;
10797 continue;
10798 }
10799 if (P->getIncomingBlock(I) == InBB)
10800 Operands[I][Idx] = P->getIncomingValue(I);
10801 else
10802 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10803 }
10804 }
10805 return;
10806 }
10807 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10808 Blocks;
10809 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10810 BasicBlock *InBB = Main->getIncomingBlock(I);
10811 if (!DT.isReachableFromEntry(InBB)) {
10812 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10813 continue;
10814 }
10815 Blocks.try_emplace(InBB).first->second.push_back(I);
10816 }
10817 for (auto [Idx, V] : enumerate(Phis)) {
10818 if (isa<PoisonValue>(V)) {
10819 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10820 Operands[I][Idx] = V;
10821 continue;
10822 }
10823 auto *P = cast<PHINode>(V);
10824 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10825 BasicBlock *InBB = P->getIncomingBlock(I);
10826 if (InBB == Main->getIncomingBlock(I)) {
10827 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10828 continue;
10829 Operands[I][Idx] = P->getIncomingValue(I);
10830 continue;
10831 }
10832 auto *It = Blocks.find(InBB);
10833 if (It == Blocks.end())
10834 continue;
10835 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10836 }
10837 }
10838 for (const auto &P : Blocks) {
10839 ArrayRef<unsigned> IncomingValues = P.second;
10840 if (IncomingValues.size() <= 1)
10841 continue;
10842 unsigned BasicI = IncomingValues.consume_front();
10843 for (unsigned I : IncomingValues) {
10844 assert(all_of(enumerate(Operands[I]),
10845 [&](const auto &Data) {
10846 return !Data.value() ||
10847 Data.value() == Operands[BasicI][Data.index()];
10848 }) &&
10849 "Expected empty operands list.");
10850 Operands[I] = Operands[BasicI];
10851 }
10852 }
10853 }
10854 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10855};
10856} // namespace
10857
10858/// Returns main/alternate instructions for the given \p VL. Unlike
10859/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10860/// node support.
10861/// \returns first main/alt instructions, if only poisons and instruction with
10862/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10863static std::pair<Instruction *, Instruction *>
10865 Instruction *MainOp = nullptr;
10866 Instruction *AltOp = nullptr;
10867 for (Value *V : VL) {
10868 if (isa<PoisonValue>(V))
10869 continue;
10870 auto *I = dyn_cast<Instruction>(V);
10871 if (!I)
10872 return {};
10873 if (!MainOp) {
10874 MainOp = I;
10875 continue;
10876 }
10877 if (MainOp->getOpcode() == I->getOpcode()) {
10878 if (I->getParent() != MainOp->getParent())
10879 return {};
10880 continue;
10881 }
10882 if (!AltOp) {
10883 AltOp = I;
10884 continue;
10885 }
10886 if (AltOp->getOpcode() == I->getOpcode()) {
10887 if (I->getParent() != AltOp->getParent())
10888 return {};
10889 continue;
10890 }
10891 return {};
10892 }
10893 if (!AltOp)
10894 return {};
10895 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10896 "Expected different main and alt instructions.");
10897 return std::make_pair(MainOp, AltOp);
10898}
10899
10900/// Checks that every instruction appears once in the list and if not, packs
10901/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10902/// unique scalars is extended by poison values to the whole register size.
10903///
10904/// \returns false if \p VL could not be uniquified, in which case \p VL is
10905/// unchanged and \p ReuseShuffleIndices is empty.
10907 SmallVectorImpl<int> &ReuseShuffleIndices,
10908 const TargetTransformInfo &TTI,
10909 const TargetLibraryInfo &TLI,
10910 const InstructionsState &S,
10911 const BoUpSLP::EdgeInfo &UserTreeIdx,
10912 bool TryPad = false) {
10913 // Check that every instruction appears once in this bundle.
10914 SmallVector<Value *> UniqueValues;
10915 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10916 for (Value *V : VL) {
10917 if (isConstant(V)) {
10918 // Constants are always considered distinct, even if the same constant
10919 // appears multiple times in VL.
10920 ReuseShuffleIndices.emplace_back(
10921 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10922 UniqueValues.emplace_back(V);
10923 continue;
10924 }
10925 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10926 ReuseShuffleIndices.emplace_back(Res.first->second);
10927 if (Res.second)
10928 UniqueValues.emplace_back(V);
10929 }
10930
10931 // Easy case: VL has unique values and a "natural" size
10932 size_t NumUniqueScalarValues = UniqueValues.size();
10933 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10934 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10935 if (NumUniqueScalarValues == VL.size() &&
10936 (VectorizeNonPowerOf2 || IsFullVectors)) {
10937 ReuseShuffleIndices.clear();
10938 return true;
10939 }
10940
10941 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10942 if ((UserTreeIdx.UserTE &&
10943 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10945 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10946 "for nodes with padding.\n");
10947 ReuseShuffleIndices.clear();
10948 return false;
10949 }
10950
10951 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10952 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10953 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10954 return isa<UndefValue>(V) || !isConstant(V);
10955 }))) {
10956 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10957 S.getMainOp()->isSafeToRemove() &&
10958 (S.areInstructionsWithCopyableElements() ||
10959 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10960 // Find the number of elements, which forms full vectors.
10961 unsigned PWSz = getFullVectorNumberOfElements(
10962 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10963 PWSz = std::min<unsigned>(PWSz, VL.size());
10964 if (PWSz == VL.size()) {
10965 // We ended up with the same size after removing duplicates and
10966 // upgrading the resulting vector size to a "nice size". Just keep
10967 // the initial VL then.
10968 ReuseShuffleIndices.clear();
10969 } else {
10970 // Pad unique values with poison to grow the vector to a "nice" size
10971 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10972 UniqueValues.end());
10973 PaddedUniqueValues.append(
10974 PWSz - UniqueValues.size(),
10975 PoisonValue::get(UniqueValues.front()->getType()));
10976 // Check that extended with poisons/copyable operations are still valid
10977 // for vectorization (div/rem are not allowed).
10978 if ((!S.areInstructionsWithCopyableElements() &&
10979 !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
10980 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10981 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10982 isa<CallInst>(S.getMainOp())))) {
10983 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10984 ReuseShuffleIndices.clear();
10985 return false;
10986 }
10987 VL = std::move(PaddedUniqueValues);
10988 }
10989 return true;
10990 }
10991 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10992 ReuseShuffleIndices.clear();
10993 return false;
10994 }
10995 VL = std::move(UniqueValues);
10996 return true;
10997}
10998
10999bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
11000 const InstructionsState &LocalState,
11001 SmallVectorImpl<Value *> &Op1,
11002 SmallVectorImpl<Value *> &Op2,
11003 OrdersType &ReorderIndices) const {
11004 constexpr unsigned SmallNodeSize = 4;
11005 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11007 return false;
11008
11009 // Check if this is a duplicate of another split entry.
11010 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
11011 << ".\n");
11012 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
11013 if (E->isSame(VL)) {
11014 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
11015 << *LocalState.getMainOp() << ".\n");
11016 return false;
11017 }
11018 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11019 if (all_of(VL, [&](Value *V) {
11020 return isa<PoisonValue>(V) || Values.contains(V);
11021 })) {
11022 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11023 return false;
11024 }
11025 }
11026
11027 ReorderIndices.assign(VL.size(), VL.size());
11028 SmallBitVector Op1Indices(VL.size());
11029 for (auto [Idx, V] : enumerate(VL)) {
11030 auto *I = dyn_cast<Instruction>(V);
11031 if (!I) {
11032 Op1.push_back(V);
11033 Op1Indices.set(Idx);
11034 continue;
11035 }
11036 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11037 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
11038 *TLI)) ||
11039 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11040 !isAlternateInstruction(I, LocalState.getMainOp(),
11041 LocalState.getAltOp(), *TLI))) {
11042 Op1.push_back(V);
11043 Op1Indices.set(Idx);
11044 continue;
11045 }
11046 Op2.push_back(V);
11047 }
11048 Type *ScalarTy = getValueType(VL.front());
11049 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
11050 unsigned Opcode0 = LocalState.getOpcode();
11051 unsigned Opcode1 = LocalState.getAltOpcode();
11052 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11053 // Enable split node, only if all nodes do not form legal alternate
11054 // instruction (like X86 addsub).
11055 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
11056 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
11057 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11058 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
11059 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
11060 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
11061 return false;
11062 // Enable split node, only if all nodes are power-of-2/full registers.
11063 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
11064 for (unsigned Idx : seq<unsigned>(VL.size())) {
11065 if (Op1Indices.test(Idx)) {
11066 ReorderIndices[Op1Cnt] = Idx;
11067 ++Op1Cnt;
11068 } else {
11069 ReorderIndices[Op2Cnt] = Idx;
11070 ++Op2Cnt;
11071 }
11072 }
11073 if (isIdentityOrder(ReorderIndices))
11074 ReorderIndices.clear();
11075 SmallVector<int> Mask;
11076 if (!ReorderIndices.empty())
11077 inversePermutation(ReorderIndices, Mask);
11078 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11079 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
11080 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
11081 // Check non-profitable single register ops, which better to be represented
11082 // as alternate ops.
11083 if (NumParts >= VL.size())
11084 return false;
11086 InstructionCost InsertCost = ::getShuffleCost(
11087 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
11088 FixedVectorType *SubVecTy =
11089 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
11090 InstructionCost NewShuffleCost =
11091 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
11092 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11093 (Mask.empty() || InsertCost >= NewShuffleCost))
11094 return false;
11095 if ((LocalState.getMainOp()->isBinaryOp() &&
11096 LocalState.getAltOp()->isBinaryOp() &&
11097 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11098 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11099 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11100 (LocalState.getMainOp()->isUnaryOp() &&
11101 LocalState.getAltOp()->isUnaryOp())) {
11102 InstructionCost OriginalVecOpsCost =
11103 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11104 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11105 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
11106 for (unsigned Idx : seq<unsigned>(VL.size())) {
11107 if (isa<PoisonValue>(VL[Idx]))
11108 continue;
11109 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
11110 }
11111 InstructionCost OriginalCost =
11112 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
11113 VecTy, OriginalMask, Kind);
11114 InstructionCost NewVecOpsCost =
11115 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11116 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11117 InstructionCost NewCost =
11118 NewVecOpsCost + InsertCost +
11119 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11120 VectorizableTree.front()->getOpcode() == Instruction::Store
11121 ? NewShuffleCost
11122 : 0);
11123 // If not profitable to split - exit.
11124 if (NewCost >= OriginalCost)
11125 return false;
11126 }
11127 return true;
11128}
11129
11130namespace {
11131/// Class accepts incoming list of values, checks if it is able to model
11132/// "copyable" values as compatible operations, and generates the list of values
11133/// for scheduling and list of operands doe the new nodes.
11134class InstructionsCompatibilityAnalysis {
11135 DominatorTree &DT;
11136 const DataLayout &DL;
11137 const TargetTransformInfo &TTI;
11138 const TargetLibraryInfo &TLI;
11139 unsigned MainOpcode = 0;
11140 Instruction *MainOp = nullptr;
11141
11142 /// Checks if the opcode is supported as the main opcode for copyable
11143 /// elements.
11144 static bool isSupportedOpcode(const unsigned Opcode) {
11145 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11146 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11147 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11148 Opcode == Instruction::And || Opcode == Instruction::Or ||
11149 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11150 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11151 Opcode == Instruction::FDiv;
11152 }
11153
11154 /// Identifies the best candidate value, which represents main opcode
11155 /// operation.
11156 /// Currently the best candidate is the Add instruction with the parent
11157 /// block with the highest DFS incoming number (block, that dominates other).
11158 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
11159 BasicBlock *Parent = nullptr;
11160 // Checks if the instruction has supported opcode.
11161 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
11162 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
11163 return false;
11164 return I && isSupportedOpcode(I->getOpcode()) &&
11165 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
11166 };
11167 // Exclude operands instructions immediately to improve compile time, it
11168 // will be unable to schedule anyway.
11169 SmallDenseSet<Value *, 8> Operands;
11170 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11171 bool AnyUndef = false;
11172 for (Value *V : VL) {
11173 auto *I = dyn_cast<Instruction>(V);
11174 if (!I) {
11175 AnyUndef |= isa<UndefValue>(V);
11176 continue;
11177 }
11178 if (!DT.isReachableFromEntry(I->getParent()))
11179 continue;
11180 if (Candidates.empty()) {
11181 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11182 Parent = I->getParent();
11183 Operands.insert(I->op_begin(), I->op_end());
11184 continue;
11185 }
11186 if (Parent == I->getParent()) {
11187 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11188 Operands.insert(I->op_begin(), I->op_end());
11189 continue;
11190 }
11191 auto *NodeA = DT.getNode(Parent);
11192 auto *NodeB = DT.getNode(I->getParent());
11193 assert(NodeA && "Should only process reachable instructions");
11194 assert(NodeB && "Should only process reachable instructions");
11195 assert((NodeA == NodeB) ==
11196 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11197 "Different nodes should have different DFS numbers");
11198 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11199 Candidates.clear();
11200 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11201 Parent = I->getParent();
11202 Operands.clear();
11203 Operands.insert(I->op_begin(), I->op_end());
11204 }
11205 }
11206 unsigned BestOpcodeNum = 0;
11207 MainOp = nullptr;
11208 bool UsedOutside = false;
11209 for (const auto &P : Candidates) {
11210 bool PUsedOutside = all_of(P.second, isUsedOutsideBlock);
11211 if (UsedOutside && !PUsedOutside)
11212 continue;
11213 if (!UsedOutside && PUsedOutside)
11214 BestOpcodeNum = 0;
11215 if (P.second.size() < BestOpcodeNum)
11216 continue;
11217 // If have inner dependencies - skip.
11218 if (!PUsedOutside && any_of(P.second, [&](Instruction *I) {
11219 return Operands.contains(I);
11220 }))
11221 continue;
11222 UsedOutside = PUsedOutside;
11223 for (Instruction *I : P.second) {
11224 if (IsSupportedInstruction(I, AnyUndef)) {
11225 MainOp = I;
11226 BestOpcodeNum = P.second.size();
11227 break;
11228 }
11229 }
11230 }
11231 if (MainOp) {
11232 // Do not match, if any copyable is a terminator from the same block as
11233 // the main operation.
11234 if (any_of(VL, [&](Value *V) {
11235 auto *I = dyn_cast<Instruction>(V);
11236 return I && I->getParent() == MainOp->getParent() &&
11237 I->isTerminator();
11238 })) {
11239 MainOp = nullptr;
11240 return;
11241 }
11242 MainOpcode = MainOp->getOpcode();
11243 }
11244 }
11245
11246 /// Returns the idempotent value for the \p MainOp with the detected \p
11247 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
11248 /// the operand itself, since V or V == V.
11249 Value *selectBestIdempotentValue() const {
11250 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11251 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
11252 !MainOp->isCommutative());
11253 }
11254
11255 /// Returns the value and operands for the \p V, considering if it is original
11256 /// instruction and its actual operands should be returned, or it is a
11257 /// copyable element and its should be represented as idempotent instruction.
11258 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
11259 if (isa<PoisonValue>(V))
11260 return {V, V};
11261 if (!S.isCopyableElement(V))
11262 return convertTo(cast<Instruction>(V), S).second;
11263 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11264 return {V, selectBestIdempotentValue()};
11265 }
11266
11267 /// Builds operands for the original instructions.
11268 void
11269 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
11270 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
11271
11272 unsigned ShuffleOrOp =
11273 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11274 Instruction *VL0 = S.getMainOp();
11275
11276 switch (ShuffleOrOp) {
11277 case Instruction::PHI: {
11278 auto *PH = cast<PHINode>(VL0);
11279
11280 // Keeps the reordered operands to avoid code duplication.
11281 PHIHandler Handler(DT, PH, VL);
11282 Handler.buildOperands();
11283 Operands.assign(PH->getNumOperands(), {});
11284 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
11285 Operands[I].assign(Handler.getOperands(I).begin(),
11286 Handler.getOperands(I).end());
11287 return;
11288 }
11289 case Instruction::ExtractValue:
11290 case Instruction::ExtractElement:
11291 // This is a special case, as it does not gather, but at the same time
11292 // we are not extending buildTree_rec() towards the operands.
11293 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
11294 return;
11295 case Instruction::InsertElement:
11296 Operands.assign(2, {VL.size(), nullptr});
11297 for (auto [Idx, V] : enumerate(VL)) {
11298 auto *IE = cast<InsertElementInst>(V);
11299 for (auto [OpIdx, Ops] : enumerate(Operands))
11300 Ops[Idx] = IE->getOperand(OpIdx);
11301 }
11302 return;
11303 case Instruction::Load:
11304 Operands.assign(
11305 1, {VL.size(),
11306 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
11307 for (auto [V, Op] : zip(VL, Operands.back())) {
11308 auto *LI = dyn_cast<LoadInst>(V);
11309 if (!LI)
11310 continue;
11311 Op = LI->getPointerOperand();
11312 }
11313 return;
11314 case Instruction::ZExt:
11315 case Instruction::SExt:
11316 case Instruction::FPToUI:
11317 case Instruction::FPToSI:
11318 case Instruction::FPExt:
11319 case Instruction::PtrToInt:
11320 case Instruction::IntToPtr:
11321 case Instruction::SIToFP:
11322 case Instruction::UIToFP:
11323 case Instruction::Trunc:
11324 case Instruction::FPTrunc:
11325 case Instruction::BitCast:
11326 case Instruction::ICmp:
11327 case Instruction::FCmp:
11328 case Instruction::Select:
11329 case Instruction::FNeg:
11330 case Instruction::Add:
11331 case Instruction::FAdd:
11332 case Instruction::Sub:
11333 case Instruction::FSub:
11334 case Instruction::Mul:
11335 case Instruction::FMul:
11336 case Instruction::UDiv:
11337 case Instruction::SDiv:
11338 case Instruction::FDiv:
11339 case Instruction::URem:
11340 case Instruction::SRem:
11341 case Instruction::FRem:
11342 case Instruction::Shl:
11343 case Instruction::LShr:
11344 case Instruction::AShr:
11345 case Instruction::And:
11346 case Instruction::Or:
11347 case Instruction::Xor:
11348 case Instruction::Freeze:
11349 case Instruction::Store:
11350 case Instruction::ShuffleVector:
11351 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11352 for (auto [Idx, V] : enumerate(VL)) {
11353 auto *I = dyn_cast<Instruction>(V);
11354 if (!I) {
11355 for (auto [OpIdx, Ops] : enumerate(Operands))
11356 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11357 continue;
11358 }
11359 auto [Op, ConvertedOps] = convertTo(I, S);
11360 for (auto [OpIdx, Ops] : enumerate(Operands))
11361 Ops[Idx] = ConvertedOps[OpIdx];
11362 }
11363 return;
11364 case Instruction::GetElementPtr: {
11365 Operands.assign(2, {VL.size(), nullptr});
11366 // Need to cast all indices to the same type before vectorization to
11367 // avoid crash.
11368 // Required to be able to find correct matches between different gather
11369 // nodes and reuse the vectorized values rather than trying to gather them
11370 // again.
11371 const unsigned IndexIdx = 1;
11372 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
11373 Type *Ty =
11374 all_of(VL,
11375 [&](Value *V) {
11377 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
11378 })
11379 ? VL0Ty
11380 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
11381 ->getPointerOperandType()
11382 ->getScalarType());
11383 for (auto [Idx, V] : enumerate(VL)) {
11385 if (!GEP) {
11386 Operands[0][Idx] = V;
11387 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11388 continue;
11389 }
11390 Operands[0][Idx] = GEP->getPointerOperand();
11391 auto *Op = GEP->getOperand(IndexIdx);
11392 auto *CI = dyn_cast<ConstantInt>(Op);
11393 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
11394 CI, Ty, CI->getValue().isSignBitSet(), DL)
11395 : Op;
11396 }
11397 return;
11398 }
11399 case Instruction::Call: {
11400 auto *CI = cast<CallInst>(VL0);
11402 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
11404 continue;
11405 auto &Ops = Operands.emplace_back();
11406 for (Value *V : VL) {
11407 auto *I = dyn_cast<Instruction>(V);
11408 Ops.push_back(I ? I->getOperand(Idx)
11409 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
11410 }
11411 }
11412 return;
11413 }
11414 default:
11415 break;
11416 }
11417 llvm_unreachable("Unexpected vectorization of the instructions.");
11418 }
11419
11420public:
11421 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11422 const TargetTransformInfo &TTI,
11423 const TargetLibraryInfo &TLI)
11424 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11425
11426 InstructionsState
11427 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
11428 bool TryCopyableElementsVectorization,
11429 bool WithProfitabilityCheck = false,
11430 bool SkipSameCodeCheck = false) {
11431 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11432 ? InstructionsState::invalid()
11433 : getSameOpcode(VL, TLI);
11434 if (S)
11435 return S;
11436 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
11437 return S;
11438 findAndSetMainInstruction(VL, R);
11439 if (!MainOp)
11440 return InstructionsState::invalid();
11441 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11442 if (!WithProfitabilityCheck)
11443 return S;
11444 // Check if it is profitable to vectorize the instruction.
11445 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11446 auto BuildCandidates =
11447 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11448 Value *V2) {
11449 if (V1 != V2 && isa<PHINode>(V1))
11450 return;
11451 auto *I1 = dyn_cast<Instruction>(V1);
11452 auto *I2 = dyn_cast<Instruction>(V2);
11453 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11454 I1->getParent() != I2->getParent())
11455 return;
11456 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
11457 };
11458 if (VL.size() == 2) {
11459 // Check if the operands allow better vectorization.
11460 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11461 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11462 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11463 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11464 R.findBestRootPair(Candidates1) &&
11465 R.findBestRootPair(Candidates2);
11466 if (!Res && isCommutative(MainOp)) {
11467 Candidates1.clear();
11468 Candidates2.clear();
11469 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11470 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11471 Res = !Candidates1.empty() && !Candidates2.empty() &&
11472 R.findBestRootPair(Candidates1) &&
11473 R.findBestRootPair(Candidates2);
11474 }
11475 if (!Res)
11476 return InstructionsState::invalid();
11478 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
11479 InstructionCost VectorCost;
11480 FixedVectorType *VecTy =
11481 getWidenedType(S.getMainOp()->getType(), VL.size());
11482 switch (MainOpcode) {
11483 case Instruction::Add:
11484 case Instruction::Sub:
11485 case Instruction::LShr:
11486 case Instruction::Shl:
11487 case Instruction::SDiv:
11488 case Instruction::UDiv:
11489 case Instruction::And:
11490 case Instruction::Or:
11491 case Instruction::Xor:
11492 case Instruction::FAdd:
11493 case Instruction::FMul:
11494 case Instruction::FSub:
11495 case Instruction::FDiv:
11496 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
11497 break;
11498 default:
11499 llvm_unreachable("Unexpected instruction.");
11500 }
11501 if (VectorCost > ScalarCost)
11502 return InstructionsState::invalid();
11503 return S;
11504 }
11505 assert(Operands.size() == 2 && "Unexpected number of operands!");
11506 unsigned CopyableNum =
11507 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
11508 if (CopyableNum < VL.size() / 2)
11509 return S;
11510 // Too many phi copyables - exit.
11511 const unsigned Limit = VL.size() / 24;
11512 if ((CopyableNum >= VL.size() - Limit ||
11513 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11514 CopyableNum >= MaxPHINumOperands) &&
11515 all_of(VL, [&](Value *V) {
11516 return isa<PHINode>(V) || !S.isCopyableElement(V);
11517 }))
11518 return InstructionsState::invalid();
11519 // Check profitability if number of copyables > VL.size() / 2.
11520 // 1. Reorder operands for better matching.
11521 if (isCommutative(MainOp)) {
11522 for (auto &Ops : Operands) {
11523 // Make instructions the first operands.
11524 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
11525 std::swap(Ops.front(), Ops.back());
11526 continue;
11527 }
11528 // Make constants the second operands.
11529 if (isa<Constant>(Ops.front())) {
11530 std::swap(Ops.front(), Ops.back());
11531 continue;
11532 }
11533 }
11534 }
11535 // 2. Check, if operands can be vectorized.
11536 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
11537 return InstructionsState::invalid();
11538 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11539 if (allConstant(Ops) || isSplat(Ops))
11540 return true;
11541 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11542 // one is different.
11543 constexpr unsigned Limit = 4;
11544 if (Operands.front().size() >= Limit) {
11545 SmallDenseMap<const Value *, unsigned> Counters;
11546 for (Value *V : Ops) {
11547 if (isa<UndefValue>(V))
11548 continue;
11549 ++Counters[V];
11550 }
11551 if (Counters.size() == 2 &&
11552 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11553 return C.second == 1;
11554 }))
11555 return true;
11556 }
11557 // First operand not a constant or splat? Last attempt - check for
11558 // potential vectorization.
11559 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11560 InstructionsState OpS = Analysis.buildInstructionsState(
11561 Ops, R, /*TryCopyableElementsVectorization=*/true);
11562 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11563 return false;
11564 unsigned CopyableNum =
11565 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11566 return CopyableNum <= VL.size() / 2;
11567 };
11568 if (!CheckOperand(Operands.front()))
11569 return InstructionsState::invalid();
11570
11571 return S;
11572 }
11573
11574 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11575 ArrayRef<Value *> VL) {
11576 assert(S && "Invalid state!");
11578 if (S.areInstructionsWithCopyableElements()) {
11579 MainOp = S.getMainOp();
11580 MainOpcode = S.getOpcode();
11581 Operands.assign(MainOp->getNumOperands(),
11582 BoUpSLP::ValueList(VL.size(), nullptr));
11583 for (auto [Idx, V] : enumerate(VL)) {
11584 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11585 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11586 Operands[OperandIdx][Idx] = Operand;
11587 }
11588 } else {
11589 buildOriginalOperands(S, VL, Operands);
11590 }
11591 return Operands;
11592 }
11593};
11594} // namespace
11595
11596BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11597 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11598 bool TryCopyableElementsVectorization) const {
11599 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11600
11601 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11602 InstructionsState S = Analysis.buildInstructionsState(
11603 VL, *this, TryCopyableElementsVectorization,
11604 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11605
11606 bool AreScatterAllGEPSameBlock = false;
11607 if (!S) {
11608 SmallVector<unsigned> SortedIndices;
11609 BasicBlock *BB = nullptr;
11610 bool IsScatterVectorizeUserTE =
11611 UserTreeIdx.UserTE &&
11612 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11613 AreScatterAllGEPSameBlock =
11614 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11615 VL.size() > 2 &&
11616 all_of(VL,
11617 [&BB](Value *V) {
11618 auto *I = dyn_cast<GetElementPtrInst>(V);
11619 if (!I)
11620 return doesNotNeedToBeScheduled(V);
11621 if (!BB)
11622 BB = I->getParent();
11623 return BB == I->getParent() && I->getNumOperands() == 2;
11624 }) &&
11625 BB &&
11626 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL,
11627 *SE, SortedIndices));
11628 if (!AreScatterAllGEPSameBlock) {
11629 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11630 "C,S,B,O, small shuffle. \n";
11631 dbgs() << "[";
11632 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11633 dbgs() << "]\n");
11634 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11635 /*TryToFindDuplicates=*/true,
11636 /*TrySplitVectorize=*/true);
11637 }
11638 // Reset S to make it GetElementPtr kind of node.
11639 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11640 assert(It != VL.end() && "Expected at least one GEP.");
11641 S = getSameOpcode(*It, *TLI);
11642 }
11643 assert(S && "Must be valid.");
11644
11645 // Don't handle vectors.
11646 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11647 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11648 // Do not try to pack to avoid extra instructions here.
11649 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11650 /*TryToFindDuplicates=*/false);
11651 }
11652
11653 // Check that all of the users of the scalars that we want to vectorize are
11654 // schedulable.
11655 BasicBlock *BB = S.getMainOp()->getParent();
11656
11658 !DT->isReachableFromEntry(BB)) {
11659 // Don't go into unreachable blocks. They may contain instructions with
11660 // dependency cycles which confuse the final scheduling.
11661 // Do not vectorize EH and non-returning blocks, not profitable in most
11662 // cases.
11663 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11664 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11665 }
11666
11667 // Don't go into catchswitch blocks, which can happen with PHIs.
11668 // Such blocks can only have PHIs and the catchswitch. There is no
11669 // place to insert a shuffle if we need to, so just avoid that issue.
11671 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11672 // Do not try to pack to avoid extra instructions here.
11673 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11674 /*TryToFindDuplicates=*/false);
11675 }
11676
11677 // Don't handle scalable vectors
11678 if (S.getOpcode() == Instruction::ExtractElement &&
11680 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11681 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11682 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11683 }
11684
11685 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11686 // a load), in which case peek through to include it in the tree, without
11687 // ballooning over-budget.
11688 if (Depth >= RecursionMaxDepth &&
11689 (S.isAltShuffle() || VL.size() < 4 ||
11690 !(match(S.getMainOp(), m_Load(m_Value())) ||
11691 all_of(VL, [&S](const Value *I) {
11692 return match(I,
11694 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11695 })))) {
11696 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11697 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11698 }
11699
11700 // Check if this is a duplicate of another entry.
11701 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11702 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11703 if (E->isSame(VL)) {
11704 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11705 << ".\n");
11706 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11707 }
11708 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11709 if (all_of(VL, [&](Value *V) {
11710 return isa<PoisonValue>(V) || Values.contains(V) ||
11711 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11712 LI->getLoopFor(S.getMainOp()->getParent()) &&
11713 isVectorized(V));
11714 })) {
11715 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11716 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11717 }
11718 }
11719
11720 // If all of the operands are identical or constant we have a simple solution.
11721 // If we deal with insert/extract instructions, they all must have constant
11722 // indices, otherwise we should gather them, not try to vectorize.
11723 // If alternate op node with 2 elements with gathered operands - do not
11724 // vectorize.
11725 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11726 if (!S || !S.isAltShuffle() || VL.size() > 2)
11727 return false;
11728 if (VectorizableTree.size() < MinTreeSize)
11729 return false;
11730 if (Depth >= RecursionMaxDepth - 1)
11731 return true;
11732 // Check if all operands are extracts, part of vector node or can build a
11733 // regular vectorize node.
11734 SmallVector<unsigned, 8> InstsCount;
11735 for (Value *V : VL) {
11736 auto *I = cast<Instruction>(V);
11737 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11738 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11739 }));
11740 }
11741 bool IsCommutative =
11742 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11743 if ((IsCommutative &&
11744 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11745 (!IsCommutative &&
11746 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11747 return true;
11748 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11750 auto *I1 = cast<Instruction>(VL.front());
11751 auto *I2 = cast<Instruction>(VL.back());
11752 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11753 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11754 I2->getOperand(Op));
11755 if (static_cast<unsigned>(count_if(
11756 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11758 })) >= S.getMainOp()->getNumOperands() / 2)
11759 return false;
11760 if (S.getMainOp()->getNumOperands() > 2)
11761 return true;
11762 if (IsCommutative) {
11763 // Check permuted operands.
11764 Candidates.clear();
11765 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11766 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11767 I2->getOperand((Op + 1) % E));
11768 if (any_of(
11769 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11771 }))
11772 return false;
11773 }
11774 return true;
11775 };
11776 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11777 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11778 if (!AreAllSameInsts || isSplat(VL) ||
11780 S.getMainOp()) &&
11782 NotProfitableForVectorization(VL)) {
11783 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11784 dbgs() << "[";
11785 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11786 dbgs() << "]\n");
11787 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11788 }
11789
11790 // Don't vectorize ephemeral values.
11791 if (!EphValues.empty()) {
11792 for (Value *V : VL) {
11793 if (EphValues.count(V)) {
11794 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11795 << ") is ephemeral.\n");
11796 // Do not try to pack to avoid extra instructions here.
11797 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11798 /*TryToFindDuplicates=*/false);
11799 }
11800 }
11801 }
11802
11803 // We now know that this is a vector of instructions of the same type from
11804 // the same block.
11805
11806 // Check that none of the instructions in the bundle are already in the tree
11807 // and the node may be not profitable for the vectorization as the small
11808 // alternate node.
11809 if (S.isAltShuffle()) {
11810 auto GetNumVectorizedExtracted = [&]() {
11811 APInt Extracted = APInt::getZero(VL.size());
11812 APInt Vectorized = APInt::getAllOnes(VL.size());
11813 for (auto [Idx, V] : enumerate(VL)) {
11814 auto *I = dyn_cast<Instruction>(V);
11815 if (!I || doesNotNeedToBeScheduled(I) ||
11816 all_of(I->operands(), [&](const Use &U) {
11817 return isa<ExtractElementInst>(U.get());
11818 }))
11819 continue;
11820 if (isVectorized(I))
11821 Vectorized.clearBit(Idx);
11822 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11823 Extracted.setBit(Idx);
11824 }
11825 return std::make_pair(Vectorized, Extracted);
11826 };
11827 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11829 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11830 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11831 // Rough cost estimation, if the vector code (+ potential extracts) is
11832 // more profitable than the scalar + buildvector.
11833 Type *ScalarTy = VL.front()->getType();
11834 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11835 InstructionCost VectorizeCostEstimate =
11836 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11837 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11838 /*Insert=*/false, /*Extract=*/true, Kind);
11839 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11840 *TTI, ScalarTy, VecTy, Vectorized,
11841 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11842 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11843 }
11844 if (PreferScalarize) {
11845 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11846 "node is not profitable.\n");
11847 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11848 }
11849 }
11850
11851 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11852 if (UserIgnoreList && !UserIgnoreList->empty()) {
11853 for (Value *V : VL) {
11854 if (UserIgnoreList->contains(V)) {
11855 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11856 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11857 }
11858 }
11859 }
11860
11861 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11862}
11863
11864void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11865 const EdgeInfo &UserTreeIdx,
11866 unsigned InterleaveFactor) {
11867 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11868
11869 SmallVector<int> ReuseShuffleIndices;
11870 SmallVector<Value *> VL(VLRef);
11871
11872 // Tries to build split node.
11873 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11874 SmallVector<Value *> Op1, Op2;
11875 OrdersType ReorderIndices;
11876 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11877 return false;
11878
11879 auto Invalid = ScheduleBundle::invalid();
11880 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11881 UserTreeIdx, {}, ReorderIndices);
11882 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11883 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11884 InstructionsState S = getSameOpcode(Op, *TLI);
11885 if (S && (isa<LoadInst>(S.getMainOp()) ||
11886 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11887 // Build gather node for loads, they will be gathered later.
11888 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11889 Idx == 0 ? 0 : Op1.size());
11890 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11891 } else {
11892 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11893 Idx == 0 ? 0 : Op1.size());
11894 buildTreeRec(Op, Depth, {TE, Idx});
11895 }
11896 };
11897 AddNode(Op1, 0);
11898 AddNode(Op2, 1);
11899 return true;
11900 };
11901
11902 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11903 bool AreConsts = false;
11904 for (Value *V : VL) {
11905 if (isa<PoisonValue>(V))
11906 continue;
11907 if (isa<Constant>(V)) {
11908 AreConsts = true;
11909 continue;
11910 }
11911 if (!isa<PHINode>(V))
11912 return false;
11913 }
11914 return AreConsts;
11915 };
11916 if (AreOnlyConstsWithPHIs(VL)) {
11917 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11918 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11919 return;
11920 }
11921
11922 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11923 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11924 InstructionsState S = Legality.getInstructionsState();
11925 if (!Legality.isLegal()) {
11926 if (Legality.trySplitVectorize()) {
11927 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11928 // Last chance to try to vectorize alternate node.
11929 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11930 return;
11931 }
11932 if (!S)
11933 Legality = getScalarsVectorizationLegality(
11934 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11935 if (!Legality.isLegal()) {
11936 if (Legality.tryToFindDuplicates())
11937 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11938 UserTreeIdx);
11939
11940 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11941 return;
11942 }
11943 S = Legality.getInstructionsState();
11944 }
11945
11946 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11947 if (S.isAltShuffle() && TrySplitNode(S))
11948 return;
11949
11950 // Check that every instruction appears once in this bundle.
11951 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11952 /*TryPad=*/true)) {
11953 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11954 return;
11955 }
11956
11957 // Perform specific checks for each particular instruction kind.
11958 bool IsScatterVectorizeUserTE =
11959 UserTreeIdx.UserTE &&
11960 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11961 OrdersType CurrentOrder;
11962 SmallVector<Value *> PointerOps;
11963 StridedPtrInfo SPtrInfo;
11964 TreeEntry::EntryState State = getScalarsVectorizationState(
11965 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11966 if (State == TreeEntry::NeedToGather) {
11967 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11968 return;
11969 }
11970
11971 Instruction *VL0 = S.getMainOp();
11972 BasicBlock *BB = VL0->getParent();
11973 auto &BSRef = BlocksSchedules[BB];
11974 if (!BSRef)
11975 BSRef = std::make_unique<BlockScheduling>(BB);
11976
11977 BlockScheduling &BS = *BSRef;
11978
11979 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11980 std::optional<ScheduleBundle *> BundlePtr =
11981 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11982#ifdef EXPENSIVE_CHECKS
11983 // Make sure we didn't break any internal invariants
11984 BS.verify();
11985#endif
11986 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11987 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11988 // Last chance to try to vectorize alternate node.
11989 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11990 return;
11991 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11992 NonScheduledFirst.insert(VL.front());
11993 if (S.getOpcode() == Instruction::Load &&
11994 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11996 return;
11997 }
11998 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11999 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
12000 ScheduleBundle Empty;
12001 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
12002 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
12003
12004 unsigned ShuffleOrOp =
12005 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
12006 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
12007 // Postpone PHI nodes creation
12008 SmallVector<unsigned> PHIOps;
12009 for (unsigned I : seq<unsigned>(Operands.size())) {
12010 ArrayRef<Value *> Op = Operands[I];
12011 if (Op.empty())
12012 continue;
12013 InstructionsState S = getSameOpcode(Op, *TLI);
12014 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12015 buildTreeRec(Op, Depth + 1, {TE, I});
12016 else
12017 PHIOps.push_back(I);
12018 }
12019 for (unsigned I : PHIOps)
12020 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12021 };
12022 switch (ShuffleOrOp) {
12023 case Instruction::PHI: {
12024 TreeEntry *TE =
12025 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12026 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
12027 TE->dump());
12028
12029 TE->setOperands(Operands);
12030 CreateOperandNodes(TE, Operands);
12031 return;
12032 }
12033 case Instruction::ExtractValue:
12034 case Instruction::ExtractElement: {
12035 if (CurrentOrder.empty()) {
12036 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
12037 } else {
12038 LLVM_DEBUG({
12039 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
12040 "with order";
12041 for (unsigned Idx : CurrentOrder)
12042 dbgs() << " " << Idx;
12043 dbgs() << "\n";
12044 });
12045 fixupOrderingIndices(CurrentOrder);
12046 }
12047 // Insert new order with initial value 0, if it does not exist,
12048 // otherwise return the iterator to the existing one.
12049 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12050 ReuseShuffleIndices, CurrentOrder);
12051 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
12052 "(ExtractValueInst/ExtractElementInst).\n";
12053 TE->dump());
12054 // This is a special case, as it does not gather, but at the same time
12055 // we are not extending buildTreeRec() towards the operands.
12056 TE->setOperands(Operands);
12057 return;
12058 }
12059 case Instruction::InsertElement: {
12060 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
12061
12062 auto OrdCompare = [](const std::pair<int, int> &P1,
12063 const std::pair<int, int> &P2) {
12064 return P1.first > P2.first;
12065 };
12066 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
12067 decltype(OrdCompare)>
12068 Indices(OrdCompare);
12069 for (int I = 0, E = VL.size(); I < E; ++I) {
12070 unsigned Idx = *getElementIndex(VL[I]);
12071 Indices.emplace(Idx, I);
12072 }
12073 OrdersType CurrentOrder(VL.size(), VL.size());
12074 bool IsIdentity = true;
12075 for (int I = 0, E = VL.size(); I < E; ++I) {
12076 CurrentOrder[Indices.top().second] = I;
12077 IsIdentity &= Indices.top().second == I;
12078 Indices.pop();
12079 }
12080 if (IsIdentity)
12081 CurrentOrder.clear();
12082 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12083 {}, CurrentOrder);
12084 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
12085 TE->dump());
12086
12087 TE->setOperands(Operands);
12088 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
12089 return;
12090 }
12091 case Instruction::Load: {
12092 // Check that a vectorized load would load the same memory as a scalar
12093 // load. For example, we don't want to vectorize loads that are smaller
12094 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
12095 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
12096 // from such a struct, we read/write packed bits disagreeing with the
12097 // unvectorized version.
12098 TreeEntry *TE = nullptr;
12099 fixupOrderingIndices(CurrentOrder);
12100 switch (State) {
12101 case TreeEntry::Vectorize:
12102 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12103 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
12104 if (CurrentOrder.empty())
12105 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
12106 TE->dump());
12107 else
12109 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
12110 TE->dump());
12111 break;
12112 case TreeEntry::CompressVectorize:
12113 // Vectorizing non-consecutive loads with (masked)load + compress.
12114 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
12115 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12116 LLVM_DEBUG(
12117 dbgs()
12118 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12119 TE->dump());
12120 break;
12121 case TreeEntry::StridedVectorize:
12122 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12123 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
12124 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12125 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
12126 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
12127 TE->dump());
12128 break;
12129 case TreeEntry::ScatterVectorize:
12130 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12131 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
12132 UserTreeIdx, ReuseShuffleIndices);
12133 LLVM_DEBUG(
12134 dbgs()
12135 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12136 TE->dump());
12137 break;
12138 case TreeEntry::CombinedVectorize:
12139 case TreeEntry::SplitVectorize:
12140 case TreeEntry::NeedToGather:
12141 llvm_unreachable("Unexpected loads state.");
12142 }
12143 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12144 assert(Operands.size() == 1 && "Expected a single operand only");
12145 SmallVector<int> Mask;
12146 inversePermutation(CurrentOrder, Mask);
12147 reorderScalars(Operands.front(), Mask);
12148 }
12149 TE->setOperands(Operands);
12150 if (State == TreeEntry::ScatterVectorize)
12151 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
12152 return;
12153 }
12154 case Instruction::ZExt:
12155 case Instruction::SExt:
12156 case Instruction::FPToUI:
12157 case Instruction::FPToSI:
12158 case Instruction::FPExt:
12159 case Instruction::PtrToInt:
12160 case Instruction::IntToPtr:
12161 case Instruction::SIToFP:
12162 case Instruction::UIToFP:
12163 case Instruction::Trunc:
12164 case Instruction::FPTrunc:
12165 case Instruction::BitCast: {
12166 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12167 std::make_pair(std::numeric_limits<unsigned>::min(),
12168 std::numeric_limits<unsigned>::max()));
12169 if (ShuffleOrOp == Instruction::ZExt ||
12170 ShuffleOrOp == Instruction::SExt) {
12171 CastMaxMinBWSizes = std::make_pair(
12172 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
12173 PrevMaxBW),
12174 std::min<unsigned>(
12175 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
12176 PrevMinBW));
12177 } else if (ShuffleOrOp == Instruction::Trunc) {
12178 CastMaxMinBWSizes = std::make_pair(
12179 std::max<unsigned>(
12180 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
12181 PrevMaxBW),
12182 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
12183 PrevMinBW));
12184 }
12185 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12186 ReuseShuffleIndices);
12187 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
12188 TE->dump());
12189
12190 TE->setOperands(Operands);
12191 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12192 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12193 if (ShuffleOrOp == Instruction::Trunc) {
12194 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12195 } else if (ShuffleOrOp == Instruction::SIToFP ||
12196 ShuffleOrOp == Instruction::UIToFP) {
12197 unsigned NumSignBits =
12198 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
12199 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
12200 APInt Mask = DB->getDemandedBits(OpI);
12201 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
12202 }
12203 if (NumSignBits * 2 >=
12204 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
12205 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12206 }
12207 return;
12208 }
12209 case Instruction::ICmp:
12210 case Instruction::FCmp: {
12211 // Check that all of the compares have the same predicate.
12212 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12213 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12214 ReuseShuffleIndices);
12215 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
12216 TE->dump());
12217
12218 VLOperands Ops(VL, Operands, S, *this);
12219 if (cast<CmpInst>(VL0)->isCommutative()) {
12220 // Commutative predicate - collect + sort operands of the instructions
12221 // so that each side is more likely to have the same opcode.
12223 "Commutative Predicate mismatch");
12224 Ops.reorder();
12225 Operands.front() = Ops.getVL(0);
12226 Operands.back() = Ops.getVL(1);
12227 } else {
12228 // Collect operands - commute if it uses the swapped predicate.
12229 for (auto [Idx, V] : enumerate(VL)) {
12230 if (isa<PoisonValue>(V))
12231 continue;
12232 auto *Cmp = cast<CmpInst>(V);
12233 if (Cmp->getPredicate() != P0)
12234 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12235 }
12236 }
12237 TE->setOperands(Operands);
12238 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
12239 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
12240 if (ShuffleOrOp == Instruction::ICmp) {
12241 unsigned NumSignBits0 =
12242 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
12243 if (NumSignBits0 * 2 >=
12244 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
12245 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12246 unsigned NumSignBits1 =
12247 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
12248 if (NumSignBits1 * 2 >=
12249 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
12250 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
12251 }
12252 return;
12253 }
12254 case Instruction::Select:
12255 case Instruction::FNeg:
12256 case Instruction::Add:
12257 case Instruction::FAdd:
12258 case Instruction::Sub:
12259 case Instruction::FSub:
12260 case Instruction::Mul:
12261 case Instruction::FMul:
12262 case Instruction::UDiv:
12263 case Instruction::SDiv:
12264 case Instruction::FDiv:
12265 case Instruction::URem:
12266 case Instruction::SRem:
12267 case Instruction::FRem:
12268 case Instruction::Shl:
12269 case Instruction::LShr:
12270 case Instruction::AShr:
12271 case Instruction::And:
12272 case Instruction::Or:
12273 case Instruction::Xor:
12274 case Instruction::Freeze: {
12275 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12276 ReuseShuffleIndices);
12277 LLVM_DEBUG(
12278 dbgs() << "SLP: added a new TreeEntry "
12279 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12280 TE->dump());
12281
12282 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
12283 VLOperands Ops(VL, Operands, S, *this);
12284 Ops.reorder();
12285 Operands[0] = Ops.getVL(0);
12286 Operands[1] = Ops.getVL(1);
12287 }
12288 TE->setOperands(Operands);
12289 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12290 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12291 return;
12292 }
12293 case Instruction::GetElementPtr: {
12294 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12295 ReuseShuffleIndices);
12296 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
12297 TE->dump());
12298 TE->setOperands(Operands);
12299
12300 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
12301 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12302 return;
12303 }
12304 case Instruction::Store: {
12305 bool Consecutive = CurrentOrder.empty();
12306 if (!Consecutive)
12307 fixupOrderingIndices(CurrentOrder);
12308 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12309 ReuseShuffleIndices, CurrentOrder);
12310 if (Consecutive)
12311 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
12312 TE->dump());
12313 else
12314 LLVM_DEBUG(
12315 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
12316 TE->dump());
12317 TE->setOperands(Operands);
12318 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
12319 return;
12320 }
12321 case Instruction::Call: {
12322 // Check if the calls are all to the same vectorizable intrinsic or
12323 // library function.
12324 CallInst *CI = cast<CallInst>(VL0);
12326
12327 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12328 ReuseShuffleIndices);
12329 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
12330 TE->dump());
12331 if (isCommutative(VL0)) {
12332 VLOperands Ops(VL, Operands, S, *this);
12333 Ops.reorder();
12334 Operands[0] = Ops.getVL(0);
12335 Operands[1] = Ops.getVL(1);
12336 }
12337 TE->setOperands(Operands);
12338 for (unsigned I : seq<unsigned>(CI->arg_size())) {
12339 // For scalar operands no need to create an entry since no need to
12340 // vectorize it.
12342 continue;
12343 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12344 }
12345 return;
12346 }
12347 case Instruction::ShuffleVector: {
12348 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12349 ReuseShuffleIndices);
12350 if (S.isAltShuffle()) {
12351 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
12352 TE->dump());
12353 } else {
12354 assert(SLPReVec && "Only supported by REVEC.");
12355 LLVM_DEBUG(
12356 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12357 TE->dump());
12358 }
12359
12360 // Reorder operands if reordering would enable vectorization.
12361 auto *CI = dyn_cast<CmpInst>(VL0);
12362 if (CI && any_of(VL, [](Value *V) {
12363 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
12364 })) {
12365 auto *MainCI = cast<CmpInst>(S.getMainOp());
12366 auto *AltCI = cast<CmpInst>(S.getAltOp());
12367 CmpInst::Predicate MainP = MainCI->getPredicate();
12368 CmpInst::Predicate AltP = AltCI->getPredicate();
12369 assert(MainP != AltP &&
12370 "Expected different main/alternate predicates.");
12371 // Collect operands - commute if it uses the swapped predicate or
12372 // alternate operation.
12373 for (auto [Idx, V] : enumerate(VL)) {
12374 if (isa<PoisonValue>(V))
12375 continue;
12376 auto *Cmp = cast<CmpInst>(V);
12377
12378 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
12379 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12380 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12381 } else {
12382 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12383 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12384 }
12385 }
12386 TE->setOperands(Operands);
12387 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
12388 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
12389 return;
12390 }
12391
12392 if (isa<BinaryOperator>(VL0) || CI) {
12393 VLOperands Ops(VL, Operands, S, *this);
12394 Ops.reorder();
12395 Operands[0] = Ops.getVL(0);
12396 Operands[1] = Ops.getVL(1);
12397 }
12398 TE->setOperands(Operands);
12399 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12400 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12401 return;
12402 }
12403 default:
12404 break;
12405 }
12406 llvm_unreachable("Unexpected vectorization of the instructions.");
12407}
12408
12409unsigned BoUpSLP::canMapToVector(Type *T) const {
12410 unsigned N = 1;
12411 Type *EltTy = T;
12412
12414 if (EltTy->isEmptyTy())
12415 return 0;
12416 if (auto *ST = dyn_cast<StructType>(EltTy)) {
12417 // Check that struct is homogeneous.
12418 for (const auto *Ty : ST->elements())
12419 if (Ty != *ST->element_begin())
12420 return 0;
12421 N *= ST->getNumElements();
12422 EltTy = *ST->element_begin();
12423 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
12424 N *= AT->getNumElements();
12425 EltTy = AT->getElementType();
12426 } else {
12427 auto *VT = cast<FixedVectorType>(EltTy);
12428 N *= VT->getNumElements();
12429 EltTy = VT->getElementType();
12430 }
12431 }
12432
12433 if (!isValidElementType(EltTy))
12434 return 0;
12435 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
12436 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12437 VTSize != DL->getTypeStoreSizeInBits(T))
12438 return 0;
12439 return N;
12440}
12441
12442bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12443 SmallVectorImpl<unsigned> &CurrentOrder,
12444 bool ResizeAllowed) const {
12446 assert(It != VL.end() && "Expected at least one extract instruction.");
12447 auto *E0 = cast<Instruction>(*It);
12448 assert(
12450 "Invalid opcode");
12451 // Check if all of the extracts come from the same vector and from the
12452 // correct offset.
12453 Value *Vec = E0->getOperand(0);
12454
12455 CurrentOrder.clear();
12456
12457 // We have to extract from a vector/aggregate with the same number of elements.
12458 unsigned NElts;
12459 if (E0->getOpcode() == Instruction::ExtractValue) {
12460 NElts = canMapToVector(Vec->getType());
12461 if (!NElts)
12462 return false;
12463 // Check if load can be rewritten as load of vector.
12464 LoadInst *LI = dyn_cast<LoadInst>(Vec);
12465 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
12466 return false;
12467 } else {
12468 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
12469 }
12470
12471 unsigned E = VL.size();
12472 if (!ResizeAllowed && NElts != E)
12473 return false;
12474 SmallVector<int> Indices(E, PoisonMaskElem);
12475 unsigned MinIdx = NElts, MaxIdx = 0;
12476 for (auto [I, V] : enumerate(VL)) {
12477 auto *Inst = dyn_cast<Instruction>(V);
12478 if (!Inst)
12479 continue;
12480 if (Inst->getOperand(0) != Vec)
12481 return false;
12482 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
12483 if (isa<UndefValue>(EE->getIndexOperand()))
12484 continue;
12485 std::optional<unsigned> Idx = getExtractIndex(Inst);
12486 if (!Idx)
12487 return false;
12488 const unsigned ExtIdx = *Idx;
12489 if (ExtIdx >= NElts)
12490 continue;
12491 Indices[I] = ExtIdx;
12492 if (MinIdx > ExtIdx)
12493 MinIdx = ExtIdx;
12494 if (MaxIdx < ExtIdx)
12495 MaxIdx = ExtIdx;
12496 }
12497 if (MaxIdx - MinIdx + 1 > E)
12498 return false;
12499 if (MaxIdx + 1 <= E)
12500 MinIdx = 0;
12501
12502 // Check that all of the indices extract from the correct offset.
12503 bool ShouldKeepOrder = true;
12504 // Assign to all items the initial value E + 1 so we can check if the extract
12505 // instruction index was used already.
12506 // Also, later we can check that all the indices are used and we have a
12507 // consecutive access in the extract instructions, by checking that no
12508 // element of CurrentOrder still has value E + 1.
12509 CurrentOrder.assign(E, E);
12510 for (unsigned I = 0; I < E; ++I) {
12511 if (Indices[I] == PoisonMaskElem)
12512 continue;
12513 const unsigned ExtIdx = Indices[I] - MinIdx;
12514 if (CurrentOrder[ExtIdx] != E) {
12515 CurrentOrder.clear();
12516 return false;
12517 }
12518 ShouldKeepOrder &= ExtIdx == I;
12519 CurrentOrder[ExtIdx] = I;
12520 }
12521 if (ShouldKeepOrder)
12522 CurrentOrder.clear();
12523
12524 return ShouldKeepOrder;
12525}
12526
12527bool BoUpSLP::areAllUsersVectorized(
12528 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12529 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
12530 all_of(I->users(), [this](User *U) {
12531 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12532 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12533 });
12534}
12535
12536void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12537 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12538 SmallVectorImpl<Value *> *OpScalars,
12539 SmallVectorImpl<Value *> *AltScalars) const {
12540 unsigned Sz = Scalars.size();
12541 Mask.assign(Sz, PoisonMaskElem);
12542 SmallVector<int> OrderMask;
12543 if (!ReorderIndices.empty())
12544 inversePermutation(ReorderIndices, OrderMask);
12545 for (unsigned I = 0; I < Sz; ++I) {
12546 unsigned Idx = I;
12547 if (!ReorderIndices.empty())
12548 Idx = OrderMask[I];
12549 if (isa<PoisonValue>(Scalars[Idx]))
12550 continue;
12551 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12552 if (IsAltOp(OpInst)) {
12553 Mask[I] = Sz + Idx;
12554 if (AltScalars)
12555 AltScalars->push_back(OpInst);
12556 } else {
12557 Mask[I] = Idx;
12558 if (OpScalars)
12559 OpScalars->push_back(OpInst);
12560 }
12561 }
12562 if (!ReuseShuffleIndices.empty()) {
12563 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12564 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12565 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12566 });
12567 Mask.swap(NewMask);
12568 }
12569}
12570
12572 Instruction *AltOp,
12573 const TargetLibraryInfo &TLI) {
12574 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12575}
12576
12578 Instruction *AltOp,
12579 const TargetLibraryInfo &TLI) {
12580 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12581 auto *AltCI = cast<CmpInst>(AltOp);
12582 CmpInst::Predicate MainP = MainCI->getPredicate();
12583 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12584 assert(MainP != AltP && "Expected different main/alternate predicates.");
12585 auto *CI = cast<CmpInst>(I);
12586 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12587 return false;
12588 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12589 return true;
12590 CmpInst::Predicate P = CI->getPredicate();
12592
12593 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12594 "CmpInst expected to match either main or alternate predicate or "
12595 "their swap.");
12596 return MainP != P && MainP != SwappedP;
12597 }
12598 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12599}
12600
12601TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12602 assert(!Ops.empty());
12603 const auto *Op0 = Ops.front();
12604
12605 const bool IsConstant = all_of(Ops, [](Value *V) {
12606 // TODO: We should allow undef elements here
12607 return isConstant(V) && !isa<UndefValue>(V);
12608 });
12609 const bool IsUniform = all_of(Ops, [=](Value *V) {
12610 // TODO: We should allow undef elements here
12611 return V == Op0;
12612 });
12613 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12614 // TODO: We should allow undef elements here
12615 if (auto *CI = dyn_cast<ConstantInt>(V))
12616 return CI->getValue().isPowerOf2();
12617 return false;
12618 });
12619 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12620 // TODO: We should allow undef elements here
12621 if (auto *CI = dyn_cast<ConstantInt>(V))
12622 return CI->getValue().isNegatedPowerOf2();
12623 return false;
12624 });
12625
12627 if (IsConstant && IsUniform)
12629 else if (IsConstant)
12631 else if (IsUniform)
12633
12635 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12636 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12637
12638 return {VK, VP};
12639}
12640
12641namespace {
12642/// The base class for shuffle instruction emission and shuffle cost estimation.
12643class BaseShuffleAnalysis {
12644protected:
12645 Type *ScalarTy = nullptr;
12646
12647 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12648
12649 /// V is expected to be a vectorized value.
12650 /// When REVEC is disabled, there is no difference between VF and
12651 /// VNumElements.
12652 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12653 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12654 /// of 8.
12655 unsigned getVF(Value *V) const {
12656 assert(V && "V cannot be nullptr");
12657 assert(isa<FixedVectorType>(V->getType()) &&
12658 "V does not have FixedVectorType");
12659 assert(ScalarTy && "ScalarTy cannot be nullptr");
12660 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12661 unsigned VNumElements =
12662 cast<FixedVectorType>(V->getType())->getNumElements();
12663 assert(VNumElements > ScalarTyNumElements &&
12664 "the number of elements of V is not large enough");
12665 assert(VNumElements % ScalarTyNumElements == 0 &&
12666 "the number of elements of V is not a vectorized value");
12667 return VNumElements / ScalarTyNumElements;
12668 }
12669
12670 /// Checks if the mask is an identity mask.
12671 /// \param IsStrict if is true the function returns false if mask size does
12672 /// not match vector size.
12673 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12674 bool IsStrict) {
12675 int Limit = Mask.size();
12676 int VF = VecTy->getNumElements();
12677 int Index = -1;
12678 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12679 return true;
12680 if (!IsStrict) {
12681 // Consider extract subvector starting from index 0.
12682 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12683 Index == 0)
12684 return true;
12685 // All VF-size submasks are identity (e.g.
12686 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12687 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12688 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12689 return all_of(Slice, equal_to(PoisonMaskElem)) ||
12691 }))
12692 return true;
12693 }
12694 return false;
12695 }
12696
12697 /// Tries to combine 2 different masks into single one.
12698 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12699 /// change the size of the vector, \p LocalVF is the original size of the
12700 /// shuffled vector.
12701 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12702 ArrayRef<int> ExtMask) {
12703 unsigned VF = Mask.size();
12704 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12705 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12706 if (ExtMask[I] == PoisonMaskElem)
12707 continue;
12708 int MaskedIdx = Mask[ExtMask[I] % VF];
12709 NewMask[I] =
12710 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12711 }
12712 Mask.swap(NewMask);
12713 }
12714
12715 /// Looks through shuffles trying to reduce final number of shuffles in the
12716 /// code. The function looks through the previously emitted shuffle
12717 /// instructions and properly mark indices in mask as undef.
12718 /// For example, given the code
12719 /// \code
12720 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12721 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12722 /// \endcode
12723 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12724 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12725 /// <0, 1, 2, 3> for the shuffle.
12726 /// If 2 operands are of different size, the smallest one will be resized and
12727 /// the mask recalculated properly.
12728 /// For example, given the code
12729 /// \code
12730 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12731 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12732 /// \endcode
12733 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12734 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12735 /// <0, 1, 2, 3> for the shuffle.
12736 /// So, it tries to transform permutations to simple vector merge, if
12737 /// possible.
12738 /// \param V The input vector which must be shuffled using the given \p Mask.
12739 /// If the better candidate is found, \p V is set to this best candidate
12740 /// vector.
12741 /// \param Mask The input mask for the shuffle. If the best candidate is found
12742 /// during looking-through-shuffles attempt, it is updated accordingly.
12743 /// \param SinglePermute true if the shuffle operation is originally a
12744 /// single-value-permutation. In this case the look-through-shuffles procedure
12745 /// may look for resizing shuffles as the best candidates.
12746 /// \return true if the shuffle results in the non-resizing identity shuffle
12747 /// (and thus can be ignored), false - otherwise.
12748 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12749 bool SinglePermute) {
12750 Value *Op = V;
12751 ShuffleVectorInst *IdentityOp = nullptr;
12752 SmallVector<int> IdentityMask;
12753 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12754 // Exit if not a fixed vector type or changing size shuffle.
12755 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12756 if (!SVTy)
12757 break;
12758 // Remember the identity or broadcast mask, if it is not a resizing
12759 // shuffle. If no better candidates are found, this Op and Mask will be
12760 // used in the final shuffle.
12761 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12762 if (!IdentityOp || !SinglePermute ||
12763 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12765 IdentityMask.size()))) {
12766 IdentityOp = SV;
12767 // Store current mask in the IdentityMask so later we did not lost
12768 // this info if IdentityOp is selected as the best candidate for the
12769 // permutation.
12770 IdentityMask.assign(Mask);
12771 }
12772 }
12773 // Remember the broadcast mask. If no better candidates are found, this Op
12774 // and Mask will be used in the final shuffle.
12775 // Zero splat can be used as identity too, since it might be used with
12776 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12777 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12778 // expensive, the analysis founds out, that the source vector is just a
12779 // broadcast, this original mask can be transformed to identity mask <0,
12780 // 1, 2, 3>.
12781 // \code
12782 // %0 = shuffle %v, poison, zeroinitalizer
12783 // %res = shuffle %0, poison, <3, 1, 2, 0>
12784 // \endcode
12785 // may be transformed to
12786 // \code
12787 // %0 = shuffle %v, poison, zeroinitalizer
12788 // %res = shuffle %0, poison, <0, 1, 2, 3>
12789 // \endcode
12790 if (SV->isZeroEltSplat()) {
12791 IdentityOp = SV;
12792 IdentityMask.assign(Mask);
12793 }
12794 int LocalVF = Mask.size();
12795 if (auto *SVOpTy =
12796 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12797 LocalVF = SVOpTy->getNumElements();
12798 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12799 for (auto [Idx, I] : enumerate(Mask)) {
12800 if (I == PoisonMaskElem ||
12801 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12802 continue;
12803 ExtMask[Idx] = SV->getMaskValue(I);
12804 }
12805 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12806 SV->getOperand(0),
12807 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12808 .all();
12809 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12810 SV->getOperand(1),
12811 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12812 .all();
12813 if (!IsOp1Undef && !IsOp2Undef) {
12814 // Update mask and mark undef elems.
12815 for (int &I : Mask) {
12816 if (I == PoisonMaskElem)
12817 continue;
12818 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12820 I = PoisonMaskElem;
12821 }
12822 break;
12823 }
12824 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12825 combineMasks(LocalVF, ShuffleMask, Mask);
12826 Mask.swap(ShuffleMask);
12827 if (IsOp2Undef)
12828 Op = SV->getOperand(0);
12829 else
12830 Op = SV->getOperand(1);
12831 }
12832 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12833 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12835 if (IdentityOp) {
12836 V = IdentityOp;
12837 assert(Mask.size() == IdentityMask.size() &&
12838 "Expected masks of same sizes.");
12839 // Clear known poison elements.
12840 for (auto [I, Idx] : enumerate(Mask))
12841 if (Idx == PoisonMaskElem)
12842 IdentityMask[I] = PoisonMaskElem;
12843 Mask.swap(IdentityMask);
12844 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12845 return SinglePermute &&
12846 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12847 /*IsStrict=*/true) ||
12848 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12849 Shuffle->isZeroEltSplat() &&
12851 all_of(enumerate(Mask), [&](const auto &P) {
12852 return P.value() == PoisonMaskElem ||
12853 Shuffle->getShuffleMask()[P.index()] == 0;
12854 })));
12855 }
12856 V = Op;
12857 return false;
12858 }
12859 V = Op;
12860 return true;
12861 }
12862
12863 /// Smart shuffle instruction emission, walks through shuffles trees and
12864 /// tries to find the best matching vector for the actual shuffle
12865 /// instruction.
12866 template <typename T, typename ShuffleBuilderTy>
12867 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12868 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12869 assert(V1 && "Expected at least one vector value.");
12870 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12871 SmallVector<int> NewMask(Mask);
12872 if (ScalarTyNumElements != 1) {
12873 assert(SLPReVec && "FixedVectorType is not expected.");
12874 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12875 Mask = NewMask;
12876 }
12877 if (V2)
12878 Builder.resizeToMatch(V1, V2);
12879 int VF = Mask.size();
12880 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12881 VF = FTy->getNumElements();
12883 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12884 .all()) {
12885 // Peek through shuffles.
12886 Value *Op1 = V1;
12887 Value *Op2 = V2;
12888 int VF =
12889 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12890 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12891 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12892 for (int I = 0, E = Mask.size(); I < E; ++I) {
12893 if (Mask[I] < VF)
12894 CombinedMask1[I] = Mask[I];
12895 else
12896 CombinedMask2[I] = Mask[I] - VF;
12897 }
12898 Value *PrevOp1;
12899 Value *PrevOp2;
12900 do {
12901 PrevOp1 = Op1;
12902 PrevOp2 = Op2;
12903 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12904 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12905 // Check if we have 2 resizing shuffles - need to peek through operands
12906 // again.
12907 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12908 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12909 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12910 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12911 if (I == PoisonMaskElem)
12912 continue;
12913 ExtMask1[Idx] = SV1->getMaskValue(I);
12914 }
12915 SmallBitVector UseMask1 = buildUseMask(
12916 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12917 ->getNumElements(),
12918 ExtMask1, UseMask::SecondArg);
12919 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12920 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12921 if (I == PoisonMaskElem)
12922 continue;
12923 ExtMask2[Idx] = SV2->getMaskValue(I);
12924 }
12925 SmallBitVector UseMask2 = buildUseMask(
12926 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12927 ->getNumElements(),
12928 ExtMask2, UseMask::SecondArg);
12929 if (SV1->getOperand(0)->getType() ==
12930 SV2->getOperand(0)->getType() &&
12931 SV1->getOperand(0)->getType() != SV1->getType() &&
12932 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12933 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12934 Op1 = SV1->getOperand(0);
12935 Op2 = SV2->getOperand(0);
12936 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12937 int LocalVF = ShuffleMask1.size();
12938 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12939 LocalVF = FTy->getNumElements();
12940 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12941 CombinedMask1.swap(ShuffleMask1);
12942 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12943 LocalVF = ShuffleMask2.size();
12944 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12945 LocalVF = FTy->getNumElements();
12946 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12947 CombinedMask2.swap(ShuffleMask2);
12948 }
12949 }
12950 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12951 Builder.resizeToMatch(Op1, Op2);
12952 VF = std::max(cast<VectorType>(Op1->getType())
12953 ->getElementCount()
12954 .getKnownMinValue(),
12956 ->getElementCount()
12957 .getKnownMinValue());
12958 for (int I = 0, E = Mask.size(); I < E; ++I) {
12959 if (CombinedMask2[I] != PoisonMaskElem) {
12960 assert(CombinedMask1[I] == PoisonMaskElem &&
12961 "Expected undefined mask element");
12962 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12963 }
12964 }
12965 if (Op1 == Op2 &&
12966 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12967 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12969 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12970 ArrayRef(CombinedMask1))))
12971 return Builder.createIdentity(Op1);
12972 return Builder.createShuffleVector(
12973 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12974 CombinedMask1);
12975 }
12976 if (isa<PoisonValue>(V1))
12977 return Builder.createPoison(
12978 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12979 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12980 assert(V1 && "Expected non-null value after looking through shuffles.");
12981
12982 if (!IsIdentity)
12983 return Builder.createShuffleVector(V1, NewMask);
12984 return Builder.createIdentity(V1);
12985 }
12986
12987 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12988 /// shuffle emission.
12989 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12990 ArrayRef<int> Mask) {
12991 for (unsigned I : seq<unsigned>(CommonMask.size()))
12992 if (Mask[I] != PoisonMaskElem)
12993 CommonMask[I] = I;
12994 }
12995};
12996} // namespace
12997
12998/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12999static std::pair<InstructionCost, InstructionCost>
13001 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
13002 Type *ScalarTy, VectorType *VecTy) {
13003 InstructionCost ScalarCost = 0;
13004 InstructionCost VecCost = 0;
13005 // Here we differentiate two cases: (1) when Ptrs represent a regular
13006 // vectorization tree node (as they are pointer arguments of scattered
13007 // loads) or (2) when Ptrs are the arguments of loads or stores being
13008 // vectorized as plane wide unit-stride load/store since all the
13009 // loads/stores are known to be from/to adjacent locations.
13010 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
13011 // Case 2: estimate costs for pointer related costs when vectorizing to
13012 // a wide load/store.
13013 // Scalar cost is estimated as a set of pointers with known relationship
13014 // between them.
13015 // For vector code we will use BasePtr as argument for the wide load/store
13016 // but we also need to account all the instructions which are going to
13017 // stay in vectorized code due to uses outside of these scalar
13018 // loads/stores.
13019 ScalarCost = TTI.getPointersChainCost(
13020 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
13021 CostKind);
13022
13023 SmallVector<const Value *> PtrsRetainedInVecCode;
13024 for (Value *V : Ptrs) {
13025 if (V == BasePtr) {
13026 PtrsRetainedInVecCode.push_back(V);
13027 continue;
13028 }
13029 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
13030 // For simplicity assume Ptr to stay in vectorized code if it's not a
13031 // GEP instruction. We don't care since it's cost considered free.
13032 // TODO: We should check for any uses outside of vectorizable tree
13033 // rather than just single use.
13034 if (!Ptr || !Ptr->hasOneUse())
13035 PtrsRetainedInVecCode.push_back(V);
13036 }
13037
13038 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
13039 // If all pointers stay in vectorized code then we don't have
13040 // any savings on that.
13041 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
13042 }
13043 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
13044 TTI::PointersChainInfo::getKnownStride(),
13045 VecTy, CostKind);
13046 } else {
13047 // Case 1: Ptrs are the arguments of loads that we are going to transform
13048 // into masked gather load intrinsic.
13049 // All the scalar GEPs will be removed as a result of vectorization.
13050 // For any external uses of some lanes extract element instructions will
13051 // be generated (which cost is estimated separately).
13052 TTI::PointersChainInfo PtrsInfo =
13053 all_of(Ptrs,
13054 [](const Value *V) {
13055 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
13056 return Ptr && !Ptr->hasAllConstantIndices();
13057 })
13058 ? TTI::PointersChainInfo::getUnknownStride()
13059 : TTI::PointersChainInfo::getKnownStride();
13060
13061 ScalarCost =
13062 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
13063 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
13064 if (!BaseGEP) {
13065 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
13066 if (It != Ptrs.end())
13067 BaseGEP = cast<GEPOperator>(*It);
13068 }
13069 if (BaseGEP) {
13070 SmallVector<const Value *> Indices(BaseGEP->indices());
13071 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
13072 BaseGEP->getPointerOperand(), Indices, VecTy,
13073 CostKind);
13074 }
13075 }
13076
13077 return std::make_pair(ScalarCost, VecCost);
13078}
13079
13080void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13081 assert(TE.isGather() && TE.ReorderIndices.empty() &&
13082 "Expected gather node without reordering.");
13083 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
13084 SmallSet<size_t, 2> LoadKeyUsed;
13085
13086 // Do not reorder nodes if it small (just 2 elements), all-constant or all
13087 // instructions have same opcode already.
13088 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
13089 all_of(TE.Scalars, isConstant))
13090 return;
13091
13092 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
13093 return VectorizableTree[Idx]->isSame(TE.Scalars);
13094 }))
13095 return;
13096
13097 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
13098 Key = hash_combine(hash_value(LI->getParent()), Key);
13099 Value *Ptr =
13100 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
13101 if (LoadKeyUsed.contains(Key)) {
13102 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
13103 if (LIt != LoadsMap.end()) {
13104 for (LoadInst *RLI : LIt->second) {
13105 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
13106 LI->getType(), LI->getPointerOperand(), *DL, *SE,
13107 /*StrictCheck=*/true))
13108 return hash_value(RLI->getPointerOperand());
13109 }
13110 for (LoadInst *RLI : LIt->second) {
13112 LI->getPointerOperand(), *TLI)) {
13113 hash_code SubKey = hash_value(RLI->getPointerOperand());
13114 return SubKey;
13115 }
13116 }
13117 if (LIt->second.size() > 2) {
13118 hash_code SubKey =
13119 hash_value(LIt->second.back()->getPointerOperand());
13120 return SubKey;
13121 }
13122 }
13123 }
13124 LoadKeyUsed.insert(Key);
13125 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
13126 return hash_value(LI->getPointerOperand());
13127 };
13128 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13129 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
13130 bool IsOrdered = true;
13131 unsigned NumInstructions = 0;
13132 // Try to "cluster" scalar instructions, to be able to build extra vectorized
13133 // nodes.
13134 for (auto [I, V] : enumerate(TE.Scalars)) {
13135 size_t Key = 1, Idx = 1;
13136 if (auto *Inst = dyn_cast<Instruction>(V);
13138 !isDeleted(Inst) && !isVectorized(V)) {
13139 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
13140 /*AllowAlternate=*/false);
13141 ++NumInstructions;
13142 }
13143 auto &Container = SortedValues[Key];
13144 if (IsOrdered && !KeyToIndex.contains(V) &&
13147 ((Container.contains(Idx) &&
13148 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
13149 (!Container.empty() && !Container.contains(Idx) &&
13150 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
13151 IsOrdered = false;
13152 auto &KTI = KeyToIndex[V];
13153 if (KTI.empty())
13154 Container[Idx].push_back(V);
13155 KTI.push_back(I);
13156 }
13158 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
13159 if (!IsOrdered && NumInstructions > 1) {
13160 unsigned Cnt = 0;
13161 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
13162 for (const auto &D : SortedValues) {
13163 for (const auto &P : D.second) {
13164 unsigned Sz = 0;
13165 for (Value *V : P.second) {
13166 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
13167 for (auto [K, Idx] : enumerate(Indices)) {
13168 TE.ReorderIndices[Cnt + K] = Idx;
13169 TE.Scalars[Cnt + K] = V;
13170 }
13171 Sz += Indices.size();
13172 Cnt += Indices.size();
13173 }
13174 if (Sz > 1 && isa<Instruction>(P.second.front())) {
13175 const unsigned SubVF = getFloorFullVectorNumberOfElements(
13176 *TTI, TE.Scalars.front()->getType(), Sz);
13177 SubVectors.emplace_back(Cnt - Sz, SubVF);
13178 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
13179 DemandedElts.clearBit(I);
13180 } else if (!P.second.empty() && isConstant(P.second.front())) {
13181 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
13182 DemandedElts.clearBit(I);
13183 }
13184 }
13185 }
13186 }
13187 // Reuses always require shuffles, so consider it as profitable.
13188 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
13189 return;
13190 // Do simple cost estimation.
13193 auto *ScalarTy = TE.Scalars.front()->getType();
13194 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
13195 for (auto [Idx, Sz] : SubVectors) {
13197 Idx, getWidenedType(ScalarTy, Sz));
13198 }
13199 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
13200 /*Insert=*/true,
13201 /*Extract=*/false, CostKind);
13202 int Sz = TE.Scalars.size();
13203 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
13204 TE.ReorderIndices.end());
13205 for (unsigned I : seq<unsigned>(Sz)) {
13206 Value *V = TE.getOrdered(I);
13207 if (isa<PoisonValue>(V)) {
13208 ReorderMask[I] = PoisonMaskElem;
13209 } else if (isConstant(V) || DemandedElts[I]) {
13210 ReorderMask[I] = I + TE.ReorderIndices.size();
13211 }
13212 }
13213 Cost += ::getShuffleCost(*TTI,
13214 any_of(ReorderMask, [&](int I) { return I >= Sz; })
13217 VecTy, ReorderMask);
13218 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
13219 ReorderMask.assign(Sz, PoisonMaskElem);
13220 for (unsigned I : seq<unsigned>(Sz)) {
13221 Value *V = TE.getOrdered(I);
13222 if (isConstant(V)) {
13223 DemandedElts.clearBit(I);
13224 if (!isa<PoisonValue>(V))
13225 ReorderMask[I] = I;
13226 } else {
13227 ReorderMask[I] = I + Sz;
13228 }
13229 }
13230 InstructionCost BVCost =
13231 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
13232 /*Insert=*/true, /*Extract=*/false, CostKind);
13233 if (!DemandedElts.isAllOnes())
13234 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
13235 if (Cost >= BVCost) {
13236 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
13237 reorderScalars(TE.Scalars, Mask);
13238 TE.ReorderIndices.clear();
13239 }
13240}
13241
13242/// Check if we can convert fadd/fsub sequence to FMAD.
13243/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
13245 const InstructionsState &S,
13246 DominatorTree &DT, const DataLayout &DL,
13248 const TargetLibraryInfo &TLI) {
13249 assert(all_of(VL,
13250 [](Value *V) {
13251 return V->getType()->getScalarType()->isFloatingPointTy();
13252 }) &&
13253 "Can only convert to FMA for floating point types");
13254 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
13255
13256 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
13257 FastMathFlags FMF;
13258 FMF.set();
13259 for (Value *V : VL) {
13260 auto *I = dyn_cast<Instruction>(V);
13261 if (!I)
13262 continue;
13263 if (S.isCopyableElement(I))
13264 continue;
13265 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
13266 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13267 continue;
13268 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13269 FMF &= FPCI->getFastMathFlags();
13270 }
13271 return FMF.allowContract();
13272 };
13273 if (!CheckForContractable(VL))
13275 // fmul also should be contractable
13276 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
13277 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
13278
13279 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
13280 if (!OpS.valid())
13282
13283 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13285 if (!CheckForContractable(Operands.front()))
13287 // Compare the costs.
13288 InstructionCost FMulPlusFAddCost = 0;
13289 InstructionCost FMACost = 0;
13291 FastMathFlags FMF;
13292 FMF.set();
13293 for (Value *V : VL) {
13294 auto *I = dyn_cast<Instruction>(V);
13295 if (!I)
13296 continue;
13297 if (!S.isCopyableElement(I))
13298 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13299 FMF &= FPCI->getFastMathFlags();
13300 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13301 }
13302 unsigned NumOps = 0;
13303 for (auto [V, Op] : zip(VL, Operands.front())) {
13304 if (S.isCopyableElement(V))
13305 continue;
13306 auto *I = dyn_cast<Instruction>(Op);
13307 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
13308 if (auto *OpI = dyn_cast<Instruction>(V))
13309 FMACost += TTI.getInstructionCost(OpI, CostKind);
13310 if (I)
13311 FMACost += TTI.getInstructionCost(I, CostKind);
13312 continue;
13313 }
13314 ++NumOps;
13315 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13316 FMF &= FPCI->getFastMathFlags();
13317 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13318 }
13319 Type *Ty = VL.front()->getType();
13320 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
13321 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
13322 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
13323}
13324
13327 BaseGraphSize = VectorizableTree.size();
13328 // Turn graph transforming mode on and off, when done.
13329 class GraphTransformModeRAAI {
13330 bool &SavedIsGraphTransformMode;
13331
13332 public:
13333 GraphTransformModeRAAI(bool &IsGraphTransformMode)
13334 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13335 IsGraphTransformMode = true;
13336 }
13337 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
13338 } TransformContext(IsGraphTransformMode);
13339 // Operands are profitable if they are:
13340 // 1. At least one constant
13341 // or
13342 // 2. Splats
13343 // or
13344 // 3. Results in good vectorization opportunity, i.e. may generate vector
13345 // nodes and reduce cost of the graph.
13346 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
13347 const InstructionsState &S) {
13349 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
13350 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
13351 I2->getOperand(Op));
13352 return all_of(
13353 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
13354 return all_of(Cand,
13355 [](const std::pair<Value *, Value *> &P) {
13356 return isa<Constant>(P.first) ||
13357 isa<Constant>(P.second) || P.first == P.second;
13358 }) ||
13360 });
13361 };
13362
13363 // Try to reorder gather nodes for better vectorization opportunities.
13364 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13365 TreeEntry &E = *VectorizableTree[Idx];
13366 if (E.isGather())
13367 reorderGatherNode(E);
13368 }
13369
13370 // Better to use full gathered loads analysis, if there are only 2 loads
13371 // gathered nodes each having less than 16 elements.
13372 constexpr unsigned VFLimit = 16;
13373 bool ForceLoadGather =
13374 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
13375 return TE->isGather() && TE->hasState() &&
13376 TE->getOpcode() == Instruction::Load &&
13377 TE->getVectorFactor() < VFLimit;
13378 }) == 2;
13379
13380 // Checks if the scalars are used in other node.
13381 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
13382 function_ref<bool(Value *)> CheckContainer) {
13383 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
13384 if (isa<PoisonValue>(V))
13385 return true;
13386 auto *I = dyn_cast<Instruction>(V);
13387 if (!I)
13388 return false;
13389 return is_contained(TE->Scalars, I) || CheckContainer(I);
13390 });
13391 };
13392 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
13393 if (E.hasState()) {
13394 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
13395 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13396 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13397 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13398 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13399 return is_contained(TEs, TE);
13400 });
13401 });
13402 }))
13403 return true;
13404 ;
13405 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
13406 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13407 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13408 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13409 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13410 return is_contained(TEs, TE);
13411 });
13412 });
13413 }))
13414 return true;
13415 } else {
13416 // Check if the gather node full copy of split node.
13417 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
13418 if (It != E.Scalars.end()) {
13419 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
13420 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13421 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13422 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13423 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13424 return is_contained(TEs, TE);
13425 });
13426 });
13427 }))
13428 return true;
13429 }
13430 }
13431 return false;
13432 };
13433 // The tree may grow here, so iterate over nodes, built before.
13434 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13435 TreeEntry &E = *VectorizableTree[Idx];
13436 if (E.isGather()) {
13437 ArrayRef<Value *> VL = E.Scalars;
13438 const unsigned Sz = getVectorElementSize(VL.front());
13439 unsigned MinVF = getMinVF(2 * Sz);
13440 // Do not try partial vectorization for small nodes (<= 2), nodes with the
13441 // same opcode and same parent block or all constants.
13442 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13443 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
13444 // We use allSameOpcode instead of isAltShuffle because we don't
13445 // want to use interchangeable instruction here.
13446 !allSameOpcode(VL) || !allSameBlock(VL)) ||
13447 allConstant(VL) || isSplat(VL))
13448 continue;
13449 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13450 continue;
13451 // Check if the node is a copy of other vector nodes.
13452 if (CheckForSameVectorNodes(E))
13453 continue;
13454 // Try to find vectorizable sequences and transform them into a series of
13455 // insertvector instructions.
13456 unsigned StartIdx = 0;
13457 unsigned End = VL.size();
13458 SmallBitVector Processed(End);
13459 for (unsigned VF = getFloorFullVectorNumberOfElements(
13460 *TTI, VL.front()->getType(), VL.size() - 1);
13461 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13462 *TTI, VL.front()->getType(), VF - 1)) {
13463 if (StartIdx + VF > End)
13464 continue;
13466 bool AllStrided = true;
13467 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13468 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
13469 // If any instruction is vectorized already - do not try again.
13470 // Reuse the existing node, if it fully matches the slice.
13471 if ((Processed.test(Cnt) || isVectorized(Slice.front())) &&
13472 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
13473 continue;
13474 // Constant already handled effectively - skip.
13475 if (allConstant(Slice))
13476 continue;
13477 // Do not try to vectorize small splats (less than vector register and
13478 // only with the single non-undef element).
13479 bool IsSplat = isSplat(Slice);
13480 bool IsTwoRegisterSplat = true;
13481 if (IsSplat && VF == 2) {
13482 unsigned NumRegs2VF = ::getNumberOfParts(
13483 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
13484 IsTwoRegisterSplat = NumRegs2VF == 2;
13485 }
13486 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
13487 count(Slice, Slice.front()) ==
13488 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
13489 : 1)) {
13490 if (IsSplat)
13491 continue;
13492 InstructionsState S = getSameOpcode(Slice, *TLI);
13493 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
13494 (S.getOpcode() == Instruction::Load &&
13496 (S.getOpcode() != Instruction::Load &&
13497 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
13498 continue;
13499 if (VF == 2) {
13500 // Try to vectorize reduced values or if all users are vectorized.
13501 // For expensive instructions extra extracts might be profitable.
13502 if ((!UserIgnoreList || E.Idx != 0) &&
13503 TTI->getInstructionCost(S.getMainOp(), CostKind) <
13505 !all_of(Slice, [&](Value *V) {
13506 if (isa<PoisonValue>(V))
13507 return true;
13508 return areAllUsersVectorized(cast<Instruction>(V),
13509 UserIgnoreList);
13510 }))
13511 continue;
13512 if (S.getOpcode() == Instruction::Load) {
13513 OrdersType Order;
13514 SmallVector<Value *> PointerOps;
13515 StridedPtrInfo SPtrInfo;
13516 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
13517 PointerOps, SPtrInfo);
13518 AllStrided &= Res == LoadsState::StridedVectorize ||
13520 Res == LoadsState::Gather;
13521 // Do not vectorize gathers.
13522 if (Res == LoadsState::ScatterVectorize ||
13523 Res == LoadsState::Gather) {
13524 if (Res == LoadsState::Gather) {
13526 // If reductions and the scalars from the root node are
13527 // analyzed - mark as non-vectorizable reduction.
13528 if (UserIgnoreList && E.Idx == 0)
13529 analyzedReductionVals(Slice);
13530 }
13531 continue;
13532 }
13533 } else if (S.getOpcode() == Instruction::ExtractElement ||
13534 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
13536 !CheckOperandsProfitability(
13537 S.getMainOp(),
13540 S))) {
13541 // Do not vectorize extractelements (handled effectively
13542 // alread). Do not vectorize non-profitable instructions (with
13543 // low cost and non-vectorizable operands.)
13544 continue;
13545 }
13546 }
13547 }
13548 Slices.emplace_back(Cnt, Slice.size());
13549 }
13550 // Do not try to vectorize if all slides are strided or gathered with
13551 // vector factor 2 and there are more than 2 slices. Better to handle
13552 // them in gathered loads analysis, may result in better vectorization.
13553 if (VF == 2 && AllStrided && Slices.size() > 2)
13554 continue;
13555 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13556 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13557 Processed.set(Cnt, Cnt + Sz);
13558 if (StartIdx == Cnt)
13559 StartIdx = Cnt + Sz;
13560 if (End == Cnt + Sz)
13561 End = Cnt;
13562 };
13563 for (auto [Cnt, Sz] : Slices) {
13564 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13565 const TreeEntry *SameTE = nullptr;
13566 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13567 It != Slice.end()) {
13568 // If any instruction is vectorized already - do not try again.
13569 SameTE = getSameValuesTreeEntry(*It, Slice);
13570 }
13571 unsigned PrevSize = VectorizableTree.size();
13572 [[maybe_unused]] unsigned PrevEntriesSize =
13573 LoadEntriesToVectorize.size();
13574 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13575 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13576 VectorizableTree[PrevSize]->isGather() &&
13577 VectorizableTree[PrevSize]->hasState() &&
13578 VectorizableTree[PrevSize]->getOpcode() !=
13579 Instruction::ExtractElement &&
13580 !isSplat(Slice)) {
13581 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13582 analyzedReductionVals(Slice);
13583 VectorizableTree.pop_back();
13584 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13585 "LoadEntriesToVectorize expected to remain the same");
13586 continue;
13587 }
13588 AddCombinedNode(PrevSize, Cnt, Sz);
13589 }
13590 }
13591 // Restore ordering, if no extra vectorization happened.
13592 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13593 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13594 reorderScalars(E.Scalars, Mask);
13595 E.ReorderIndices.clear();
13596 }
13597 }
13598 if (!E.hasState())
13599 continue;
13600 switch (E.getOpcode()) {
13601 case Instruction::Load: {
13602 // No need to reorder masked gather loads, just reorder the scalar
13603 // operands.
13604 if (E.State != TreeEntry::Vectorize)
13605 break;
13606 Type *ScalarTy = E.getMainOp()->getType();
13607 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13608 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13609 // Check if profitable to represent consecutive load + reverse as strided
13610 // load with stride -1.
13611 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13612 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13613 SmallVector<int> Mask;
13614 inversePermutation(E.ReorderIndices, Mask);
13615 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13616 InstructionCost OriginalVecCost =
13617 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13618 BaseLI->getPointerAddressSpace(), CostKind,
13620 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13621 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13622 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13623 VecTy, BaseLI->getPointerOperand(),
13624 /*VariableMask=*/false, CommonAlignment,
13625 BaseLI),
13626 CostKind);
13627 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13628 // Strided load is more profitable than consecutive load + reverse -
13629 // transform the node to strided load.
13630 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13631 ->getPointerOperand()
13632 ->getType());
13633 StridedPtrInfo SPtrInfo;
13634 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13635 SPtrInfo.Ty = VecTy;
13636 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13637 E.State = TreeEntry::StridedVectorize;
13638 }
13639 }
13640 break;
13641 }
13642 case Instruction::Store: {
13643 Type *ScalarTy =
13644 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13645 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13646 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13647 // Check if profitable to represent consecutive load + reverse as strided
13648 // load with stride -1.
13649 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13650 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13651 SmallVector<int> Mask;
13652 inversePermutation(E.ReorderIndices, Mask);
13653 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13654 InstructionCost OriginalVecCost =
13655 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13656 BaseSI->getPointerAddressSpace(), CostKind,
13658 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13659 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13660 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13661 VecTy, BaseSI->getPointerOperand(),
13662 /*VariableMask=*/false, CommonAlignment,
13663 BaseSI),
13664 CostKind);
13665 if (StridedCost < OriginalVecCost)
13666 // Strided store is more profitable than reverse + consecutive store -
13667 // transform the node to strided store.
13668 E.State = TreeEntry::StridedVectorize;
13669 } else if (!E.ReorderIndices.empty()) {
13670 // Check for interleaved stores.
13671 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13672 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13673 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13674 if (Mask.size() < 4)
13675 return 0u;
13676 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13678 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13679 TTI.isLegalInterleavedAccessType(
13680 VecTy, Factor, BaseSI->getAlign(),
13681 BaseSI->getPointerAddressSpace()))
13682 return Factor;
13683 }
13684
13685 return 0u;
13686 };
13687 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13688 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13689 if (InterleaveFactor != 0)
13690 E.setInterleave(InterleaveFactor);
13691 }
13692 break;
13693 }
13694 case Instruction::Select: {
13695 if (E.State != TreeEntry::Vectorize)
13696 break;
13697 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13698 if (MinMaxID == Intrinsic::not_intrinsic)
13699 break;
13700 // This node is a minmax node.
13701 E.CombinedOp = TreeEntry::MinMax;
13702 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13703 if (SelectOnly && CondEntry->UserTreeIndex &&
13704 CondEntry->State == TreeEntry::Vectorize) {
13705 // The condition node is part of the combined minmax node.
13706 CondEntry->State = TreeEntry::CombinedVectorize;
13707 }
13708 break;
13709 }
13710 case Instruction::FSub:
13711 case Instruction::FAdd: {
13712 // Check if possible to convert (a*b)+c to fma.
13713 if (E.State != TreeEntry::Vectorize ||
13714 !E.getOperations().isAddSubLikeOp())
13715 break;
13716 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13717 .isValid())
13718 break;
13719 // This node is a fmuladd node.
13720 E.CombinedOp = TreeEntry::FMulAdd;
13721 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13722 if (FMulEntry->UserTreeIndex &&
13723 FMulEntry->State == TreeEntry::Vectorize) {
13724 // The FMul node is part of the combined fmuladd node.
13725 FMulEntry->State = TreeEntry::CombinedVectorize;
13726 }
13727 break;
13728 }
13729 default:
13730 break;
13731 }
13732 }
13733
13734 if (LoadEntriesToVectorize.empty()) {
13735 // Single load node - exit.
13736 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13737 VectorizableTree.front()->getOpcode() == Instruction::Load)
13738 return;
13739 // Small graph with small VF - exit.
13740 constexpr unsigned SmallTree = 3;
13741 constexpr unsigned SmallVF = 2;
13742 if ((VectorizableTree.size() <= SmallTree &&
13743 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13744 (VectorizableTree.size() <= 2 && UserIgnoreList))
13745 return;
13746
13747 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13748 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13749 getCanonicalGraphSize() <= SmallTree &&
13750 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13751 [](const std::unique_ptr<TreeEntry> &TE) {
13752 return TE->isGather() && TE->hasState() &&
13753 TE->getOpcode() == Instruction::Load &&
13754 !allSameBlock(TE->Scalars);
13755 }) == 1)
13756 return;
13757 }
13758
13759 // A list of loads to be gathered during the vectorization process. We can
13760 // try to vectorize them at the end, if profitable.
13761 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13763 GatheredLoads;
13764
13765 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13766 TreeEntry &E = *TE;
13767 if (E.isGather() &&
13768 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13769 (!E.hasState() && any_of(E.Scalars,
13770 [&](Value *V) {
13771 return isa<LoadInst>(V) &&
13772 !isVectorized(V) &&
13773 !isDeleted(cast<Instruction>(V));
13774 }))) &&
13775 !isSplat(E.Scalars)) {
13776 for (Value *V : E.Scalars) {
13777 auto *LI = dyn_cast<LoadInst>(V);
13778 if (!LI)
13779 continue;
13780 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13781 continue;
13783 *this, V, *DL, *SE, *TTI,
13784 GatheredLoads[std::make_tuple(
13785 LI->getParent(),
13786 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13787 LI->getType())]);
13788 }
13789 }
13790 }
13791 // Try to vectorize gathered loads if this is not just a gather of loads.
13792 if (!GatheredLoads.empty())
13793 tryToVectorizeGatheredLoads(GatheredLoads);
13794}
13795
13796/// Merges shuffle masks and emits final shuffle instruction, if required. It
13797/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13798/// when the actual shuffle instruction is generated only if this is actually
13799/// required. Otherwise, the shuffle instruction emission is delayed till the
13800/// end of the process, to reduce the number of emitted instructions and further
13801/// analysis/transformations.
13802class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13803 bool IsFinalized = false;
13804 SmallVector<int> CommonMask;
13806 const TargetTransformInfo &TTI;
13807 InstructionCost Cost = 0;
13808 SmallDenseSet<Value *> VectorizedVals;
13809 BoUpSLP &R;
13810 SmallPtrSetImpl<Value *> &CheckedExtracts;
13811 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13812 /// While set, still trying to estimate the cost for the same nodes and we
13813 /// can delay actual cost estimation (virtual shuffle instruction emission).
13814 /// May help better estimate the cost if same nodes must be permuted + allows
13815 /// to move most of the long shuffles cost estimation to TTI.
13816 bool SameNodesEstimated = true;
13817
13818 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13819 if (Ty->getScalarType()->isPointerTy()) {
13822 IntegerType::get(Ty->getContext(),
13823 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13824 Ty->getScalarType());
13825 if (auto *VTy = dyn_cast<VectorType>(Ty))
13826 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13827 return Res;
13828 }
13829 return Constant::getAllOnesValue(Ty);
13830 }
13831
13832 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13833 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13834 return TTI::TCC_Free;
13835 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13836 InstructionCost GatherCost = 0;
13837 SmallVector<Value *> Gathers(VL);
13838 if (!Root && isSplat(VL)) {
13839 // Found the broadcasting of the single scalar, calculate the cost as
13840 // the broadcast.
13841 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13842 assert(It != VL.end() && "Expected at least one non-undef value.");
13843 // Add broadcast for non-identity shuffle only.
13844 bool NeedShuffle =
13845 count(VL, *It) > 1 &&
13846 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13847 if (!NeedShuffle) {
13848 if (isa<FixedVectorType>(ScalarTy)) {
13849 assert(SLPReVec && "FixedVectorType is not expected.");
13850 return TTI.getShuffleCost(
13851 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13852 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13853 cast<FixedVectorType>(ScalarTy));
13854 }
13855 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13856 CostKind, std::distance(VL.begin(), It),
13857 PoisonValue::get(VecTy), *It);
13858 }
13859
13860 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13861 transform(VL, ShuffleMask.begin(), [](Value *V) {
13862 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13863 });
13864 InstructionCost InsertCost =
13865 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13866 PoisonValue::get(VecTy), *It);
13867 return InsertCost + ::getShuffleCost(TTI,
13869 VecTy, ShuffleMask, CostKind,
13870 /*Index=*/0, /*SubTp=*/nullptr,
13871 /*Args=*/*It);
13872 }
13873 return GatherCost +
13874 (all_of(Gathers, IsaPred<UndefValue>)
13876 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13877 ScalarTy));
13878 };
13879
13880 /// Compute the cost of creating a vector containing the extracted values from
13881 /// \p VL.
13883 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13884 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13885 unsigned NumParts) {
13886 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13887 unsigned NumElts =
13888 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13889 auto *EE = dyn_cast<ExtractElementInst>(V);
13890 if (!EE)
13891 return Sz;
13892 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13893 if (!VecTy)
13894 return Sz;
13895 return std::max(Sz, VecTy->getNumElements());
13896 });
13897 // FIXME: this must be moved to TTI for better estimation.
13898 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13899 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13901 SmallVectorImpl<unsigned> &SubVecSizes)
13902 -> std::optional<TTI::ShuffleKind> {
13903 if (NumElts <= EltsPerVector)
13904 return std::nullopt;
13905 int OffsetReg0 =
13906 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13907 [](int S, int I) {
13908 if (I == PoisonMaskElem)
13909 return S;
13910 return std::min(S, I);
13911 }),
13912 EltsPerVector);
13913 int OffsetReg1 = OffsetReg0;
13914 DenseSet<int> RegIndices;
13915 // Check that if trying to permute same single/2 input vectors.
13917 int FirstRegId = -1;
13918 Indices.assign(1, OffsetReg0);
13919 for (auto [Pos, I] : enumerate(Mask)) {
13920 if (I == PoisonMaskElem)
13921 continue;
13922 int Idx = I - OffsetReg0;
13923 int RegId =
13924 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13925 if (FirstRegId < 0)
13926 FirstRegId = RegId;
13927 RegIndices.insert(RegId);
13928 if (RegIndices.size() > 2)
13929 return std::nullopt;
13930 if (RegIndices.size() == 2) {
13931 ShuffleKind = TTI::SK_PermuteTwoSrc;
13932 if (Indices.size() == 1) {
13933 OffsetReg1 = alignDown(
13934 std::accumulate(
13935 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13936 [&](int S, int I) {
13937 if (I == PoisonMaskElem)
13938 return S;
13939 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13940 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13941 if (RegId == FirstRegId)
13942 return S;
13943 return std::min(S, I);
13944 }),
13945 EltsPerVector);
13946 unsigned Index = OffsetReg1 % NumElts;
13947 Indices.push_back(Index);
13948 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13949 }
13950 Idx = I - OffsetReg1;
13951 }
13952 I = (Idx % NumElts) % EltsPerVector +
13953 (RegId == FirstRegId ? 0 : EltsPerVector);
13954 }
13955 return ShuffleKind;
13956 };
13957 InstructionCost Cost = 0;
13958
13959 // Process extracts in blocks of EltsPerVector to check if the source vector
13960 // operand can be re-used directly. If not, add the cost of creating a
13961 // shuffle to extract the values into a vector register.
13962 for (unsigned Part : seq<unsigned>(NumParts)) {
13963 if (!ShuffleKinds[Part])
13964 continue;
13965 ArrayRef<int> MaskSlice = Mask.slice(
13966 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13967 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13968 copy(MaskSlice, SubMask.begin());
13970 SmallVector<unsigned, 2> SubVecSizes;
13971 std::optional<TTI::ShuffleKind> RegShuffleKind =
13972 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13973 if (!RegShuffleKind) {
13974 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13976 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13977 Cost +=
13978 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13979 getWidenedType(ScalarTy, NumElts), MaskSlice);
13980 continue;
13981 }
13982 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13983 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13984 Cost +=
13985 ::getShuffleCost(TTI, *RegShuffleKind,
13986 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13987 }
13988 const unsigned BaseVF = getFullVectorNumberOfElements(
13989 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13990 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13991 assert((Idx + SubVecSize) <= BaseVF &&
13992 "SK_ExtractSubvector index out of range");
13994 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13995 Idx, getWidenedType(ScalarTy, SubVecSize));
13996 }
13997 // Second attempt to check, if just a permute is better estimated than
13998 // subvector extract.
13999 SubMask.assign(NumElts, PoisonMaskElem);
14000 copy(MaskSlice, SubMask.begin());
14001 InstructionCost OriginalCost = ::getShuffleCost(
14002 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
14003 if (OriginalCost < Cost)
14004 Cost = OriginalCost;
14005 }
14006 return Cost;
14007 }
14008 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
14009 /// mask \p Mask, register number \p Part, that includes \p SliceSize
14010 /// elements.
14011 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
14012 ArrayRef<int> Mask, unsigned Part,
14013 unsigned SliceSize) {
14014 if (SameNodesEstimated) {
14015 // Delay the cost estimation if the same nodes are reshuffling.
14016 // If we already requested the cost of reshuffling of E1 and E2 before, no
14017 // need to estimate another cost with the sub-Mask, instead include this
14018 // sub-Mask into the CommonMask to estimate it later and avoid double cost
14019 // estimation.
14020 if ((InVectors.size() == 2 &&
14021 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
14022 cast<const TreeEntry *>(InVectors.back()) == E2) ||
14023 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
14024 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
14025 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
14026 [](int Idx) { return Idx == PoisonMaskElem; }) &&
14027 "Expected all poisoned elements.");
14028 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
14029 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
14030 return;
14031 }
14032 // Found non-matching nodes - need to estimate the cost for the matched
14033 // and transform mask.
14034 Cost += createShuffle(InVectors.front(),
14035 InVectors.size() == 1 ? nullptr : InVectors.back(),
14036 CommonMask);
14037 transformMaskAfterShuffle(CommonMask, CommonMask);
14038 } else if (InVectors.size() == 2) {
14039 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14040 transformMaskAfterShuffle(CommonMask, CommonMask);
14041 }
14042 SameNodesEstimated = false;
14043 if (!E2 && InVectors.size() == 1) {
14044 unsigned VF = E1.getVectorFactor();
14045 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
14046 VF = std::max(VF, getVF(V1));
14047 } else {
14048 const auto *E = cast<const TreeEntry *>(InVectors.front());
14049 VF = std::max(VF, E->getVectorFactor());
14050 }
14051 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14052 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14053 CommonMask[Idx] = Mask[Idx] + VF;
14054 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
14055 transformMaskAfterShuffle(CommonMask, CommonMask);
14056 } else {
14057 auto P = InVectors.front();
14058 Cost += createShuffle(&E1, E2, Mask);
14059 unsigned VF = Mask.size();
14060 if (Value *V1 = dyn_cast<Value *>(P)) {
14061 VF = std::max(VF,
14062 getNumElements(V1->getType()));
14063 } else {
14064 const auto *E = cast<const TreeEntry *>(P);
14065 VF = std::max(VF, E->getVectorFactor());
14066 }
14067 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14068 if (Mask[Idx] != PoisonMaskElem)
14069 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
14070 Cost += createShuffle(P, InVectors.front(), CommonMask);
14071 transformMaskAfterShuffle(CommonMask, CommonMask);
14072 }
14073 }
14074
14075 class ShuffleCostBuilder {
14076 const TargetTransformInfo &TTI;
14077
14078 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
14079 int Index = -1;
14080 return Mask.empty() ||
14081 (VF == Mask.size() &&
14084 Index == 0);
14085 }
14086
14087 public:
14088 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
14089 ~ShuffleCostBuilder() = default;
14090 InstructionCost createShuffleVector(Value *V1, Value *,
14091 ArrayRef<int> Mask) const {
14092 // Empty mask or identity mask are free.
14093 unsigned VF =
14094 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
14095 if (isEmptyOrIdentity(Mask, VF))
14096 return TTI::TCC_Free;
14097 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
14098 cast<VectorType>(V1->getType()), Mask);
14099 }
14100 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
14101 // Empty mask or identity mask are free.
14102 unsigned VF =
14103 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
14104 if (isEmptyOrIdentity(Mask, VF))
14105 return TTI::TCC_Free;
14106 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
14107 cast<VectorType>(V1->getType()), Mask);
14108 }
14109 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
14110 InstructionCost createPoison(Type *Ty, unsigned VF) const {
14111 return TTI::TCC_Free;
14112 }
14113 void resizeToMatch(Value *&, Value *&) const {}
14114 };
14115
14116 /// Smart shuffle instruction emission, walks through shuffles trees and
14117 /// tries to find the best matching vector for the actual shuffle
14118 /// instruction.
14120 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
14122 ArrayRef<int> Mask) {
14123 ShuffleCostBuilder Builder(TTI);
14124 SmallVector<int> CommonMask(Mask);
14125 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
14126 unsigned CommonVF = Mask.size();
14127 InstructionCost ExtraCost = 0;
14128 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
14129 unsigned VF) -> InstructionCost {
14130 if (E.isGather() && allConstant(E.Scalars))
14131 return TTI::TCC_Free;
14132 Type *EScalarTy = E.Scalars.front()->getType();
14133 bool IsSigned = true;
14134 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
14135 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
14136 IsSigned = It->second.second;
14137 }
14138 if (EScalarTy != ScalarTy) {
14139 unsigned CastOpcode = Instruction::Trunc;
14140 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14141 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14142 if (DstSz > SrcSz)
14143 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14144 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
14145 getWidenedType(EScalarTy, VF),
14146 TTI::CastContextHint::None, CostKind);
14147 }
14148 return TTI::TCC_Free;
14149 };
14150 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
14151 if (isa<Constant>(V))
14152 return TTI::TCC_Free;
14153 auto *VecTy = cast<VectorType>(V->getType());
14154 Type *EScalarTy = VecTy->getElementType();
14155 if (EScalarTy != ScalarTy) {
14156 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
14157 unsigned CastOpcode = Instruction::Trunc;
14158 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14159 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14160 if (DstSz > SrcSz)
14161 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14162 return TTI.getCastInstrCost(
14163 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
14164 VecTy, TTI::CastContextHint::None, CostKind);
14165 }
14166 return TTI::TCC_Free;
14167 };
14168 if (!V1 && !V2 && !P2.isNull()) {
14169 // Shuffle 2 entry nodes.
14170 const TreeEntry *E = cast<const TreeEntry *>(P1);
14171 unsigned VF = E->getVectorFactor();
14172 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
14173 CommonVF = std::max(VF, E2->getVectorFactor());
14174 assert(all_of(Mask,
14175 [=](int Idx) {
14176 return Idx < 2 * static_cast<int>(CommonVF);
14177 }) &&
14178 "All elements in mask must be less than 2 * CommonVF.");
14179 if (E->Scalars.size() == E2->Scalars.size()) {
14180 SmallVector<int> EMask = E->getCommonMask();
14181 SmallVector<int> E2Mask = E2->getCommonMask();
14182 if (!EMask.empty() || !E2Mask.empty()) {
14183 for (int &Idx : CommonMask) {
14184 if (Idx == PoisonMaskElem)
14185 continue;
14186 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
14187 Idx = EMask[Idx];
14188 else if (Idx >= static_cast<int>(CommonVF))
14189 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
14190 E->Scalars.size();
14191 }
14192 }
14193 CommonVF = E->Scalars.size();
14194 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14195 GetNodeMinBWAffectedCost(*E2, CommonVF);
14196 } else {
14197 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14198 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14199 }
14200 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14201 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14202 } else if (!V1 && P2.isNull()) {
14203 // Shuffle single entry node.
14204 const TreeEntry *E = cast<const TreeEntry *>(P1);
14205 unsigned VF = E->getVectorFactor();
14206 CommonVF = VF;
14207 assert(
14208 all_of(Mask,
14209 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14210 "All elements in mask must be less than CommonVF.");
14211 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14212 SmallVector<int> EMask = E->getCommonMask();
14213 assert(!EMask.empty() && "Expected non-empty common mask.");
14214 for (int &Idx : CommonMask) {
14215 if (Idx != PoisonMaskElem)
14216 Idx = EMask[Idx];
14217 }
14218 CommonVF = E->Scalars.size();
14219 } else if (unsigned Factor = E->getInterleaveFactor();
14220 Factor > 0 && E->Scalars.size() != Mask.size() &&
14222 Factor)) {
14223 // Deinterleaved nodes are free.
14224 std::iota(CommonMask.begin(), CommonMask.end(), 0);
14225 }
14226 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14227 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14228 // Not identity/broadcast? Try to see if the original vector is better.
14229 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14230 CommonVF == CommonMask.size() &&
14231 any_of(enumerate(CommonMask),
14232 [](const auto &&P) {
14233 return P.value() != PoisonMaskElem &&
14234 static_cast<unsigned>(P.value()) != P.index();
14235 }) &&
14236 any_of(CommonMask,
14237 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
14238 SmallVector<int> ReorderMask;
14239 inversePermutation(E->ReorderIndices, ReorderMask);
14240 ::addMask(CommonMask, ReorderMask);
14241 }
14242 } else if (V1 && P2.isNull()) {
14243 // Shuffle single vector.
14244 ExtraCost += GetValueMinBWAffectedCost(V1);
14245 CommonVF = getVF(V1);
14246 assert(
14247 all_of(Mask,
14248 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14249 "All elements in mask must be less than CommonVF.");
14250 } else if (V1 && !V2) {
14251 // Shuffle vector and tree node.
14252 unsigned VF = getVF(V1);
14253 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
14254 CommonVF = std::max(VF, E2->getVectorFactor());
14255 assert(all_of(Mask,
14256 [=](int Idx) {
14257 return Idx < 2 * static_cast<int>(CommonVF);
14258 }) &&
14259 "All elements in mask must be less than 2 * CommonVF.");
14260 if (E2->Scalars.size() == VF && VF != CommonVF) {
14261 SmallVector<int> E2Mask = E2->getCommonMask();
14262 assert(!E2Mask.empty() && "Expected non-empty common mask.");
14263 for (int &Idx : CommonMask) {
14264 if (Idx == PoisonMaskElem)
14265 continue;
14266 if (Idx >= static_cast<int>(CommonVF))
14267 Idx = E2Mask[Idx - CommonVF] + VF;
14268 }
14269 CommonVF = VF;
14270 }
14271 ExtraCost += GetValueMinBWAffectedCost(V1);
14272 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14273 ExtraCost += GetNodeMinBWAffectedCost(
14274 *E2, std::min(CommonVF, E2->getVectorFactor()));
14275 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14276 } else if (!V1 && V2) {
14277 // Shuffle vector and tree node.
14278 unsigned VF = getVF(V2);
14279 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
14280 CommonVF = std::max(VF, E1->getVectorFactor());
14281 assert(all_of(Mask,
14282 [=](int Idx) {
14283 return Idx < 2 * static_cast<int>(CommonVF);
14284 }) &&
14285 "All elements in mask must be less than 2 * CommonVF.");
14286 if (E1->Scalars.size() == VF && VF != CommonVF) {
14287 SmallVector<int> E1Mask = E1->getCommonMask();
14288 assert(!E1Mask.empty() && "Expected non-empty common mask.");
14289 for (int &Idx : CommonMask) {
14290 if (Idx == PoisonMaskElem)
14291 continue;
14292 if (Idx >= static_cast<int>(CommonVF))
14293 Idx = E1Mask[Idx - CommonVF] + VF;
14294 else
14295 Idx = E1Mask[Idx];
14296 }
14297 CommonVF = VF;
14298 }
14299 ExtraCost += GetNodeMinBWAffectedCost(
14300 *E1, std::min(CommonVF, E1->getVectorFactor()));
14301 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14302 ExtraCost += GetValueMinBWAffectedCost(V2);
14303 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14304 } else {
14305 assert(V1 && V2 && "Expected both vectors.");
14306 unsigned VF = getVF(V1);
14307 CommonVF = std::max(VF, getVF(V2));
14308 assert(all_of(Mask,
14309 [=](int Idx) {
14310 return Idx < 2 * static_cast<int>(CommonVF);
14311 }) &&
14312 "All elements in mask must be less than 2 * CommonVF.");
14313 ExtraCost +=
14314 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14315 if (V1->getType() != V2->getType()) {
14316 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14317 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14318 } else {
14319 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
14320 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14321 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
14322 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14323 }
14324 }
14325 InVectors.front() =
14326 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14327 if (InVectors.size() == 2)
14328 InVectors.pop_back();
14329 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14330 V1, V2, CommonMask, Builder, ScalarTy);
14331 }
14332
14333public:
14335 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
14336 SmallPtrSetImpl<Value *> &CheckedExtracts)
14337 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14338 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14339 CheckedExtracts(CheckedExtracts) {}
14340 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14341 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14342 unsigned NumParts, bool &UseVecBaseAsInput) {
14343 UseVecBaseAsInput = false;
14344 if (Mask.empty())
14345 return nullptr;
14346 Value *VecBase = nullptr;
14347 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14348 if (!E->ReorderIndices.empty()) {
14349 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14350 E->ReorderIndices.end());
14351 reorderScalars(VL, ReorderMask);
14352 }
14353 // Check if it can be considered reused if same extractelements were
14354 // vectorized already.
14355 bool PrevNodeFound = any_of(
14356 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14357 [&](const std::unique_ptr<TreeEntry> &TE) {
14358 return ((TE->hasState() && !TE->isAltShuffle() &&
14359 TE->getOpcode() == Instruction::ExtractElement) ||
14360 TE->isGather()) &&
14361 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14362 return VL.size() > Data.index() &&
14363 (Mask[Data.index()] == PoisonMaskElem ||
14364 isa<UndefValue>(VL[Data.index()]) ||
14365 Data.value() == VL[Data.index()]);
14366 });
14367 });
14368 SmallPtrSet<Value *, 4> UniqueBases;
14369 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14370 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
14371 for (unsigned Part : seq<unsigned>(NumParts)) {
14372 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14373 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14374 for (auto [I, V] :
14375 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
14376 // Ignore non-extractelement scalars.
14377 if (isa<UndefValue>(V) ||
14378 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
14379 continue;
14380 // If all users of instruction are going to be vectorized and this
14381 // instruction itself is not going to be vectorized, consider this
14382 // instruction as dead and remove its cost from the final cost of the
14383 // vectorized tree.
14384 // Also, avoid adjusting the cost for extractelements with multiple uses
14385 // in different graph entries.
14386 auto *EE = cast<ExtractElementInst>(V);
14387 VecBase = EE->getVectorOperand();
14388 UniqueBases.insert(VecBase);
14389 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
14390 if (!CheckedExtracts.insert(V).second ||
14391 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
14392 any_of(VEs,
14393 [&](const TreeEntry *TE) {
14394 return R.DeletedNodes.contains(TE) ||
14395 R.TransformedToGatherNodes.contains(TE);
14396 }) ||
14397 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
14398 !R.isVectorized(EE) &&
14399 count_if(E->Scalars, [&](Value *V) { return V == EE; }) !=
14400 count_if(E->UserTreeIndex.UserTE->Scalars,
14401 [&](Value *V) { return V == EE; })) ||
14402 any_of(EE->users(),
14403 [&](User *U) {
14404 return isa<GetElementPtrInst>(U) &&
14405 !R.areAllUsersVectorized(cast<Instruction>(U),
14406 &VectorizedVals);
14407 }) ||
14408 (!VEs.empty() && !is_contained(VEs, E)))
14409 continue;
14410 std::optional<unsigned> EEIdx = getExtractIndex(EE);
14411 if (!EEIdx)
14412 continue;
14413 unsigned Idx = *EEIdx;
14414 // Take credit for instruction that will become dead.
14415 if (EE->hasOneUse() || !PrevNodeFound) {
14416 Instruction *Ext = EE->user_back();
14417 if (isa<SExtInst, ZExtInst>(Ext) &&
14419 // Use getExtractWithExtendCost() to calculate the cost of
14420 // extractelement/ext pair.
14421 Cost -= TTI.getExtractWithExtendCost(
14422 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
14423 Idx, CostKind);
14424 // Add back the cost of s|zext which is subtracted separately.
14425 Cost += TTI.getCastInstrCost(
14426 Ext->getOpcode(), Ext->getType(), EE->getType(),
14428 continue;
14429 }
14430 }
14431 APInt &DemandedElts =
14432 VectorOpsToExtracts
14433 .try_emplace(VecBase,
14434 APInt::getZero(getNumElements(VecBase->getType())))
14435 .first->getSecond();
14436 DemandedElts.setBit(Idx);
14437 }
14438 }
14439 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14441 DemandedElts, /*Insert=*/false,
14442 /*Extract=*/true, CostKind);
14443 // Check that gather of extractelements can be represented as just a
14444 // shuffle of a single/two vectors the scalars are extracted from.
14445 // Found the bunch of extractelement instructions that must be gathered
14446 // into a vector and can be represented as a permutation elements in a
14447 // single input vector or of 2 input vectors.
14448 // Done for reused if same extractelements were vectorized already.
14449 if (!PrevNodeFound)
14450 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14451 InVectors.assign(1, E);
14452 CommonMask.assign(Mask.begin(), Mask.end());
14453 transformMaskAfterShuffle(CommonMask, CommonMask);
14454 SameNodesEstimated = false;
14455 if (NumParts != 1 && UniqueBases.size() != 1) {
14456 UseVecBaseAsInput = true;
14457 VecBase =
14458 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14459 }
14460 return VecBase;
14461 }
14462 /// Checks if the specified entry \p E needs to be delayed because of its
14463 /// dependency nodes.
14464 std::optional<InstructionCost>
14465 needToDelay(const TreeEntry *,
14467 // No need to delay the cost estimation during analysis.
14468 return std::nullopt;
14469 }
14470 /// Reset the builder to handle perfect diamond match.
14472 IsFinalized = false;
14473 CommonMask.clear();
14474 InVectors.clear();
14475 Cost = 0;
14476 VectorizedVals.clear();
14477 SameNodesEstimated = true;
14478 }
14479 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14480 if (&E1 == &E2) {
14481 assert(all_of(Mask,
14482 [&](int Idx) {
14483 return Idx < static_cast<int>(E1.getVectorFactor());
14484 }) &&
14485 "Expected single vector shuffle mask.");
14486 add(E1, Mask);
14487 return;
14488 }
14489 if (InVectors.empty()) {
14490 CommonMask.assign(Mask.begin(), Mask.end());
14491 InVectors.assign({&E1, &E2});
14492 return;
14493 }
14494 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14495 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14496 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14497 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14498 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
14499 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14500 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14501 }
14502 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14503 if (InVectors.empty()) {
14504 CommonMask.assign(Mask.begin(), Mask.end());
14505 InVectors.assign(1, &E1);
14506 return;
14507 }
14508 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14509 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14510 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14511 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14512 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
14513 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14514 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
14515 if (!SameNodesEstimated && InVectors.size() == 1)
14516 InVectors.emplace_back(&E1);
14517 }
14518 /// Adds 2 input vectors and the mask for their shuffling.
14519 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14520 // May come only for shuffling of 2 vectors with extractelements, already
14521 // handled in adjustExtracts.
14522 assert(InVectors.size() == 1 &&
14523 all_of(enumerate(CommonMask),
14524 [&](auto P) {
14525 if (P.value() == PoisonMaskElem)
14526 return Mask[P.index()] == PoisonMaskElem;
14527 auto *EI = cast<ExtractElementInst>(
14528 cast<const TreeEntry *>(InVectors.front())
14529 ->getOrdered(P.index()));
14530 return EI->getVectorOperand() == V1 ||
14531 EI->getVectorOperand() == V2;
14532 }) &&
14533 "Expected extractelement vectors.");
14534 }
14535 /// Adds another one input vector and the mask for the shuffling.
14536 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14537 if (InVectors.empty()) {
14538 assert(CommonMask.empty() && !ForExtracts &&
14539 "Expected empty input mask/vectors.");
14540 CommonMask.assign(Mask.begin(), Mask.end());
14541 InVectors.assign(1, V1);
14542 return;
14543 }
14544 if (ForExtracts) {
14545 // No need to add vectors here, already handled them in adjustExtracts.
14546 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14547 !CommonMask.empty() &&
14548 all_of(enumerate(CommonMask),
14549 [&](auto P) {
14550 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14551 ->getOrdered(P.index());
14552 if (P.value() == PoisonMaskElem)
14553 return P.value() == Mask[P.index()] ||
14554 isa<UndefValue>(Scalar);
14555 if (isa<Constant>(V1))
14556 return true;
14557 auto *EI = cast<ExtractElementInst>(Scalar);
14558 return EI->getVectorOperand() == V1;
14559 }) &&
14560 "Expected only tree entry for extractelement vectors.");
14561 return;
14562 }
14563 assert(!InVectors.empty() && !CommonMask.empty() &&
14564 "Expected only tree entries from extracts/reused buildvectors.");
14565 unsigned VF = getVF(V1);
14566 if (InVectors.size() == 2) {
14567 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14568 transformMaskAfterShuffle(CommonMask, CommonMask);
14569 VF = std::max<unsigned>(VF, CommonMask.size());
14570 } else if (const auto *InTE =
14571 InVectors.front().dyn_cast<const TreeEntry *>()) {
14572 VF = std::max(VF, InTE->getVectorFactor());
14573 } else {
14574 VF = std::max(
14575 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14576 ->getNumElements());
14577 }
14578 InVectors.push_back(V1);
14579 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14580 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14581 CommonMask[Idx] = Mask[Idx] + VF;
14582 }
14583 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14584 Value *Root = nullptr) {
14585 Cost += getBuildVectorCost(VL, Root);
14586 if (!Root) {
14587 // FIXME: Need to find a way to avoid use of getNullValue here.
14589 unsigned VF = VL.size();
14590 if (MaskVF != 0)
14591 VF = std::min(VF, MaskVF);
14592 Type *VLScalarTy = VL.front()->getType();
14593 for (Value *V : VL.take_front(VF)) {
14594 Type *ScalarTy = VLScalarTy->getScalarType();
14595 if (isa<PoisonValue>(V)) {
14596 Vals.push_back(PoisonValue::get(ScalarTy));
14597 continue;
14598 }
14599 if (isa<UndefValue>(V)) {
14600 Vals.push_back(UndefValue::get(ScalarTy));
14601 continue;
14602 }
14603 Vals.push_back(Constant::getNullValue(ScalarTy));
14604 }
14605 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14606 assert(SLPReVec && "FixedVectorType is not expected.");
14607 // When REVEC is enabled, we need to expand vector types into scalar
14608 // types.
14609 Vals = replicateMask(Vals, VecTy->getNumElements());
14610 }
14611 return ConstantVector::get(Vals);
14612 }
14615 cast<FixedVectorType>(Root->getType())->getNumElements()),
14616 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14617 }
14619 /// Finalize emission of the shuffles.
14621 ArrayRef<int> ExtMask,
14622 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14623 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14626 Action = {}) {
14627 IsFinalized = true;
14628 if (Action) {
14629 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14630 if (InVectors.size() == 2)
14631 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14632 else
14633 Cost += createShuffle(Vec, nullptr, CommonMask);
14634 transformMaskAfterShuffle(CommonMask, CommonMask);
14635 assert(VF > 0 &&
14636 "Expected vector length for the final value before action.");
14637 Value *V = cast<Value *>(Vec);
14638 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14639 Cost += createShuffle(V1, V2, Mask);
14640 return V1;
14641 });
14642 InVectors.front() = V;
14643 }
14644 if (!SubVectors.empty()) {
14645 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14646 if (InVectors.size() == 2)
14647 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14648 else
14649 Cost += createShuffle(Vec, nullptr, CommonMask);
14650 transformMaskAfterShuffle(CommonMask, CommonMask);
14651 // Add subvectors permutation cost.
14652 if (!SubVectorsMask.empty()) {
14653 assert(SubVectorsMask.size() <= CommonMask.size() &&
14654 "Expected same size of masks for subvectors and common mask.");
14655 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14656 copy(SubVectorsMask, SVMask.begin());
14657 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14658 if (I2 != PoisonMaskElem) {
14659 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14660 I1 = I2 + CommonMask.size();
14661 }
14662 }
14664 getWidenedType(ScalarTy, CommonMask.size()),
14665 SVMask, CostKind);
14666 }
14667 for (auto [E, Idx] : SubVectors) {
14668 Type *EScalarTy = E->Scalars.front()->getType();
14669 bool IsSigned = true;
14670 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14671 EScalarTy =
14672 IntegerType::get(EScalarTy->getContext(), It->second.first);
14673 IsSigned = It->second.second;
14674 }
14675 if (ScalarTy != EScalarTy) {
14676 unsigned CastOpcode = Instruction::Trunc;
14677 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14678 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14679 if (DstSz > SrcSz)
14680 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14681 Cost += TTI.getCastInstrCost(
14682 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14683 getWidenedType(EScalarTy, E->getVectorFactor()),
14685 }
14688 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14689 getWidenedType(ScalarTy, E->getVectorFactor()));
14690 if (!CommonMask.empty()) {
14691 std::iota(std::next(CommonMask.begin(), Idx),
14692 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14693 Idx);
14694 }
14695 }
14696 }
14697
14698 if (!ExtMask.empty()) {
14699 if (CommonMask.empty()) {
14700 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14701 } else {
14702 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14703 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14704 if (ExtMask[I] == PoisonMaskElem)
14705 continue;
14706 NewMask[I] = CommonMask[ExtMask[I]];
14707 }
14708 CommonMask.swap(NewMask);
14709 }
14710 }
14711 if (CommonMask.empty()) {
14712 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14713 return Cost;
14714 }
14715 return Cost +
14716 createShuffle(InVectors.front(),
14717 InVectors.size() == 2 ? InVectors.back() : nullptr,
14718 CommonMask);
14719 }
14720
14722 assert((IsFinalized || CommonMask.empty()) &&
14723 "Shuffle construction must be finalized.");
14724 }
14725};
14726
14727const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14728 unsigned Idx) const {
14729 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14730 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14731 return Op;
14732}
14733
14734TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14735 if (TE.State == TreeEntry::ScatterVectorize ||
14736 TE.State == TreeEntry::StridedVectorize)
14738 if (TE.State == TreeEntry::CompressVectorize)
14740 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14741 !TE.isAltShuffle()) {
14742 if (TE.ReorderIndices.empty())
14744 SmallVector<int> Mask;
14745 inversePermutation(TE.ReorderIndices, Mask);
14746 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14748 }
14750}
14751
14753BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14754 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14755 ArrayRef<Value *> VL = E->Scalars;
14756
14757 Type *ScalarTy = getValueType(VL[0]);
14758 if (!isValidElementType(ScalarTy))
14759 return InstructionCost::getInvalid();
14761
14762 // If we have computed a smaller type for the expression, update VecTy so
14763 // that the costs will be accurate.
14764 auto It = MinBWs.find(E);
14765 Type *OrigScalarTy = ScalarTy;
14766 if (It != MinBWs.end()) {
14767 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14768 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14769 if (VecTy)
14770 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14771 }
14772 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14773 unsigned EntryVF = E->getVectorFactor();
14774 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14775
14776 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
14777 if (allConstant(VL))
14778 return 0;
14779 if (isa<InsertElementInst>(VL[0]))
14780 return InstructionCost::getInvalid();
14781 if (isa<CmpInst>(VL.front()))
14782 ScalarTy = VL.front()->getType();
14783 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14784 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14785 }
14786 if (E->State == TreeEntry::SplitVectorize) {
14787 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14788 "Expected exactly 2 combined entries.");
14789 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14790 InstructionCost VectorCost = 0;
14791 if (E->ReorderIndices.empty()) {
14792 VectorCost = ::getShuffleCost(
14793 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14794 E->CombinedEntriesWithIndices.back().second,
14796 ScalarTy,
14797 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14798 ->getVectorFactor()));
14799 } else {
14800 unsigned CommonVF =
14801 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14802 ->getVectorFactor(),
14803 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14804 ->getVectorFactor());
14805 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14806 getWidenedType(ScalarTy, CommonVF),
14807 E->getSplitMask(), CostKind);
14808 }
14809 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14810 return VectorCost;
14811 }
14812 InstructionCost CommonCost = 0;
14813 SmallVector<int> Mask;
14814 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14815 (E->State != TreeEntry::StridedVectorize ||
14816 !isReverseOrder(E->ReorderIndices))) {
14817 SmallVector<int> NewMask;
14818 if (E->getOpcode() == Instruction::Store) {
14819 // For stores the order is actually a mask.
14820 NewMask.resize(E->ReorderIndices.size());
14821 copy(E->ReorderIndices, NewMask.begin());
14822 } else {
14823 inversePermutation(E->ReorderIndices, NewMask);
14824 }
14825 ::addMask(Mask, NewMask);
14826 }
14827 if (!E->ReuseShuffleIndices.empty())
14828 ::addMask(Mask, E->ReuseShuffleIndices);
14829 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14830 CommonCost =
14831 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14832 assert((E->State == TreeEntry::Vectorize ||
14833 E->State == TreeEntry::ScatterVectorize ||
14834 E->State == TreeEntry::StridedVectorize ||
14835 E->State == TreeEntry::CompressVectorize) &&
14836 "Unhandled state");
14837 assert(E->getOpcode() &&
14838 ((allSameType(VL) && allSameBlock(VL)) ||
14839 (E->getOpcode() == Instruction::GetElementPtr &&
14840 E->getMainOp()->getType()->isPointerTy()) ||
14841 E->hasCopyableElements()) &&
14842 "Invalid VL");
14843 Instruction *VL0 = E->getMainOp();
14844 unsigned ShuffleOrOp =
14845 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14846 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14847 ShuffleOrOp = E->CombinedOp;
14848 SmallSetVector<Value *, 16> UniqueValues;
14849 SmallVector<unsigned, 16> UniqueIndexes;
14850 for (auto [Idx, V] : enumerate(VL))
14851 if (UniqueValues.insert(V))
14852 UniqueIndexes.push_back(Idx);
14853 const unsigned Sz = UniqueValues.size();
14854 SmallBitVector UsedScalars(Sz, false);
14855 for (unsigned I = 0; I < Sz; ++I) {
14856 if (isa<Instruction>(UniqueValues[I]) &&
14857 !E->isCopyableElement(UniqueValues[I]) &&
14858 getTreeEntries(UniqueValues[I]).front() == E)
14859 continue;
14860 UsedScalars.set(I);
14861 }
14862 auto GetCastContextHint = [&](Value *V) {
14863 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14864 return getCastContextHint(*OpTEs.front());
14865 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14866 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14867 !SrcState.isAltShuffle())
14870 };
14871 auto GetCostDiff =
14872 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14873 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14874 // Calculate the cost of this instruction.
14875 InstructionCost ScalarCost = 0;
14876 if (isa<CastInst, CallInst>(VL0)) {
14877 // For some of the instructions no need to calculate cost for each
14878 // particular instruction, we can use the cost of the single
14879 // instruction x total number of scalar instructions.
14880 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14881 } else {
14882 for (unsigned I = 0; I < Sz; ++I) {
14883 if (UsedScalars.test(I))
14884 continue;
14885 ScalarCost += ScalarEltCost(I);
14886 }
14887 }
14888
14889 InstructionCost VecCost = VectorCost(CommonCost);
14890 // Check if the current node must be resized, if the parent node is not
14891 // resized.
14892 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14893 E->Idx != 0 &&
14894 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14895 const EdgeInfo &EI = E->UserTreeIndex;
14896 if (!EI.UserTE->hasState() ||
14897 EI.UserTE->getOpcode() != Instruction::Select ||
14898 EI.EdgeIdx != 0) {
14899 auto UserBWIt = MinBWs.find(EI.UserTE);
14900 Type *UserScalarTy =
14901 (EI.UserTE->isGather() ||
14902 EI.UserTE->State == TreeEntry::SplitVectorize)
14903 ? EI.UserTE->Scalars.front()->getType()
14904 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14905 if (UserBWIt != MinBWs.end())
14906 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14907 UserBWIt->second.first);
14908 if (ScalarTy != UserScalarTy) {
14909 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14910 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14911 unsigned VecOpcode;
14912 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14913 if (BWSz > SrcBWSz)
14914 VecOpcode = Instruction::Trunc;
14915 else
14916 VecOpcode =
14917 It->second.second ? Instruction::SExt : Instruction::ZExt;
14918 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14919 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14920 CostKind);
14921 }
14922 }
14923 }
14924 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14925 ScalarCost, "Calculated costs for Tree"));
14926 return VecCost - ScalarCost;
14927 };
14928 // Calculate cost difference from vectorizing set of GEPs.
14929 // Negative value means vectorizing is profitable.
14930 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14931 assert((E->State == TreeEntry::Vectorize ||
14932 E->State == TreeEntry::StridedVectorize ||
14933 E->State == TreeEntry::CompressVectorize) &&
14934 "Entry state expected to be Vectorize, StridedVectorize or "
14935 "MaskedLoadCompressVectorize here.");
14936 InstructionCost ScalarCost = 0;
14937 InstructionCost VecCost = 0;
14938 std::tie(ScalarCost, VecCost) = getGEPCosts(
14939 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14940 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14941 "Calculated GEPs cost for Tree"));
14942
14943 return VecCost - ScalarCost;
14944 };
14945
14946 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14947 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14948 if (MinMaxID == Intrinsic::not_intrinsic)
14949 return InstructionCost::getInvalid();
14950 Type *CanonicalType = Ty;
14951 if (CanonicalType->isPtrOrPtrVectorTy())
14952 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14953 CanonicalType->getContext(),
14954 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14955
14956 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14957 {CanonicalType, CanonicalType});
14959 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14960 // If the selects are the only uses of the compares, they will be
14961 // dead and we can adjust the cost by removing their cost.
14962 if (VI && SelectOnly) {
14963 assert((!Ty->isVectorTy() || SLPReVec) &&
14964 "Expected only for scalar type.");
14965 auto *CI = cast<CmpInst>(VI->getOperand(0));
14966 IntrinsicCost -= TTI->getCmpSelInstrCost(
14967 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14968 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14969 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14970 }
14971 return IntrinsicCost;
14972 };
14973 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14974 Instruction *VI) {
14975 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14976 return Cost;
14977 };
14978 switch (ShuffleOrOp) {
14979 case Instruction::PHI: {
14980 // Count reused scalars.
14981 InstructionCost ScalarCost = 0;
14982 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14983 for (Value *V : UniqueValues) {
14984 auto *PHI = dyn_cast<PHINode>(V);
14985 if (!PHI)
14986 continue;
14987
14988 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14989 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14990 Value *Op = PHI->getIncomingValue(I);
14991 Operands[I] = Op;
14992 }
14993 if (const TreeEntry *OpTE =
14994 getSameValuesTreeEntry(Operands.front(), Operands))
14995 if (CountedOps.insert(OpTE).second &&
14996 !OpTE->ReuseShuffleIndices.empty())
14997 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14998 OpTE->Scalars.size());
14999 }
15000
15001 return CommonCost - ScalarCost;
15002 }
15003 case Instruction::ExtractValue:
15004 case Instruction::ExtractElement: {
15005 APInt DemandedElts;
15006 VectorType *SrcVecTy = nullptr;
15007 auto GetScalarCost = [&](unsigned Idx) {
15008 if (isa<PoisonValue>(UniqueValues[Idx]))
15010
15011 auto *I = cast<Instruction>(UniqueValues[Idx]);
15012 if (!SrcVecTy) {
15013 if (ShuffleOrOp == Instruction::ExtractElement) {
15014 auto *EE = cast<ExtractElementInst>(I);
15015 SrcVecTy = EE->getVectorOperandType();
15016 } else {
15017 auto *EV = cast<ExtractValueInst>(I);
15018 Type *AggregateTy = EV->getAggregateOperand()->getType();
15019 unsigned NumElts;
15020 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
15021 NumElts = ATy->getNumElements();
15022 else
15023 NumElts = AggregateTy->getStructNumElements();
15024 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
15025 }
15026 }
15027 if (I->hasOneUse()) {
15028 Instruction *Ext = I->user_back();
15029 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
15031 // Use getExtractWithExtendCost() to calculate the cost of
15032 // extractelement/ext pair.
15033 InstructionCost Cost = TTI->getExtractWithExtendCost(
15034 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
15035 CostKind);
15036 // Subtract the cost of s|zext which is subtracted separately.
15037 Cost -= TTI->getCastInstrCost(
15038 Ext->getOpcode(), Ext->getType(), I->getType(),
15040 return Cost;
15041 }
15042 }
15043 if (DemandedElts.isZero())
15044 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
15045 DemandedElts.setBit(*getExtractIndex(I));
15047 };
15048 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15049 return CommonCost - (DemandedElts.isZero()
15051 : TTI.getScalarizationOverhead(
15052 SrcVecTy, DemandedElts, /*Insert=*/false,
15053 /*Extract=*/true, CostKind));
15054 };
15055 return GetCostDiff(GetScalarCost, GetVectorCost);
15056 }
15057 case Instruction::InsertElement: {
15058 assert(E->ReuseShuffleIndices.empty() &&
15059 "Unique insertelements only are expected.");
15060 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
15061 unsigned const NumElts = SrcVecTy->getNumElements();
15062 unsigned const NumScalars = VL.size();
15063
15064 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
15065
15066 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15067 unsigned OffsetBeg = *getElementIndex(VL.front());
15068 unsigned OffsetEnd = OffsetBeg;
15069 InsertMask[OffsetBeg] = 0;
15070 for (auto [I, V] : enumerate(VL.drop_front())) {
15071 unsigned Idx = *getElementIndex(V);
15072 if (OffsetBeg > Idx)
15073 OffsetBeg = Idx;
15074 else if (OffsetEnd < Idx)
15075 OffsetEnd = Idx;
15076 InsertMask[Idx] = I + 1;
15077 }
15078 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
15079 if (NumOfParts > 0 && NumOfParts < NumElts)
15080 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
15081 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15082 VecScalarsSz;
15083 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15084 unsigned InsertVecSz = std::min<unsigned>(
15085 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
15086 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15087 bool IsWholeSubvector =
15088 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
15089 // Check if we can safely insert a subvector. If it is not possible, just
15090 // generate a whole-sized vector and shuffle the source vector and the new
15091 // subvector.
15092 if (OffsetBeg + InsertVecSz > VecSz) {
15093 // Align OffsetBeg to generate correct mask.
15094 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
15095 InsertVecSz = VecSz;
15096 }
15097
15098 APInt DemandedElts = APInt::getZero(NumElts);
15099 // TODO: Add support for Instruction::InsertValue.
15100 SmallVector<int> Mask;
15101 if (!E->ReorderIndices.empty()) {
15102 inversePermutation(E->ReorderIndices, Mask);
15103 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
15104 } else {
15105 Mask.assign(VecSz, PoisonMaskElem);
15106 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
15107 }
15108 bool IsIdentity = true;
15109 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
15110 Mask.swap(PrevMask);
15111 for (unsigned I = 0; I < NumScalars; ++I) {
15112 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
15113 DemandedElts.setBit(InsertIdx);
15114 IsIdentity &= InsertIdx - OffsetBeg == I;
15115 Mask[InsertIdx - OffsetBeg] = I;
15116 }
15117 assert(Offset < NumElts && "Failed to find vector index offset");
15118
15120 Cost -=
15121 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
15122 /*Insert*/ true, /*Extract*/ false, CostKind);
15123
15124 // First cost - resize to actual vector size if not identity shuffle or
15125 // need to shift the vector.
15126 // Do not calculate the cost if the actual size is the register size and
15127 // we can merge this shuffle with the following SK_Select.
15128 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
15129 if (!IsIdentity)
15131 InsertVecTy, Mask);
15132 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15133 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15134 }));
15135 // Second cost - permutation with subvector, if some elements are from the
15136 // initial vector or inserting a subvector.
15137 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
15138 // subvector of ActualVecTy.
15139 SmallBitVector InMask =
15140 isUndefVector(FirstInsert->getOperand(0),
15141 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
15142 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
15143 if (InsertVecSz != VecSz) {
15144 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
15145 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
15146 CostKind, OffsetBeg - Offset, InsertVecTy);
15147 } else {
15148 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
15149 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
15150 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
15151 I <= End; ++I)
15152 if (Mask[I] != PoisonMaskElem)
15153 Mask[I] = I + VecSz;
15154 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
15155 Mask[I] =
15156 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
15157 Cost +=
15158 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
15159 }
15160 }
15161 return Cost;
15162 }
15163 case Instruction::ZExt:
15164 case Instruction::SExt:
15165 case Instruction::FPToUI:
15166 case Instruction::FPToSI:
15167 case Instruction::FPExt:
15168 case Instruction::PtrToInt:
15169 case Instruction::IntToPtr:
15170 case Instruction::SIToFP:
15171 case Instruction::UIToFP:
15172 case Instruction::Trunc:
15173 case Instruction::FPTrunc:
15174 case Instruction::BitCast: {
15175 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15176 Type *SrcScalarTy = VL0->getOperand(0)->getType();
15177 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
15178 unsigned Opcode = ShuffleOrOp;
15179 unsigned VecOpcode = Opcode;
15180 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15181 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
15182 // Check if the values are candidates to demote.
15183 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
15184 if (SrcIt != MinBWs.end()) {
15185 SrcBWSz = SrcIt->second.first;
15186 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
15187 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
15188 SrcVecTy =
15189 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
15190 }
15191 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15192 if (BWSz == SrcBWSz) {
15193 VecOpcode = Instruction::BitCast;
15194 } else if (BWSz < SrcBWSz) {
15195 VecOpcode = Instruction::Trunc;
15196 } else if (It != MinBWs.end()) {
15197 assert(BWSz > SrcBWSz && "Invalid cast!");
15198 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15199 } else if (SrcIt != MinBWs.end()) {
15200 assert(BWSz > SrcBWSz && "Invalid cast!");
15201 VecOpcode =
15202 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15203 }
15204 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15205 !SrcIt->second.second) {
15206 VecOpcode = Instruction::UIToFP;
15207 }
15208 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
15209 assert(Idx == 0 && "Expected 0 index only");
15210 return TTI->getCastInstrCost(Opcode, VL0->getType(),
15211 VL0->getOperand(0)->getType(),
15213 };
15214 auto GetVectorCost = [=](InstructionCost CommonCost) {
15215 // Do not count cost here if minimum bitwidth is in effect and it is just
15216 // a bitcast (here it is just a noop).
15217 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
15218 return CommonCost;
15219 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
15220 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
15221
15222 bool IsArithmeticExtendedReduction =
15223 E->Idx == 0 && UserIgnoreList &&
15224 all_of(*UserIgnoreList, [](Value *V) {
15225 auto *I = cast<Instruction>(V);
15226 return is_contained({Instruction::Add, Instruction::FAdd,
15227 Instruction::Mul, Instruction::FMul,
15228 Instruction::And, Instruction::Or,
15229 Instruction::Xor},
15230 I->getOpcode());
15231 });
15232 if (IsArithmeticExtendedReduction &&
15233 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
15234 return CommonCost;
15235 return CommonCost +
15236 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
15237 VecOpcode == Opcode ? VI : nullptr);
15238 };
15239 return GetCostDiff(GetScalarCost, GetVectorCost);
15240 }
15241 case Instruction::FCmp:
15242 case Instruction::ICmp:
15243 case Instruction::Select: {
15244 CmpPredicate VecPred, SwappedVecPred;
15245 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
15246 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
15247 match(VL0, MatchCmp))
15248 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
15249 else
15250 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
15253 auto GetScalarCost = [&](unsigned Idx) {
15254 if (isa<PoisonValue>(UniqueValues[Idx]))
15256
15257 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15258 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
15261 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
15262 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
15263 !match(VI, MatchCmp)) ||
15264 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
15265 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
15266 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
15269
15270 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
15271 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
15272 CostKind, getOperandInfo(VI->getOperand(0)),
15273 getOperandInfo(VI->getOperand(1)), VI);
15274 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
15275 if (IntrinsicCost.isValid())
15276 ScalarCost = IntrinsicCost;
15277
15278 return ScalarCost;
15279 };
15280 auto GetVectorCost = [&](InstructionCost CommonCost) {
15281 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15282
15283 InstructionCost VecCost =
15284 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
15285 CostKind, getOperandInfo(E->getOperand(0)),
15286 getOperandInfo(E->getOperand(1)), VL0);
15287 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
15288 auto *CondType =
15289 getWidenedType(SI->getCondition()->getType(), VL.size());
15290 unsigned CondNumElements = CondType->getNumElements();
15291 unsigned VecTyNumElements = getNumElements(VecTy);
15292 assert(VecTyNumElements >= CondNumElements &&
15293 VecTyNumElements % CondNumElements == 0 &&
15294 "Cannot vectorize Instruction::Select");
15295 if (CondNumElements != VecTyNumElements) {
15296 // When the return type is i1 but the source is fixed vector type, we
15297 // need to duplicate the condition value.
15298 VecCost += ::getShuffleCost(
15299 *TTI, TTI::SK_PermuteSingleSrc, CondType,
15300 createReplicatedMask(VecTyNumElements / CondNumElements,
15301 CondNumElements));
15302 }
15303 }
15304 return VecCost + CommonCost;
15305 };
15306 return GetCostDiff(GetScalarCost, GetVectorCost);
15307 }
15308 case TreeEntry::MinMax: {
15309 auto GetScalarCost = [&](unsigned Idx) {
15310 return GetMinMaxCost(OrigScalarTy);
15311 };
15312 auto GetVectorCost = [&](InstructionCost CommonCost) {
15313 InstructionCost VecCost = GetMinMaxCost(VecTy);
15314 return VecCost + CommonCost;
15315 };
15316 return GetCostDiff(GetScalarCost, GetVectorCost);
15317 }
15318 case TreeEntry::FMulAdd: {
15319 auto GetScalarCost = [&](unsigned Idx) {
15320 if (isa<PoisonValue>(UniqueValues[Idx]))
15322 return GetFMulAddCost(E->getOperations(),
15323 cast<Instruction>(UniqueValues[Idx]));
15324 };
15325 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15326 FastMathFlags FMF;
15327 FMF.set();
15328 for (Value *V : E->Scalars) {
15329 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
15330 FMF &= FPCI->getFastMathFlags();
15331 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
15332 FMF &= FPCIOp->getFastMathFlags();
15333 }
15334 }
15335 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15336 {VecTy, VecTy, VecTy}, FMF);
15337 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
15338 return VecCost + CommonCost;
15339 };
15340 return GetCostDiff(GetScalarCost, GetVectorCost);
15341 }
15342 case Instruction::FNeg:
15343 case Instruction::Add:
15344 case Instruction::FAdd:
15345 case Instruction::Sub:
15346 case Instruction::FSub:
15347 case Instruction::Mul:
15348 case Instruction::FMul:
15349 case Instruction::UDiv:
15350 case Instruction::SDiv:
15351 case Instruction::FDiv:
15352 case Instruction::URem:
15353 case Instruction::SRem:
15354 case Instruction::FRem:
15355 case Instruction::Shl:
15356 case Instruction::LShr:
15357 case Instruction::AShr:
15358 case Instruction::And:
15359 case Instruction::Or:
15360 case Instruction::Xor: {
15361 auto GetScalarCost = [&](unsigned Idx) {
15362 if (isa<PoisonValue>(UniqueValues[Idx]))
15364
15365 // We cannot retrieve the operand from UniqueValues[Idx] because an
15366 // interchangeable instruction may be used. The order and the actual
15367 // operand might differ from what is retrieved from UniqueValues[Idx].
15368 unsigned Lane = UniqueIndexes[Idx];
15369 Value *Op1 = E->getOperand(0)[Lane];
15370 Value *Op2;
15371 SmallVector<const Value *, 2> Operands(1, Op1);
15372 if (isa<UnaryOperator>(UniqueValues[Idx])) {
15373 Op2 = Op1;
15374 } else {
15375 Op2 = E->getOperand(1)[Lane];
15376 Operands.push_back(Op2);
15377 }
15380 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
15381 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
15382 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
15383 I && (ShuffleOrOp == Instruction::FAdd ||
15384 ShuffleOrOp == Instruction::FSub)) {
15385 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
15386 if (IntrinsicCost.isValid())
15387 ScalarCost = IntrinsicCost;
15388 }
15389 return ScalarCost;
15390 };
15391 auto GetVectorCost = [=](InstructionCost CommonCost) {
15392 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15393 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15394 ArrayRef<Value *> Ops = E->getOperand(I);
15395 if (all_of(Ops, [&](Value *Op) {
15396 auto *CI = dyn_cast<ConstantInt>(Op);
15397 return CI && CI->getValue().countr_one() >= It->second.first;
15398 }))
15399 return CommonCost;
15400 }
15401 }
15402 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
15403 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
15404 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
15405 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
15406 Op2Info, {}, nullptr, TLI) +
15407 CommonCost;
15408 };
15409 return GetCostDiff(GetScalarCost, GetVectorCost);
15410 }
15411 case Instruction::GetElementPtr: {
15412 return CommonCost + GetGEPCostDiff(VL, VL0);
15413 }
15414 case Instruction::Load: {
15415 auto GetScalarCost = [&](unsigned Idx) {
15416 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
15417 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15418 VI->getAlign(), VI->getPointerAddressSpace(),
15420 };
15421 auto *LI0 = cast<LoadInst>(VL0);
15422 auto GetVectorCost = [&](InstructionCost CommonCost) {
15423 InstructionCost VecLdCost;
15424 switch (E->State) {
15425 case TreeEntry::Vectorize:
15426 if (unsigned Factor = E->getInterleaveFactor()) {
15427 VecLdCost = TTI->getInterleavedMemoryOpCost(
15428 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15429 LI0->getPointerAddressSpace(), CostKind);
15430
15431 } else {
15432 VecLdCost = TTI->getMemoryOpCost(
15433 Instruction::Load, VecTy, LI0->getAlign(),
15434 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15435 }
15436 break;
15437 case TreeEntry::StridedVectorize: {
15438 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
15439 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15440 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
15441 Align CommonAlignment =
15442 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15443 VecLdCost = TTI->getMemIntrinsicInstrCost(
15444 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15445 StridedLoadTy, LI0->getPointerOperand(),
15446 /*VariableMask=*/false, CommonAlignment),
15447 CostKind);
15448 if (StridedLoadTy != VecTy)
15449 VecLdCost +=
15450 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15451 getCastContextHint(*E), CostKind);
15452
15453 break;
15454 }
15455 case TreeEntry::CompressVectorize: {
15456 bool IsMasked;
15457 unsigned InterleaveFactor;
15458 SmallVector<int> CompressMask;
15459 VectorType *LoadVecTy;
15460 SmallVector<Value *> Scalars(VL);
15461 if (!E->ReorderIndices.empty()) {
15462 SmallVector<int> Mask(E->ReorderIndices.begin(),
15463 E->ReorderIndices.end());
15464 reorderScalars(Scalars, Mask);
15465 }
15466 SmallVector<Value *> PointerOps(Scalars.size());
15467 for (auto [I, V] : enumerate(Scalars))
15468 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15469 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15470 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15471 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
15472 CompressMask, LoadVecTy);
15473 assert(IsVectorized && "Failed to vectorize load");
15474 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
15475 InterleaveFactor, IsMasked);
15476 Align CommonAlignment = LI0->getAlign();
15477 if (InterleaveFactor) {
15478 VecLdCost = TTI->getInterleavedMemoryOpCost(
15479 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15480 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
15481 } else if (IsMasked) {
15482 VecLdCost = TTI->getMemIntrinsicInstrCost(
15483 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15484 CommonAlignment,
15485 LI0->getPointerAddressSpace()),
15486 CostKind);
15487 // TODO: include this cost into CommonCost.
15488 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15489 LoadVecTy, CompressMask, CostKind);
15490 } else {
15491 VecLdCost = TTI->getMemoryOpCost(
15492 Instruction::Load, LoadVecTy, CommonAlignment,
15493 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15494 // TODO: include this cost into CommonCost.
15495 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15496 LoadVecTy, CompressMask, CostKind);
15497 }
15498 break;
15499 }
15500 case TreeEntry::ScatterVectorize: {
15501 Align CommonAlignment =
15502 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15503 VecLdCost = TTI->getMemIntrinsicInstrCost(
15504 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15505 LI0->getPointerOperand(),
15506 /*VariableMask=*/false, CommonAlignment),
15507 CostKind);
15508 break;
15509 }
15510 case TreeEntry::CombinedVectorize:
15511 case TreeEntry::SplitVectorize:
15512 case TreeEntry::NeedToGather:
15513 llvm_unreachable("Unexpected vectorization state.");
15514 }
15515 return VecLdCost + CommonCost;
15516 };
15517
15518 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
15519 // If this node generates masked gather load then it is not a terminal node.
15520 // Hence address operand cost is estimated separately.
15521 if (E->State == TreeEntry::ScatterVectorize)
15522 return Cost;
15523
15524 // Estimate cost of GEPs since this tree node is a terminator.
15525 SmallVector<Value *> PointerOps(VL.size());
15526 for (auto [I, V] : enumerate(VL))
15527 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15528 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15529 }
15530 case Instruction::Store: {
15531 bool IsReorder = !E->ReorderIndices.empty();
15532 auto GetScalarCost = [=](unsigned Idx) {
15533 auto *VI = cast<StoreInst>(VL[Idx]);
15534 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
15535 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15536 VI->getAlign(), VI->getPointerAddressSpace(),
15537 CostKind, OpInfo, VI);
15538 };
15539 auto *BaseSI =
15540 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
15541 auto GetVectorCost = [=](InstructionCost CommonCost) {
15542 // We know that we can merge the stores. Calculate the cost.
15543 InstructionCost VecStCost;
15544 if (E->State == TreeEntry::StridedVectorize) {
15545 Align CommonAlignment =
15546 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
15547 VecStCost = TTI->getMemIntrinsicInstrCost(
15548 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15549 VecTy, BaseSI->getPointerOperand(),
15550 /*VariableMask=*/false, CommonAlignment),
15551 CostKind);
15552 } else {
15553 assert(E->State == TreeEntry::Vectorize &&
15554 "Expected either strided or consecutive stores.");
15555 if (unsigned Factor = E->getInterleaveFactor()) {
15556 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
15557 "No reused shuffles expected");
15558 CommonCost = 0;
15559 VecStCost = TTI->getInterleavedMemoryOpCost(
15560 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15561 BaseSI->getPointerAddressSpace(), CostKind);
15562 } else {
15563 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
15564 VecStCost = TTI->getMemoryOpCost(
15565 Instruction::Store, VecTy, BaseSI->getAlign(),
15566 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
15567 }
15568 }
15569 return VecStCost + CommonCost;
15570 };
15571 SmallVector<Value *> PointerOps(VL.size());
15572 for (auto [I, V] : enumerate(VL)) {
15573 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
15574 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
15575 }
15576
15577 return GetCostDiff(GetScalarCost, GetVectorCost) +
15578 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15579 }
15580 case Instruction::Call: {
15581 auto GetScalarCost = [&](unsigned Idx) {
15582 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15585 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15586 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15587 }
15588 return TTI->getCallInstrCost(CI->getCalledFunction(),
15590 CI->getFunctionType()->params(), CostKind);
15591 };
15592 auto GetVectorCost = [=](InstructionCost CommonCost) {
15593 auto *CI = cast<CallInst>(VL0);
15596 CI, ID, VecTy->getNumElements(),
15597 It != MinBWs.end() ? It->second.first : 0, TTI);
15598 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15599 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15600 };
15601 return GetCostDiff(GetScalarCost, GetVectorCost);
15602 }
15603 case Instruction::ShuffleVector: {
15604 if (!SLPReVec || E->isAltShuffle())
15605 assert(E->isAltShuffle() &&
15606 ((Instruction::isBinaryOp(E->getOpcode()) &&
15607 Instruction::isBinaryOp(E->getAltOpcode())) ||
15608 (Instruction::isCast(E->getOpcode()) &&
15609 Instruction::isCast(E->getAltOpcode())) ||
15610 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15611 "Invalid Shuffle Vector Operand");
15612 // Try to find the previous shuffle node with the same operands and same
15613 // main/alternate ops.
15614 auto TryFindNodeWithEqualOperands = [=]() {
15615 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15616 if (TE.get() == E)
15617 break;
15618 if (TE->hasState() && TE->isAltShuffle() &&
15619 ((TE->getOpcode() == E->getOpcode() &&
15620 TE->getAltOpcode() == E->getAltOpcode()) ||
15621 (TE->getOpcode() == E->getAltOpcode() &&
15622 TE->getAltOpcode() == E->getOpcode())) &&
15623 TE->hasEqualOperands(*E))
15624 return true;
15625 }
15626 return false;
15627 };
15628 auto GetScalarCost = [&](unsigned Idx) {
15629 if (isa<PoisonValue>(UniqueValues[Idx]))
15631
15632 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15633 assert(E->getMatchingMainOpOrAltOp(VI) &&
15634 "Unexpected main/alternate opcode");
15635 (void)E;
15636 return TTI->getInstructionCost(VI, CostKind);
15637 };
15638 // Need to clear CommonCost since the final shuffle cost is included into
15639 // vector cost.
15640 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15641 // VecCost is equal to sum of the cost of creating 2 vectors
15642 // and the cost of creating shuffle.
15643 InstructionCost VecCost = 0;
15644 if (TryFindNodeWithEqualOperands()) {
15645 LLVM_DEBUG({
15646 dbgs() << "SLP: diamond match for alternate node found.\n";
15647 E->dump();
15648 });
15649 // No need to add new vector costs here since we're going to reuse
15650 // same main/alternate vector ops, just do different shuffling.
15651 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15652 VecCost =
15653 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15654 VecCost +=
15655 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15656 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15657 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15658 VecCost = TTIRef.getCmpSelInstrCost(
15659 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15660 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15661 VL0);
15662 VecCost += TTIRef.getCmpSelInstrCost(
15663 E->getOpcode(), VecTy, MaskTy,
15664 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15665 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15666 E->getAltOp());
15667 } else {
15668 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15669 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15670 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15671 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15672 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15673 unsigned SrcBWSz =
15674 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15675 if (SrcIt != MinBWs.end()) {
15676 SrcBWSz = SrcIt->second.first;
15677 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15678 SrcTy = getWidenedType(SrcSclTy, VL.size());
15679 }
15680 if (BWSz <= SrcBWSz) {
15681 if (BWSz < SrcBWSz)
15682 VecCost =
15683 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15685 LLVM_DEBUG({
15686 dbgs()
15687 << "SLP: alternate extension, which should be truncated.\n";
15688 E->dump();
15689 });
15690 return VecCost;
15691 }
15692 }
15693 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15695 VecCost +=
15696 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15698 }
15699 SmallVector<int> Mask;
15700 E->buildAltOpShuffleMask(
15701 [&](Instruction *I) {
15702 assert(E->getMatchingMainOpOrAltOp(I) &&
15703 "Unexpected main/alternate opcode");
15704 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15705 *TLI);
15706 },
15707 Mask);
15709 FinalVecTy, Mask, CostKind);
15710 // Patterns like [fadd,fsub] can be combined into a single instruction
15711 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15712 // need to take into account their order when looking for the most used
15713 // order.
15714 unsigned Opcode0 = E->getOpcode();
15715 unsigned Opcode1 = E->getAltOpcode();
15716 SmallBitVector OpcodeMask(
15717 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15718 // If this pattern is supported by the target then we consider the
15719 // order.
15720 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15721 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15722 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15723 return AltVecCost < VecCost ? AltVecCost : VecCost;
15724 }
15725 // TODO: Check the reverse order too.
15726 return VecCost;
15727 };
15728 if (SLPReVec && !E->isAltShuffle())
15729 return GetCostDiff(
15730 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15731 // If a group uses mask in order, the shufflevector can be
15732 // eliminated by instcombine. Then the cost is 0.
15734 "Not supported shufflevector usage.");
15735 auto *SV = cast<ShuffleVectorInst>(VL.front());
15736 unsigned SVNumElements =
15737 cast<FixedVectorType>(SV->getOperand(0)->getType())
15738 ->getNumElements();
15739 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15740 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15741 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15742 int NextIndex = 0;
15743 if (!all_of(Group, [&](Value *V) {
15745 "Not supported shufflevector usage.");
15746 auto *SV = cast<ShuffleVectorInst>(V);
15747 int Index;
15748 [[maybe_unused]] bool IsExtractSubvectorMask =
15749 SV->isExtractSubvectorMask(Index);
15750 assert(IsExtractSubvectorMask &&
15751 "Not supported shufflevector usage.");
15752 if (NextIndex != Index)
15753 return false;
15754 NextIndex += SV->getShuffleMask().size();
15755 return true;
15756 }))
15757 return ::getShuffleCost(
15759 calculateShufflevectorMask(E->Scalars));
15760 }
15761 return TTI::TCC_Free;
15762 });
15763 return GetCostDiff(GetScalarCost, GetVectorCost);
15764 }
15765 case Instruction::Freeze:
15766 return CommonCost;
15767 default:
15768 llvm_unreachable("Unknown instruction");
15769 }
15770}
15771
15772bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15773 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15774 << VectorizableTree.size() << " is fully vectorizable .\n");
15775
15776 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15777 SmallVector<int> Mask;
15778 return TE->isGather() &&
15779 !any_of(TE->Scalars,
15780 [this](Value *V) { return EphValues.contains(V); }) &&
15781 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15782 TE->Scalars.size() < Limit ||
15783 (((TE->hasState() &&
15784 TE->getOpcode() == Instruction::ExtractElement) ||
15786 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15787 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15788 !TE->isAltShuffle()) ||
15789 any_of(TE->Scalars, IsaPred<LoadInst>));
15790 };
15791
15792 // We only handle trees of heights 1 and 2.
15793 if (VectorizableTree.size() == 1 &&
15794 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15795 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15796 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15797 (ForReduction &&
15798 AreVectorizableGathers(VectorizableTree[0].get(),
15799 VectorizableTree[0]->Scalars.size()) &&
15800 VectorizableTree[0]->getVectorFactor() > 2)))
15801 return true;
15802
15803 if (VectorizableTree.size() != 2)
15804 return false;
15805
15806 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15807 // with the second gather nodes if they have less scalar operands rather than
15808 // the initial tree element (may be profitable to shuffle the second gather)
15809 // or they are extractelements, which form shuffle.
15810 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15811 AreVectorizableGathers(VectorizableTree[1].get(),
15812 VectorizableTree[0]->Scalars.size()))
15813 return true;
15814
15815 // Gathering cost would be too much for tiny trees.
15816 if (VectorizableTree[0]->isGather() ||
15817 (VectorizableTree[1]->isGather() &&
15818 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15819 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15820 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15821 return false;
15822
15823 return true;
15824}
15825
15826static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15828 bool MustMatchOrInst) {
15829 // Look past the root to find a source value. Arbitrarily follow the
15830 // path through operand 0 of any 'or'. Also, peek through optional
15831 // shift-left-by-multiple-of-8-bits.
15832 Value *ZextLoad = Root;
15833 const APInt *ShAmtC;
15834 bool FoundOr = false;
15835 while (!isa<ConstantExpr>(ZextLoad) &&
15836 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15837 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15838 ShAmtC->urem(8) == 0))) {
15839 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15840 ZextLoad = BinOp->getOperand(0);
15841 if (BinOp->getOpcode() == Instruction::Or)
15842 FoundOr = true;
15843 }
15844 // Check if the input is an extended load of the required or/shift expression.
15845 Value *Load;
15846 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15847 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15848 return false;
15849
15850 // Require that the total load bit width is a legal integer type.
15851 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15852 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15853 Type *SrcTy = Load->getType();
15854 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15855 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15856 return false;
15857
15858 // Everything matched - assume that we can fold the whole sequence using
15859 // load combining.
15860 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15861 << *(cast<Instruction>(Root)) << "\n");
15862
15863 return true;
15864}
15865
15867 if (RdxKind != RecurKind::Or)
15868 return false;
15869
15870 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15871 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15872 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15873 /* MatchOr */ false);
15874}
15875
15877 // Peek through a final sequence of stores and check if all operations are
15878 // likely to be load-combined.
15879 unsigned NumElts = Stores.size();
15880 for (Value *Scalar : Stores) {
15881 Value *X;
15882 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15883 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15884 return false;
15885 }
15886 return true;
15887}
15888
15889bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15890 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15891 return true;
15892
15893 // Graph is empty - do nothing.
15894 if (VectorizableTree.empty()) {
15895 assert(ExternalUses.empty() && "We shouldn't have any external users");
15896
15897 return true;
15898 }
15899
15900 // No need to vectorize inserts of gathered values.
15901 if (VectorizableTree.size() == 2 &&
15902 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15903 VectorizableTree[1]->isGather() &&
15904 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15905 !(isSplat(VectorizableTree[1]->Scalars) ||
15906 allConstant(VectorizableTree[1]->Scalars))))
15907 return true;
15908
15909 // If the graph includes only PHI nodes and gathers, it is defnitely not
15910 // profitable for the vectorization, we can skip it, if the cost threshold is
15911 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15912 // gathers/buildvectors.
15913 constexpr int Limit = 4;
15914 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15915 !VectorizableTree.empty() &&
15916 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15917 return (TE->isGather() &&
15918 (!TE->hasState() ||
15919 TE->getOpcode() != Instruction::ExtractElement) &&
15920 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15921 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15922 }))
15923 return true;
15924
15925 // Do not vectorize small tree of phis only, if all vector phis are also
15926 // gathered.
15927 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15928 VectorizableTree.size() <= Limit &&
15929 all_of(VectorizableTree,
15930 [&](const std::unique_ptr<TreeEntry> &TE) {
15931 return (TE->isGather() &&
15932 (!TE->hasState() ||
15933 TE->getOpcode() != Instruction::ExtractElement) &&
15934 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15935 Limit) ||
15936 (TE->hasState() &&
15937 (TE->getOpcode() == Instruction::InsertElement ||
15938 (TE->getOpcode() == Instruction::PHI &&
15939 all_of(TE->Scalars, [&](Value *V) {
15940 return isa<PoisonValue>(V) || MustGather.contains(V);
15941 }))));
15942 }) &&
15943 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15944 return TE->State == TreeEntry::Vectorize &&
15945 TE->getOpcode() == Instruction::PHI;
15946 }))
15947 return true;
15948
15949 // If the tree contains only phis, buildvectors, split nodes and
15950 // small nodes with reuses, we can skip it.
15951 SmallVector<const TreeEntry *> StoreLoadNodes;
15952 unsigned NumGathers = 0;
15953 constexpr int LimitTreeSize = 36;
15954 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15955 all_of(VectorizableTree,
15956 [&](const std::unique_ptr<TreeEntry> &TE) {
15957 if (!TE->isGather() && TE->hasState() &&
15958 (TE->getOpcode() == Instruction::Load ||
15959 TE->getOpcode() == Instruction::Store)) {
15960 StoreLoadNodes.push_back(TE.get());
15961 return true;
15962 }
15963 if (TE->isGather())
15964 ++NumGathers;
15965 return TE->State == TreeEntry::SplitVectorize ||
15966 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15967 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15968 VectorizableTree.size() > LimitTreeSize) ||
15969 (TE->isGather() &&
15970 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15971 (TE->hasState() &&
15972 (TE->getOpcode() == Instruction::PHI ||
15973 (TE->hasCopyableElements() &&
15974 static_cast<unsigned>(count_if(
15975 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15976 TE->Scalars.size() / 2) ||
15977 ((!TE->ReuseShuffleIndices.empty() ||
15978 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15979 TE->Scalars.size() == 2)));
15980 }) &&
15981 (StoreLoadNodes.empty() ||
15982 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15983 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15984 return TE->getOpcode() == Instruction::Store ||
15985 all_of(TE->Scalars, [&](Value *V) {
15986 return !isa<LoadInst>(V) ||
15987 areAllUsersVectorized(cast<Instruction>(V));
15988 });
15989 })))))
15990 return true;
15991
15992 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15993 // tree node) and other buildvectors, we can skip it.
15994 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15995 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15996 VectorizableTree.size() >= Limit &&
15997 count_if(ArrayRef(VectorizableTree).drop_front(),
15998 [&](const std::unique_ptr<TreeEntry> &TE) {
15999 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
16000 TE->UserTreeIndex.UserTE->Idx == 0;
16001 }) == 2)
16002 return true;
16003
16004 // If the tree contains only vectorization of the phi node from the
16005 // buildvector - skip it.
16006 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16007 VectorizableTree.size() > 2 &&
16008 VectorizableTree.front()->State == TreeEntry::Vectorize &&
16009 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16010 VectorizableTree[1]->State == TreeEntry::Vectorize &&
16011 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
16012 all_of(
16013 ArrayRef(VectorizableTree).drop_front(2),
16014 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
16015 return true;
16016
16017 // We can vectorize the tree if its size is greater than or equal to the
16018 // minimum size specified by the MinTreeSize command line option.
16019 if (VectorizableTree.size() >= MinTreeSize)
16020 return false;
16021
16022 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
16023 // can vectorize it if we can prove it fully vectorizable.
16024 if (isFullyVectorizableTinyTree(ForReduction))
16025 return false;
16026
16027 // Check if any of the gather node forms an insertelement buildvector
16028 // somewhere.
16029 bool IsAllowedSingleBVNode =
16030 VectorizableTree.size() > 1 ||
16031 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
16032 !VectorizableTree.front()->isAltShuffle() &&
16033 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
16034 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
16035 allSameBlock(VectorizableTree.front()->Scalars));
16036 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
16037 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
16038 return isa<ExtractElementInst, Constant>(V) ||
16039 (IsAllowedSingleBVNode &&
16040 !V->hasNUsesOrMore(UsesLimit) &&
16041 any_of(V->users(), IsaPred<InsertElementInst>));
16042 });
16043 }))
16044 return false;
16045
16046 if (VectorizableTree.back()->isGather() &&
16047 VectorizableTree.back()->hasState() &&
16048 VectorizableTree.back()->isAltShuffle() &&
16049 VectorizableTree.back()->getVectorFactor() > 2 &&
16050 allSameBlock(VectorizableTree.back()->Scalars) &&
16051 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
16052 TTI->getScalarizationOverhead(
16053 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
16054 VectorizableTree.back()->getVectorFactor()),
16055 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
16056 /*Insert=*/true, /*Extract=*/false,
16058 return false;
16059
16060 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
16061 // vectorizable.
16062 return true;
16063}
16064
16067 constexpr unsigned SmallTree = 3;
16068 if (VectorizableTree.front()->isNonPowOf2Vec() &&
16069 getCanonicalGraphSize() <= SmallTree &&
16070 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
16071 [](const std::unique_ptr<TreeEntry> &TE) {
16072 return TE->isGather() && TE->hasState() &&
16073 TE->getOpcode() == Instruction::Load &&
16074 !allSameBlock(TE->Scalars);
16075 }) == 1)
16076 return true;
16077 return false;
16078 }
16079 bool Res = false;
16080 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
16081 TreeEntry &E = *VectorizableTree[Idx];
16082 if (E.State == TreeEntry::SplitVectorize)
16083 return false;
16084 if (!E.isGather())
16085 continue;
16086 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
16087 (!E.hasState() &&
16089 (isa<ExtractElementInst>(E.Scalars.front()) &&
16090 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
16091 return false;
16092 if (isSplat(E.Scalars) || allConstant(E.Scalars))
16093 continue;
16094 Res = true;
16095 }
16096 return Res;
16097}
16098
16100 // Walk from the bottom of the tree to the top, tracking which values are
16101 // live. When we see a call instruction that is not part of our tree,
16102 // query TTI to see if there is a cost to keeping values live over it
16103 // (for example, if spills and fills are required).
16104
16105 const TreeEntry *Root = VectorizableTree.front().get();
16106 if (Root->isGather())
16107 return 0;
16108
16109 InstructionCost Cost = 0;
16111 EntriesToOperands;
16112 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
16113 SmallPtrSet<const Instruction *, 8> LastInstructions;
16114 for (const auto &TEPtr : VectorizableTree) {
16115 if (!TEPtr->isGather()) {
16116 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
16117 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
16118 LastInstructions.insert(LastInst);
16119 }
16120 if (TEPtr->UserTreeIndex)
16121 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
16122 }
16123
16124 auto NoCallIntrinsic = [this](const Instruction *I) {
16125 const auto *II = dyn_cast<IntrinsicInst>(I);
16126 if (!II)
16127 return false;
16128 if (II->isAssumeLikeIntrinsic())
16129 return true;
16130 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
16131 InstructionCost IntrCost =
16132 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
16133 InstructionCost CallCost = TTI->getCallInstrCost(
16134 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
16135 return IntrCost < CallCost;
16136 };
16137
16138 // Maps last instruction in the entry to the last instruction for the one of
16139 // operand entries and the flag. If the flag is true, there are no calls in
16140 // between these instructions.
16142 CheckedInstructions;
16143 unsigned Budget = 0;
16144 const unsigned BudgetLimit =
16145 ScheduleRegionSizeBudget / VectorizableTree.size();
16146 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
16147 const Instruction *Last) {
16148 assert(First->getParent() == Last->getParent() &&
16149 "Expected instructions in same block.");
16150 if (auto It = CheckedInstructions.find(Last);
16151 It != CheckedInstructions.end()) {
16152 const Instruction *Checked = It->second.getPointer();
16153 if (Checked == First || Checked->comesBefore(First))
16154 return It->second.getInt() != 0;
16155 Last = Checked;
16156 } else if (Last == First || Last->comesBefore(First)) {
16157 return true;
16158 }
16160 ++First->getIterator().getReverse(),
16161 PrevInstIt =
16162 Last->getIterator().getReverse();
16163 SmallVector<const Instruction *> LastInstsInRange;
16164 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
16165 // Debug information does not impact spill cost.
16166 // Vectorized calls, represented as vector intrinsics, do not impact spill
16167 // cost.
16168 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
16169 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
16170 for (const Instruction *LastInst : LastInstsInRange)
16171 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
16172 return false;
16173 }
16174 if (LastInstructions.contains(&*PrevInstIt))
16175 LastInstsInRange.push_back(&*PrevInstIt);
16176
16177 ++PrevInstIt;
16178 ++Budget;
16179 }
16180 for (const Instruction *LastInst : LastInstsInRange)
16181 CheckedInstructions.try_emplace(
16182 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
16183 Budget <= BudgetLimit ? 1 : 0);
16184 return Budget <= BudgetLimit;
16185 };
16186 auto AddCosts = [&](const TreeEntry *Op) {
16187 Type *ScalarTy = Op->Scalars.front()->getType();
16188 auto It = MinBWs.find(Op);
16189 if (It != MinBWs.end())
16190 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
16191 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
16192 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
16193 if (ScalarTy->isVectorTy()) {
16194 // Handle revec dead vector instructions.
16195 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
16196 }
16197 };
16198 // Memoize the relationship between blocks, i.e. if there is (at least one)
16199 // non-vectorized call between the blocks. This allows to skip the analysis of
16200 // the same block paths multiple times.
16202 ParentOpParentToPreds;
16203 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
16204 BasicBlock *OpParent) {
16205 auto Key = std::make_pair(Root, OpParent);
16206 if (auto It = ParentOpParentToPreds.find(Key);
16207 It != ParentOpParentToPreds.end())
16208 return It->second;
16210 if (Pred)
16211 Worklist.push_back(Pred);
16212 else
16213 Worklist.append(pred_begin(Root), pred_end(Root));
16216 ParentsPairsToAdd;
16217 bool Res = false;
16219 for (const auto &KeyPair : ParentsPairsToAdd) {
16220 assert(!ParentOpParentToPreds.contains(KeyPair) &&
16221 "Should not have been added before.");
16222 ParentOpParentToPreds.try_emplace(KeyPair, Res);
16223 }
16224 });
16225 while (!Worklist.empty()) {
16226 BasicBlock *BB = Worklist.pop_back_val();
16227 if (BB == OpParent || !Visited.insert(BB).second)
16228 continue;
16229 auto Pair = std::make_pair(BB, OpParent);
16230 if (auto It = ParentOpParentToPreds.find(Pair);
16231 It != ParentOpParentToPreds.end()) {
16232 Res = It->second;
16233 return Res;
16234 }
16235 ParentsPairsToAdd.insert(Pair);
16236 unsigned BlockSize = BB->size();
16237 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
16238 return Res;
16239 Budget += BlockSize;
16240 if (Budget > BudgetLimit)
16241 return Res;
16242 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
16243 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
16244 BB->getTerminator()))
16245 return Res;
16246 Worklist.append(pred_begin(BB), pred_end(BB));
16247 }
16248 Res = true;
16249 return Res;
16250 };
16251 SmallVector<const TreeEntry *> LiveEntries(1, Root);
16252 while (!LiveEntries.empty()) {
16253 const TreeEntry *Entry = LiveEntries.pop_back_val();
16254 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
16255 if (Operands.empty())
16256 continue;
16257 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
16258 BasicBlock *Parent = LastInst->getParent();
16259 for (const TreeEntry *Op : Operands) {
16260 if (!Op->isGather())
16261 LiveEntries.push_back(Op);
16262 if (Entry->State == TreeEntry::SplitVectorize ||
16263 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
16264 (Op->isGather() && allConstant(Op->Scalars)))
16265 continue;
16266 Budget = 0;
16267 BasicBlock *Pred = nullptr;
16268 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
16269 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
16270 BasicBlock *OpParent;
16271 Instruction *OpLastInst;
16272 if (Op->isGather()) {
16273 assert(Entry->getOpcode() == Instruction::PHI &&
16274 "Expected phi node only.");
16275 OpParent = cast<PHINode>(Entry->getMainOp())
16276 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
16277 OpLastInst = OpParent->getTerminator();
16278 for (Value *V : Op->Scalars) {
16279 auto *Inst = dyn_cast<Instruction>(V);
16280 if (!Inst)
16281 continue;
16282 if (isVectorized(V)) {
16283 OpParent = Inst->getParent();
16284 OpLastInst = Inst;
16285 break;
16286 }
16287 }
16288 } else {
16289 OpLastInst = EntriesToLastInstruction.at(Op);
16290 OpParent = OpLastInst->getParent();
16291 }
16292 // Check the call instructions within the same basic blocks.
16293 if (OpParent == Parent) {
16294 if (Entry->getOpcode() == Instruction::PHI) {
16295 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16296 AddCosts(Op);
16297 continue;
16298 }
16299 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16300 AddCosts(Op);
16301 continue;
16302 }
16303 // Check for call instruction in between blocks.
16304 // 1. Check entry's block to the head.
16305 if (Entry->getOpcode() != Instruction::PHI &&
16306 !CheckForNonVecCallsInSameBlock(
16307 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
16308 LastInst)) {
16309 AddCosts(Op);
16310 continue;
16311 }
16312 // 2. Check op's block from the end.
16313 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16314 OpParent->getTerminator())) {
16315 AddCosts(Op);
16316 continue;
16317 }
16318 // 3. Check the predecessors of entry's block till op's block.
16319 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16320 AddCosts(Op);
16321 continue;
16322 }
16323 }
16324 }
16325
16326 return Cost;
16327}
16328
16329/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
16330/// buildvector sequence.
16332 const InsertElementInst *IE2) {
16333 if (IE1 == IE2)
16334 return false;
16335 const auto *I1 = IE1;
16336 const auto *I2 = IE2;
16337 const InsertElementInst *PrevI1;
16338 const InsertElementInst *PrevI2;
16339 unsigned Idx1 = *getElementIndex(IE1);
16340 unsigned Idx2 = *getElementIndex(IE2);
16341 do {
16342 if (I2 == IE1)
16343 return true;
16344 if (I1 == IE2)
16345 return false;
16346 PrevI1 = I1;
16347 PrevI2 = I2;
16348 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16349 getElementIndex(I1).value_or(Idx2) != Idx2)
16350 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
16351 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
16352 getElementIndex(I2).value_or(Idx1) != Idx1)
16353 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
16354 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16355 llvm_unreachable("Two different buildvectors not expected.");
16356}
16357
16358namespace {
16359/// Returns incoming Value *, if the requested type is Value * too, or a default
16360/// value, otherwise.
16361struct ValueSelect {
16362 template <typename U>
16363 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
16364 return V;
16365 }
16366 template <typename U>
16367 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
16368 return U();
16369 }
16370};
16371} // namespace
16372
16373/// Does the analysis of the provided shuffle masks and performs the requested
16374/// actions on the vectors with the given shuffle masks. It tries to do it in
16375/// several steps.
16376/// 1. If the Base vector is not undef vector, resizing the very first mask to
16377/// have common VF and perform action for 2 input vectors (including non-undef
16378/// Base). Other shuffle masks are combined with the resulting after the 1 stage
16379/// and processed as a shuffle of 2 elements.
16380/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
16381/// action only for 1 vector with the given mask, if it is not the identity
16382/// mask.
16383/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
16384/// vectors, combing the masks properly between the steps.
16385template <typename T>
16387 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
16388 function_ref<unsigned(T *)> GetVF,
16389 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
16391 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
16392 SmallVector<int> Mask(ShuffleMask.begin()->second);
16393 auto VMIt = std::next(ShuffleMask.begin());
16394 T *Prev = nullptr;
16395 SmallBitVector UseMask =
16396 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16397 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
16398 if (!IsBaseUndef.all()) {
16399 // Base is not undef, need to combine it with the next subvectors.
16400 std::pair<T *, bool> Res =
16401 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
16402 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
16403 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16404 if (Mask[Idx] == PoisonMaskElem)
16405 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
16406 else
16407 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16408 }
16409 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
16410 assert((!V || GetVF(V) == Mask.size()) &&
16411 "Expected base vector of VF number of elements.");
16412 Prev = Action(Mask, {nullptr, Res.first});
16413 } else if (ShuffleMask.size() == 1) {
16414 // Base is undef and only 1 vector is shuffled - perform the action only for
16415 // single vector, if the mask is not the identity mask.
16416 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16417 /*ForSingleMask=*/true);
16418 if (Res.second)
16419 // Identity mask is found.
16420 Prev = Res.first;
16421 else
16422 Prev = Action(Mask, {ShuffleMask.begin()->first});
16423 } else {
16424 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
16425 // shuffles step by step, combining shuffle between the steps.
16426 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16427 unsigned Vec2VF = GetVF(VMIt->first);
16428 if (Vec1VF == Vec2VF) {
16429 // No need to resize the input vectors since they are of the same size, we
16430 // can shuffle them directly.
16431 ArrayRef<int> SecMask = VMIt->second;
16432 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16433 if (SecMask[I] != PoisonMaskElem) {
16434 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16435 Mask[I] = SecMask[I] + Vec1VF;
16436 }
16437 }
16438 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16439 } else {
16440 // Vectors of different sizes - resize and reshuffle.
16441 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16442 /*ForSingleMask=*/false);
16443 std::pair<T *, bool> Res2 =
16444 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16445 ArrayRef<int> SecMask = VMIt->second;
16446 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16447 if (Mask[I] != PoisonMaskElem) {
16448 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16449 if (Res1.second)
16450 Mask[I] = I;
16451 } else if (SecMask[I] != PoisonMaskElem) {
16452 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16453 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
16454 }
16455 }
16456 Prev = Action(Mask, {Res1.first, Res2.first});
16457 }
16458 VMIt = std::next(VMIt);
16459 }
16460 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
16461 // Perform requested actions for the remaining masks/vectors.
16462 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16463 // Shuffle other input vectors, if any.
16464 std::pair<T *, bool> Res =
16465 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16466 ArrayRef<int> SecMask = VMIt->second;
16467 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16468 if (SecMask[I] != PoisonMaskElem) {
16469 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
16470 "Multiple uses of scalars.");
16471 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
16472 } else if (Mask[I] != PoisonMaskElem) {
16473 Mask[I] = I;
16474 }
16475 }
16476 Prev = Action(Mask, {Prev, Res.first});
16477 }
16478 return Prev;
16479}
16480
16482 ArrayRef<Value *> VectorizedVals) {
16484 SmallPtrSet<Value *, 4> CheckedExtracts;
16485 SmallPtrSet<const TreeEntry *, 4> GatheredLoadsNodes;
16486 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16487 << VectorizableTree.size() << ".\n");
16488 InstructionCost Cost = 0;
16489 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16490 TreeEntry &TE = *Ptr;
16491 // No need to count the cost for combined entries, they are combined and
16492 // just skip their cost.
16493 if (TE.State == TreeEntry::CombinedVectorize) {
16494 LLVM_DEBUG(
16495 dbgs() << "SLP: Skipping cost for combined node that starts with "
16496 << *TE.Scalars[0] << ".\n";
16497 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16498 NodesCosts.try_emplace(&TE);
16499 continue;
16500 }
16501 if (TE.hasState() &&
16502 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16503 if (const TreeEntry *E =
16504 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16505 E && E->getVectorFactor() == TE.getVectorFactor()) {
16506 // Some gather nodes might be absolutely the same as some vectorizable
16507 // nodes after reordering, need to handle it.
16508 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16509 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16510 << "SLP: Current total cost = " << Cost << "\n");
16511 NodesCosts.try_emplace(&TE);
16512 continue;
16513 }
16514 }
16515
16516 // Exclude cost of gather loads nodes which are not used. These nodes were
16517 // built as part of the final attempt to vectorize gathered loads.
16518 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16519 "Expected gather nodes with users only.");
16520
16521 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
16522 Cost += C;
16523 NodesCosts.try_emplace(&TE, C);
16524 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16525 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16526 << "SLP: Current total cost = " << Cost << "\n");
16527 // Add gathered loads nodes to the set for later processing.
16528 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
16529 TE.getOpcode() == Instruction::Load)
16530 GatheredLoadsNodes.insert(&TE);
16531 }
16532 // Bail out if the cost threshold is negative and cost already below it.
16533 if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 &&
16534 Cost < -SLPCostThreshold)
16535 return Cost;
16536 // Bail out, if gathered loads nodes are found.
16537 // TODO: add analysis for gathered load to include their cost correctly into
16538 // the related subtrees.
16539 if (!GatheredLoadsNodes.empty())
16540 return Cost;
16541 // The narrow non-profitable tree in loop? Skip, may cause regressions.
16542 constexpr unsigned PartLimit = 2;
16543 const unsigned Sz =
16544 getVectorElementSize(VectorizableTree.front()->Scalars.front());
16545 const unsigned MinVF = getMinVF(Sz);
16546 if (Cost >= -SLPCostThreshold &&
16547 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
16548 (!VectorizableTree.front()->hasState() ||
16549 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
16550 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
16551 return Cost;
16553 VectorizableTree.size());
16554 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16555 TreeEntry &TE = *Ptr;
16556 InstructionCost C = NodesCosts.at(&TE);
16557 SubtreeCosts[TE.Idx].first += C;
16558 const TreeEntry *UserTE = TE.UserTreeIndex.UserTE;
16559 while (UserTE) {
16560 SubtreeCosts[UserTE->Idx].first += C;
16561 SubtreeCosts[UserTE->Idx].second.push_back(TE.Idx);
16562 UserTE = UserTE->UserTreeIndex.UserTE;
16563 }
16564 }
16565 using CostIndicesTy =
16566 std::pair<TreeEntry *, std::pair<InstructionCost, SmallVector<unsigned>>>;
16567 struct FirstGreater {
16568 bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
16569 return LHS.second.first < RHS.second.first ||
16570 (LHS.second.first == RHS.second.first &&
16571 LHS.first->Idx < RHS.first->Idx);
16572 }
16573 };
16575 Worklist;
16576 for (const auto [Idx, P] : enumerate(SubtreeCosts))
16577 Worklist.emplace(VectorizableTree[Idx].get(), P);
16578
16579 // Narrow store trees with non-profitable immediate values - exit.
16580 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
16581 VectorizableTree.front()->hasState() &&
16582 VectorizableTree.front()->getOpcode() == Instruction::Store &&
16583 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
16584 return Cost;
16585
16586 bool Changed = false;
16587 while (!Worklist.empty() && Worklist.top().second.first > 0) {
16588 TreeEntry *TE = Worklist.top().first;
16589 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE) ||
16590 // Exit early if the parent node is split node and any of scalars is
16591 // used in other split nodes.
16592 (TE->UserTreeIndex &&
16593 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
16594 any_of(TE->Scalars, [&](Value *V) {
16595 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
16596 return Entries.size() > 1;
16597 }))) {
16598 Worklist.pop();
16599 continue;
16600 }
16601
16602 // Calculate the gather cost of the root node.
16603 InstructionCost SubtreeCost = Worklist.top().second.first;
16604 if (SubtreeCost < TE->Scalars.size()) {
16605 Worklist.pop();
16606 continue;
16607 }
16608 if (!TransformedToGatherNodes.empty()) {
16609 for (unsigned Idx : Worklist.top().second.second) {
16610 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].get());
16611 if (It != TransformedToGatherNodes.end()) {
16612 SubtreeCost -= SubtreeCosts[Idx].first;
16613 SubtreeCost += It->second;
16614 }
16615 }
16616 }
16617 if (SubtreeCost < 0 || SubtreeCost < TE->Scalars.size()) {
16618 Worklist.pop();
16619 continue;
16620 }
16621 const unsigned Sz = TE->Scalars.size();
16622 APInt DemandedElts = APInt::getAllOnes(Sz);
16623 for (auto [Idx, V] : enumerate(TE->Scalars)) {
16624 if (isConstant(V))
16625 DemandedElts.clearBit(Idx);
16626 }
16628
16629 Type *ScalarTy = getValueType(TE->Scalars.front());
16630 auto *VecTy = getWidenedType(ScalarTy, Sz);
16631 const unsigned EntryVF = TE->getVectorFactor();
16632 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
16634 *TTI, ScalarTy, VecTy, DemandedElts,
16635 /*Insert=*/true, /*Extract=*/false, CostKind);
16636 SmallVector<int> Mask;
16637 if (!TE->ReorderIndices.empty() &&
16638 TE->State != TreeEntry::CompressVectorize &&
16639 (TE->State != TreeEntry::StridedVectorize ||
16640 !isReverseOrder(TE->ReorderIndices))) {
16641 SmallVector<int> NewMask;
16642 if (TE->getOpcode() == Instruction::Store) {
16643 // For stores the order is actually a mask.
16644 NewMask.resize(TE->ReorderIndices.size());
16645 copy(TE->ReorderIndices, NewMask.begin());
16646 } else {
16647 inversePermutation(TE->ReorderIndices, NewMask);
16648 }
16649 ::addMask(Mask, NewMask);
16650 }
16651 if (!TE->ReuseShuffleIndices.empty())
16652 ::addMask(Mask, TE->ReuseShuffleIndices);
16653 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF))
16654 GatherCost +=
16655 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
16656 // If all scalars are reused in gather node(s) or other vector nodes, there
16657 // might be extra cost for inserting them.
16658 if (all_of(TE->Scalars, [&](Value *V) {
16659 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
16660 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
16661 }))
16662 GatherCost *= 2;
16663 // Erase subtree if it is non-profitable.
16664 if (SubtreeCost > GatherCost) {
16665 // If the remaining tree is just a buildvector - exit, it will cause
16666 // enless attempts to vectorize.
16667 if (VectorizableTree.front()->hasState() &&
16668 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16669 TE->Idx == 1)
16671
16672 LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
16673 << TE->Idx << " with cost "
16674 << Worklist.top().second.first << " and gather cost "
16675 << GatherCost << ".\n");
16676 if (TE->UserTreeIndex) {
16677 TransformedToGatherNodes.try_emplace(TE, GatherCost);
16678 NodesCosts.erase(TE);
16679 } else {
16680 DeletedNodes.insert(TE);
16681 TransformedToGatherNodes.erase(TE);
16682 NodesCosts.erase(TE);
16683 }
16684 for (unsigned Idx : Worklist.top().second.second) {
16685 TreeEntry &ChildTE = *VectorizableTree[Idx];
16686 DeletedNodes.insert(&ChildTE);
16687 TransformedToGatherNodes.erase(&ChildTE);
16688 NodesCosts.erase(&ChildTE);
16689 }
16690 Changed = true;
16691 }
16692 Worklist.pop();
16693 }
16694 if (!Changed)
16695 return SubtreeCosts.front().first;
16696
16697 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16698 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(TE.get())) {
16699 assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
16700 continue;
16701 }
16702 if (DeletedNodes.contains(TE.get()))
16703 continue;
16704 if (!NodesCosts.contains(TE.get())) {
16706 getEntryCost(TE.get(), VectorizedVals, CheckedExtracts);
16707 NodesCosts.try_emplace(TE.get(), C);
16708 }
16709 }
16710
16711 LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
16712 InstructionCost NewCost = 0;
16713 for (const auto &P : NodesCosts) {
16714 NewCost += P.second;
16715 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
16716 << shortBundleName(P.first->Scalars, P.first->Idx)
16717 << ".\n"
16718 << "SLP: Current total cost = " << Cost << "\n");
16719 }
16720 if (NewCost >= Cost) {
16721 DeletedNodes.clear();
16722 TransformedToGatherNodes.clear();
16723 NewCost = Cost;
16724 }
16725 return NewCost;
16726}
16727
16728namespace {
16729/// Data type for handling buildvector sequences with the reused scalars from
16730/// other tree entries.
16731template <typename T> struct ShuffledInsertData {
16732 /// List of insertelements to be replaced by shuffles.
16733 SmallVector<InsertElementInst *> InsertElements;
16734 /// The parent vectors and shuffle mask for the given list of inserts.
16736};
16737} // namespace
16738
16740 ArrayRef<Value *> VectorizedVals,
16741 InstructionCost ReductionCost) {
16742 InstructionCost Cost = TreeCost + ReductionCost;
16743
16744 if (Cost >= -SLPCostThreshold &&
16745 none_of(ExternalUses, [](const ExternalUser &EU) {
16746 return isa_and_nonnull<InsertElementInst>(EU.User);
16747 }))
16748 return Cost;
16749
16750 SmallPtrSet<Value *, 16> ExtractCostCalculated;
16751 InstructionCost ExtractCost = 0;
16753 SmallVector<APInt> DemandedElts;
16754 SmallDenseSet<Value *, 4> UsedInserts;
16756 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16758 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
16759 // Keep track {Scalar, Index, User} tuple.
16760 // On AArch64, this helps in fusing a mov instruction, associated with
16761 // extractelement, with fmul in the backend so that extractelement is free.
16763 for (ExternalUser &EU : ExternalUses) {
16764 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
16765 }
16766 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
16767 for (ExternalUser &EU : ExternalUses) {
16768 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
16769 << EU.E.Idx << " in lane " << EU.Lane << "\n");
16770 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
16771 else dbgs() << " User: nullptr\n");
16772 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
16773
16774 // Uses by ephemeral values are free (because the ephemeral value will be
16775 // removed prior to code generation, and so the extraction will be
16776 // removed as well).
16777 if (EphValues.count(EU.User))
16778 continue;
16779
16780 // Check if the scalar for the given user or all users is accounted already.
16781 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
16782 (EU.User &&
16783 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
16784 continue;
16785
16786 // Used in unreachable blocks or in EH pads (rarely executed) or is
16787 // terminated with unreachable instruction.
16788 if (BasicBlock *UserParent =
16789 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
16790 UserParent &&
16791 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16792 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
16793 continue;
16794
16795 // We only add extract cost once for the same scalar.
16796 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16797 !ExtractCostCalculated.insert(EU.Scalar).second)
16798 continue;
16799
16800 // No extract cost for vector "scalar" if REVEC is disabled
16801 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16802 continue;
16803
16804 // If found user is an insertelement, do not calculate extract cost but try
16805 // to detect it as a final shuffled/identity match.
16806 // TODO: what if a user is insertvalue when REVEC is enabled?
16807 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16808 VU && VU->getOperand(1) == EU.Scalar) {
16809 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16810 if (!UsedInserts.insert(VU).second)
16811 continue;
16812 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16813 if (InsertIdx) {
16814 const TreeEntry *ScalarTE = &EU.E;
16815 auto *It = find_if(
16816 ShuffledInserts,
16817 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16818 // Checks if 2 insertelements are from the same buildvector.
16819 InsertElementInst *VecInsert = Data.InsertElements.front();
16821 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16822 Value *Op0 = II->getOperand(0);
16823 if (isVectorized(II) && !isVectorized(Op0))
16824 return nullptr;
16825 return Op0;
16826 });
16827 });
16828 int VecId = -1;
16829 if (It == ShuffledInserts.end()) {
16830 auto &Data = ShuffledInserts.emplace_back();
16831 Data.InsertElements.emplace_back(VU);
16832 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16833 VecId = ShuffledInserts.size() - 1;
16834 auto It = MinBWs.find(ScalarTE);
16835 if (It != MinBWs.end() &&
16836 VectorCasts
16837 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16838 .second) {
16839 unsigned BWSz = It->second.first;
16840 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16841 unsigned VecOpcode;
16842 if (DstBWSz < BWSz)
16843 VecOpcode = Instruction::Trunc;
16844 else
16845 VecOpcode =
16846 It->second.second ? Instruction::SExt : Instruction::ZExt;
16848 InstructionCost C = TTI->getCastInstrCost(
16849 VecOpcode, FTy,
16850 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16851 FTy->getNumElements()),
16853 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16854 << " for extending externally used vector with "
16855 "non-equal minimum bitwidth.\n");
16856 Cost += C;
16857 }
16858 } else {
16859 if (isFirstInsertElement(VU, It->InsertElements.front()))
16860 It->InsertElements.front() = VU;
16861 VecId = std::distance(ShuffledInserts.begin(), It);
16862 }
16863 int InIdx = *InsertIdx;
16864 SmallVectorImpl<int> &Mask =
16865 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16866 if (Mask.empty())
16867 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16868 Mask[InIdx] = EU.Lane;
16869 DemandedElts[VecId].setBit(InIdx);
16870 continue;
16871 }
16872 }
16873 }
16874
16876 // If we plan to rewrite the tree in a smaller type, we will need to sign
16877 // extend the extracted value back to the original type. Here, we account
16878 // for the extract and the added cost of the sign extend if needed.
16879 InstructionCost ExtraCost = TTI::TCC_Free;
16880 auto *ScalarTy = EU.Scalar->getType();
16881 const unsigned BundleWidth = EU.E.getVectorFactor();
16882 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16883 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16884 const TreeEntry *Entry = &EU.E;
16885 auto It = MinBWs.find(Entry);
16886 if (It != MinBWs.end()) {
16887 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16888 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16889 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16890 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16891 ? Instruction::ZExt
16892 : Instruction::SExt;
16893 VecTy = getWidenedType(MinTy, BundleWidth);
16894 ExtraCost =
16895 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16896 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16897 << ExtraCost << "\n");
16898 } else {
16899 ExtraCost =
16900 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16901 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16902 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16903 << *VecTy << ": " << ExtraCost << "\n");
16904 }
16905 // Leave the scalar instructions as is if they are cheaper than extracts.
16906 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16907 Entry->getOpcode() == Instruction::Load) {
16908 // Checks if the user of the external scalar is phi in loop body.
16909 auto IsPhiInLoop = [&](const ExternalUser &U) {
16910 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16911 auto *I = cast<Instruction>(U.Scalar);
16912 const Loop *L = LI->getLoopFor(Phi->getParent());
16913 return L && (Phi->getParent() == I->getParent() ||
16914 L == LI->getLoopFor(I->getParent()));
16915 }
16916 return false;
16917 };
16918 if (!ValueToExtUses) {
16919 ValueToExtUses.emplace();
16920 for (const auto &P : enumerate(ExternalUses)) {
16921 // Ignore phis in loops.
16922 if (IsPhiInLoop(P.value()))
16923 continue;
16924
16925 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16926 }
16927 }
16928 // Can use original instruction, if no operands vectorized or they are
16929 // marked as externally used already.
16930 auto *Inst = cast<Instruction>(EU.Scalar);
16931 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16932 auto OperandIsScalar = [&](Value *V) {
16933 if (!isVectorized(V)) {
16934 // Some extractelements might be not vectorized, but
16935 // transformed into shuffle and removed from the function,
16936 // consider it here.
16937 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16938 return !EE->hasOneUse() || !MustGather.contains(EE);
16939 return true;
16940 }
16941 return ValueToExtUses->contains(V);
16942 };
16943 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16944 bool CanBeUsedAsScalarCast = false;
16945 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16946 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16947 Op && all_of(Op->operands(), OperandIsScalar)) {
16948 InstructionCost OpCost =
16949 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16950 ? TTI->getInstructionCost(Op, CostKind)
16951 : 0;
16952 if (ScalarCost + OpCost <= ExtraCost) {
16953 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16954 ScalarCost += OpCost;
16955 }
16956 }
16957 }
16958 if (CanBeUsedAsScalar) {
16959 bool KeepScalar = ScalarCost <= ExtraCost;
16960 // Try to keep original scalar if the user is the phi node from the same
16961 // block as the root phis, currently vectorized. It allows to keep
16962 // better ordering info of PHIs, being vectorized currently.
16963 bool IsProfitablePHIUser =
16964 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16965 VectorizableTree.front()->Scalars.size() > 2)) &&
16966 VectorizableTree.front()->hasState() &&
16967 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16968 !Inst->hasNUsesOrMore(UsesLimit) &&
16969 none_of(Inst->users(),
16970 [&](User *U) {
16971 auto *PHIUser = dyn_cast<PHINode>(U);
16972 return (!PHIUser ||
16973 PHIUser->getParent() !=
16974 cast<Instruction>(
16975 VectorizableTree.front()->getMainOp())
16976 ->getParent()) &&
16977 !isVectorized(U);
16978 }) &&
16979 count_if(Entry->Scalars, [&](Value *V) {
16980 return ValueToExtUses->contains(V);
16981 }) <= 2;
16982 if (IsProfitablePHIUser) {
16983 KeepScalar = true;
16984 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16985 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16986 (!GatheredLoadsEntriesFirst.has_value() ||
16987 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16988 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16989 return ValueToExtUses->contains(V);
16990 });
16991 auto It = ExtractsCount.find(Entry);
16992 if (It != ExtractsCount.end()) {
16993 assert(ScalarUsesCount >= It->getSecond().size() &&
16994 "Expected total number of external uses not less than "
16995 "number of scalar uses.");
16996 ScalarUsesCount -= It->getSecond().size();
16997 }
16998 // Keep original scalar if number of externally used instructions in
16999 // the same entry is not power of 2. It may help to do some extra
17000 // vectorization for now.
17001 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
17002 }
17003 if (KeepScalar) {
17004 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
17005 for (Value *V : Inst->operands()) {
17006 auto It = ValueToExtUses->find(V);
17007 if (It != ValueToExtUses->end()) {
17008 // Replace all uses to avoid compiler crash.
17009 ExternalUses[It->second].User = nullptr;
17010 }
17011 }
17012 ExtraCost = ScalarCost;
17013 if (!IsPhiInLoop(EU))
17014 ExtractsCount[Entry].insert(Inst);
17015 if (CanBeUsedAsScalarCast) {
17016 ScalarOpsFromCasts.insert(Inst->getOperand(0));
17017 // Update the users of the operands of the cast operand to avoid
17018 // compiler crash.
17019 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
17020 for (Value *V : IOp->operands()) {
17021 auto It = ValueToExtUses->find(V);
17022 if (It != ValueToExtUses->end()) {
17023 // Replace all uses to avoid compiler crash.
17024 ExternalUses[It->second].User = nullptr;
17025 }
17026 }
17027 }
17028 }
17029 }
17030 }
17031 }
17032
17033 ExtractCost += ExtraCost;
17034 }
17035 // Insert externals for extract of operands of casts to be emitted as scalars
17036 // instead of extractelement.
17037 for (Value *V : ScalarOpsFromCasts) {
17038 ExternalUsesAsOriginalScalar.insert(V);
17039 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
17040 const auto *It = find_if_not(TEs, [&](TreeEntry *TE) {
17041 return TransformedToGatherNodes.contains(TE) ||
17042 DeletedNodes.contains(TE);
17043 });
17044 if (It != TEs.end()) {
17045 const TreeEntry *UserTE = *It;
17046 ExternalUses.emplace_back(V, nullptr, *UserTE,
17047 UserTE->findLaneForValue(V));
17048 }
17049 }
17050 }
17051 // Add reduced value cost, if resized.
17052 if (!VectorizedVals.empty()) {
17053 const TreeEntry &Root = *VectorizableTree.front();
17054 auto BWIt = MinBWs.find(&Root);
17055 if (BWIt != MinBWs.end()) {
17056 Type *DstTy = Root.Scalars.front()->getType();
17057 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
17058 unsigned SrcSz =
17059 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
17060 if (OriginalSz != SrcSz) {
17061 unsigned Opcode = Instruction::Trunc;
17062 if (OriginalSz > SrcSz)
17063 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
17064 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
17065 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
17066 assert(SLPReVec && "Only supported by REVEC.");
17067 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
17068 }
17069 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
17072 }
17073 }
17074 }
17075
17076 // Buildvector with externally used scalars, which should remain as scalars,
17077 // should not be vectorized, the compiler may hang.
17078 if (SLPCostThreshold < 0 && VectorizableTree.size() > 1 &&
17079 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
17080 VectorizableTree[1]->hasState() &&
17081 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17082 all_of(VectorizableTree[1]->Scalars, [&](Value *V) {
17083 return ExternalUsesAsOriginalScalar.contains(V);
17084 }))
17086
17087 Cost += ExtractCost;
17088 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
17089 bool ForSingleMask) {
17090 InstructionCost C = 0;
17091 unsigned VF = Mask.size();
17092 unsigned VecVF = TE->getVectorFactor();
17093 bool HasLargeIndex =
17094 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
17095 if ((VF != VecVF && HasLargeIndex) ||
17097
17098 if (HasLargeIndex) {
17099 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
17100 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
17101 OrigMask.begin());
17103 getWidenedType(TE->getMainOp()->getType(), VecVF),
17104 OrigMask);
17105 LLVM_DEBUG(
17106 dbgs() << "SLP: Adding cost " << C
17107 << " for final shuffle of insertelement external users.\n";
17108 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17109 Cost += C;
17110 return std::make_pair(TE, true);
17111 }
17112
17113 if (!ForSingleMask) {
17114 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
17115 for (unsigned I = 0; I < VF; ++I) {
17116 if (Mask[I] != PoisonMaskElem)
17117 ResizeMask[Mask[I]] = Mask[I];
17118 }
17119 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
17122 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
17123 LLVM_DEBUG(
17124 dbgs() << "SLP: Adding cost " << C
17125 << " for final shuffle of insertelement external users.\n";
17126 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17127
17128 Cost += C;
17129 }
17130 }
17131 return std::make_pair(TE, false);
17132 };
17133 // Calculate the cost of the reshuffled vectors, if any.
17134 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
17135 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
17136 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
17137 unsigned VF = 0;
17138 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
17140 assert((TEs.size() == 1 || TEs.size() == 2) &&
17141 "Expected exactly 1 or 2 tree entries.");
17142 if (TEs.size() == 1) {
17143 if (VF == 0)
17144 VF = TEs.front()->getVectorFactor();
17145 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17146 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
17147 !all_of(enumerate(Mask), [=](const auto &Data) {
17148 return Data.value() == PoisonMaskElem ||
17149 (Data.index() < VF &&
17150 static_cast<int>(Data.index()) == Data.value());
17151 })) {
17154 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17155 << " for final shuffle of insertelement "
17156 "external users.\n";
17157 TEs.front()->dump();
17158 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17159 Cost += C;
17160 }
17161 } else {
17162 if (VF == 0) {
17163 if (TEs.front() &&
17164 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
17165 VF = TEs.front()->getVectorFactor();
17166 else
17167 VF = Mask.size();
17168 }
17169 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17171 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
17172 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17173 << " for final shuffle of vector node and external "
17174 "insertelement users.\n";
17175 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
17176 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17177 Cost += C;
17178 }
17179 VF = Mask.size();
17180 return TEs.back();
17181 };
17183 MutableArrayRef(Vector.data(), Vector.size()), Base,
17184 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
17185 EstimateShufflesCost);
17186 InstructionCost InsertCost = TTI->getScalarizationOverhead(
17188 ShuffledInserts[I].InsertElements.front()->getType()),
17189 DemandedElts[I],
17190 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
17191 Cost -= InsertCost;
17192 }
17193
17194 // Add the cost for reduced value resize (if required).
17195 if (ReductionBitWidth != 0) {
17196 assert(UserIgnoreList && "Expected reduction tree.");
17197 const TreeEntry &E = *VectorizableTree.front();
17198 auto It = MinBWs.find(&E);
17199 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
17200 unsigned SrcSize = It->second.first;
17201 unsigned DstSize = ReductionBitWidth;
17202 unsigned Opcode = Instruction::Trunc;
17203 if (SrcSize < DstSize) {
17204 bool IsArithmeticExtendedReduction =
17205 all_of(*UserIgnoreList, [](Value *V) {
17206 auto *I = cast<Instruction>(V);
17207 return is_contained({Instruction::Add, Instruction::FAdd,
17208 Instruction::Mul, Instruction::FMul,
17209 Instruction::And, Instruction::Or,
17210 Instruction::Xor},
17211 I->getOpcode());
17212 });
17213 if (IsArithmeticExtendedReduction)
17214 Opcode =
17215 Instruction::BitCast; // Handle it by getExtendedReductionCost
17216 else
17217 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
17218 }
17219 if (Opcode != Instruction::BitCast) {
17220 auto *SrcVecTy =
17221 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
17222 auto *DstVecTy =
17223 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
17224 TTI::CastContextHint CCH = getCastContextHint(E);
17225 InstructionCost CastCost;
17226 switch (E.getOpcode()) {
17227 case Instruction::SExt:
17228 case Instruction::ZExt:
17229 case Instruction::Trunc: {
17230 const TreeEntry *OpTE = getOperandEntry(&E, 0);
17231 CCH = getCastContextHint(*OpTE);
17232 break;
17233 }
17234 default:
17235 break;
17236 }
17237 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
17239 Cost += CastCost;
17240 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
17241 << " for final resize for reduction from " << SrcVecTy
17242 << " to " << DstVecTy << "\n";
17243 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17244 }
17245 }
17246 }
17247
17248 std::optional<InstructionCost> SpillCost;
17249 if (Cost < -SLPCostThreshold) {
17250 SpillCost = getSpillCost();
17251 Cost += *SpillCost;
17252 }
17253#ifndef NDEBUG
17254 SmallString<256> Str;
17255 {
17256 raw_svector_ostream OS(Str);
17257 OS << "SLP: Spill Cost = ";
17258 if (SpillCost)
17259 OS << *SpillCost;
17260 else
17261 OS << "<skipped>";
17262 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
17263 << "SLP: Total Cost = " << Cost << ".\n";
17264 }
17265 LLVM_DEBUG(dbgs() << Str);
17266 if (ViewSLPTree)
17267 ViewGraph(this, "SLP" + F->getName(), false, Str);
17268#endif
17269
17270 return Cost;
17271}
17272
17273/// Tries to find extractelement instructions with constant indices from fixed
17274/// vector type and gather such instructions into a bunch, which highly likely
17275/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17276/// successful, the matched scalars are replaced by poison values in \p VL for
17277/// future analysis.
17278std::optional<TTI::ShuffleKind>
17279BoUpSLP::tryToGatherSingleRegisterExtractElements(
17281 // Scan list of gathered scalars for extractelements that can be represented
17282 // as shuffles.
17284 SmallVector<int> UndefVectorExtracts;
17285 for (int I = 0, E = VL.size(); I < E; ++I) {
17286 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
17287 if (!EI) {
17288 if (isa<UndefValue>(VL[I]))
17289 UndefVectorExtracts.push_back(I);
17290 continue;
17291 }
17292 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
17293 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
17294 continue;
17295 std::optional<unsigned> Idx = getExtractIndex(EI);
17296 // Undefined index.
17297 if (!Idx) {
17298 UndefVectorExtracts.push_back(I);
17299 continue;
17300 }
17301 if (Idx >= VecTy->getNumElements()) {
17302 UndefVectorExtracts.push_back(I);
17303 continue;
17304 }
17305 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
17306 ExtractMask.reset(*Idx);
17307 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
17308 UndefVectorExtracts.push_back(I);
17309 continue;
17310 }
17311 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
17312 }
17313 // Sort the vector operands by the maximum number of uses in extractelements.
17315 VectorOpToIdx.takeVector();
17316 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
17317 return P1.second.size() > P2.second.size();
17318 });
17319 // Find the best pair of the vectors or a single vector.
17320 const int UndefSz = UndefVectorExtracts.size();
17321 unsigned SingleMax = 0;
17322 unsigned PairMax = 0;
17323 if (!Vectors.empty()) {
17324 SingleMax = Vectors.front().second.size() + UndefSz;
17325 if (Vectors.size() > 1) {
17326 auto *ItNext = std::next(Vectors.begin());
17327 PairMax = SingleMax + ItNext->second.size();
17328 }
17329 }
17330 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
17331 return std::nullopt;
17332 // Check if better to perform a shuffle of 2 vectors or just of a single
17333 // vector.
17334 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
17335 SmallVector<Value *> GatheredExtracts(
17336 VL.size(), PoisonValue::get(VL.front()->getType()));
17337 if (SingleMax >= PairMax && SingleMax) {
17338 for (int Idx : Vectors.front().second)
17339 std::swap(GatheredExtracts[Idx], VL[Idx]);
17340 } else if (!Vectors.empty()) {
17341 for (unsigned Idx : {0, 1})
17342 for (int Idx : Vectors[Idx].second)
17343 std::swap(GatheredExtracts[Idx], VL[Idx]);
17344 }
17345 // Add extracts from undefs too.
17346 for (int Idx : UndefVectorExtracts)
17347 std::swap(GatheredExtracts[Idx], VL[Idx]);
17348 // Check that gather of extractelements can be represented as just a
17349 // shuffle of a single/two vectors the scalars are extracted from.
17350 std::optional<TTI::ShuffleKind> Res =
17351 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
17352 if (!Res || all_of(Mask, equal_to(PoisonMaskElem))) {
17353 // TODO: try to check other subsets if possible.
17354 // Restore the original VL if attempt was not successful.
17355 copy(SavedVL, VL.begin());
17356 return std::nullopt;
17357 }
17358 // Restore unused scalars from mask, if some of the extractelements were not
17359 // selected for shuffle.
17360 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
17361 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
17362 isa<UndefValue>(GatheredExtracts[I])) {
17363 std::swap(VL[I], GatheredExtracts[I]);
17364 continue;
17365 }
17366 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
17367 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
17368 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
17369 is_contained(UndefVectorExtracts, I))
17370 continue;
17371 }
17372 return Res;
17373}
17374
17375/// Tries to find extractelement instructions with constant indices from fixed
17376/// vector type and gather such instructions into a bunch, which highly likely
17377/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17378/// successful, the matched scalars are replaced by poison values in \p VL for
17379/// future analysis.
17381BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
17382 SmallVectorImpl<int> &Mask,
17383 unsigned NumParts) const {
17384 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
17385 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
17386 Mask.assign(VL.size(), PoisonMaskElem);
17387 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17388 for (unsigned Part : seq<unsigned>(NumParts)) {
17389 // Scan list of gathered scalars for extractelements that can be represented
17390 // as shuffles.
17391 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
17392 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17393 SmallVector<int> SubMask;
17394 std::optional<TTI::ShuffleKind> Res =
17395 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
17396 ShufflesRes[Part] = Res;
17397 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
17398 }
17399 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
17400 return Res.has_value();
17401 }))
17402 ShufflesRes.clear();
17403 return ShufflesRes;
17404}
17405
17406std::optional<TargetTransformInfo::ShuffleKind>
17407BoUpSLP::isGatherShuffledSingleRegisterEntry(
17408 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
17409 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
17410 Entries.clear();
17411 if (TE->Idx == 0)
17412 return std::nullopt;
17413 // TODO: currently checking only for Scalars in the tree entry, need to count
17414 // reused elements too for better cost estimation.
17415 auto GetUserEntry = [&](const TreeEntry *TE) {
17416 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17417 TE = TE->UserTreeIndex.UserTE;
17418 if (TE == VectorizableTree.front().get())
17419 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
17420 return TE->UserTreeIndex;
17421 };
17422 auto HasGatherUser = [&](const TreeEntry *TE) {
17423 while (TE->Idx != 0 && TE->UserTreeIndex) {
17424 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17425 return true;
17426 TE = TE->UserTreeIndex.UserTE;
17427 }
17428 return false;
17429 };
17430 const EdgeInfo TEUseEI = GetUserEntry(TE);
17431 if (!TEUseEI)
17432 return std::nullopt;
17433 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
17434 const BasicBlock *TEInsertBlock = nullptr;
17435 // Main node of PHI entries keeps the correct order of operands/incoming
17436 // blocks.
17437 if (auto *PHI = dyn_cast_or_null<PHINode>(
17438 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
17439 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
17440 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
17441 TEInsertPt = TEInsertBlock->getTerminator();
17442 } else {
17443 TEInsertBlock = TEInsertPt->getParent();
17444 }
17445 if (!DT->isReachableFromEntry(TEInsertBlock))
17446 return std::nullopt;
17447 auto *NodeUI = DT->getNode(TEInsertBlock);
17448 assert(NodeUI && "Should only process reachable instructions");
17449 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
17450 auto CheckOrdering = [&](const Instruction *InsertPt) {
17451 // Argument InsertPt is an instruction where vector code for some other
17452 // tree entry (one that shares one or more scalars with TE) is going to be
17453 // generated. This lambda returns true if insertion point of vector code
17454 // for the TE dominates that point (otherwise dependency is the other way
17455 // around). The other node is not limited to be of a gather kind. Gather
17456 // nodes are not scheduled and their vector code is inserted before their
17457 // first user. If user is PHI, that is supposed to be at the end of a
17458 // predecessor block. Otherwise it is the last instruction among scalars of
17459 // the user node. So, instead of checking dependency between instructions
17460 // themselves, we check dependency between their insertion points for vector
17461 // code (since each scalar instruction ends up as a lane of a vector
17462 // instruction).
17463 const BasicBlock *InsertBlock = InsertPt->getParent();
17464 auto *NodeEUI = DT->getNode(InsertBlock);
17465 if (!NodeEUI)
17466 return false;
17467 assert((NodeUI == NodeEUI) ==
17468 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
17469 "Different nodes should have different DFS numbers");
17470 // Check the order of the gather nodes users.
17471 if (TEInsertPt->getParent() != InsertBlock &&
17472 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
17473 return false;
17474 if (TEInsertPt->getParent() == InsertBlock &&
17475 TEInsertPt->comesBefore(InsertPt))
17476 return false;
17477 return true;
17478 };
17479 // Find all tree entries used by the gathered values. If no common entries
17480 // found - not a shuffle.
17481 // Here we build a set of tree nodes for each gathered value and trying to
17482 // find the intersection between these sets. If we have at least one common
17483 // tree node for each gathered value - we have just a permutation of the
17484 // single vector. If we have 2 different sets, we're in situation where we
17485 // have a permutation of 2 input vectors.
17487 SmallDenseMap<Value *, int> UsedValuesEntry;
17488 SmallPtrSet<const Value *, 16> VisitedValue;
17489 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
17490 // The node is reused - exit.
17491 if ((TEPtr->getVectorFactor() != VL.size() &&
17492 TEPtr->Scalars.size() != VL.size()) ||
17493 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
17494 return false;
17495 UsedTEs.clear();
17496 UsedTEs.emplace_back().insert(TEPtr);
17497 for (Value *V : VL) {
17498 if (isConstant(V))
17499 continue;
17500 UsedValuesEntry.try_emplace(V, 0);
17501 }
17502 return true;
17503 };
17504 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
17505 unsigned EdgeIdx) {
17506 const TreeEntry *Ptr1 = User1;
17507 const TreeEntry *Ptr2 = User2;
17508 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
17509 while (Ptr2) {
17510 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
17511 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
17512 Ptr2 = Ptr2->UserTreeIndex.UserTE;
17513 }
17514 while (Ptr1) {
17515 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
17516 Ptr1 = Ptr1->UserTreeIndex.UserTE;
17517 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
17518 return Idx < It->second;
17519 }
17520 return false;
17521 };
17522 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
17523 Instruction *InsertPt) {
17524 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
17525 !TEUseEI.UserTE->isCopyableElement(
17526 const_cast<Instruction *>(TEInsertPt)) &&
17527 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17528 InsertPt->getNextNode() == TEInsertPt &&
17529 (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) ||
17530 !isUsedOutsideBlock(InsertPt));
17531 };
17532 for (Value *V : VL) {
17533 if (isConstant(V) || !VisitedValue.insert(V).second)
17534 continue;
17535 // Build a list of tree entries where V is used.
17536 SmallPtrSet<const TreeEntry *, 4> VToTEs;
17538 ValueToGatherNodes.lookup(V).takeVector());
17539 if (TransformedToGatherNodes.contains(TE)) {
17540 for (TreeEntry *E : getSplitTreeEntries(V)) {
17541 if (TE == E || !TransformedToGatherNodes.contains(E) ||
17542 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
17543 continue;
17544 GatherNodes.push_back(E);
17545 }
17546 for (TreeEntry *E : getTreeEntries(V)) {
17547 if (TE == E || !TransformedToGatherNodes.contains(E) ||
17548 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
17549 continue;
17550 GatherNodes.push_back(E);
17551 }
17552 }
17553 for (const TreeEntry *TEPtr : GatherNodes) {
17554 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
17555 continue;
17556 assert(any_of(TEPtr->Scalars,
17557 [&](Value *V) { return GatheredScalars.contains(V); }) &&
17558 "Must contain at least single gathered value.");
17559 assert(TEPtr->UserTreeIndex &&
17560 "Expected only single user of a gather node.");
17561 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17562
17563 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17564 UseEI.UserTE->hasState())
17565 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
17566 : nullptr;
17567 Instruction *InsertPt =
17568 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
17569 : &getLastInstructionInBundle(UseEI.UserTE);
17570 if (TEInsertPt == InsertPt) {
17571 // Check nodes, which might be emitted first.
17572 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17573 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17574 TEUseEI.UserTE->isAltShuffle()) &&
17575 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
17576 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17577 (UseEI.UserTE->hasState() &&
17578 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17579 !UseEI.UserTE->isAltShuffle()) ||
17580 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
17581 continue;
17582 }
17583
17584 // If the schedulable insertion point is used in multiple entries - just
17585 // exit, no known ordering at this point, available only after real
17586 // scheduling.
17587 if (!doesNotNeedToBeScheduled(InsertPt) &&
17588 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17589 continue;
17590 // If the users are the PHI nodes with the same incoming blocks - skip.
17591 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17592 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17593 UseEI.UserTE->State == TreeEntry::Vectorize &&
17594 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17595 TEUseEI.UserTE != UseEI.UserTE)
17596 continue;
17597 // If 2 gathers are operands of the same entry (regardless of whether
17598 // user is PHI or else), compare operands indices, use the earlier one
17599 // as the base.
17600 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17601 continue;
17602 // If the user instruction is used for some reason in different
17603 // vectorized nodes - make it depend on index.
17604 if (TEUseEI.UserTE != UseEI.UserTE &&
17605 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17606 HasGatherUser(TEUseEI.UserTE)))
17607 continue;
17608 // If the user node is the operand of the other user node - skip.
17609 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17610 continue;
17611 }
17612
17613 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17614 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17615 UseEI.UserTE->doesNotNeedToSchedule() &&
17616 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
17617 continue;
17618 // Check if the user node of the TE comes after user node of TEPtr,
17619 // otherwise TEPtr depends on TE.
17620 if ((TEInsertBlock != InsertPt->getParent() ||
17621 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17622 (!CheckOrdering(InsertPt) ||
17623 (UseEI.UserTE->hasCopyableElements() &&
17624 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17625 is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
17626 continue;
17627 // The node is reused - exit.
17628 if (CheckAndUseSameNode(TEPtr))
17629 break;
17630 // The parent node is copyable with last inst used outside? And the last
17631 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
17632 // preserve def-use chain.
17633 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17634 continue;
17635 VToTEs.insert(TEPtr);
17636 }
17637 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
17638 const auto *It = find_if(VTEs, [&](const TreeEntry *MTE) {
17639 return MTE != TE && MTE != TEUseEI.UserTE &&
17640 !DeletedNodes.contains(MTE) &&
17641 !TransformedToGatherNodes.contains(MTE);
17642 });
17643 if (It != VTEs.end()) {
17644 const TreeEntry *VTE = *It;
17645 if (none_of(TE->CombinedEntriesWithIndices,
17646 [&](const auto &P) { return P.first == VTE->Idx; })) {
17647 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17648 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17649 continue;
17650 }
17651 // The node is reused - exit.
17652 if (CheckAndUseSameNode(VTE))
17653 break;
17654 VToTEs.insert(VTE);
17655 }
17656 }
17657 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
17658 const auto *It = find_if(VTEs, [&, MainTE = TE](const TreeEntry *TE) {
17659 return TE != MainTE && !DeletedNodes.contains(TE) &&
17660 !TransformedToGatherNodes.contains(TE);
17661 });
17662 if (It != VTEs.end()) {
17663 const TreeEntry *VTE = *It;
17664 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17665 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17666 VTEs = VTEs.drop_front();
17667 // Iterate through all vectorized nodes.
17668 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
17669 return MTE->State == TreeEntry::Vectorize;
17670 });
17671 if (MIt == VTEs.end())
17672 continue;
17673 VTE = *MIt;
17674 }
17675 if (none_of(TE->CombinedEntriesWithIndices,
17676 [&](const auto &P) { return P.first == VTE->Idx; })) {
17677 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17678 if (&LastBundleInst == TEInsertPt ||
17679 !CheckOrdering(&LastBundleInst) ||
17680 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
17681 continue;
17682 }
17683 // The node is reused - exit.
17684 if (CheckAndUseSameNode(VTE))
17685 break;
17686 VToTEs.insert(VTE);
17687 }
17688 }
17689 if (VToTEs.empty())
17690 continue;
17691 if (UsedTEs.empty()) {
17692 // The first iteration, just insert the list of nodes to vector.
17693 UsedTEs.push_back(VToTEs);
17694 UsedValuesEntry.try_emplace(V, 0);
17695 } else {
17696 // Need to check if there are any previously used tree nodes which use V.
17697 // If there are no such nodes, consider that we have another one input
17698 // vector.
17699 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
17700 unsigned Idx = 0;
17701 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
17702 // Do we have a non-empty intersection of previously listed tree entries
17703 // and tree entries using current V?
17704 set_intersect(VToTEs, Set);
17705 if (!VToTEs.empty()) {
17706 // Yes, write the new subset and continue analysis for the next
17707 // scalar.
17708 Set.swap(VToTEs);
17709 break;
17710 }
17711 VToTEs = SavedVToTEs;
17712 ++Idx;
17713 }
17714 // No non-empty intersection found - need to add a second set of possible
17715 // source vectors.
17716 if (Idx == UsedTEs.size()) {
17717 // If the number of input vectors is greater than 2 - not a permutation,
17718 // fallback to the regular gather.
17719 // TODO: support multiple reshuffled nodes.
17720 if (UsedTEs.size() == 2)
17721 continue;
17722 UsedTEs.push_back(SavedVToTEs);
17723 Idx = UsedTEs.size() - 1;
17724 }
17725 UsedValuesEntry.try_emplace(V, Idx);
17726 }
17727 }
17728
17729 if (UsedTEs.empty()) {
17730 Entries.clear();
17731 return std::nullopt;
17732 }
17733
17734 unsigned VF = 0;
17735 if (UsedTEs.size() == 1) {
17736 // Keep the order to avoid non-determinism.
17737 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
17738 UsedTEs.front().end());
17739 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17740 return TE1->Idx < TE2->Idx;
17741 });
17742 // Try to find the perfect match in another gather node at first.
17743 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
17744 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
17745 });
17746 if (It != FirstEntries.end() &&
17747 ((*It)->getVectorFactor() == VL.size() ||
17748 ((*It)->getVectorFactor() == TE->Scalars.size() &&
17749 TE->ReuseShuffleIndices.size() == VL.size() &&
17750 (*It)->isSame(TE->Scalars)))) {
17751 Entries.push_back(*It);
17752 if ((*It)->getVectorFactor() == VL.size()) {
17753 std::iota(std::next(Mask.begin(), Part * VL.size()),
17754 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
17755 } else {
17756 SmallVector<int> CommonMask = TE->getCommonMask();
17757 copy(CommonMask, Mask.begin());
17758 }
17759 // Clear undef scalars.
17760 for (unsigned I : seq<unsigned>(VL.size()))
17761 if (isa<PoisonValue>(VL[I]))
17762 Mask[Part * VL.size() + I] = PoisonMaskElem;
17764 }
17765 // No perfect match, just shuffle, so choose the first tree node from the
17766 // tree.
17767 Entries.push_back(FirstEntries.front());
17768 // Update mapping between values and corresponding tree entries.
17769 for (auto &P : UsedValuesEntry)
17770 P.second = 0;
17771 VF = FirstEntries.front()->getVectorFactor();
17772 } else {
17773 // Try to find nodes with the same vector factor.
17774 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
17775 // Keep the order of tree nodes to avoid non-determinism.
17776 DenseMap<int, const TreeEntry *> VFToTE;
17777 for (const TreeEntry *TE : UsedTEs.front()) {
17778 unsigned VF = TE->getVectorFactor();
17779 auto It = VFToTE.find(VF);
17780 if (It != VFToTE.end()) {
17781 if (It->second->Idx > TE->Idx)
17782 It->getSecond() = TE;
17783 continue;
17784 }
17785 VFToTE.try_emplace(VF, TE);
17786 }
17787 // Same, keep the order to avoid non-determinism.
17788 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
17789 UsedTEs.back().end());
17790 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17791 return TE1->Idx < TE2->Idx;
17792 });
17793 for (const TreeEntry *TE : SecondEntries) {
17794 auto It = VFToTE.find(TE->getVectorFactor());
17795 if (It != VFToTE.end()) {
17796 VF = It->first;
17797 Entries.push_back(It->second);
17798 Entries.push_back(TE);
17799 break;
17800 }
17801 }
17802 // No 2 source vectors with the same vector factor - just choose 2 with max
17803 // index.
17804 if (Entries.empty()) {
17806 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
17807 return TE1->Idx < TE2->Idx;
17808 }));
17809 Entries.push_back(SecondEntries.front());
17810 VF = std::max(Entries.front()->getVectorFactor(),
17811 Entries.back()->getVectorFactor());
17812 } else {
17813 VF = Entries.front()->getVectorFactor();
17814 }
17815 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
17816 for (const TreeEntry *E : Entries)
17817 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
17818 E->Scalars.end());
17819 // Update mapping between values and corresponding tree entries.
17820 for (auto &P : UsedValuesEntry) {
17821 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
17822 if (ValuesToEntries[Idx].contains(P.first)) {
17823 P.second = Idx;
17824 break;
17825 }
17826 }
17827 }
17828
17829 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
17830 // Checks if the 2 PHIs are compatible in terms of high possibility to be
17831 // vectorized.
17832 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
17833 auto *PHI = cast<PHINode>(V);
17834 auto *PHI1 = cast<PHINode>(V1);
17835 // Check that all incoming values are compatible/from same parent (if they
17836 // are instructions).
17837 // The incoming values are compatible if they all are constants, or
17838 // instruction with the same/alternate opcodes from the same basic block.
17839 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
17840 Value *In = PHI->getIncomingValue(I);
17841 Value *In1 = PHI1->getIncomingValue(I);
17842 if (isConstant(In) && isConstant(In1))
17843 continue;
17844 if (!getSameOpcode({In, In1}, *TLI))
17845 return false;
17846 if (cast<Instruction>(In)->getParent() !=
17848 return false;
17849 }
17850 return true;
17851 };
17852 // Check if the value can be ignored during analysis for shuffled gathers.
17853 // We suppose it is better to ignore instruction, which do not form splats,
17854 // are not vectorized/not extractelements (these instructions will be handled
17855 // by extractelements processing) or may form vector node in future.
17856 auto MightBeIgnored = [=](Value *V) {
17857 auto *I = dyn_cast<Instruction>(V);
17858 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17860 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17861 };
17862 // Check that the neighbor instruction may form a full vector node with the
17863 // current instruction V. It is possible, if they have same/alternate opcode
17864 // and same parent basic block.
17865 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17866 Value *V1 = VL[Idx];
17867 bool UsedInSameVTE = false;
17868 auto It = UsedValuesEntry.find(V1);
17869 if (It != UsedValuesEntry.end())
17870 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17871 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17872 getSameOpcode({V, V1}, *TLI) &&
17873 cast<Instruction>(V)->getParent() ==
17874 cast<Instruction>(V1)->getParent() &&
17875 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17876 };
17877 // Build a shuffle mask for better cost estimation and vector emission.
17878 SmallBitVector UsedIdxs(Entries.size());
17880 for (int I = 0, E = VL.size(); I < E; ++I) {
17881 Value *V = VL[I];
17882 auto It = UsedValuesEntry.find(V);
17883 if (It == UsedValuesEntry.end())
17884 continue;
17885 // Do not try to shuffle scalars, if they are constants, or instructions
17886 // that can be vectorized as a result of the following vector build
17887 // vectorization.
17888 if (isConstant(V) || (MightBeIgnored(V) &&
17889 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17890 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17891 continue;
17892 unsigned Idx = It->second;
17893 EntryLanes.emplace_back(Idx, I);
17894 UsedIdxs.set(Idx);
17895 }
17896 // Iterate through all shuffled scalars and select entries, which can be used
17897 // for final shuffle.
17899 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17900 if (!UsedIdxs.test(I))
17901 continue;
17902 // Fix the entry number for the given scalar. If it is the first entry, set
17903 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17904 // These indices are used when calculating final shuffle mask as the vector
17905 // offset.
17906 for (std::pair<unsigned, int> &Pair : EntryLanes)
17907 if (Pair.first == I)
17908 Pair.first = TempEntries.size();
17909 TempEntries.push_back(Entries[I]);
17910 }
17911 Entries.swap(TempEntries);
17912 if (EntryLanes.size() == Entries.size() &&
17913 !VL.equals(ArrayRef(TE->Scalars)
17914 .slice(Part * VL.size(),
17915 std::min<int>(VL.size(), TE->Scalars.size())))) {
17916 // We may have here 1 or 2 entries only. If the number of scalars is equal
17917 // to the number of entries, no need to do the analysis, it is not very
17918 // profitable. Since VL is not the same as TE->Scalars, it means we already
17919 // have some shuffles before. Cut off not profitable case.
17920 Entries.clear();
17921 return std::nullopt;
17922 }
17923 // Build the final mask, check for the identity shuffle, if possible.
17924 bool IsIdentity = Entries.size() == 1;
17925 // Pair.first is the offset to the vector, while Pair.second is the index of
17926 // scalar in the list.
17927 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17928 unsigned Idx = Part * VL.size() + Pair.second;
17929 Mask[Idx] =
17930 Pair.first * VF +
17931 (ForOrder ? std::distance(
17932 Entries[Pair.first]->Scalars.begin(),
17933 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17934 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17935 IsIdentity &= Mask[Idx] == Pair.second;
17936 }
17937 if (ForOrder || IsIdentity || Entries.empty()) {
17938 switch (Entries.size()) {
17939 case 1:
17940 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17942 break;
17943 case 2:
17944 if (EntryLanes.size() > 2 || VL.size() <= 2)
17946 break;
17947 default:
17948 break;
17949 }
17950 } else if (!isa<VectorType>(VL.front()->getType()) &&
17951 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17952 // Do the cost estimation if shuffle beneficial than buildvector.
17953 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17954 std::next(Mask.begin(), (Part + 1) * VL.size()));
17955 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17956 for (int Idx : SubMask) {
17957 if (Idx == PoisonMaskElem)
17958 continue;
17959 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17960 MinElement = Idx;
17961 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17962 MaxElement = Idx;
17963 }
17964 assert(MaxElement >= 0 && MinElement >= 0 &&
17965 MaxElement % VF >= MinElement % VF &&
17966 "Expected at least single element.");
17967 unsigned NewVF = std::max<unsigned>(
17968 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17969 (MaxElement % VF) -
17970 (MinElement % VF) + 1));
17971 if (NewVF < VF) {
17972 for (int &Idx : SubMask) {
17973 if (Idx == PoisonMaskElem)
17974 continue;
17975 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17976 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17977 }
17978 } else {
17979 NewVF = VF;
17980 }
17981
17983 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17984 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17985 auto GetShuffleCost = [&,
17986 &TTI = *TTI](ArrayRef<int> Mask,
17988 VectorType *VecTy) -> InstructionCost {
17989 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17991 Mask, Entries.front()->getInterleaveFactor()))
17992 return TTI::TCC_Free;
17993 return ::getShuffleCost(TTI,
17994 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17996 VecTy, Mask, CostKind);
17997 };
17998 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17999 InstructionCost FirstShuffleCost = 0;
18000 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
18001 if (Entries.size() == 1 || !Entries[0]->isGather()) {
18002 FirstShuffleCost = ShuffleCost;
18003 } else {
18004 // Transform mask to include only first entry.
18005 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
18006 bool IsIdentity = true;
18007 for (auto [I, Idx] : enumerate(FirstMask)) {
18008 if (Idx >= static_cast<int>(NewVF)) {
18009 Idx = PoisonMaskElem;
18010 } else {
18011 DemandedElts.clearBit(I);
18012 if (Idx != PoisonMaskElem)
18013 IsIdentity &= static_cast<int>(I) == Idx;
18014 }
18015 }
18016 if (!IsIdentity)
18017 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
18018 FirstShuffleCost += getScalarizationOverhead(
18019 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
18020 /*Extract=*/false, CostKind);
18021 }
18022 InstructionCost SecondShuffleCost = 0;
18023 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
18024 if (Entries.size() == 1 || !Entries[1]->isGather()) {
18025 SecondShuffleCost = ShuffleCost;
18026 } else {
18027 // Transform mask to include only first entry.
18028 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
18029 bool IsIdentity = true;
18030 for (auto [I, Idx] : enumerate(SecondMask)) {
18031 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
18032 Idx = PoisonMaskElem;
18033 } else {
18034 DemandedElts.clearBit(I);
18035 if (Idx != PoisonMaskElem) {
18036 Idx -= NewVF;
18037 IsIdentity &= static_cast<int>(I) == Idx;
18038 }
18039 }
18040 }
18041 if (!IsIdentity)
18042 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
18043 SecondShuffleCost += getScalarizationOverhead(
18044 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
18045 /*Extract=*/false, CostKind);
18046 }
18047 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
18048 for (auto [I, Idx] : enumerate(SubMask))
18049 if (Idx == PoisonMaskElem)
18050 DemandedElts.clearBit(I);
18051 InstructionCost BuildVectorCost = getScalarizationOverhead(
18052 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
18053 /*Extract=*/false, CostKind);
18054 const TreeEntry *BestEntry = nullptr;
18055 if (FirstShuffleCost < ShuffleCost) {
18056 std::for_each(std::next(Mask.begin(), Part * VL.size()),
18057 std::next(Mask.begin(), (Part + 1) * VL.size()),
18058 [&](int &Idx) {
18059 if (Idx >= static_cast<int>(VF))
18060 Idx = PoisonMaskElem;
18061 });
18062 BestEntry = Entries.front();
18063 ShuffleCost = FirstShuffleCost;
18064 }
18065 if (SecondShuffleCost < ShuffleCost) {
18066 std::for_each(std::next(Mask.begin(), Part * VL.size()),
18067 std::next(Mask.begin(), (Part + 1) * VL.size()),
18068 [&](int &Idx) {
18069 if (Idx < static_cast<int>(VF))
18070 Idx = PoisonMaskElem;
18071 else
18072 Idx -= VF;
18073 });
18074 BestEntry = Entries[1];
18075 ShuffleCost = SecondShuffleCost;
18076 }
18077 if (BuildVectorCost >= ShuffleCost) {
18078 if (BestEntry) {
18079 Entries.clear();
18080 Entries.push_back(BestEntry);
18081 }
18082 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
18084 }
18085 }
18086 Entries.clear();
18087 // Clear the corresponding mask elements.
18088 std::fill(std::next(Mask.begin(), Part * VL.size()),
18089 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
18090 return std::nullopt;
18091}
18092
18094BoUpSLP::isGatherShuffledEntry(
18095 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
18096 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
18097 bool ForOrder) {
18098 assert(NumParts > 0 && NumParts < VL.size() &&
18099 "Expected positive number of registers.");
18100 Entries.clear();
18101 // No need to check for the topmost gather node.
18102 if (TE == VectorizableTree.front().get() &&
18103 (!GatheredLoadsEntriesFirst.has_value() ||
18104 none_of(ArrayRef(VectorizableTree).drop_front(),
18105 [](const std::unique_ptr<TreeEntry> &TE) {
18106 return !TE->isGather();
18107 })))
18108 return {};
18109 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
18110 // implemented yet.
18111 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
18112 return {};
18113 Mask.assign(VL.size(), PoisonMaskElem);
18114 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
18115 "Expected only single user of the gather node.");
18116 assert(VL.size() % NumParts == 0 &&
18117 "Number of scalars must be divisible by NumParts.");
18118 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
18119 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
18120 (TE->Idx == 0 ||
18121 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
18122 isSplat(TE->Scalars) ||
18123 (TE->hasState() &&
18124 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
18125 return {};
18126 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18128 for (unsigned Part : seq<unsigned>(NumParts)) {
18129 ArrayRef<Value *> SubVL =
18130 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
18131 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
18132 std::optional<TTI::ShuffleKind> SubRes =
18133 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
18134 ForOrder);
18135 if (!SubRes)
18136 SubEntries.clear();
18137 Res.push_back(SubRes);
18138 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
18139 SubEntries.front()->getVectorFactor() == VL.size() &&
18140 (SubEntries.front()->isSame(TE->Scalars) ||
18141 SubEntries.front()->isSame(VL))) {
18142 SmallVector<const TreeEntry *> LocalSubEntries;
18143 LocalSubEntries.swap(SubEntries);
18144 Entries.clear();
18145 Res.clear();
18146 std::iota(Mask.begin(), Mask.end(), 0);
18147 // Clear undef scalars.
18148 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
18149 if (isa<PoisonValue>(VL[I]))
18151 Entries.emplace_back(1, LocalSubEntries.front());
18153 return Res;
18154 }
18155 }
18156 if (all_of(Res,
18157 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
18158 Entries.clear();
18159 return {};
18160 }
18161 return Res;
18162}
18163
18164InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
18165 Type *ScalarTy) const {
18166 const unsigned VF = VL.size();
18167 auto *VecTy = getWidenedType(ScalarTy, VF);
18168 // Find the cost of inserting/extracting values from the vector.
18169 // Check if the same elements are inserted several times and count them as
18170 // shuffle candidates.
18171 APInt DemandedElements = APInt::getZero(VF);
18174 auto EstimateInsertCost = [&](unsigned I, Value *V) {
18175 DemandedElements.setBit(I);
18176 if (V->getType() != ScalarTy)
18177 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
18179 };
18180 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
18181 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
18182 for (auto [I, V] : enumerate(VL)) {
18183 // No need to shuffle duplicates for constants.
18184 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
18185 continue;
18186
18187 if (isConstant(V)) {
18188 ConstantShuffleMask[I] = I + VF;
18189 continue;
18190 }
18191 EstimateInsertCost(I, V);
18192 }
18193 // FIXME: add a cost for constant vector materialization.
18194 bool IsAnyNonUndefConst =
18195 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
18196 // 1. Shuffle input source vector and constant vector.
18197 if (!ForPoisonSrc && IsAnyNonUndefConst) {
18199 ConstantShuffleMask);
18200 }
18201
18202 // 2. Insert unique non-constants.
18203 if (!DemandedElements.isZero())
18204 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
18205 /*Insert=*/true,
18206 /*Extract=*/false, CostKind,
18207 ForPoisonSrc && !IsAnyNonUndefConst, VL);
18208 return Cost;
18209}
18210
18211Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
18212 auto It = EntryToLastInstruction.find(E);
18213 if (It != EntryToLastInstruction.end())
18214 return *cast<Instruction>(It->second);
18215 Instruction *Res = nullptr;
18216 // Get the basic block this bundle is in. All instructions in the bundle
18217 // should be in this block (except for extractelement-like instructions with
18218 // constant indices or gathered loads or copyables).
18219 Instruction *Front;
18220 unsigned Opcode;
18221 if (E->hasState()) {
18222 Front = E->getMainOp();
18223 Opcode = E->getOpcode();
18224 } else {
18225 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
18226 Opcode = Front->getOpcode();
18227 }
18228 auto *BB = Front->getParent();
18229 assert(
18230 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
18231 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
18232 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
18233 all_of(E->Scalars,
18234 [=](Value *V) -> bool {
18235 if (Opcode == Instruction::GetElementPtr &&
18236 !isa<GetElementPtrInst>(V))
18237 return true;
18238 auto *I = dyn_cast<Instruction>(V);
18239 return !I || !E->getMatchingMainOpOrAltOp(I) ||
18240 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
18241 })) &&
18242 "Expected gathered loads or GEPs or instructions from same basic "
18243 "block.");
18244
18245 auto FindLastInst = [&]() {
18246 Instruction *LastInst = Front;
18247 for (Value *V : E->Scalars) {
18248 auto *I = dyn_cast<Instruction>(V);
18249 if (!I)
18250 continue;
18251 if (E->isCopyableElement(I))
18252 continue;
18253 if (LastInst->getParent() == I->getParent()) {
18254 if (LastInst->comesBefore(I))
18255 LastInst = I;
18256 continue;
18257 }
18258 assert(((Opcode == Instruction::GetElementPtr &&
18260 E->State == TreeEntry::SplitVectorize ||
18261 (isVectorLikeInstWithConstOps(LastInst) &&
18263 (GatheredLoadsEntriesFirst.has_value() &&
18264 Opcode == Instruction::Load && E->isGather() &&
18265 E->Idx < *GatheredLoadsEntriesFirst)) &&
18266 "Expected vector-like or non-GEP in GEP node insts only.");
18267 if (!DT->isReachableFromEntry(LastInst->getParent())) {
18268 LastInst = I;
18269 continue;
18270 }
18271 if (!DT->isReachableFromEntry(I->getParent()))
18272 continue;
18273 auto *NodeA = DT->getNode(LastInst->getParent());
18274 auto *NodeB = DT->getNode(I->getParent());
18275 assert(NodeA && "Should only process reachable instructions");
18276 assert(NodeB && "Should only process reachable instructions");
18277 assert((NodeA == NodeB) ==
18278 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18279 "Different nodes should have different DFS numbers");
18280 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
18281 LastInst = I;
18282 }
18283 BB = LastInst->getParent();
18284 return LastInst;
18285 };
18286
18287 auto FindFirstInst = [&]() {
18288 Instruction *FirstInst = Front;
18289 for (Value *V : E->Scalars) {
18290 auto *I = dyn_cast<Instruction>(V);
18291 if (!I)
18292 continue;
18293 if (E->isCopyableElement(I))
18294 continue;
18295 if (FirstInst->getParent() == I->getParent()) {
18296 if (I->comesBefore(FirstInst))
18297 FirstInst = I;
18298 continue;
18299 }
18300 assert(((Opcode == Instruction::GetElementPtr &&
18302 (isVectorLikeInstWithConstOps(FirstInst) &&
18304 "Expected vector-like or non-GEP in GEP node insts only.");
18305 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
18306 FirstInst = I;
18307 continue;
18308 }
18309 if (!DT->isReachableFromEntry(I->getParent()))
18310 continue;
18311 auto *NodeA = DT->getNode(FirstInst->getParent());
18312 auto *NodeB = DT->getNode(I->getParent());
18313 assert(NodeA && "Should only process reachable instructions");
18314 assert(NodeB && "Should only process reachable instructions");
18315 assert((NodeA == NodeB) ==
18316 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18317 "Different nodes should have different DFS numbers");
18318 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
18319 FirstInst = I;
18320 }
18321 return FirstInst;
18322 };
18323
18324 if (E->State == TreeEntry::SplitVectorize) {
18325 Res = FindLastInst();
18326 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
18327 for (auto *E : Entries) {
18328 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
18329 if (!I)
18330 I = &getLastInstructionInBundle(E);
18331 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
18332 Res = I;
18333 }
18334 }
18335 EntryToLastInstruction.try_emplace(E, Res);
18336 return *Res;
18337 }
18338
18339 // Set insertpoint for gathered loads to the very first load.
18340 if (GatheredLoadsEntriesFirst.has_value() &&
18341 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
18342 Opcode == Instruction::Load) {
18343 Res = FindFirstInst();
18344 EntryToLastInstruction.try_emplace(E, Res);
18345 return *Res;
18346 }
18347
18348 // Set the insert point to the beginning of the basic block if the entry
18349 // should not be scheduled.
18350 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
18351 if (E->isGather())
18352 return nullptr;
18353 // Found previously that the instruction do not need to be scheduled.
18354 const auto *It = BlocksSchedules.find(BB);
18355 if (It == BlocksSchedules.end())
18356 return nullptr;
18357 for (Value *V : E->Scalars) {
18358 auto *I = dyn_cast<Instruction>(V);
18359 if (!I || isa<PHINode>(I) ||
18360 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
18361 continue;
18362 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
18363 if (Bundles.empty())
18364 continue;
18365 const auto *It = find_if(
18366 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
18367 if (It != Bundles.end())
18368 return *It;
18369 }
18370 return nullptr;
18371 };
18372 const ScheduleBundle *Bundle = FindScheduleBundle(E);
18373 if (!E->isGather() && !Bundle) {
18374 if ((Opcode == Instruction::GetElementPtr &&
18375 any_of(E->Scalars,
18376 [](Value *V) {
18377 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
18378 })) ||
18379 (all_of(E->Scalars,
18380 [&](Value *V) {
18381 return isa<PoisonValue>(V) ||
18382 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
18383 E->isCopyableElement(V) ||
18384 (!isVectorLikeInstWithConstOps(V) &&
18385 isUsedOutsideBlock(V));
18386 }) &&
18387 (!E->doesNotNeedToSchedule() ||
18388 any_of(E->Scalars,
18389 [&](Value *V) {
18390 if (!isa<Instruction>(V) ||
18391 (E->hasCopyableElements() && E->isCopyableElement(V)))
18392 return false;
18393 return !areAllOperandsNonInsts(V);
18394 }) ||
18395 none_of(E->Scalars, [&](Value *V) {
18396 if (!isa<Instruction>(V) ||
18397 (E->hasCopyableElements() && E->isCopyableElement(V)))
18398 return false;
18399 return MustGather.contains(V);
18400 }))))
18401 Res = FindLastInst();
18402 else
18403 Res = FindFirstInst();
18404 EntryToLastInstruction.try_emplace(E, Res);
18405 return *Res;
18406 }
18407
18408 // Find the last instruction. The common case should be that BB has been
18409 // scheduled, and the last instruction is VL.back(). So we start with
18410 // VL.back() and iterate over schedule data until we reach the end of the
18411 // bundle. The end of the bundle is marked by null ScheduleData.
18412 if (Bundle) {
18413 assert(!E->isGather() && "Gathered instructions should not be scheduled");
18414 Res = Bundle->getBundle().back()->getInst();
18415 EntryToLastInstruction.try_emplace(E, Res);
18416 return *Res;
18417 }
18418
18419 // LastInst can still be null at this point if there's either not an entry
18420 // for BB in BlocksSchedules or there's no ScheduleData available for
18421 // VL.back(). This can be the case if buildTreeRec aborts for various
18422 // reasons (e.g., the maximum recursion depth is reached, the maximum region
18423 // size is reached, etc.). ScheduleData is initialized in the scheduling
18424 // "dry-run".
18425 //
18426 // If this happens, we can still find the last instruction by brute force. We
18427 // iterate forwards from Front (inclusive) until we either see all
18428 // instructions in the bundle or reach the end of the block. If Front is the
18429 // last instruction in program order, LastInst will be set to Front, and we
18430 // will visit all the remaining instructions in the block.
18431 //
18432 // One of the reasons we exit early from buildTreeRec is to place an upper
18433 // bound on compile-time. Thus, taking an additional compile-time hit here is
18434 // not ideal. However, this should be exceedingly rare since it requires that
18435 // we both exit early from buildTreeRec and that the bundle be out-of-order
18436 // (causing us to iterate all the way to the end of the block).
18437 if (!Res)
18438 Res = FindLastInst();
18439 assert(Res && "Failed to find last instruction in bundle");
18440 EntryToLastInstruction.try_emplace(E, Res);
18441 return *Res;
18442}
18443
18444void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
18445 auto *Front = E->getMainOp();
18446 Instruction *LastInst = &getLastInstructionInBundle(E);
18447 assert(LastInst && "Failed to find last instruction in bundle");
18448 BasicBlock::iterator LastInstIt = LastInst->getIterator();
18449 // If the instruction is PHI, set the insert point after all the PHIs.
18450 bool IsPHI = isa<PHINode>(LastInst);
18451 if (IsPHI) {
18452 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
18453 if (LastInstIt != LastInst->getParent()->end() &&
18454 LastInstIt->getParent()->isLandingPad())
18455 LastInstIt = std::next(LastInstIt);
18456 }
18457 if (IsPHI ||
18458 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
18459 (E->doesNotNeedToSchedule() ||
18460 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
18461 isUsedOutsideBlock(LastInst)))) ||
18462 (GatheredLoadsEntriesFirst.has_value() &&
18463 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
18464 E->getOpcode() == Instruction::Load)) {
18465 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
18466 } else {
18467 // Set the insertion point after the last instruction in the bundle. Set the
18468 // debug location to Front.
18469 Builder.SetInsertPoint(
18470 LastInst->getParent(),
18471 LastInst->getNextNode()->getIterator());
18472 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
18473 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
18474 } else {
18475 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
18476 PoisonValue::get(Builder.getPtrTy()),
18477 MaybeAlign());
18478 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
18479 eraseInstruction(Res);
18480 LastInstructionToPos.try_emplace(LastInst, Res);
18481 }
18482 }
18483 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
18484}
18485
18486Value *BoUpSLP::gather(
18487 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
18488 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
18489 // List of instructions/lanes from current block and/or the blocks which are
18490 // part of the current loop. These instructions will be inserted at the end to
18491 // make it possible to optimize loops and hoist invariant instructions out of
18492 // the loops body with better chances for success.
18494 SmallSet<int, 4> PostponedIndices;
18495 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
18496 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
18497 SmallPtrSet<BasicBlock *, 4> Visited;
18498 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
18499 InsertBB = InsertBB->getSinglePredecessor();
18500 return InsertBB && InsertBB == InstBB;
18501 };
18502 for (int I = 0, E = VL.size(); I < E; ++I) {
18503 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
18504 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
18505 isVectorized(Inst) ||
18506 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
18507 PostponedIndices.insert(I).second)
18508 PostponedInsts.emplace_back(Inst, I);
18509 }
18510
18511 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
18512 Type *Ty) {
18513 Value *Scalar = V;
18514 if (Scalar->getType() != Ty) {
18515 assert(Scalar->getType()->isIntOrIntVectorTy() &&
18516 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
18517 Value *V = Scalar;
18518 if (auto *CI = dyn_cast<CastInst>(Scalar);
18520 Value *Op = CI->getOperand(0);
18521 if (auto *IOp = dyn_cast<Instruction>(Op);
18522 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
18523 V = Op;
18524 }
18525 Scalar = Builder.CreateIntCast(
18526 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
18527 }
18528
18529 Instruction *InsElt;
18530 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
18531 assert(SLPReVec && "FixedVectorType is not expected.");
18532 Vec =
18533 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
18534 auto *II = dyn_cast<Instruction>(Vec);
18535 if (!II)
18536 return Vec;
18537 InsElt = II;
18538 } else {
18539 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
18540 InsElt = dyn_cast<InsertElementInst>(Vec);
18541 if (!InsElt)
18542 return Vec;
18543 }
18544 GatherShuffleExtractSeq.insert(InsElt);
18545 CSEBlocks.insert(InsElt->getParent());
18546 // Add to our 'need-to-extract' list.
18547 if (isa<Instruction>(V)) {
18548 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
18549 const auto *It = find_if(Entries, [&](const TreeEntry *E) {
18550 return !TransformedToGatherNodes.contains(E) &&
18551 !DeletedNodes.contains(E);
18552 });
18553 if (It != Entries.end()) {
18554 // Find which lane we need to extract.
18555 User *UserOp = nullptr;
18556 if (Scalar != V) {
18557 if (auto *SI = dyn_cast<Instruction>(Scalar))
18558 UserOp = SI;
18559 } else {
18560 if (V->getType()->isVectorTy()) {
18561 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
18562 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
18563 // Find shufflevector, caused by resize.
18564 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
18565 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
18566 if (SV->getOperand(0) == V)
18567 return SV;
18568 if (SV->getOperand(1) == V)
18569 return SV;
18570 }
18571 return nullptr;
18572 };
18573 InsElt = nullptr;
18574 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18575 InsElt = User;
18576 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18577 InsElt = User;
18578 assert(InsElt &&
18579 "Failed to find shufflevector, caused by resize.");
18580 }
18581 }
18582 UserOp = InsElt;
18583 }
18584 if (UserOp) {
18585 unsigned FoundLane = (*It)->findLaneForValue(V);
18586 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
18587 }
18588 }
18589 }
18590 return Vec;
18591 };
18592 auto *VecTy = getWidenedType(ScalarTy, VL.size());
18593 Value *Vec = PoisonValue::get(VecTy);
18594 SmallVector<int> NonConsts;
18595 SmallVector<int> Mask(VL.size());
18596 std::iota(Mask.begin(), Mask.end(), 0);
18597 Value *OriginalRoot = Root;
18598 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
18599 SV && isa<PoisonValue>(SV->getOperand(1)) &&
18600 SV->getOperand(0)->getType() == VecTy) {
18601 Root = SV->getOperand(0);
18602 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18603 }
18604 // Insert constant values at first.
18605 for (int I = 0, E = VL.size(); I < E; ++I) {
18606 if (PostponedIndices.contains(I))
18607 continue;
18608 if (!isConstant(VL[I])) {
18609 NonConsts.push_back(I);
18610 continue;
18611 }
18612 if (isa<PoisonValue>(VL[I]))
18613 continue;
18614 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18615 Mask[I] = I + E;
18616 }
18617 if (Root) {
18618 if (isa<PoisonValue>(Vec)) {
18619 Vec = OriginalRoot;
18620 } else {
18621 Vec = CreateShuffle(Root, Vec, Mask);
18622 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
18623 OI && OI->use_empty() &&
18624 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18625 return TE->VectorizedValue == OI;
18626 }))
18627 eraseInstruction(OI);
18628 }
18629 }
18630 // Insert non-constant values.
18631 for (int I : NonConsts)
18632 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18633 // Append instructions, which are/may be part of the loop, in the end to make
18634 // it possible to hoist non-loop-based instructions.
18635 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18636 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18637
18638 return Vec;
18639}
18640
18641/// Merges shuffle masks and emits final shuffle instruction, if required. It
18642/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
18643/// when the actual shuffle instruction is generated only if this is actually
18644/// required. Otherwise, the shuffle instruction emission is delayed till the
18645/// end of the process, to reduce the number of emitted instructions and further
18646/// analysis/transformations.
18647/// The class also will look through the previously emitted shuffle instructions
18648/// and properly mark indices in mask as undef.
18649/// For example, given the code
18650/// \code
18651/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
18652/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
18653/// \endcode
18654/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
18655/// look through %s1 and %s2 and emit
18656/// \code
18657/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18658/// \endcode
18659/// instead.
18660/// If 2 operands are of different size, the smallest one will be resized and
18661/// the mask recalculated properly.
18662/// For example, given the code
18663/// \code
18664/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
18665/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
18666/// \endcode
18667/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
18668/// look through %s1 and %s2 and emit
18669/// \code
18670/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18671/// \endcode
18672/// instead.
18673class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
18674 bool IsFinalized = false;
18675 /// Combined mask for all applied operands and masks. It is built during
18676 /// analysis and actual emission of shuffle vector instructions.
18677 SmallVector<int> CommonMask;
18678 /// List of operands for the shuffle vector instruction. It hold at max 2
18679 /// operands, if the 3rd is going to be added, the first 2 are combined into
18680 /// shuffle with \p CommonMask mask, the first operand sets to be the
18681 /// resulting shuffle and the second operand sets to be the newly added
18682 /// operand. The \p CommonMask is transformed in the proper way after that.
18683 SmallVector<Value *, 2> InVectors;
18684 IRBuilderBase &Builder;
18685 BoUpSLP &R;
18686
18687 class ShuffleIRBuilder {
18688 IRBuilderBase &Builder;
18689 /// Holds all of the instructions that we gathered.
18690 SetVector<Instruction *> &GatherShuffleExtractSeq;
18691 /// A list of blocks that we are going to CSE.
18692 DenseSet<BasicBlock *> &CSEBlocks;
18693 /// Data layout.
18694 const DataLayout &DL;
18695
18696 public:
18697 ShuffleIRBuilder(IRBuilderBase &Builder,
18698 SetVector<Instruction *> &GatherShuffleExtractSeq,
18699 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
18700 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
18701 CSEBlocks(CSEBlocks), DL(DL) {}
18702 ~ShuffleIRBuilder() = default;
18703 /// Creates shufflevector for the 2 operands with the given mask.
18704 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
18705 if (V1->getType() != V2->getType()) {
18707 V1->getType()->isIntOrIntVectorTy() &&
18708 "Expected integer vector types only.");
18709 if (V1->getType() != V2->getType()) {
18710 if (cast<VectorType>(V2->getType())
18711 ->getElementType()
18712 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
18713 ->getElementType()
18714 ->getIntegerBitWidth())
18715 V2 = Builder.CreateIntCast(
18716 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
18717 else
18718 V1 = Builder.CreateIntCast(
18719 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
18720 }
18721 }
18722 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
18723 if (auto *I = dyn_cast<Instruction>(Vec)) {
18724 GatherShuffleExtractSeq.insert(I);
18725 CSEBlocks.insert(I->getParent());
18726 }
18727 return Vec;
18728 }
18729 /// Creates permutation of the single vector operand with the given mask, if
18730 /// it is not identity mask.
18731 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
18732 if (Mask.empty())
18733 return V1;
18734 unsigned VF = Mask.size();
18735 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
18736 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
18737 return V1;
18738 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18739 if (auto *I = dyn_cast<Instruction>(Vec)) {
18740 GatherShuffleExtractSeq.insert(I);
18741 CSEBlocks.insert(I->getParent());
18742 }
18743 return Vec;
18744 }
18745 Value *createIdentity(Value *V) { return V; }
18746 Value *createPoison(Type *Ty, unsigned VF) {
18747 return PoisonValue::get(getWidenedType(Ty, VF));
18748 }
18749 /// Resizes 2 input vector to match the sizes, if the they are not equal
18750 /// yet. The smallest vector is resized to the size of the larger vector.
18751 void resizeToMatch(Value *&V1, Value *&V2) {
18752 if (V1->getType() == V2->getType())
18753 return;
18754 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
18755 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
18756 int VF = std::max(V1VF, V2VF);
18757 int MinVF = std::min(V1VF, V2VF);
18758 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
18759 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
18760 0);
18761 Value *&Op = MinVF == V1VF ? V1 : V2;
18762 Op = Builder.CreateShuffleVector(Op, IdentityMask);
18763 if (auto *I = dyn_cast<Instruction>(Op)) {
18764 GatherShuffleExtractSeq.insert(I);
18765 CSEBlocks.insert(I->getParent());
18766 }
18767 if (MinVF == V1VF)
18768 V1 = Op;
18769 else
18770 V2 = Op;
18771 }
18772 };
18773
18774 /// Smart shuffle instruction emission, walks through shuffles trees and
18775 /// tries to find the best matching vector for the actual shuffle
18776 /// instruction.
18777 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
18778 assert(V1 && "Expected at least one vector value.");
18779 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18780 R.CSEBlocks, *R.DL);
18781 return BaseShuffleAnalysis::createShuffle<Value *>(
18782 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18783 }
18784
18785 /// Cast value \p V to the vector type with the same number of elements, but
18786 /// the base type \p ScalarTy.
18787 Value *castToScalarTyElem(Value *V,
18788 std::optional<bool> IsSigned = std::nullopt) {
18789 auto *VecTy = cast<VectorType>(V->getType());
18790 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
18791 if (VecTy->getElementType() == ScalarTy->getScalarType())
18792 return V;
18793 return Builder.CreateIntCast(
18794 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18795 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
18796 }
18797
18798 Value *getVectorizedValue(const TreeEntry &E) {
18799 Value *Vec = E.VectorizedValue;
18800 if (!Vec->getType()->isIntOrIntVectorTy())
18801 return Vec;
18802 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
18803 return !isa<PoisonValue>(V) &&
18804 !isKnownNonNegative(
18805 V, SimplifyQuery(*R.DL));
18806 }));
18807 }
18808
18809public:
18811 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18812
18813 /// Adjusts extractelements after reusing them.
18814 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
18815 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18816 unsigned NumParts, bool &UseVecBaseAsInput) {
18817 UseVecBaseAsInput = false;
18818 SmallPtrSet<Value *, 4> UniqueBases;
18819 Value *VecBase = nullptr;
18820 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
18821 if (!E->ReorderIndices.empty()) {
18822 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18823 E->ReorderIndices.end());
18824 reorderScalars(VL, ReorderMask);
18825 }
18826 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18827 int Idx = Mask[I];
18828 if (Idx == PoisonMaskElem)
18829 continue;
18830 auto *EI = cast<ExtractElementInst>(VL[I]);
18831 VecBase = EI->getVectorOperand();
18832 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
18833 VecBase = TEs.front()->VectorizedValue;
18834 assert(VecBase && "Expected vectorized value.");
18835 UniqueBases.insert(VecBase);
18836 // If the only one use is vectorized - can delete the extractelement
18837 // itself.
18838 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18839 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
18840 !R.isVectorized(EI) &&
18841 count_if(E->Scalars, [&](Value *V) { return V == EI; }) !=
18842 count_if(E->UserTreeIndex.UserTE->Scalars,
18843 [&](Value *V) { return V == EI; })) ||
18844 (NumParts != 1 && count(VL, EI) > 1) ||
18845 any_of(EI->users(), [&](User *U) {
18846 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18847 return UTEs.empty() || UTEs.size() > 1 ||
18848 any_of(UTEs,
18849 [&](const TreeEntry *TE) {
18850 return R.DeletedNodes.contains(TE) ||
18851 R.TransformedToGatherNodes.contains(TE);
18852 }) ||
18854 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18855 (!UTEs.empty() &&
18856 count_if(R.VectorizableTree,
18857 [&](const std::unique_ptr<TreeEntry> &TE) {
18858 return TE->UserTreeIndex.UserTE ==
18859 UTEs.front() &&
18860 is_contained(VL, EI);
18861 }) != 1);
18862 }))
18863 continue;
18864 R.eraseInstruction(EI);
18865 }
18866 if (NumParts == 1 || UniqueBases.size() == 1) {
18867 assert(VecBase && "Expected vectorized value.");
18868 return castToScalarTyElem(VecBase);
18869 }
18870 UseVecBaseAsInput = true;
18871 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
18872 for (auto [I, Idx] : enumerate(Mask))
18873 if (Idx != PoisonMaskElem)
18874 Idx = I;
18875 };
18876 // Perform multi-register vector shuffle, joining them into a single virtual
18877 // long vector.
18878 // Need to shuffle each part independently and then insert all this parts
18879 // into a long virtual vector register, forming the original vector.
18880 Value *Vec = nullptr;
18881 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18882 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18883 for (unsigned Part : seq<unsigned>(NumParts)) {
18884 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
18885 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
18886 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
18887 constexpr int MaxBases = 2;
18888 SmallVector<Value *, MaxBases> Bases(MaxBases);
18889 auto VLMask = zip(SubVL, SubMask);
18890 const unsigned VF = std::accumulate(
18891 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
18892 if (std::get<1>(D) == PoisonMaskElem)
18893 return S;
18894 Value *VecOp =
18895 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18896 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18897 !TEs.empty())
18898 VecOp = TEs.front()->VectorizedValue;
18899 assert(VecOp && "Expected vectorized value.");
18900 const unsigned Size =
18901 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18902 return std::max(S, Size);
18903 });
18904 for (const auto [V, I] : VLMask) {
18905 if (I == PoisonMaskElem)
18906 continue;
18907 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18908 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18909 VecOp = TEs.front()->VectorizedValue;
18910 assert(VecOp && "Expected vectorized value.");
18911 VecOp = castToScalarTyElem(VecOp);
18912 Bases[I / VF] = VecOp;
18913 }
18914 if (!Bases.front())
18915 continue;
18916 Value *SubVec;
18917 if (Bases.back()) {
18918 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18919 TransformToIdentity(SubMask);
18920 } else {
18921 SubVec = Bases.front();
18922 }
18923 if (!Vec) {
18924 Vec = SubVec;
18925 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18926 [&](unsigned P) {
18927 ArrayRef<int> SubMask =
18928 Mask.slice(P * SliceSize,
18929 getNumElems(Mask.size(),
18930 SliceSize, P));
18931 return all_of(SubMask, [](int Idx) {
18932 return Idx == PoisonMaskElem;
18933 });
18934 })) &&
18935 "Expected first part or all previous parts masked.");
18936 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18937 } else {
18938 unsigned NewVF =
18939 cast<FixedVectorType>(Vec->getType())->getNumElements();
18940 if (Vec->getType() != SubVec->getType()) {
18941 unsigned SubVecVF =
18942 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18943 NewVF = std::max(NewVF, SubVecVF);
18944 }
18945 // Adjust SubMask.
18946 for (int &Idx : SubMask)
18947 if (Idx != PoisonMaskElem)
18948 Idx += NewVF;
18949 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18950 Vec = createShuffle(Vec, SubVec, VecMask);
18951 TransformToIdentity(VecMask);
18952 }
18953 }
18954 copy(VecMask, Mask.begin());
18955 return Vec;
18956 }
18957 /// Checks if the specified entry \p E needs to be delayed because of its
18958 /// dependency nodes.
18959 std::optional<Value *>
18960 needToDelay(const TreeEntry *E,
18962 // No need to delay emission if all deps are ready.
18963 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18964 return all_of(
18965 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18966 }))
18967 return std::nullopt;
18968 // Postpone gather emission, will be emitted after the end of the
18969 // process to keep correct order.
18970 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18971 return Builder.CreateAlignedLoad(
18972 ResVecTy,
18973 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18974 MaybeAlign());
18975 }
18976 /// Reset the builder to handle perfect diamond match.
18978 IsFinalized = false;
18979 CommonMask.clear();
18980 InVectors.clear();
18981 }
18982 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18983 /// shuffling.
18984 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18985 Value *V1 = getVectorizedValue(E1);
18986 Value *V2 = getVectorizedValue(E2);
18987 add(V1, V2, Mask);
18988 }
18989 /// Adds single input vector (in form of tree entry) and the mask for its
18990 /// shuffling.
18991 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18992 Value *V1 = getVectorizedValue(E1);
18993 add(V1, Mask);
18994 }
18995 /// Adds 2 input vectors and the mask for their shuffling.
18996 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18997 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
19000 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
19001 V1 = castToScalarTyElem(V1);
19002 V2 = castToScalarTyElem(V2);
19003 if (InVectors.empty()) {
19004 InVectors.push_back(V1);
19005 InVectors.push_back(V2);
19006 CommonMask.assign(Mask.begin(), Mask.end());
19007 return;
19008 }
19009 Value *Vec = InVectors.front();
19010 if (InVectors.size() == 2) {
19011 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19012 transformMaskAfterShuffle(CommonMask, CommonMask);
19013 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
19014 Mask.size()) {
19015 Vec = createShuffle(Vec, nullptr, CommonMask);
19016 transformMaskAfterShuffle(CommonMask, CommonMask);
19017 }
19018 V1 = createShuffle(V1, V2, Mask);
19019 unsigned VF = std::max(getVF(V1), getVF(Vec));
19020 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19021 if (Mask[Idx] != PoisonMaskElem)
19022 CommonMask[Idx] = Idx + VF;
19023 InVectors.front() = Vec;
19024 if (InVectors.size() == 2)
19025 InVectors.back() = V1;
19026 else
19027 InVectors.push_back(V1);
19028 }
19029 /// Adds another one input vector and the mask for the shuffling.
19030 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
19032 "castToScalarTyElem expects V1 to be FixedVectorType");
19033 V1 = castToScalarTyElem(V1);
19034 if (InVectors.empty()) {
19035 InVectors.push_back(V1);
19036 CommonMask.assign(Mask.begin(), Mask.end());
19037 return;
19038 }
19039 const auto *It = find(InVectors, V1);
19040 if (It == InVectors.end()) {
19041 if (InVectors.size() == 2 ||
19042 InVectors.front()->getType() != V1->getType()) {
19043 Value *V = InVectors.front();
19044 if (InVectors.size() == 2) {
19045 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
19046 transformMaskAfterShuffle(CommonMask, CommonMask);
19047 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
19048 CommonMask.size()) {
19049 V = createShuffle(InVectors.front(), nullptr, CommonMask);
19050 transformMaskAfterShuffle(CommonMask, CommonMask);
19051 }
19052 unsigned VF = std::max(CommonMask.size(), Mask.size());
19053 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19054 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
19055 CommonMask[Idx] = V->getType() != V1->getType()
19056 ? Idx + VF
19057 : Mask[Idx] + getVF(V1);
19058 if (V->getType() != V1->getType())
19059 V1 = createShuffle(V1, nullptr, Mask);
19060 InVectors.front() = V;
19061 if (InVectors.size() == 2)
19062 InVectors.back() = V1;
19063 else
19064 InVectors.push_back(V1);
19065 return;
19066 }
19067 // Check if second vector is required if the used elements are already
19068 // used from the first one.
19069 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19070 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
19071 InVectors.push_back(V1);
19072 break;
19073 }
19074 }
19075 unsigned VF = 0;
19076 for (Value *V : InVectors)
19077 VF = std::max(VF, getVF(V));
19078 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19079 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
19080 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
19081 }
19082 /// Adds another one input vector and the mask for the shuffling.
19084 SmallVector<int> NewMask;
19085 inversePermutation(Order, NewMask);
19086 add(V1, NewMask);
19087 }
19088 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
19089 Value *Root = nullptr) {
19090 return R.gather(VL, Root, ScalarTy,
19091 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
19092 return createShuffle(V1, V2, Mask);
19093 });
19094 }
19095 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
19096 /// Finalize emission of the shuffles.
19097 /// \param Action the action (if any) to be performed before final applying of
19098 /// the \p ExtMask mask.
19100 ArrayRef<int> ExtMask,
19101 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
19102 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
19105 Action = {}) {
19106 IsFinalized = true;
19107 if (Action) {
19108 Value *Vec = InVectors.front();
19109 if (InVectors.size() == 2) {
19110 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19111 InVectors.pop_back();
19112 } else {
19113 Vec = createShuffle(Vec, nullptr, CommonMask);
19114 }
19115 transformMaskAfterShuffle(CommonMask, CommonMask);
19116 assert(VF > 0 &&
19117 "Expected vector length for the final value before action.");
19118 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
19119 if (VecVF < VF) {
19120 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
19121 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
19122 Vec = createShuffle(Vec, nullptr, ResizeMask);
19123 }
19124 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
19125 return createShuffle(V1, V2, Mask);
19126 });
19127 InVectors.front() = Vec;
19128 }
19129 if (!SubVectors.empty()) {
19130 Value *Vec = InVectors.front();
19131 if (InVectors.size() == 2) {
19132 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19133 InVectors.pop_back();
19134 } else {
19135 Vec = createShuffle(Vec, nullptr, CommonMask);
19136 }
19137 transformMaskAfterShuffle(CommonMask, CommonMask);
19138 auto CreateSubVectors = [&](Value *Vec,
19139 SmallVectorImpl<int> &CommonMask) {
19140 for (auto [E, Idx] : SubVectors) {
19141 Value *V = getVectorizedValue(*E);
19142 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
19143 // Use scalar version of the SCalarType to correctly handle shuffles
19144 // for revectorization. The revectorization mode operates by the
19145 // vectors, but here we need to operate on the scalars, because the
19146 // masks were already transformed for the vector elements and we don't
19147 // need doing this transformation again.
19148 Type *OrigScalarTy = ScalarTy;
19149 ScalarTy = ScalarTy->getScalarType();
19150 Vec = createInsertVector(
19151 Builder, Vec, V, InsertionIndex,
19152 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
19153 _3));
19154 ScalarTy = OrigScalarTy;
19155 if (!CommonMask.empty()) {
19156 std::iota(std::next(CommonMask.begin(), Idx),
19157 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
19158 Idx);
19159 }
19160 }
19161 return Vec;
19162 };
19163 if (SubVectorsMask.empty()) {
19164 Vec = CreateSubVectors(Vec, CommonMask);
19165 } else {
19166 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
19167 copy(SubVectorsMask, SVMask.begin());
19168 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
19169 if (I2 != PoisonMaskElem) {
19170 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
19171 I1 = I2 + CommonMask.size();
19172 }
19173 }
19174 Value *InsertVec =
19175 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
19176 Vec = createShuffle(InsertVec, Vec, SVMask);
19177 transformMaskAfterShuffle(CommonMask, SVMask);
19178 }
19179 InVectors.front() = Vec;
19180 }
19181
19182 if (!ExtMask.empty()) {
19183 if (CommonMask.empty()) {
19184 CommonMask.assign(ExtMask.begin(), ExtMask.end());
19185 } else {
19186 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
19187 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
19188 if (ExtMask[I] == PoisonMaskElem)
19189 continue;
19190 NewMask[I] = CommonMask[ExtMask[I]];
19191 }
19192 CommonMask.swap(NewMask);
19193 }
19194 }
19195 if (CommonMask.empty()) {
19196 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
19197 return InVectors.front();
19198 }
19199 if (InVectors.size() == 2)
19200 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
19201 return createShuffle(InVectors.front(), nullptr, CommonMask);
19202 }
19203
19205 assert((IsFinalized || CommonMask.empty()) &&
19206 "Shuffle construction must be finalized.");
19207 }
19208};
19209
19210Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
19211 return vectorizeTree(getOperandEntry(E, NodeIdx));
19212}
19213
19214template <typename BVTy, typename ResTy, typename... Args>
19215ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
19216 Args &...Params) {
19217 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
19218 "Expected gather node.");
19219 unsigned VF = E->getVectorFactor();
19220
19221 bool NeedFreeze = false;
19222 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
19223 // Clear values, to be replaced by insertvector instructions.
19224 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
19225 for_each(MutableArrayRef(GatheredScalars)
19226 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
19227 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
19229 E->CombinedEntriesWithIndices.size());
19230 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
19231 [&](const auto &P) {
19232 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19233 });
19234 // Build a mask out of the reorder indices and reorder scalars per this
19235 // mask.
19236 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
19237 E->ReorderIndices.end());
19238 if (!ReorderMask.empty())
19239 reorderScalars(GatheredScalars, ReorderMask);
19240 SmallVector<int> SubVectorsMask;
19241 inversePermutation(E->ReorderIndices, SubVectorsMask);
19242 // Transform non-clustered elements in the mask to poison (-1).
19243 // "Clustered" operations will be reordered using this mask later.
19244 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
19245 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
19246 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
19247 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
19248 } else {
19249 SubVectorsMask.clear();
19250 }
19251 SmallVector<Value *> StoredGS(GatheredScalars);
19252 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
19253 unsigned I, unsigned SliceSize,
19254 bool IsNotPoisonous) {
19255 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
19256 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19257 }))
19258 return false;
19259 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
19260 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
19261 if (UserTE->getNumOperands() != 2)
19262 return false;
19263 if (!IsNotPoisonous) {
19264 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
19265 [=](const std::unique_ptr<TreeEntry> &TE) {
19266 return TE->UserTreeIndex.UserTE == UserTE &&
19267 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
19268 });
19269 if (It == VectorizableTree.end())
19270 return false;
19271 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
19272 if (!(*It)->ReorderIndices.empty()) {
19273 inversePermutation((*It)->ReorderIndices, ReorderMask);
19274 reorderScalars(GS, ReorderMask);
19275 }
19276 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
19277 Value *V0 = std::get<0>(P);
19278 Value *V1 = std::get<1>(P);
19279 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
19280 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
19281 is_contained(E->Scalars, V1));
19282 }))
19283 return false;
19284 }
19285 int Idx;
19286 if ((Mask.size() < InputVF &&
19287 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
19288 Idx == 0) ||
19289 (Mask.size() == InputVF &&
19290 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
19291 std::iota(
19292 std::next(Mask.begin(), I * SliceSize),
19293 std::next(Mask.begin(),
19294 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
19295 0);
19296 } else {
19297 unsigned IVal =
19298 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
19299 std::fill(
19300 std::next(Mask.begin(), I * SliceSize),
19301 std::next(Mask.begin(),
19302 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
19303 IVal);
19304 }
19305 return true;
19306 };
19307 BVTy ShuffleBuilder(ScalarTy, Params...);
19308 ResTy Res = ResTy();
19309 SmallVector<int> Mask;
19310 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
19312 Value *ExtractVecBase = nullptr;
19313 bool UseVecBaseAsInput = false;
19316 Type *OrigScalarTy = GatheredScalars.front()->getType();
19317 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
19318 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
19319 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
19320 // Check for gathered extracts.
19321 bool Resized = false;
19322 ExtractShuffles =
19323 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
19324 if (!ExtractShuffles.empty()) {
19325 SmallVector<const TreeEntry *> ExtractEntries;
19326 for (auto [Idx, I] : enumerate(ExtractMask)) {
19327 if (I == PoisonMaskElem)
19328 continue;
19329 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
19330 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
19331 !TEs.empty())
19332 ExtractEntries.append(TEs.begin(), TEs.end());
19333 }
19334 if (std::optional<ResTy> Delayed =
19335 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
19336 // Delay emission of gathers which are not ready yet.
19337 PostponedGathers.insert(E);
19338 // Postpone gather emission, will be emitted after the end of the
19339 // process to keep correct order.
19340 return *Delayed;
19341 }
19342 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
19343 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
19344 ExtractVecBase = VecBase;
19345 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
19346 if (VF == VecBaseTy->getNumElements() &&
19347 GatheredScalars.size() != VF) {
19348 Resized = true;
19349 GatheredScalars.append(VF - GatheredScalars.size(),
19350 PoisonValue::get(OrigScalarTy));
19351 NumParts =
19352 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
19353 }
19354 }
19355 }
19356 // Gather extracts after we check for full matched gathers only.
19357 if (!ExtractShuffles.empty() || !E->hasState() ||
19358 E->getOpcode() != Instruction::Load ||
19359 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
19360 any_of(E->Scalars, IsaPred<LoadInst>)) &&
19361 any_of(E->Scalars,
19362 [this](Value *V) {
19363 return isa<LoadInst>(V) && isVectorized(V);
19364 })) ||
19365 (E->hasState() && E->isAltShuffle()) ||
19366 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
19367 isSplat(E->Scalars) ||
19368 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
19369 GatherShuffles =
19370 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
19371 }
19372 if (!GatherShuffles.empty()) {
19373 if (std::optional<ResTy> Delayed =
19374 ShuffleBuilder.needToDelay(E, Entries)) {
19375 // Delay emission of gathers which are not ready yet.
19376 PostponedGathers.insert(E);
19377 // Postpone gather emission, will be emitted after the end of the
19378 // process to keep correct order.
19379 return *Delayed;
19380 }
19381 if (GatherShuffles.size() == 1 &&
19382 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
19383 Entries.front().front()->isSame(E->Scalars)) {
19384 // Perfect match in the graph, will reuse the previously vectorized
19385 // node. Cost is 0.
19386 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
19387 << shortBundleName(E->Scalars, E->Idx) << ".\n");
19388 // Restore the mask for previous partially matched values.
19389 Mask.resize(E->Scalars.size());
19390 const TreeEntry *FrontTE = Entries.front().front();
19391 if (FrontTE->ReorderIndices.empty() &&
19392 ((FrontTE->ReuseShuffleIndices.empty() &&
19393 E->Scalars.size() == FrontTE->Scalars.size()) ||
19394 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
19395 std::iota(Mask.begin(), Mask.end(), 0);
19396 } else {
19397 for (auto [I, V] : enumerate(E->Scalars)) {
19398 if (isa<PoisonValue>(V)) {
19399 Mask[I] = PoisonMaskElem;
19400 continue;
19401 }
19402 Mask[I] = FrontTE->findLaneForValue(V);
19403 }
19404 }
19405 // Reset the builder(s) to correctly handle perfect diamond matched
19406 // nodes.
19407 ShuffleBuilder.resetForSameNode();
19408 ShuffleBuilder.add(*FrontTE, Mask);
19409 // Full matched entry found, no need to insert subvectors.
19410 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
19411 return Res;
19412 }
19413 if (!Resized) {
19414 if (GatheredScalars.size() != VF &&
19415 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
19416 return any_of(TEs, [&](const TreeEntry *TE) {
19417 return TE->getVectorFactor() == VF;
19418 });
19419 }))
19420 GatheredScalars.append(VF - GatheredScalars.size(),
19421 PoisonValue::get(OrigScalarTy));
19422 }
19423 // Remove shuffled elements from list of gathers.
19424 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
19425 if (Mask[I] != PoisonMaskElem)
19426 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
19427 }
19428 }
19429 }
19430 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
19431 SmallVectorImpl<int> &ReuseMask,
19432 bool IsRootPoison) {
19433 // For splats with can emit broadcasts instead of gathers, so try to find
19434 // such sequences.
19435 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
19436 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
19437 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
19438 SmallVector<int> UndefPos;
19439 DenseMap<Value *, unsigned> UniquePositions;
19440 // Gather unique non-const values and all constant values.
19441 // For repeated values, just shuffle them.
19442 int NumNonConsts = 0;
19443 int SinglePos = 0;
19444 for (auto [I, V] : enumerate(Scalars)) {
19445 if (isa<UndefValue>(V)) {
19446 if (!isa<PoisonValue>(V)) {
19447 ReuseMask[I] = I;
19448 UndefPos.push_back(I);
19449 }
19450 continue;
19451 }
19452 if (isConstant(V)) {
19453 ReuseMask[I] = I;
19454 continue;
19455 }
19456 ++NumNonConsts;
19457 SinglePos = I;
19458 Value *OrigV = V;
19459 Scalars[I] = PoisonValue::get(OrigScalarTy);
19460 if (IsSplat) {
19461 Scalars.front() = OrigV;
19462 ReuseMask[I] = 0;
19463 } else {
19464 const auto Res = UniquePositions.try_emplace(OrigV, I);
19465 Scalars[Res.first->second] = OrigV;
19466 ReuseMask[I] = Res.first->second;
19467 }
19468 }
19469 if (NumNonConsts == 1) {
19470 // Restore single insert element.
19471 if (IsSplat) {
19472 ReuseMask.assign(VF, PoisonMaskElem);
19473 std::swap(Scalars.front(), Scalars[SinglePos]);
19474 if (!UndefPos.empty() && UndefPos.front() == 0)
19475 Scalars.front() = UndefValue::get(OrigScalarTy);
19476 }
19477 ReuseMask[SinglePos] = SinglePos;
19478 } else if (!UndefPos.empty() && IsSplat) {
19479 // For undef values, try to replace them with the simple broadcast.
19480 // We can do it if the broadcasted value is guaranteed to be
19481 // non-poisonous, or by freezing the incoming scalar value first.
19482 auto *It = find_if(Scalars, [this, E](Value *V) {
19483 return !isa<UndefValue>(V) &&
19485 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
19486 // Check if the value already used in the same operation in
19487 // one of the nodes already.
19488 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
19489 is_contained(E->UserTreeIndex.UserTE->Scalars,
19490 U.getUser());
19491 })));
19492 });
19493 if (It != Scalars.end()) {
19494 // Replace undefs by the non-poisoned scalars and emit broadcast.
19495 int Pos = std::distance(Scalars.begin(), It);
19496 for (int I : UndefPos) {
19497 // Set the undef position to the non-poisoned scalar.
19498 ReuseMask[I] = Pos;
19499 // Replace the undef by the poison, in the mask it is replaced by
19500 // non-poisoned scalar already.
19501 if (I != Pos)
19502 Scalars[I] = PoisonValue::get(OrigScalarTy);
19503 }
19504 } else {
19505 // Replace undefs by the poisons, emit broadcast and then emit
19506 // freeze.
19507 for (int I : UndefPos) {
19508 ReuseMask[I] = PoisonMaskElem;
19509 if (isa<UndefValue>(Scalars[I]))
19510 Scalars[I] = PoisonValue::get(OrigScalarTy);
19511 }
19512 NeedFreeze = true;
19513 }
19514 }
19515 };
19516 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
19517 bool IsNonPoisoned = true;
19518 bool IsUsedInExpr = true;
19519 Value *Vec1 = nullptr;
19520 if (!ExtractShuffles.empty()) {
19521 // Gather of extractelements can be represented as just a shuffle of
19522 // a single/two vectors the scalars are extracted from.
19523 // Find input vectors.
19524 Value *Vec2 = nullptr;
19525 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
19526 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
19527 ExtractMask[I] = PoisonMaskElem;
19528 }
19529 if (UseVecBaseAsInput) {
19530 Vec1 = ExtractVecBase;
19531 } else {
19532 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
19533 if (ExtractMask[I] == PoisonMaskElem)
19534 continue;
19535 if (isa<UndefValue>(StoredGS[I]))
19536 continue;
19537 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
19538 Value *VecOp = EI->getVectorOperand();
19539 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
19540 !TEs.empty() && TEs.front()->VectorizedValue)
19541 VecOp = TEs.front()->VectorizedValue;
19542 if (!Vec1) {
19543 Vec1 = VecOp;
19544 } else if (Vec1 != VecOp) {
19545 assert((!Vec2 || Vec2 == VecOp) &&
19546 "Expected only 1 or 2 vectors shuffle.");
19547 Vec2 = VecOp;
19548 }
19549 }
19550 }
19551 if (Vec2) {
19552 IsUsedInExpr = false;
19553 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
19554 isGuaranteedNotToBePoison(Vec2, AC);
19555 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
19556 } else if (Vec1) {
19557 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
19558 IsUsedInExpr &= FindReusedSplat(
19559 ExtractMask,
19560 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
19561 ExtractMask.size(), IsNotPoisonedVec);
19562 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
19563 IsNonPoisoned &= IsNotPoisonedVec;
19564 } else {
19565 IsUsedInExpr = false;
19566 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
19567 /*ForExtracts=*/true);
19568 }
19569 }
19570 if (!GatherShuffles.empty()) {
19571 unsigned SliceSize =
19572 getPartNumElems(E->Scalars.size(),
19573 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
19574 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
19575 for (const auto [I, TEs] : enumerate(Entries)) {
19576 if (TEs.empty()) {
19577 assert(!GatherShuffles[I] &&
19578 "No shuffles with empty entries list expected.");
19579 continue;
19580 }
19581 assert((TEs.size() == 1 || TEs.size() == 2) &&
19582 "Expected shuffle of 1 or 2 entries.");
19583 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
19584 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
19585 VecMask.assign(VecMask.size(), PoisonMaskElem);
19586 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
19587 if (TEs.size() == 1) {
19588 bool IsNotPoisonedVec =
19589 TEs.front()->VectorizedValue
19590 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
19591 : true;
19592 IsUsedInExpr &=
19593 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
19594 SliceSize, IsNotPoisonedVec);
19595 ShuffleBuilder.add(*TEs.front(), VecMask);
19596 IsNonPoisoned &= IsNotPoisonedVec;
19597 } else {
19598 IsUsedInExpr = false;
19599 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19600 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19601 IsNonPoisoned &=
19602 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
19603 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
19604 }
19605 }
19606 }
19607 // Try to figure out best way to combine values: build a shuffle and insert
19608 // elements or just build several shuffles.
19609 // Insert non-constant scalars.
19610 SmallVector<Value *> NonConstants(GatheredScalars);
19611 int EMSz = ExtractMask.size();
19612 int MSz = Mask.size();
19613 // Try to build constant vector and shuffle with it only if currently we
19614 // have a single permutation and more than 1 scalar constants.
19615 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
19616 bool IsIdentityShuffle =
19617 ((UseVecBaseAsInput ||
19618 all_of(ExtractShuffles,
19619 [](const std::optional<TTI::ShuffleKind> &SK) {
19620 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19622 })) &&
19623 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
19624 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
19625 (!GatherShuffles.empty() &&
19626 all_of(GatherShuffles,
19627 [](const std::optional<TTI::ShuffleKind> &SK) {
19628 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19630 }) &&
19631 none_of(Mask, [&](int I) { return I >= MSz; }) &&
19633 bool EnoughConstsForShuffle =
19634 IsSingleShuffle &&
19635 (none_of(GatheredScalars,
19636 [](Value *V) {
19637 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19638 }) ||
19639 any_of(GatheredScalars,
19640 [](Value *V) {
19641 return isa<Constant>(V) && !isa<UndefValue>(V);
19642 })) &&
19643 (!IsIdentityShuffle ||
19644 (GatheredScalars.size() == 2 &&
19645 any_of(GatheredScalars,
19646 [](Value *V) { return !isa<UndefValue>(V); })) ||
19647 count_if(GatheredScalars, [](Value *V) {
19648 return isa<Constant>(V) && !isa<PoisonValue>(V);
19649 }) > 1);
19650 // NonConstants array contains just non-constant values, GatheredScalars
19651 // contains only constant to build final vector and then shuffle.
19652 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
19653 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
19654 NonConstants[I] = PoisonValue::get(OrigScalarTy);
19655 else
19656 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
19657 }
19658 // Generate constants for final shuffle and build a mask for them.
19659 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
19660 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
19661 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
19662 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
19663 ShuffleBuilder.add(BV, BVMask);
19664 }
19665 if (all_of(NonConstants, [=](Value *V) {
19666 return isa<PoisonValue>(V) ||
19667 (IsSingleShuffle && ((IsIdentityShuffle &&
19668 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
19669 }))
19670 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19671 SubVectorsMask);
19672 else
19673 Res = ShuffleBuilder.finalize(
19674 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
19675 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
19676 bool IsSplat = isSplat(NonConstants);
19677 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
19678 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
19679 auto CheckIfSplatIsProfitable = [&]() {
19680 // Estimate the cost of splatting + shuffle and compare with
19681 // insert + shuffle.
19682 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19683 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19684 if (isa<ExtractElementInst>(V) || isVectorized(V))
19685 return false;
19686 InstructionCost SplatCost = TTI->getVectorInstrCost(
19687 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
19688 PoisonValue::get(VecTy), V);
19689 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19690 for (auto [Idx, I] : enumerate(BVMask))
19691 if (I != PoisonMaskElem)
19692 NewMask[Idx] = Mask.size();
19693 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
19694 NewMask, CostKind);
19695 InstructionCost BVCost = TTI->getVectorInstrCost(
19696 Instruction::InsertElement, VecTy, CostKind,
19697 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
19698 // Shuffle required?
19699 if (count(BVMask, PoisonMaskElem) <
19700 static_cast<int>(BVMask.size() - 1)) {
19701 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19702 for (auto [Idx, I] : enumerate(BVMask))
19703 if (I != PoisonMaskElem)
19704 NewMask[Idx] = I;
19705 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
19706 VecTy, NewMask, CostKind);
19707 }
19708 return SplatCost <= BVCost;
19709 };
19710 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
19711 for (auto [Idx, I] : enumerate(BVMask))
19712 if (I != PoisonMaskElem)
19713 Mask[Idx] = I;
19714 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
19715 } else {
19716 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19717 SmallVector<Value *> Values(NonConstants.size(),
19718 PoisonValue::get(ScalarTy));
19719 Values[0] = V;
19720 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
19721 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
19722 transform(BVMask, SplatMask.begin(), [](int I) {
19723 return I == PoisonMaskElem ? PoisonMaskElem : 0;
19724 });
19725 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
19726 BV = CreateShuffle(BV, nullptr, SplatMask);
19727 for (auto [Idx, I] : enumerate(BVMask))
19728 if (I != PoisonMaskElem)
19729 Mask[Idx] = BVMask.size() + Idx;
19730 Vec = CreateShuffle(Vec, BV, Mask);
19731 for (auto [Idx, I] : enumerate(Mask))
19732 if (I != PoisonMaskElem)
19733 Mask[Idx] = Idx;
19734 }
19735 });
19736 } else if (!allConstant(GatheredScalars)) {
19737 // Gather unique scalars and all constants.
19738 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
19739 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
19740 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
19741 ShuffleBuilder.add(BV, ReuseMask);
19742 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19743 SubVectorsMask);
19744 } else {
19745 // Gather all constants.
19746 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
19747 for (auto [I, V] : enumerate(GatheredScalars)) {
19748 if (!isa<PoisonValue>(V))
19749 Mask[I] = I;
19750 }
19751 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19752 ShuffleBuilder.add(BV, Mask);
19753 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19754 SubVectorsMask);
19755 }
19756
19757 if (NeedFreeze)
19758 Res = ShuffleBuilder.createFreeze(Res);
19759 return Res;
19760}
19761
19762Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
19763 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
19764 (void)vectorizeTree(VectorizableTree[EIdx].get());
19765 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
19766 Builder, *this);
19767}
19768
19769/// \returns \p I after propagating metadata from \p VL only for instructions in
19770/// \p VL.
19773 for (Value *V : VL)
19774 if (isa<Instruction>(V))
19775 Insts.push_back(V);
19776 return llvm::propagateMetadata(Inst, Insts);
19777}
19778
19780 if (DebugLoc DL = PN.getDebugLoc())
19781 return DL;
19782 return DebugLoc::getUnknown();
19783}
19784
19785Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
19786 IRBuilderBase::InsertPointGuard Guard(Builder);
19787
19788 Value *V = E->Scalars.front();
19789 Type *ScalarTy = V->getType();
19790 if (!isa<CmpInst>(V))
19791 ScalarTy = getValueType(V);
19792 auto It = MinBWs.find(E);
19793 if (It != MinBWs.end()) {
19794 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
19795 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
19796 if (VecTy)
19797 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
19798 }
19799 if (E->VectorizedValue)
19800 return E->VectorizedValue;
19801 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
19802 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
19803 // Set insert point for non-reduction initial nodes.
19804 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
19805 setInsertPointAfterBundle(E);
19806 Value *Vec = createBuildVector(E, ScalarTy);
19807 E->VectorizedValue = Vec;
19808 return Vec;
19809 }
19810 if (E->State == TreeEntry::SplitVectorize) {
19811 assert(E->CombinedEntriesWithIndices.size() == 2 &&
19812 "Expected exactly 2 combined entries.");
19813 setInsertPointAfterBundle(E);
19814 TreeEntry &OpTE1 =
19815 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
19816 assert(OpTE1.isSame(
19817 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19818 "Expected same first part of scalars.");
19819 Value *Op1 = vectorizeTree(&OpTE1);
19820 TreeEntry &OpTE2 =
19821 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
19822 assert(
19823 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19824 "Expected same second part of scalars.");
19825 Value *Op2 = vectorizeTree(&OpTE2);
19826 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
19827 bool IsSigned = false;
19828 auto It = MinBWs.find(OpE);
19829 if (It != MinBWs.end())
19830 IsSigned = It->second.second;
19831 else
19832 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19833 if (isa<PoisonValue>(V))
19834 return false;
19835 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19836 });
19837 return IsSigned;
19838 };
19839 if (cast<VectorType>(Op1->getType())->getElementType() !=
19840 ScalarTy->getScalarType()) {
19841 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19842 Op1 = Builder.CreateIntCast(
19843 Op1,
19845 ScalarTy,
19846 cast<FixedVectorType>(Op1->getType())->getNumElements()),
19847 GetOperandSignedness(&OpTE1));
19848 }
19849 if (cast<VectorType>(Op2->getType())->getElementType() !=
19850 ScalarTy->getScalarType()) {
19851 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19852 Op2 = Builder.CreateIntCast(
19853 Op2,
19855 ScalarTy,
19856 cast<FixedVectorType>(Op2->getType())->getNumElements()),
19857 GetOperandSignedness(&OpTE2));
19858 }
19859 if (E->ReorderIndices.empty()) {
19860 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
19861 std::iota(
19862 Mask.begin(),
19863 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
19864 0);
19865 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
19866 if (ScalarTyNumElements != 1) {
19867 assert(SLPReVec && "Only supported by REVEC.");
19868 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
19869 }
19870 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19871 Vec = createInsertVector(Builder, Vec, Op2,
19872 E->CombinedEntriesWithIndices.back().second *
19873 ScalarTyNumElements);
19874 E->VectorizedValue = Vec;
19875 return Vec;
19876 }
19877 unsigned CommonVF =
19878 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19879 if (getNumElements(Op1->getType()) != CommonVF) {
19880 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19881 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
19882 0);
19883 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19884 }
19885 if (getNumElements(Op2->getType()) != CommonVF) {
19886 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19887 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
19888 0);
19889 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19890 }
19891 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
19892 E->VectorizedValue = Vec;
19893 return Vec;
19894 }
19895
19896 bool IsReverseOrder =
19897 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
19898 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
19899 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
19900 if (E->getOpcode() == Instruction::Store &&
19901 E->State == TreeEntry::Vectorize) {
19902 ArrayRef<int> Mask =
19903 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19904 E->ReorderIndices.size());
19905 ShuffleBuilder.add(V, Mask);
19906 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19907 E->State == TreeEntry::CompressVectorize) {
19908 ShuffleBuilder.addOrdered(V, {});
19909 } else {
19910 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19911 }
19913 E->CombinedEntriesWithIndices.size());
19914 transform(
19915 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19916 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19917 });
19918 assert(
19919 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19920 "Expected either combined subnodes or reordering");
19921 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19922 };
19923
19924 assert(!E->isGather() && "Unhandled state");
19925 unsigned ShuffleOrOp =
19926 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19927 Instruction *VL0 = E->getMainOp();
19928 auto GetOperandSignedness = [&](unsigned Idx) {
19929 const TreeEntry *OpE = getOperandEntry(E, Idx);
19930 bool IsSigned = false;
19931 auto It = MinBWs.find(OpE);
19932 if (It != MinBWs.end())
19933 IsSigned = It->second.second;
19934 else
19935 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19936 if (isa<PoisonValue>(V))
19937 return false;
19938 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19939 });
19940 return IsSigned;
19941 };
19942 switch (ShuffleOrOp) {
19943 case Instruction::PHI: {
19944 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19945 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19946 "PHI reordering is free.");
19947 auto *PH = cast<PHINode>(VL0);
19948 Builder.SetInsertPoint(PH->getParent(),
19949 PH->getParent()->getFirstNonPHIIt());
19950 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19951 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19952 Value *V = NewPhi;
19953
19954 // Adjust insertion point once all PHI's have been generated.
19955 Builder.SetInsertPoint(PH->getParent(),
19956 PH->getParent()->getFirstInsertionPt());
19957 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19958
19959 V = FinalShuffle(V, E);
19960
19961 E->VectorizedValue = V;
19962 // If phi node is fully emitted - exit.
19963 if (NewPhi->getNumIncomingValues() != 0)
19964 return NewPhi;
19965
19966 // PHINodes may have multiple entries from the same block. We want to
19967 // visit every block once.
19968 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19969
19970 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19971 BasicBlock *IBB = PH->getIncomingBlock(I);
19972
19973 // Stop emission if all incoming values are generated.
19974 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19975 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19976 return NewPhi;
19977 }
19978
19979 if (!VisitedBBs.insert(IBB).second) {
19980 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19981 NewPhi->addIncoming(VecOp, IBB);
19982 TreeEntry *OpTE = getOperandEntry(E, I);
19983 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19984 OpTE->VectorizedValue = VecOp;
19985 continue;
19986 }
19987
19988 Builder.SetInsertPoint(IBB->getTerminator());
19989 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19990 Value *Vec = vectorizeOperand(E, I);
19991 if (VecTy != Vec->getType()) {
19992 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19993 MinBWs.contains(getOperandEntry(E, I))) &&
19994 "Expected item in MinBWs.");
19995 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19996 }
19997 NewPhi->addIncoming(Vec, IBB);
19998 }
19999
20000 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
20001 "Invalid number of incoming values");
20002 assert(E->VectorizedValue && "Expected vectorized value.");
20003 return E->VectorizedValue;
20004 }
20005
20006 case Instruction::ExtractElement: {
20007 Value *V = E->getSingleOperand(0);
20008 setInsertPointAfterBundle(E);
20009 V = FinalShuffle(V, E);
20010 E->VectorizedValue = V;
20011 return V;
20012 }
20013 case Instruction::ExtractValue: {
20014 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
20015 Builder.SetInsertPoint(LI);
20016 Value *Ptr = LI->getPointerOperand();
20017 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
20018 Value *NewV = ::propagateMetadata(V, E->Scalars);
20019 NewV = FinalShuffle(NewV, E);
20020 E->VectorizedValue = NewV;
20021 return NewV;
20022 }
20023 case Instruction::InsertElement: {
20024 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
20025 if (const TreeEntry *OpE = getOperandEntry(E, 1);
20026 OpE && !OpE->isGather() && OpE->hasState() &&
20027 !OpE->hasCopyableElements())
20028 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
20029 else
20030 setInsertPointAfterBundle(E);
20031 Value *V = vectorizeOperand(E, 1);
20032 ArrayRef<Value *> Op = E->getOperand(1);
20033 Type *ScalarTy = Op.front()->getType();
20034 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
20035 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20036 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
20037 assert(Res.first > 0 && "Expected item in MinBWs.");
20038 V = Builder.CreateIntCast(
20039 V,
20041 ScalarTy,
20042 cast<FixedVectorType>(V->getType())->getNumElements()),
20043 Res.second);
20044 }
20045
20046 // Create InsertVector shuffle if necessary
20047 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
20048 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
20049 }));
20050 const unsigned NumElts =
20051 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
20052 const unsigned NumScalars = E->Scalars.size();
20053
20054 unsigned Offset = *getElementIndex(VL0);
20055 assert(Offset < NumElts && "Failed to find vector index offset");
20056
20057 // Create shuffle to resize vector
20058 SmallVector<int> Mask;
20059 if (!E->ReorderIndices.empty()) {
20060 inversePermutation(E->ReorderIndices, Mask);
20061 Mask.append(NumElts - NumScalars, PoisonMaskElem);
20062 } else {
20063 Mask.assign(NumElts, PoisonMaskElem);
20064 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
20065 }
20066 // Create InsertVector shuffle if necessary
20067 bool IsIdentity = true;
20068 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
20069 Mask.swap(PrevMask);
20070 for (unsigned I = 0; I < NumScalars; ++I) {
20071 Value *Scalar = E->Scalars[PrevMask[I]];
20072 unsigned InsertIdx = *getElementIndex(Scalar);
20073 IsIdentity &= InsertIdx - Offset == I;
20074 Mask[InsertIdx - Offset] = I;
20075 }
20076 if (!IsIdentity || NumElts != NumScalars) {
20077 Value *V2 = nullptr;
20078 bool IsVNonPoisonous =
20080 SmallVector<int> InsertMask(Mask);
20081 if (NumElts != NumScalars && Offset == 0) {
20082 // Follow all insert element instructions from the current buildvector
20083 // sequence.
20084 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
20085 do {
20086 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
20087 if (!InsertIdx)
20088 break;
20089 if (InsertMask[*InsertIdx] == PoisonMaskElem)
20090 InsertMask[*InsertIdx] = *InsertIdx;
20091 if (!Ins->hasOneUse())
20092 break;
20095 } while (Ins);
20096 SmallBitVector UseMask =
20097 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20098 SmallBitVector IsFirstPoison =
20099 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
20100 SmallBitVector IsFirstUndef =
20101 isUndefVector(FirstInsert->getOperand(0), UseMask);
20102 if (!IsFirstPoison.all()) {
20103 unsigned Idx = 0;
20104 for (unsigned I = 0; I < NumElts; I++) {
20105 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
20106 IsFirstUndef.test(I)) {
20107 if (IsVNonPoisonous) {
20108 InsertMask[I] = I < NumScalars ? I : 0;
20109 continue;
20110 }
20111 if (!V2)
20112 V2 = UndefValue::get(V->getType());
20113 if (Idx >= NumScalars)
20114 Idx = NumScalars - 1;
20115 InsertMask[I] = NumScalars + Idx;
20116 ++Idx;
20117 } else if (InsertMask[I] != PoisonMaskElem &&
20118 Mask[I] == PoisonMaskElem) {
20119 InsertMask[I] = PoisonMaskElem;
20120 }
20121 }
20122 } else {
20123 InsertMask = Mask;
20124 }
20125 }
20126 if (!V2)
20127 V2 = PoisonValue::get(V->getType());
20128 V = Builder.CreateShuffleVector(V, V2, InsertMask);
20129 if (auto *I = dyn_cast<Instruction>(V)) {
20130 GatherShuffleExtractSeq.insert(I);
20131 CSEBlocks.insert(I->getParent());
20132 }
20133 }
20134
20135 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
20136 for (unsigned I = 0; I < NumElts; I++) {
20137 if (Mask[I] != PoisonMaskElem)
20138 InsertMask[Offset + I] = I;
20139 }
20140 SmallBitVector UseMask =
20141 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20142 SmallBitVector IsFirstUndef =
20143 isUndefVector(FirstInsert->getOperand(0), UseMask);
20144 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
20145 NumElts != NumScalars) {
20146 if (IsFirstUndef.all()) {
20147 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
20148 SmallBitVector IsFirstPoison =
20149 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
20150 if (!IsFirstPoison.all()) {
20151 for (unsigned I = 0; I < NumElts; I++) {
20152 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
20153 InsertMask[I] = I + NumElts;
20154 }
20155 }
20156 V = Builder.CreateShuffleVector(
20157 V,
20158 IsFirstPoison.all() ? PoisonValue::get(V->getType())
20159 : FirstInsert->getOperand(0),
20160 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
20161 if (auto *I = dyn_cast<Instruction>(V)) {
20162 GatherShuffleExtractSeq.insert(I);
20163 CSEBlocks.insert(I->getParent());
20164 }
20165 }
20166 } else {
20167 SmallBitVector IsFirstPoison =
20168 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
20169 for (unsigned I = 0; I < NumElts; I++) {
20170 if (InsertMask[I] == PoisonMaskElem)
20171 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
20172 else
20173 InsertMask[I] += NumElts;
20174 }
20175 V = Builder.CreateShuffleVector(
20176 FirstInsert->getOperand(0), V, InsertMask,
20177 cast<Instruction>(E->Scalars.back())->getName());
20178 if (auto *I = dyn_cast<Instruction>(V)) {
20179 GatherShuffleExtractSeq.insert(I);
20180 CSEBlocks.insert(I->getParent());
20181 }
20182 }
20183 }
20184
20185 ++NumVectorInstructions;
20186 E->VectorizedValue = V;
20187 return V;
20188 }
20189 case Instruction::ZExt:
20190 case Instruction::SExt:
20191 case Instruction::FPToUI:
20192 case Instruction::FPToSI:
20193 case Instruction::FPExt:
20194 case Instruction::PtrToInt:
20195 case Instruction::IntToPtr:
20196 case Instruction::SIToFP:
20197 case Instruction::UIToFP:
20198 case Instruction::Trunc:
20199 case Instruction::FPTrunc:
20200 case Instruction::BitCast: {
20201 setInsertPointAfterBundle(E);
20202
20203 Value *InVec = vectorizeOperand(E, 0);
20204
20205 auto *CI = cast<CastInst>(VL0);
20206 Instruction::CastOps VecOpcode = CI->getOpcode();
20207 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
20208 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
20209 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
20210 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
20211 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
20212 // Check if the values are candidates to demote.
20213 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
20214 if (SrcIt != MinBWs.end())
20215 SrcBWSz = SrcIt->second.first;
20216 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
20217 if (BWSz == SrcBWSz) {
20218 VecOpcode = Instruction::BitCast;
20219 } else if (BWSz < SrcBWSz) {
20220 VecOpcode = Instruction::Trunc;
20221 } else if (It != MinBWs.end()) {
20222 assert(BWSz > SrcBWSz && "Invalid cast!");
20223 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
20224 } else if (SrcIt != MinBWs.end()) {
20225 assert(BWSz > SrcBWSz && "Invalid cast!");
20226 VecOpcode =
20227 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
20228 }
20229 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
20230 !SrcIt->second.second) {
20231 VecOpcode = Instruction::UIToFP;
20232 }
20233 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
20234 ? InVec
20235 : Builder.CreateCast(VecOpcode, InVec, VecTy);
20236 V = FinalShuffle(V, E);
20237
20238 E->VectorizedValue = V;
20239 ++NumVectorInstructions;
20240 return V;
20241 }
20242 case Instruction::FCmp:
20243 case Instruction::ICmp: {
20244 setInsertPointAfterBundle(E);
20245
20246 Value *L = vectorizeOperand(E, 0);
20247 Value *R = vectorizeOperand(E, 1);
20248 if (L->getType() != R->getType()) {
20249 assert((getOperandEntry(E, 0)->isGather() ||
20250 getOperandEntry(E, 1)->isGather() ||
20251 MinBWs.contains(getOperandEntry(E, 0)) ||
20252 MinBWs.contains(getOperandEntry(E, 1))) &&
20253 "Expected item in MinBWs.");
20254 if (cast<VectorType>(L->getType())
20255 ->getElementType()
20256 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
20257 ->getElementType()
20258 ->getIntegerBitWidth()) {
20259 Type *CastTy = R->getType();
20260 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
20261 } else {
20262 Type *CastTy = L->getType();
20263 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
20264 }
20265 }
20266
20267 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
20268 Value *V = Builder.CreateCmp(P0, L, R);
20269 propagateIRFlags(V, E->Scalars, VL0);
20270 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
20271 ICmp->setSameSign(/*B=*/false);
20272 // Do not cast for cmps.
20273 VecTy = cast<FixedVectorType>(V->getType());
20274 V = FinalShuffle(V, E);
20275
20276 E->VectorizedValue = V;
20277 ++NumVectorInstructions;
20278 return V;
20279 }
20280 case Instruction::Select: {
20281 setInsertPointAfterBundle(E);
20282
20283 Value *Cond = vectorizeOperand(E, 0);
20284 Value *True = vectorizeOperand(E, 1);
20285 Value *False = vectorizeOperand(E, 2);
20286 if (True->getType() != VecTy || False->getType() != VecTy) {
20287 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
20288 getOperandEntry(E, 2)->isGather() ||
20289 MinBWs.contains(getOperandEntry(E, 1)) ||
20290 MinBWs.contains(getOperandEntry(E, 2))) &&
20291 "Expected item in MinBWs.");
20292 if (True->getType() != VecTy)
20293 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
20294 if (False->getType() != VecTy)
20295 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
20296 }
20297
20298 unsigned CondNumElements = getNumElements(Cond->getType());
20299 unsigned TrueNumElements = getNumElements(True->getType());
20300 assert(TrueNumElements >= CondNumElements &&
20301 TrueNumElements % CondNumElements == 0 &&
20302 "Cannot vectorize Instruction::Select");
20303 assert(TrueNumElements == getNumElements(False->getType()) &&
20304 "Cannot vectorize Instruction::Select");
20305 if (CondNumElements != TrueNumElements) {
20306 // When the return type is i1 but the source is fixed vector type, we
20307 // need to duplicate the condition value.
20308 Cond = Builder.CreateShuffleVector(
20309 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
20310 CondNumElements));
20311 }
20312 assert(getNumElements(Cond->getType()) == TrueNumElements &&
20313 "Cannot vectorize Instruction::Select");
20314 Value *V =
20315 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
20316 V = FinalShuffle(V, E);
20317
20318 E->VectorizedValue = V;
20319 ++NumVectorInstructions;
20320 return V;
20321 }
20322 case Instruction::FNeg: {
20323 setInsertPointAfterBundle(E);
20324
20325 Value *Op = vectorizeOperand(E, 0);
20326
20327 Value *V = Builder.CreateUnOp(
20328 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
20329 propagateIRFlags(V, E->Scalars, VL0);
20330 if (auto *I = dyn_cast<Instruction>(V))
20331 V = ::propagateMetadata(I, E->Scalars);
20332
20333 V = FinalShuffle(V, E);
20334
20335 E->VectorizedValue = V;
20336 ++NumVectorInstructions;
20337
20338 return V;
20339 }
20340 case Instruction::Freeze: {
20341 setInsertPointAfterBundle(E);
20342
20343 Value *Op = vectorizeOperand(E, 0);
20344
20345 if (Op->getType() != VecTy) {
20346 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
20347 MinBWs.contains(getOperandEntry(E, 0))) &&
20348 "Expected item in MinBWs.");
20349 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
20350 }
20351 Value *V = Builder.CreateFreeze(Op);
20352 V = FinalShuffle(V, E);
20353
20354 E->VectorizedValue = V;
20355 ++NumVectorInstructions;
20356
20357 return V;
20358 }
20359 case Instruction::Add:
20360 case Instruction::FAdd:
20361 case Instruction::Sub:
20362 case Instruction::FSub:
20363 case Instruction::Mul:
20364 case Instruction::FMul:
20365 case Instruction::UDiv:
20366 case Instruction::SDiv:
20367 case Instruction::FDiv:
20368 case Instruction::URem:
20369 case Instruction::SRem:
20370 case Instruction::FRem:
20371 case Instruction::Shl:
20372 case Instruction::LShr:
20373 case Instruction::AShr:
20374 case Instruction::And:
20375 case Instruction::Or:
20376 case Instruction::Xor: {
20377 setInsertPointAfterBundle(E);
20378
20379 Value *LHS = vectorizeOperand(E, 0);
20380 Value *RHS = vectorizeOperand(E, 1);
20381 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
20382 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
20383 ArrayRef<Value *> Ops = E->getOperand(I);
20384 if (all_of(Ops, [&](Value *Op) {
20385 auto *CI = dyn_cast<ConstantInt>(Op);
20386 return CI && CI->getValue().countr_one() >= It->second.first;
20387 })) {
20388 V = FinalShuffle(I == 0 ? RHS : LHS, E);
20389 E->VectorizedValue = V;
20390 ++NumVectorInstructions;
20391 return V;
20392 }
20393 }
20394 }
20395 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
20396 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
20397 getOperandEntry(E, 1)->isGather() ||
20398 MinBWs.contains(getOperandEntry(E, 0)) ||
20399 MinBWs.contains(getOperandEntry(E, 1))) &&
20400 "Expected item in MinBWs.");
20401 if (LHS->getType() != VecTy)
20402 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
20403 if (RHS->getType() != VecTy)
20404 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
20405 }
20406
20407 Value *V = Builder.CreateBinOp(
20408 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
20409 RHS);
20410 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
20411 if (auto *I = dyn_cast<Instruction>(V)) {
20412 V = ::propagateMetadata(I, E->Scalars);
20413 // Drop nuw flags for abs(sub(commutative), true).
20414 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
20415 any_of(E->Scalars, [E](Value *V) {
20416 return isa<PoisonValue>(V) ||
20417 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
20418 isCommutative(cast<Instruction>(V));
20419 }))
20420 I->setHasNoUnsignedWrap(/*b=*/false);
20421 }
20422
20423 V = FinalShuffle(V, E);
20424
20425 E->VectorizedValue = V;
20426 ++NumVectorInstructions;
20427
20428 return V;
20429 }
20430 case Instruction::Load: {
20431 // Loads are inserted at the head of the tree because we don't want to
20432 // sink them all the way down past store instructions.
20433 setInsertPointAfterBundle(E);
20434
20435 LoadInst *LI = cast<LoadInst>(VL0);
20436 Instruction *NewLI;
20437 FixedVectorType *StridedLoadTy = nullptr;
20438 Value *PO = LI->getPointerOperand();
20439 if (E->State == TreeEntry::Vectorize) {
20440 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
20441 } else if (E->State == TreeEntry::CompressVectorize) {
20442 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
20443 CompressEntryToData.at(E);
20444 Align CommonAlignment = LI->getAlign();
20445 if (IsMasked) {
20446 unsigned VF = getNumElements(LoadVecTy);
20447 SmallVector<Constant *> MaskValues(
20448 VF / getNumElements(LI->getType()),
20449 ConstantInt::getFalse(VecTy->getContext()));
20450 for (int I : CompressMask)
20451 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
20452 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
20453 assert(SLPReVec && "Only supported by REVEC.");
20454 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
20455 }
20456 Constant *MaskValue = ConstantVector::get(MaskValues);
20457 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
20458 MaskValue);
20459 } else {
20460 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
20461 }
20462 NewLI = ::propagateMetadata(NewLI, E->Scalars);
20463 // TODO: include this cost into CommonCost.
20464 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
20465 assert(SLPReVec && "FixedVectorType is not expected.");
20466 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
20467 CompressMask);
20468 }
20469 NewLI =
20470 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
20471 } else if (E->State == TreeEntry::StridedVectorize) {
20472 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
20473 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
20474 PO = IsReverseOrder ? PtrN : Ptr0;
20475 Type *StrideTy = DL->getIndexType(PO->getType());
20476 Value *StrideVal;
20477 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
20478 StridedLoadTy = SPtrInfo.Ty;
20479 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
20480 unsigned StridedLoadEC =
20481 StridedLoadTy->getElementCount().getKnownMinValue();
20482
20483 Value *Stride = SPtrInfo.StrideVal;
20484 if (!Stride) {
20485 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
20486 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
20487 SCEVExpander Expander(*SE, "strided-load-vec");
20488 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
20489 &*Builder.GetInsertPoint());
20490 }
20491 Value *NewStride =
20492 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
20493 StrideVal = Builder.CreateMul(
20494 NewStride, ConstantInt::getSigned(
20495 StrideTy, (IsReverseOrder ? -1 : 1) *
20496 static_cast<int>(
20497 DL->getTypeAllocSize(ScalarTy))));
20498 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
20499 auto *Inst = Builder.CreateIntrinsic(
20500 Intrinsic::experimental_vp_strided_load,
20501 {StridedLoadTy, PO->getType(), StrideTy},
20502 {PO, StrideVal,
20503 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
20504 Builder.getInt32(StridedLoadEC)});
20505 Inst->addParamAttr(
20506 /*ArgNo=*/0,
20507 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20508 NewLI = Inst;
20509 } else {
20510 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
20511 Value *VecPtr = vectorizeOperand(E, 0);
20512 if (isa<FixedVectorType>(ScalarTy)) {
20513 assert(SLPReVec && "FixedVectorType is not expected.");
20514 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
20515 // to expand VecPtr if ScalarTy is a vector type.
20516 unsigned ScalarTyNumElements =
20517 cast<FixedVectorType>(ScalarTy)->getNumElements();
20518 unsigned VecTyNumElements =
20519 cast<FixedVectorType>(VecTy)->getNumElements();
20520 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
20521 "Cannot expand getelementptr.");
20522 unsigned VF = VecTyNumElements / ScalarTyNumElements;
20523 SmallVector<Constant *> Indices(VecTyNumElements);
20524 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
20525 return Builder.getInt64(I % ScalarTyNumElements);
20526 });
20527 VecPtr = Builder.CreateGEP(
20528 VecTy->getElementType(),
20529 Builder.CreateShuffleVector(
20530 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
20531 ConstantVector::get(Indices));
20532 }
20533 // Use the minimum alignment of the gathered loads.
20534 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
20535 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
20536 }
20537 Value *V = E->State == TreeEntry::CompressVectorize
20538 ? NewLI
20539 : ::propagateMetadata(NewLI, E->Scalars);
20540
20541 if (StridedLoadTy != VecTy)
20542 V = Builder.CreateBitOrPointerCast(V, VecTy);
20543 V = FinalShuffle(V, E);
20544 E->VectorizedValue = V;
20545 ++NumVectorInstructions;
20546 return V;
20547 }
20548 case Instruction::Store: {
20549 auto *SI = cast<StoreInst>(VL0);
20550
20551 setInsertPointAfterBundle(E);
20552
20553 Value *VecValue = vectorizeOperand(E, 0);
20554 if (VecValue->getType() != VecTy)
20555 VecValue =
20556 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
20557 VecValue = FinalShuffle(VecValue, E);
20558
20559 Value *Ptr = SI->getPointerOperand();
20560 Instruction *ST;
20561 if (E->State == TreeEntry::Vectorize) {
20562 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
20563 } else {
20564 assert(E->State == TreeEntry::StridedVectorize &&
20565 "Expected either strided or consecutive stores.");
20566 if (!E->ReorderIndices.empty()) {
20567 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
20568 Ptr = SI->getPointerOperand();
20569 }
20570 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
20571 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
20572 auto *Inst = Builder.CreateIntrinsic(
20573 Intrinsic::experimental_vp_strided_store,
20574 {VecTy, Ptr->getType(), StrideTy},
20575 {VecValue, Ptr,
20577 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
20578 Builder.getAllOnesMask(VecTy->getElementCount()),
20579 Builder.getInt32(E->Scalars.size())});
20580 Inst->addParamAttr(
20581 /*ArgNo=*/1,
20582 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20583 ST = Inst;
20584 }
20585
20586 Value *V = ::propagateMetadata(ST, E->Scalars);
20587
20588 E->VectorizedValue = V;
20589 ++NumVectorInstructions;
20590 return V;
20591 }
20592 case Instruction::GetElementPtr: {
20593 auto *GEP0 = cast<GetElementPtrInst>(VL0);
20594 setInsertPointAfterBundle(E);
20595
20596 Value *Op0 = vectorizeOperand(E, 0);
20597
20598 SmallVector<Value *> OpVecs;
20599 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
20600 Value *OpVec = vectorizeOperand(E, J);
20601 OpVecs.push_back(OpVec);
20602 }
20603
20604 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20605 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
20607 for (Value *V : E->Scalars) {
20609 GEPs.push_back(V);
20610 }
20611 V = ::propagateMetadata(I, GEPs);
20612 }
20613
20614 V = FinalShuffle(V, E);
20615
20616 E->VectorizedValue = V;
20617 ++NumVectorInstructions;
20618
20619 return V;
20620 }
20621 case Instruction::Call: {
20622 CallInst *CI = cast<CallInst>(VL0);
20623 setInsertPointAfterBundle(E);
20624
20626
20628 CI, ID, VecTy->getNumElements(),
20629 It != MinBWs.end() ? It->second.first : 0, TTI);
20630 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
20631 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
20632 VecCallCosts.first <= VecCallCosts.second;
20633
20634 Value *ScalarArg = nullptr;
20635 SmallVector<Value *> OpVecs;
20636 SmallVector<Type *, 2> TysForDecl;
20637 // Add return type if intrinsic is overloaded on it.
20638 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
20639 TysForDecl.push_back(VecTy);
20640 auto *CEI = cast<CallInst>(VL0);
20641 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
20642 // Some intrinsics have scalar arguments. This argument should not be
20643 // vectorized.
20644 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
20645 ScalarArg = CEI->getArgOperand(I);
20646 // if decided to reduce bitwidth of abs intrinsic, it second argument
20647 // must be set false (do not return poison, if value issigned min).
20648 if (ID == Intrinsic::abs && It != MinBWs.end() &&
20649 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
20650 ScalarArg = Builder.getFalse();
20651 OpVecs.push_back(ScalarArg);
20653 TysForDecl.push_back(ScalarArg->getType());
20654 continue;
20655 }
20656
20657 Value *OpVec = vectorizeOperand(E, I);
20658 ScalarArg = CEI->getArgOperand(I);
20659 if (cast<VectorType>(OpVec->getType())->getElementType() !=
20660 ScalarArg->getType()->getScalarType() &&
20661 It == MinBWs.end()) {
20662 auto *CastTy =
20663 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
20664 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
20665 } else if (It != MinBWs.end()) {
20666 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
20667 }
20668 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
20669 OpVecs.push_back(OpVec);
20670 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
20671 TysForDecl.push_back(OpVec->getType());
20672 }
20673
20674 Function *CF;
20675 if (!UseIntrinsic) {
20676 VFShape Shape =
20678 ElementCount::getFixed(VecTy->getNumElements()),
20679 false /*HasGlobalPred*/);
20680 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
20681 } else {
20682 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
20683 }
20684
20686 CI->getOperandBundlesAsDefs(OpBundles);
20687 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
20688
20689 propagateIRFlags(V, E->Scalars, VL0);
20690 V = FinalShuffle(V, E);
20691
20692 E->VectorizedValue = V;
20693 ++NumVectorInstructions;
20694 return V;
20695 }
20696 case Instruction::ShuffleVector: {
20697 Value *V;
20698 if (SLPReVec && !E->isAltShuffle()) {
20699 setInsertPointAfterBundle(E);
20700 Value *Src = vectorizeOperand(E, 0);
20701 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
20702 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
20703 SmallVector<int> NewMask(ThisMask.size());
20704 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
20705 return SVSrc->getShuffleMask()[Mask];
20706 });
20707 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
20708 SVSrc->getOperand(1), NewMask);
20709 } else {
20710 V = Builder.CreateShuffleVector(Src, ThisMask);
20711 }
20712 propagateIRFlags(V, E->Scalars, VL0);
20713 if (auto *I = dyn_cast<Instruction>(V))
20714 V = ::propagateMetadata(I, E->Scalars);
20715 V = FinalShuffle(V, E);
20716 } else {
20717 assert(E->isAltShuffle() &&
20718 ((Instruction::isBinaryOp(E->getOpcode()) &&
20719 Instruction::isBinaryOp(E->getAltOpcode())) ||
20720 (Instruction::isCast(E->getOpcode()) &&
20721 Instruction::isCast(E->getAltOpcode())) ||
20722 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
20723 "Invalid Shuffle Vector Operand");
20724
20725 Value *LHS = nullptr, *RHS = nullptr;
20726 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
20727 setInsertPointAfterBundle(E);
20728 LHS = vectorizeOperand(E, 0);
20729 RHS = vectorizeOperand(E, 1);
20730 } else {
20731 setInsertPointAfterBundle(E);
20732 LHS = vectorizeOperand(E, 0);
20733 }
20734 if (LHS && RHS &&
20735 ((Instruction::isBinaryOp(E->getOpcode()) &&
20736 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
20737 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
20738 assert((It != MinBWs.end() ||
20739 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
20740 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
20741 MinBWs.contains(getOperandEntry(E, 0)) ||
20742 MinBWs.contains(getOperandEntry(E, 1))) &&
20743 "Expected item in MinBWs.");
20744 Type *CastTy = VecTy;
20745 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
20747 ->getElementType()
20748 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
20749 ->getElementType()
20750 ->getIntegerBitWidth())
20751 CastTy = RHS->getType();
20752 else
20753 CastTy = LHS->getType();
20754 }
20755 if (LHS->getType() != CastTy)
20756 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
20757 if (RHS->getType() != CastTy)
20758 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
20759 }
20760
20761 Value *V0, *V1;
20762 if (Instruction::isBinaryOp(E->getOpcode())) {
20763 V0 = Builder.CreateBinOp(
20764 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
20765 V1 = Builder.CreateBinOp(
20766 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
20767 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
20768 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
20769 auto *AltCI = cast<CmpInst>(E->getAltOp());
20770 CmpInst::Predicate AltPred = AltCI->getPredicate();
20771 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
20772 } else {
20773 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
20774 unsigned SrcBWSz = DL->getTypeSizeInBits(
20775 cast<VectorType>(LHS->getType())->getElementType());
20776 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20777 if (BWSz <= SrcBWSz) {
20778 if (BWSz < SrcBWSz)
20779 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
20780 assert(LHS->getType() == VecTy &&
20781 "Expected same type as operand.");
20782 if (auto *I = dyn_cast<Instruction>(LHS))
20783 LHS = ::propagateMetadata(I, E->Scalars);
20784 LHS = FinalShuffle(LHS, E);
20785 E->VectorizedValue = LHS;
20786 ++NumVectorInstructions;
20787 return LHS;
20788 }
20789 }
20790 V0 = Builder.CreateCast(
20791 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
20792 V1 = Builder.CreateCast(
20793 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
20794 }
20795 // Add V0 and V1 to later analysis to try to find and remove matching
20796 // instruction, if any.
20797 for (Value *V : {V0, V1}) {
20798 if (auto *I = dyn_cast<Instruction>(V)) {
20799 GatherShuffleExtractSeq.insert(I);
20800 CSEBlocks.insert(I->getParent());
20801 }
20802 }
20803
20804 // Create shuffle to take alternate operations from the vector.
20805 // Also, gather up main and alt scalar ops to propagate IR flags to
20806 // each vector operation.
20807 ValueList OpScalars, AltScalars;
20808 SmallVector<int> Mask;
20809 E->buildAltOpShuffleMask(
20810 [E, this](Instruction *I) {
20811 assert(E->getMatchingMainOpOrAltOp(I) &&
20812 "Unexpected main/alternate opcode");
20813 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
20814 *TLI);
20815 },
20816 Mask, &OpScalars, &AltScalars);
20817
20818 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
20819 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
20820 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
20821 // Drop nuw flags for abs(sub(commutative), true).
20822 if (auto *I = dyn_cast<Instruction>(Vec);
20823 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
20824 any_of(E->Scalars, [E](Value *V) {
20825 if (isa<PoisonValue>(V))
20826 return false;
20827 if (E->hasCopyableElements() && E->isCopyableElement(V))
20828 return false;
20829 auto *IV = cast<Instruction>(V);
20830 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20831 }))
20832 I->setHasNoUnsignedWrap(/*b=*/false);
20833 };
20834 DropNuwFlag(V0, E->getOpcode());
20835 DropNuwFlag(V1, E->getAltOpcode());
20836
20837 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20838 assert(SLPReVec && "FixedVectorType is not expected.");
20839 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
20840 }
20841 V = Builder.CreateShuffleVector(V0, V1, Mask);
20842 if (auto *I = dyn_cast<Instruction>(V)) {
20843 V = ::propagateMetadata(I, E->Scalars);
20844 GatherShuffleExtractSeq.insert(I);
20845 CSEBlocks.insert(I->getParent());
20846 }
20847 }
20848
20849 E->VectorizedValue = V;
20850 ++NumVectorInstructions;
20851
20852 return V;
20853 }
20854 default:
20855 llvm_unreachable("unknown inst");
20856 }
20857 return nullptr;
20858}
20859
20861 ExtraValueToDebugLocsMap ExternallyUsedValues;
20862 return vectorizeTree(ExternallyUsedValues);
20863}
20864
20866 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
20867 Instruction *ReductionRoot,
20868 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20869 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
20870 // need to rebuild it.
20871 EntryToLastInstruction.clear();
20872 // All blocks must be scheduled before any instructions are inserted.
20873 for (auto &BSIter : BlocksSchedules)
20874 scheduleBlock(*this, BSIter.second.get());
20875 // Cache last instructions for the nodes to avoid side effects, which may
20876 // appear during vectorization, like extra uses, etc.
20877 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20878 if (TE->isGather() || DeletedNodes.contains(TE.get()))
20879 continue;
20880 (void)getLastInstructionInBundle(TE.get());
20881 }
20882
20883 if (ReductionRoot)
20884 Builder.SetInsertPoint(ReductionRoot->getParent(),
20885 ReductionRoot->getIterator());
20886 else
20887 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20888
20889 // Vectorize gather operands of the nodes with the external uses only.
20891 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20892 if (DeletedNodes.contains(TE.get()))
20893 continue;
20894 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20895 TE->UserTreeIndex.UserTE->hasState() &&
20896 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20897 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20898 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20899 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20900 all_of(TE->UserTreeIndex.UserTE->Scalars,
20901 [](Value *V) { return isUsedOutsideBlock(V); })) {
20902 Instruction &LastInst =
20903 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20904 GatherEntries.emplace_back(TE.get(), &LastInst);
20905 }
20906 }
20907 for (auto &Entry : GatherEntries) {
20908 IRBuilderBase::InsertPointGuard Guard(Builder);
20909 Builder.SetInsertPoint(Entry.second);
20910 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20911 (void)vectorizeTree(Entry.first);
20912 }
20913 // Emit gathered loads first to emit better code for the users of those
20914 // gathered loads.
20915 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20916 if (DeletedNodes.contains(TE.get()))
20917 continue;
20918 if (GatheredLoadsEntriesFirst.has_value() &&
20919 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20920 (!TE->isGather() || TE->UserTreeIndex)) {
20921 assert((TE->UserTreeIndex ||
20922 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20923 "Expected gathered load node.");
20924 (void)vectorizeTree(TE.get());
20925 }
20926 }
20927 (void)vectorizeTree(VectorizableTree[0].get());
20928 // Run through the list of postponed gathers and emit them, replacing the temp
20929 // emitted allocas with actual vector instructions.
20930 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20932 for (const TreeEntry *E : PostponedNodes) {
20933 auto *TE = const_cast<TreeEntry *>(E);
20934 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20935 TE->VectorizedValue = nullptr;
20936 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20937 // If user is a PHI node, its vector code have to be inserted right before
20938 // block terminator. Since the node was delayed, there were some unresolved
20939 // dependencies at the moment when stab instruction was emitted. In a case
20940 // when any of these dependencies turn out an operand of another PHI, coming
20941 // from this same block, position of a stab instruction will become invalid.
20942 // The is because source vector that supposed to feed this gather node was
20943 // inserted at the end of the block [after stab instruction]. So we need
20944 // to adjust insertion point again to the end of block.
20945 if (isa<PHINode>(UserI) ||
20946 (TE->UserTreeIndex.UserTE->hasState() &&
20947 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20948 // Insert before all users.
20949 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20950 for (User *U : PrevVec->users()) {
20951 if (U == UserI)
20952 continue;
20953 auto *UI = dyn_cast<Instruction>(U);
20954 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20955 continue;
20956 if (UI->comesBefore(InsertPt))
20957 InsertPt = UI;
20958 }
20959 Builder.SetInsertPoint(InsertPt);
20960 } else {
20961 Builder.SetInsertPoint(PrevVec);
20962 }
20963 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20964 Value *Vec = vectorizeTree(TE);
20965 if (auto *VecI = dyn_cast<Instruction>(Vec);
20966 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20967 Builder.GetInsertPoint()->comesBefore(VecI))
20968 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20969 Builder.GetInsertPoint());
20970 if (Vec->getType() != PrevVec->getType()) {
20971 assert(Vec->getType()->isIntOrIntVectorTy() &&
20972 PrevVec->getType()->isIntOrIntVectorTy() &&
20973 "Expected integer vector types only.");
20974 std::optional<bool> IsSigned;
20975 for (Value *V : TE->Scalars) {
20976 if (isVectorized(V)) {
20977 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20978 auto It = MinBWs.find(MNTE);
20979 if (It != MinBWs.end()) {
20980 IsSigned = IsSigned.value_or(false) || It->second.second;
20981 if (*IsSigned)
20982 break;
20983 }
20984 }
20985 if (IsSigned.value_or(false))
20986 break;
20987 // Scan through gather nodes.
20988 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20989 auto It = MinBWs.find(BVE);
20990 if (It != MinBWs.end()) {
20991 IsSigned = IsSigned.value_or(false) || It->second.second;
20992 if (*IsSigned)
20993 break;
20994 }
20995 }
20996 if (IsSigned.value_or(false))
20997 break;
20998 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20999 IsSigned =
21000 IsSigned.value_or(false) ||
21001 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
21002 continue;
21003 }
21004 if (IsSigned.value_or(false))
21005 break;
21006 }
21007 }
21008 if (IsSigned.value_or(false)) {
21009 // Final attempt - check user node.
21010 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
21011 if (It != MinBWs.end())
21012 IsSigned = It->second.second;
21013 }
21014 assert(IsSigned &&
21015 "Expected user node or perfect diamond match in MinBWs.");
21016 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
21017 }
21018 PrevVec->replaceAllUsesWith(Vec);
21019 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
21020 // Replace the stub vector node, if it was used before for one of the
21021 // buildvector nodes already.
21022 auto It = PostponedValues.find(PrevVec);
21023 if (It != PostponedValues.end()) {
21024 for (TreeEntry *VTE : It->getSecond())
21025 VTE->VectorizedValue = Vec;
21026 }
21027 eraseInstruction(PrevVec);
21028 }
21029
21030 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
21031 << " values .\n");
21032
21034 // Maps vector instruction to original insertelement instruction
21035 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
21036 // Maps extract Scalar to the corresponding extractelement instruction in the
21037 // basic block. Only one extractelement per block should be emitted.
21039 ScalarToEEs;
21040 SmallDenseSet<Value *, 4> UsedInserts;
21042 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
21044 // Extract all of the elements with the external uses.
21045 for (const auto &ExternalUse : ExternalUses) {
21046 Value *Scalar = ExternalUse.Scalar;
21047 llvm::User *User = ExternalUse.User;
21048
21049 // Skip users that we already RAUW. This happens when one instruction
21050 // has multiple uses of the same value.
21051 if (User && !is_contained(Scalar->users(), User))
21052 continue;
21053 const TreeEntry *E = &ExternalUse.E;
21054 assert(E && "Invalid scalar");
21055 assert(!E->isGather() && "Extracting from a gather list");
21056 // Non-instruction pointers are not deleted, just skip them.
21057 if (E->getOpcode() == Instruction::GetElementPtr &&
21058 !isa<GetElementPtrInst>(Scalar))
21059 continue;
21060
21061 Value *Vec = E->VectorizedValue;
21062 assert(Vec && "Can't find vectorizable value");
21063
21064 Value *Lane = Builder.getInt32(ExternalUse.Lane);
21065 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
21066 if (Scalar->getType() != Vec->getType()) {
21067 Value *Ex = nullptr;
21068 Value *ExV = nullptr;
21069 auto *Inst = dyn_cast<Instruction>(Scalar);
21070 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
21071 auto It = ScalarToEEs.find(Scalar);
21072 if (It != ScalarToEEs.end()) {
21073 // No need to emit many extracts, just move the only one in the
21074 // current block.
21075 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
21076 : Builder.GetInsertBlock());
21077 if (EEIt != It->second.end()) {
21078 Value *PrevV = EEIt->second.first;
21079 if (auto *I = dyn_cast<Instruction>(PrevV);
21080 I && !ReplaceInst &&
21081 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
21082 Builder.GetInsertPoint()->comesBefore(I)) {
21083 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
21084 Builder.GetInsertPoint());
21085 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
21086 CI->moveAfter(I);
21087 }
21088 Ex = PrevV;
21089 ExV = EEIt->second.second ? EEIt->second.second : Ex;
21090 }
21091 }
21092 if (!Ex) {
21093 // "Reuse" the existing extract to improve final codegen.
21094 if (ReplaceInst) {
21095 // Leave the instruction as is, if it cheaper extracts and all
21096 // operands are scalar.
21097 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
21098 IgnoredExtracts.insert(EE);
21099 Ex = EE;
21100 } else {
21101 auto *CloneInst = Inst->clone();
21102 CloneInst->insertBefore(Inst->getIterator());
21103 if (Inst->hasName())
21104 CloneInst->takeName(Inst);
21105 Ex = CloneInst;
21106 }
21107 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
21108 ES && isa<Instruction>(Vec)) {
21109 Value *V = ES->getVectorOperand();
21110 auto *IVec = cast<Instruction>(Vec);
21111 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
21112 V = ETEs.front()->VectorizedValue;
21113 if (auto *IV = dyn_cast<Instruction>(V);
21114 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
21115 IV->comesBefore(IVec))
21116 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
21117 else
21118 Ex = Builder.CreateExtractElement(Vec, Lane);
21119 } else if (auto *VecTy =
21120 dyn_cast<FixedVectorType>(Scalar->getType())) {
21121 assert(SLPReVec && "FixedVectorType is not expected.");
21122 unsigned VecTyNumElements = VecTy->getNumElements();
21123 // When REVEC is enabled, we need to extract a vector.
21124 // Note: The element size of Scalar may be different from the
21125 // element size of Vec.
21126 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
21127 ExternalUse.Lane * VecTyNumElements);
21128 } else {
21129 Ex = Builder.CreateExtractElement(Vec, Lane);
21130 }
21131 // If necessary, sign-extend or zero-extend ScalarRoot
21132 // to the larger type.
21133 ExV = Ex;
21134 if (Scalar->getType() != Ex->getType())
21135 ExV = Builder.CreateIntCast(
21136 Ex, Scalar->getType(),
21137 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
21138 auto *I = dyn_cast<Instruction>(Ex);
21139 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
21140 : &F->getEntryBlock(),
21141 std::make_pair(Ex, ExV));
21142 }
21143 // The then branch of the previous if may produce constants, since 0
21144 // operand might be a constant.
21145 if (auto *ExI = dyn_cast<Instruction>(Ex);
21146 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
21147 GatherShuffleExtractSeq.insert(ExI);
21148 CSEBlocks.insert(ExI->getParent());
21149 }
21150 return ExV;
21151 }
21152 assert(isa<FixedVectorType>(Scalar->getType()) &&
21153 isa<InsertElementInst>(Scalar) &&
21154 "In-tree scalar of vector type is not insertelement?");
21155 auto *IE = cast<InsertElementInst>(Scalar);
21156 VectorToInsertElement.try_emplace(Vec, IE);
21157 return Vec;
21158 };
21159 // If User == nullptr, the Scalar remains as scalar in vectorized
21160 // instructions or is used as extra arg. Generate ExtractElement instruction
21161 // and update the record for this scalar in ExternallyUsedValues.
21162 if (!User) {
21163 if (!ScalarsWithNullptrUser.insert(Scalar).second)
21164 continue;
21165 assert(
21166 (ExternallyUsedValues.count(Scalar) ||
21167 ExternalUsesWithNonUsers.count(Scalar) ||
21168 ExternalUsesAsOriginalScalar.contains(Scalar) ||
21169 any_of(
21170 Scalar->users(),
21171 [&, TTI = TTI](llvm::User *U) {
21172 if (ExternalUsesAsOriginalScalar.contains(U))
21173 return true;
21174 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
21175 return !UseEntries.empty() &&
21176 (E->State == TreeEntry::Vectorize ||
21177 E->State == TreeEntry::StridedVectorize ||
21178 E->State == TreeEntry::CompressVectorize) &&
21179 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
21180 return (UseEntry->State == TreeEntry::Vectorize ||
21181 UseEntry->State ==
21182 TreeEntry::StridedVectorize ||
21183 UseEntry->State ==
21184 TreeEntry::CompressVectorize) &&
21185 doesInTreeUserNeedToExtract(
21186 Scalar, getRootEntryInstruction(*UseEntry),
21187 TLI, TTI);
21188 });
21189 })) &&
21190 "Scalar with nullptr User must be registered in "
21191 "ExternallyUsedValues map or remain as scalar in vectorized "
21192 "instructions");
21193 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
21194 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
21195 if (PHI->getParent()->isLandingPad())
21196 Builder.SetInsertPoint(
21197 PHI->getParent(),
21198 std::next(
21199 PHI->getParent()->getLandingPadInst()->getIterator()));
21200 else
21201 Builder.SetInsertPoint(PHI->getParent(),
21202 PHI->getParent()->getFirstNonPHIIt());
21203 } else {
21204 Builder.SetInsertPoint(VecI->getParent(),
21205 std::next(VecI->getIterator()));
21206 }
21207 } else {
21208 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21209 }
21210 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21211 // Required to update internally referenced instructions.
21212 if (Scalar != NewInst) {
21213 assert((!isa<ExtractElementInst>(Scalar) ||
21214 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
21215 "Extractelements should not be replaced.");
21216 Scalar->replaceAllUsesWith(NewInst);
21217 }
21218 continue;
21219 }
21220
21221 if (auto *VU = dyn_cast<InsertElementInst>(User);
21222 VU && VU->getOperand(1) == Scalar) {
21223 // Skip if the scalar is another vector op or Vec is not an instruction.
21224 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
21225 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
21226 if (!UsedInserts.insert(VU).second)
21227 continue;
21228 // Need to use original vector, if the root is truncated.
21229 auto BWIt = MinBWs.find(E);
21230 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
21231 auto *ScalarTy = FTy->getElementType();
21232 auto Key = std::make_pair(Vec, ScalarTy);
21233 auto VecIt = VectorCasts.find(Key);
21234 if (VecIt == VectorCasts.end()) {
21235 IRBuilderBase::InsertPointGuard Guard(Builder);
21236 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
21237 if (IVec->getParent()->isLandingPad())
21238 Builder.SetInsertPoint(IVec->getParent(),
21239 std::next(IVec->getParent()
21240 ->getLandingPadInst()
21241 ->getIterator()));
21242 else
21243 Builder.SetInsertPoint(
21244 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
21245 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
21246 Builder.SetInsertPoint(IVec->getNextNode());
21247 }
21248 Vec = Builder.CreateIntCast(
21249 Vec,
21251 ScalarTy,
21252 cast<FixedVectorType>(Vec->getType())->getNumElements()),
21253 BWIt->second.second);
21254 VectorCasts.try_emplace(Key, Vec);
21255 } else {
21256 Vec = VecIt->second;
21257 }
21258 }
21259
21260 std::optional<unsigned> InsertIdx = getElementIndex(VU);
21261 if (InsertIdx) {
21262 auto *It = find_if(
21263 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
21264 // Checks if 2 insertelements are from the same buildvector.
21265 InsertElementInst *VecInsert = Data.InsertElements.front();
21267 VU, VecInsert,
21268 [](InsertElementInst *II) { return II->getOperand(0); });
21269 });
21270 unsigned Idx = *InsertIdx;
21271 if (It == ShuffledInserts.end()) {
21272 (void)ShuffledInserts.emplace_back();
21273 It = std::next(ShuffledInserts.begin(),
21274 ShuffledInserts.size() - 1);
21275 }
21276 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
21277 if (Mask.empty())
21278 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
21279 Mask[Idx] = ExternalUse.Lane;
21280 It->InsertElements.push_back(cast<InsertElementInst>(User));
21281 continue;
21282 }
21283 }
21284 }
21285 }
21286
21287 // Generate extracts for out-of-tree users.
21288 // Find the insertion point for the extractelement lane.
21289 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
21290 if (PHINode *PH = dyn_cast<PHINode>(User)) {
21291 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
21292 if (PH->getIncomingValue(I) == Scalar) {
21293 Instruction *IncomingTerminator =
21294 PH->getIncomingBlock(I)->getTerminator();
21295 if (isa<CatchSwitchInst>(IncomingTerminator)) {
21296 Builder.SetInsertPoint(VecI->getParent(),
21297 std::next(VecI->getIterator()));
21298 } else {
21299 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
21300 }
21301 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21302 PH->setOperand(I, NewInst);
21303 }
21304 }
21305 } else {
21306 Builder.SetInsertPoint(cast<Instruction>(User));
21307 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21308 User->replaceUsesOfWith(Scalar, NewInst);
21309 }
21310 } else {
21311 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21312 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21313 User->replaceUsesOfWith(Scalar, NewInst);
21314 }
21315
21316 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
21317 }
21318
21319 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
21320 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
21321 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
21322 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
21323 for (int I = 0, E = Mask.size(); I < E; ++I) {
21324 if (Mask[I] < VF)
21325 CombinedMask1[I] = Mask[I];
21326 else
21327 CombinedMask2[I] = Mask[I] - VF;
21328 }
21329 ShuffleInstructionBuilder ShuffleBuilder(
21330 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
21331 ShuffleBuilder.add(V1, CombinedMask1);
21332 if (V2)
21333 ShuffleBuilder.add(V2, CombinedMask2);
21334 return ShuffleBuilder.finalize({}, {}, {});
21335 };
21336
21337 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
21338 bool ForSingleMask) {
21339 unsigned VF = Mask.size();
21340 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
21341 if (VF != VecVF) {
21342 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
21343 Vec = CreateShuffle(Vec, nullptr, Mask);
21344 return std::make_pair(Vec, true);
21345 }
21346 if (!ForSingleMask) {
21347 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
21348 for (unsigned I = 0; I < VF; ++I) {
21349 if (Mask[I] != PoisonMaskElem)
21350 ResizeMask[Mask[I]] = Mask[I];
21351 }
21352 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
21353 }
21354 }
21355
21356 return std::make_pair(Vec, false);
21357 };
21358 // Perform shuffling of the vectorize tree entries for better handling of
21359 // external extracts.
21360 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
21361 // Find the first and the last instruction in the list of insertelements.
21362 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
21363 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
21364 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
21365 Builder.SetInsertPoint(LastInsert);
21366 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
21368 MutableArrayRef(Vector.data(), Vector.size()),
21369 FirstInsert->getOperand(0),
21370 [](Value *Vec) {
21371 return cast<VectorType>(Vec->getType())
21372 ->getElementCount()
21373 .getKnownMinValue();
21374 },
21375 ResizeToVF,
21376 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
21377 ArrayRef<Value *> Vals) {
21378 assert((Vals.size() == 1 || Vals.size() == 2) &&
21379 "Expected exactly 1 or 2 input values.");
21380 if (Vals.size() == 1) {
21381 // Do not create shuffle if the mask is a simple identity
21382 // non-resizing mask.
21383 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
21384 ->getNumElements() ||
21385 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
21386 return CreateShuffle(Vals.front(), nullptr, Mask);
21387 return Vals.front();
21388 }
21389 return CreateShuffle(Vals.front() ? Vals.front()
21390 : FirstInsert->getOperand(0),
21391 Vals.back(), Mask);
21392 });
21393 auto It = ShuffledInserts[I].InsertElements.rbegin();
21394 // Rebuild buildvector chain.
21395 InsertElementInst *II = nullptr;
21396 if (It != ShuffledInserts[I].InsertElements.rend())
21397 II = *It;
21399 while (It != ShuffledInserts[I].InsertElements.rend()) {
21400 assert(II && "Must be an insertelement instruction.");
21401 if (*It == II)
21402 ++It;
21403 else
21404 Inserts.push_back(cast<Instruction>(II));
21405 II = dyn_cast<InsertElementInst>(II->getOperand(0));
21406 }
21407 for (Instruction *II : reverse(Inserts)) {
21408 II->replaceUsesOfWith(II->getOperand(0), NewInst);
21409 if (auto *NewI = dyn_cast<Instruction>(NewInst))
21410 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
21411 II->moveAfter(NewI);
21412 NewInst = II;
21413 }
21414 LastInsert->replaceAllUsesWith(NewInst);
21415 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
21416 IE->replaceUsesOfWith(IE->getOperand(0),
21417 PoisonValue::get(IE->getOperand(0)->getType()));
21418 IE->replaceUsesOfWith(IE->getOperand(1),
21419 PoisonValue::get(IE->getOperand(1)->getType()));
21420 eraseInstruction(IE);
21421 }
21422 CSEBlocks.insert(LastInsert->getParent());
21423 }
21424
21425 SmallVector<Instruction *> RemovedInsts;
21426 // For each vectorized value:
21427 for (auto &TEPtr : VectorizableTree) {
21428 TreeEntry *Entry = TEPtr.get();
21429
21430 // No need to handle users of gathered values.
21431 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
21432 DeletedNodes.contains(Entry) ||
21433 TransformedToGatherNodes.contains(Entry))
21434 continue;
21435
21436 assert(Entry->VectorizedValue && "Can't find vectorizable value");
21437
21438 // For each lane:
21439 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
21440 Value *Scalar = Entry->Scalars[Lane];
21441
21442 if (Entry->getOpcode() == Instruction::GetElementPtr &&
21443 !isa<GetElementPtrInst>(Scalar))
21444 continue;
21445 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
21446 EE && IgnoredExtracts.contains(EE))
21447 continue;
21448 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
21449 continue;
21450#ifndef NDEBUG
21451 Type *Ty = Scalar->getType();
21452 if (!Ty->isVoidTy()) {
21453 for (User *U : Scalar->users()) {
21454 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
21455
21456 // It is legal to delete users in the ignorelist.
21457 assert((isVectorized(U) ||
21458 (UserIgnoreList && UserIgnoreList->contains(U)) ||
21461 "Deleting out-of-tree value");
21462 }
21463 }
21464#endif
21465 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
21466 auto *I = cast<Instruction>(Scalar);
21467 RemovedInsts.push_back(I);
21468 }
21469 }
21470
21471 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
21472 // new vector instruction.
21473 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
21474 V->mergeDIAssignID(RemovedInsts);
21475
21476 // Clear up reduction references, if any.
21477 if (UserIgnoreList) {
21478 for (Instruction *I : RemovedInsts) {
21479 const TreeEntry *IE = getTreeEntries(I).front();
21480 if (ArrayRef<TreeEntry *> SplitEntries = getSplitTreeEntries(I);
21481 !SplitEntries.empty() && SplitEntries.front()->Idx < IE->Idx)
21482 IE = SplitEntries.front();
21483 if (IE->Idx != 0 &&
21484 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
21485 (ValueToGatherNodes.lookup(I).contains(
21486 VectorizableTree.front().get()) ||
21487 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
21488 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
21489 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
21490 IE->UserTreeIndex &&
21491 is_contained(VectorizableTree.front()->Scalars, I)) &&
21492 !(GatheredLoadsEntriesFirst.has_value() &&
21493 IE->Idx >= *GatheredLoadsEntriesFirst &&
21494 VectorizableTree.front()->isGather() &&
21495 is_contained(VectorizableTree.front()->Scalars, I)) &&
21496 !(!VectorizableTree.front()->isGather() &&
21497 VectorizableTree.front()->isCopyableElement(I)))
21498 continue;
21499 SmallVector<SelectInst *> LogicalOpSelects;
21500 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
21501 // Do not replace condition of the logical op in form select <cond>.
21502 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
21503 (match(U.getUser(), m_LogicalAnd()) ||
21504 match(U.getUser(), m_LogicalOr())) &&
21505 U.getOperandNo() == 0;
21506 if (IsPoisoningLogicalOp) {
21507 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
21508 return false;
21509 }
21510 return UserIgnoreList->contains(U.getUser());
21511 });
21512 // Replace conditions of the poisoning logical ops with the non-poison
21513 // constant value.
21514 for (SelectInst *SI : LogicalOpSelects)
21515 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
21516 }
21517 }
21518 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
21519 // cache correctness.
21520 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
21521 // - instructions are not deleted until later.
21522 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
21523
21524 Builder.ClearInsertionPoint();
21525 InstrElementSize.clear();
21526
21527 const TreeEntry &RootTE = *VectorizableTree.front();
21528 Value *Vec = RootTE.VectorizedValue;
21529 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
21530 It != MinBWs.end() &&
21531 ReductionBitWidth != It->second.first) {
21532 IRBuilder<>::InsertPointGuard Guard(Builder);
21533 Builder.SetInsertPoint(ReductionRoot->getParent(),
21534 ReductionRoot->getIterator());
21535 Vec = Builder.CreateIntCast(
21536 Vec,
21537 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
21538 cast<VectorType>(Vec->getType())->getElementCount()),
21539 It->second.second);
21540 }
21541 return Vec;
21542}
21543
21545 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
21546 << " gather sequences instructions.\n");
21547 // LICM InsertElementInst sequences.
21548 for (Instruction *I : GatherShuffleExtractSeq) {
21549 if (isDeleted(I))
21550 continue;
21551
21552 // Check if this block is inside a loop.
21553 Loop *L = LI->getLoopFor(I->getParent());
21554 if (!L)
21555 continue;
21556
21557 // Check if it has a preheader.
21558 BasicBlock *PreHeader = L->getLoopPreheader();
21559 if (!PreHeader)
21560 continue;
21561
21562 // If the vector or the element that we insert into it are
21563 // instructions that are defined in this basic block then we can't
21564 // hoist this instruction.
21565 if (any_of(I->operands(), [L](Value *V) {
21566 auto *OpI = dyn_cast<Instruction>(V);
21567 return OpI && L->contains(OpI);
21568 }))
21569 continue;
21570
21571 // We can hoist this instruction. Move it to the pre-header.
21572 I->moveBefore(PreHeader->getTerminator()->getIterator());
21573 CSEBlocks.insert(PreHeader);
21574 }
21575
21576 // Make a list of all reachable blocks in our CSE queue.
21578 CSEWorkList.reserve(CSEBlocks.size());
21579 for (BasicBlock *BB : CSEBlocks)
21580 if (DomTreeNode *N = DT->getNode(BB)) {
21581 assert(DT->isReachableFromEntry(N));
21582 CSEWorkList.push_back(N);
21583 }
21584
21585 // Sort blocks by domination. This ensures we visit a block after all blocks
21586 // dominating it are visited.
21587 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
21588 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
21589 "Different nodes should have different DFS numbers");
21590 return A->getDFSNumIn() < B->getDFSNumIn();
21591 });
21592
21593 // Less defined shuffles can be replaced by the more defined copies.
21594 // Between two shuffles one is less defined if it has the same vector operands
21595 // and its mask indeces are the same as in the first one or undefs. E.g.
21596 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
21597 // poison, <0, 0, 0, 0>.
21598 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
21599 Instruction *I2,
21600 SmallVectorImpl<int> &NewMask) {
21601 if (I1->getType() != I2->getType())
21602 return false;
21603 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
21604 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
21605 if (!SI1 || !SI2)
21606 return I1->isIdenticalTo(I2);
21607 if (SI1->isIdenticalTo(SI2))
21608 return true;
21609 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
21610 if (SI1->getOperand(I) != SI2->getOperand(I))
21611 return false;
21612 // Check if the second instruction is more defined than the first one.
21613 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
21614 ArrayRef<int> SM1 = SI1->getShuffleMask();
21615 // Count trailing undefs in the mask to check the final number of used
21616 // registers.
21617 unsigned LastUndefsCnt = 0;
21618 for (int I = 0, E = NewMask.size(); I < E; ++I) {
21619 if (SM1[I] == PoisonMaskElem)
21620 ++LastUndefsCnt;
21621 else
21622 LastUndefsCnt = 0;
21623 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
21624 NewMask[I] != SM1[I])
21625 return false;
21626 if (NewMask[I] == PoisonMaskElem)
21627 NewMask[I] = SM1[I];
21628 }
21629 // Check if the last undefs actually change the final number of used vector
21630 // registers.
21631 return SM1.size() - LastUndefsCnt > 1 &&
21632 ::getNumberOfParts(*TTI, SI1->getType()) ==
21634 *TTI, getWidenedType(SI1->getType()->getElementType(),
21635 SM1.size() - LastUndefsCnt));
21636 };
21637 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
21638 // instructions. TODO: We can further optimize this scan if we split the
21639 // instructions into different buckets based on the insert lane.
21641 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
21642 assert(*I &&
21643 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
21644 "Worklist not sorted properly!");
21645 BasicBlock *BB = (*I)->getBlock();
21646 // For all instructions in blocks containing gather sequences:
21647 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
21648 if (isDeleted(&In))
21649 continue;
21651 !GatherShuffleExtractSeq.contains(&In))
21652 continue;
21653
21654 // Check if we can replace this instruction with any of the
21655 // visited instructions.
21656 bool Replaced = false;
21657 for (Instruction *&V : Visited) {
21658 SmallVector<int> NewMask;
21659 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
21660 DT->dominates(V->getParent(), In.getParent())) {
21661 In.replaceAllUsesWith(V);
21662 eraseInstruction(&In);
21663 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
21664 if (!NewMask.empty())
21665 SI->setShuffleMask(NewMask);
21666 Replaced = true;
21667 break;
21668 }
21670 GatherShuffleExtractSeq.contains(V) &&
21671 IsIdenticalOrLessDefined(V, &In, NewMask) &&
21672 DT->dominates(In.getParent(), V->getParent())) {
21673 In.moveAfter(V);
21674 V->replaceAllUsesWith(&In);
21676 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
21677 if (!NewMask.empty())
21678 SI->setShuffleMask(NewMask);
21679 V = &In;
21680 Replaced = true;
21681 break;
21682 }
21683 }
21684 if (!Replaced) {
21685 assert(!is_contained(Visited, &In));
21686 Visited.push_back(&In);
21687 }
21688 }
21689 }
21690 CSEBlocks.clear();
21691 GatherShuffleExtractSeq.clear();
21692}
21693
21694BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
21695 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
21696 auto &BundlePtr =
21697 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
21698 for (Value *V : VL) {
21699 if (S.isNonSchedulable(V))
21700 continue;
21701 auto *I = cast<Instruction>(V);
21702 if (S.isCopyableElement(V)) {
21703 // Add a copyable element model.
21704 ScheduleCopyableData &SD =
21705 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
21706 // Group the instructions to a bundle.
21707 BundlePtr->add(&SD);
21708 continue;
21709 }
21710 ScheduleData *BundleMember = getScheduleData(V);
21711 assert(BundleMember && "no ScheduleData for bundle member "
21712 "(maybe not in same basic block)");
21713 // Group the instructions to a bundle.
21714 BundlePtr->add(BundleMember);
21715 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
21716 BundlePtr.get());
21717 }
21718 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
21719 return *BundlePtr;
21720}
21721
21722// Groups the instructions to a bundle (which is then a single scheduling entity)
21723// and schedules instructions until the bundle gets ready.
21724std::optional<BoUpSLP::ScheduleBundle *>
21725BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
21726 const InstructionsState &S,
21727 const EdgeInfo &EI) {
21728 // No need to schedule PHIs, insertelement, extractelement and extractvalue
21729 // instructions.
21730 if (isa<PHINode>(S.getMainOp()) ||
21731 isVectorLikeInstWithConstOps(S.getMainOp()))
21732 return nullptr;
21733 // If the parent node is non-schedulable and the current node is copyable, and
21734 // any of parent instructions are used outside several basic blocks or in
21735 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
21736 // analysis, leading to a crash.
21737 // Non-scheduled nodes may not have related ScheduleData model, which may lead
21738 // to a skipped dep analysis.
21739 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21740 EI.UserTE->doesNotNeedToSchedule() &&
21741 EI.UserTE->getOpcode() != Instruction::PHI &&
21742 any_of(EI.UserTE->Scalars, [](Value *V) {
21743 auto *I = dyn_cast<Instruction>(V);
21744 if (!I || I->hasOneUser())
21745 return false;
21746 for (User *U : I->users()) {
21747 auto *UI = cast<Instruction>(U);
21748 if (isa<BinaryOperator>(UI))
21749 return true;
21750 }
21751 return false;
21752 }))
21753 return std::nullopt;
21754 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21755 EI.UserTE->hasCopyableElements() &&
21756 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21757 all_of(VL, [&](Value *V) {
21758 if (S.isCopyableElement(V))
21759 return true;
21760 return isUsedOutsideBlock(V);
21761 }))
21762 return std::nullopt;
21763 // If any instruction is used outside block only and its operand is placed
21764 // immediately before it, do not schedule, it may cause wrong def-use chain.
21765 if (S.areInstructionsWithCopyableElements() && any_of(VL, [&](Value *V) {
21766 if (isa<PoisonValue>(V) || S.isCopyableElement(V))
21767 return false;
21768 if (isUsedOutsideBlock(V)) {
21769 for (Value *Op : cast<Instruction>(V)->operands()) {
21770 auto *I = dyn_cast<Instruction>(Op);
21771 if (!I)
21772 continue;
21773 return SLP->isVectorized(I) && I->getNextNode() == V;
21774 }
21775 }
21776 return false;
21777 }))
21778 return std::nullopt;
21779 if (S.areInstructionsWithCopyableElements() && EI) {
21780 bool IsNonSchedulableWithParentPhiNode =
21781 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
21782 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
21783 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21784 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
21785 if (IsNonSchedulableWithParentPhiNode) {
21786 SmallSet<std::pair<Value *, Value *>, 4> Values;
21787 for (const auto [Idx, V] :
21788 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
21789 Value *Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
21790 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
21791 auto *I = dyn_cast<Instruction>(Op);
21792 if (!I || !isCommutative(I))
21793 continue;
21794 if (!Values.insert(std::make_pair(V, Op)).second)
21795 return std::nullopt;
21796 }
21797 }
21798 }
21799 bool HasCopyables = S.areInstructionsWithCopyableElements();
21800 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
21801 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
21802 // If all operands were replaced by copyables, the operands of this node
21803 // might be not, so need to recalculate dependencies for schedule data,
21804 // replaced by copyable schedule data.
21805 SmallVector<ScheduleData *> ControlDependentMembers;
21806 for (Value *V : VL) {
21807 auto *I = dyn_cast<Instruction>(V);
21808 if (!I || (HasCopyables && S.isCopyableElement(V)))
21809 continue;
21810 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21811 for (const Use &U : I->operands()) {
21812 unsigned &NumOps =
21813 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
21814 .first->getSecond();
21815 ++NumOps;
21816 if (auto *Op = dyn_cast<Instruction>(U.get());
21817 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
21818 if (ScheduleData *OpSD = getScheduleData(Op);
21819 OpSD && OpSD->hasValidDependencies())
21820 // TODO: investigate how to improve it instead of early exiting.
21821 return std::nullopt;
21822 }
21823 }
21824 }
21825 return nullptr;
21826 }
21827
21828 // Initialize the instruction bundle.
21829 Instruction *OldScheduleEnd = ScheduleEnd;
21830 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
21831
21832 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
21833 // Clear deps or recalculate the region, if the memory instruction is a
21834 // copyable. It may have memory deps, which must be recalculated.
21835 SmallVector<ScheduleData *> ControlDependentMembers;
21836 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21837 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21838 for (ScheduleEntity *SE : Bundle.getBundle()) {
21839 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
21840 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21841 BundleMember && BundleMember->hasValidDependencies()) {
21842 BundleMember->clearDirectDependencies();
21843 if (RegionHasStackSave ||
21845 BundleMember->getInst()))
21846 ControlDependentMembers.push_back(BundleMember);
21847 }
21848 continue;
21849 }
21850 auto *SD = cast<ScheduleData>(SE);
21851 if (SD->hasValidDependencies() &&
21852 (!S.areInstructionsWithCopyableElements() ||
21853 !S.isCopyableElement(SD->getInst())) &&
21854 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21855 EI.UserTE->hasState() &&
21856 (!EI.UserTE->hasCopyableElements() ||
21857 !EI.UserTE->isCopyableElement(SD->getInst())))
21858 SD->clearDirectDependencies();
21859 for (const Use &U : SD->getInst()->operands()) {
21860 unsigned &NumOps =
21861 UserOpToNumOps
21862 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
21863 .first->getSecond();
21864 ++NumOps;
21865 if (auto *Op = dyn_cast<Instruction>(U.get());
21866 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
21867 *SLP, NumOps)) {
21868 if (ScheduleData *OpSD = getScheduleData(Op);
21869 OpSD && OpSD->hasValidDependencies()) {
21870 OpSD->clearDirectDependencies();
21871 if (RegionHasStackSave ||
21873 ControlDependentMembers.push_back(OpSD);
21874 }
21875 }
21876 }
21877 }
21878 };
21879 // The scheduling region got new instructions at the lower end (or it is a
21880 // new region for the first bundle). This makes it necessary to
21881 // recalculate all dependencies.
21882 // It is seldom that this needs to be done a second time after adding the
21883 // initial bundle to the region.
21884 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21885 for_each(ScheduleDataMap, [&](auto &P) {
21886 if (BB != P.first->getParent())
21887 return;
21888 ScheduleData *SD = P.second;
21889 if (isInSchedulingRegion(*SD))
21890 SD->clearDependencies();
21891 });
21892 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21893 for_each(P.second, [&](ScheduleCopyableData *SD) {
21894 if (isInSchedulingRegion(*SD))
21895 SD->clearDependencies();
21896 });
21897 });
21898 ReSchedule = true;
21899 }
21900 // Check if the bundle data has deps for copyable elements already. In
21901 // this case need to reset deps and recalculate it.
21902 if (Bundle && !Bundle.getBundle().empty()) {
21903 if (S.areInstructionsWithCopyableElements() ||
21904 !ScheduleCopyableDataMap.empty())
21905 CheckIfNeedToClearDeps(Bundle);
21906 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
21907 << BB->getName() << "\n");
21908 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
21909 ControlDependentMembers);
21910 } else if (!ControlDependentMembers.empty()) {
21911 ScheduleBundle Invalid = ScheduleBundle::invalid();
21912 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
21913 ControlDependentMembers);
21914 }
21915
21916 if (ReSchedule) {
21917 resetSchedule();
21918 initialFillReadyList(ReadyInsts);
21919 }
21920
21921 // Now try to schedule the new bundle or (if no bundle) just calculate
21922 // dependencies. As soon as the bundle is "ready" it means that there are no
21923 // cyclic dependencies and we can schedule it. Note that's important that we
21924 // don't "schedule" the bundle yet.
21925 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21926 !ReadyInsts.empty()) {
21927 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21928 assert(Picked->isReady() && "must be ready to schedule");
21929 schedule(*SLP, S, EI, Picked, ReadyInsts);
21930 if (Picked == &Bundle)
21931 break;
21932 }
21933 };
21934
21935 // Make sure that the scheduling region contains all
21936 // instructions of the bundle.
21937 for (Value *V : VL) {
21938 if (S.isNonSchedulable(V))
21939 continue;
21940 if (!extendSchedulingRegion(V, S)) {
21941 // If the scheduling region got new instructions at the lower end (or it
21942 // is a new region for the first bundle). This makes it necessary to
21943 // recalculate all dependencies.
21944 // Otherwise the compiler may crash trying to incorrectly calculate
21945 // dependencies and emit instruction in the wrong order at the actual
21946 // scheduling.
21947 ScheduleBundle Invalid = ScheduleBundle::invalid();
21948 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
21949 return std::nullopt;
21950 }
21951 }
21952
21953 bool ReSchedule = false;
21954 for (Value *V : VL) {
21955 if (S.isNonSchedulable(V))
21956 continue;
21958 getScheduleCopyableData(cast<Instruction>(V));
21959 if (!CopyableData.empty()) {
21960 for (ScheduleCopyableData *SD : CopyableData)
21961 ReadyInsts.remove(SD);
21962 }
21963 ScheduleData *BundleMember = getScheduleData(V);
21964 assert((BundleMember || S.isCopyableElement(V)) &&
21965 "no ScheduleData for bundle member (maybe not in same basic block)");
21966 if (!BundleMember)
21967 continue;
21968
21969 // Make sure we don't leave the pieces of the bundle in the ready list when
21970 // whole bundle might not be ready.
21971 ReadyInsts.remove(BundleMember);
21972 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
21973 !Bundles.empty()) {
21974 for (ScheduleBundle *B : Bundles)
21975 ReadyInsts.remove(B);
21976 }
21977
21978 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21979 continue;
21980 // A bundle member was scheduled as single instruction before and now
21981 // needs to be scheduled as part of the bundle. We just get rid of the
21982 // existing schedule.
21983 // A bundle member has deps calculated before it was copyable element - need
21984 // to reschedule.
21985 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
21986 << " was already scheduled\n");
21987 ReSchedule = true;
21988 }
21989
21990 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21991 TryScheduleBundleImpl(ReSchedule, Bundle);
21992 if (!Bundle.isReady()) {
21993 for (ScheduleEntity *BD : Bundle.getBundle()) {
21994 // Copyable data scheduling is just removed.
21996 continue;
21997 if (BD->isReady()) {
21998 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
21999 if (Bundles.empty()) {
22000 ReadyInsts.insert(BD);
22001 continue;
22002 }
22003 for (ScheduleBundle *B : Bundles)
22004 if (B->isReady())
22005 ReadyInsts.insert(B);
22006 }
22007 }
22008 ScheduledBundlesList.pop_back();
22009 SmallVector<ScheduleData *> ControlDependentMembers;
22010 for (Value *V : VL) {
22011 if (S.isNonSchedulable(V))
22012 continue;
22013 auto *I = cast<Instruction>(V);
22014 if (S.isCopyableElement(I)) {
22015 // Remove the copyable data from the scheduling region and restore
22016 // previous mappings.
22017 auto KV = std::make_pair(EI, I);
22018 assert(ScheduleCopyableDataMap.contains(KV) &&
22019 "no ScheduleCopyableData for copyable element");
22020 ScheduleCopyableData *SD =
22021 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
22022 ScheduleCopyableDataMapByUsers[I].remove(SD);
22023 if (EI.UserTE) {
22024 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
22025 const auto *It = find(Op, I);
22026 assert(It != Op.end() && "Lane not set");
22027 SmallPtrSet<Instruction *, 4> Visited;
22028 do {
22029 int Lane = std::distance(Op.begin(), It);
22030 assert(Lane >= 0 && "Lane not set");
22031 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
22032 !EI.UserTE->ReorderIndices.empty())
22033 Lane = EI.UserTE->ReorderIndices[Lane];
22034 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
22035 "Couldn't find extract lane");
22036 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
22037 if (!Visited.insert(In).second) {
22038 It = find(make_range(std::next(It), Op.end()), I);
22039 break;
22040 }
22041 ScheduleCopyableDataMapByInstUser
22042 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
22043 .pop_back();
22044 It = find(make_range(std::next(It), Op.end()), I);
22045 } while (It != Op.end());
22046 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
22047 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
22048 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
22049 }
22050 if (ScheduleCopyableDataMapByUsers[I].empty())
22051 ScheduleCopyableDataMapByUsers.erase(I);
22052 ScheduleCopyableDataMap.erase(KV);
22053 // Need to recalculate dependencies for the actual schedule data.
22054 if (ScheduleData *OpSD = getScheduleData(I);
22055 OpSD && OpSD->hasValidDependencies()) {
22056 OpSD->clearDirectDependencies();
22057 if (RegionHasStackSave ||
22059 ControlDependentMembers.push_back(OpSD);
22060 }
22061 continue;
22062 }
22063 ScheduledBundles.find(I)->getSecond().pop_back();
22064 }
22065 if (!ControlDependentMembers.empty()) {
22066 ScheduleBundle Invalid = ScheduleBundle::invalid();
22067 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
22068 ControlDependentMembers);
22069 }
22070 return std::nullopt;
22071 }
22072 return &Bundle;
22073}
22074
22075BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
22076 // Allocate a new ScheduleData for the instruction.
22077 if (ChunkPos >= ChunkSize) {
22078 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
22079 ChunkPos = 0;
22080 }
22081 return &(ScheduleDataChunks.back()[ChunkPos++]);
22082}
22083
22084bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
22085 Value *V, const InstructionsState &S) {
22087 assert(I && "bundle member must be an instruction");
22088 if (getScheduleData(I))
22089 return true;
22090 if (!ScheduleStart) {
22091 // It's the first instruction in the new region.
22092 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
22093 ScheduleStart = I;
22094 ScheduleEnd = I->getNextNode();
22095 assert(ScheduleEnd && "tried to vectorize a terminator?");
22096 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
22097 return true;
22098 }
22099 // Search up and down at the same time, because we don't know if the new
22100 // instruction is above or below the existing scheduling region.
22101 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
22102 // against the budget. Otherwise debug info could affect codegen.
22104 ++ScheduleStart->getIterator().getReverse();
22105 BasicBlock::reverse_iterator UpperEnd = BB->rend();
22106 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
22107 BasicBlock::iterator LowerEnd = BB->end();
22108 auto IsAssumeLikeIntr = [](const Instruction &I) {
22109 if (auto *II = dyn_cast<IntrinsicInst>(&I))
22110 return II->isAssumeLikeIntrinsic();
22111 return false;
22112 };
22113 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22114 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22115 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
22116 &*DownIter != I) {
22117 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
22118 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
22119 return false;
22120 }
22121
22122 ++UpIter;
22123 ++DownIter;
22124
22125 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22126 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22127 }
22128 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
22129 assert(I->getParent() == ScheduleStart->getParent() &&
22130 "Instruction is in wrong basic block.");
22131 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
22132 ScheduleStart = I;
22133 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
22134 << "\n");
22135 return true;
22136 }
22137 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
22138 "Expected to reach top of the basic block or instruction down the "
22139 "lower end.");
22140 assert(I->getParent() == ScheduleEnd->getParent() &&
22141 "Instruction is in wrong basic block.");
22142 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
22143 nullptr);
22144 ScheduleEnd = I->getNextNode();
22145 assert(ScheduleEnd && "tried to vectorize a terminator?");
22146 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
22147 return true;
22148}
22149
22150void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
22151 Instruction *ToI,
22152 ScheduleData *PrevLoadStore,
22153 ScheduleData *NextLoadStore) {
22154 ScheduleData *CurrentLoadStore = PrevLoadStore;
22155 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
22156 // No need to allocate data for non-schedulable instructions.
22157 if (isa<PHINode>(I))
22158 continue;
22159 ScheduleData *SD = ScheduleDataMap.lookup(I);
22160 if (!SD) {
22161 SD = allocateScheduleDataChunks();
22162 ScheduleDataMap[I] = SD;
22163 }
22164 assert(!isInSchedulingRegion(*SD) &&
22165 "new ScheduleData already in scheduling region");
22166 SD->init(SchedulingRegionID, I);
22167
22168 auto CanIgnoreLoad = [](const Instruction *I) {
22169 const auto *LI = dyn_cast<LoadInst>(I);
22170 // If there is a simple load marked as invariant, we can ignore it.
22171 // But, in the (unlikely) case of non-simple invariant load,
22172 // we should not ignore it.
22173 return LI && LI->isSimple() &&
22174 LI->getMetadata(LLVMContext::MD_invariant_load);
22175 };
22176
22177 if (I->mayReadOrWriteMemory() &&
22178 // Simple InvariantLoad does not depend on other memory accesses.
22179 !CanIgnoreLoad(I) &&
22180 (!isa<IntrinsicInst>(I) ||
22181 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
22183 Intrinsic::pseudoprobe))) {
22184 // Update the linked list of memory accessing instructions.
22185 if (CurrentLoadStore) {
22186 CurrentLoadStore->setNextLoadStore(SD);
22187 } else {
22188 FirstLoadStoreInRegion = SD;
22189 }
22190 CurrentLoadStore = SD;
22191 }
22192
22195 RegionHasStackSave = true;
22196 }
22197 if (NextLoadStore) {
22198 if (CurrentLoadStore)
22199 CurrentLoadStore->setNextLoadStore(NextLoadStore);
22200 } else {
22201 LastLoadStoreInRegion = CurrentLoadStore;
22202 }
22203}
22204
22205void BoUpSLP::BlockScheduling::calculateDependencies(
22206 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
22207 ArrayRef<ScheduleData *> ControlDeps) {
22208 SmallVector<ScheduleEntity *> WorkList;
22209 auto ProcessNode = [&](ScheduleEntity *SE) {
22210 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
22211 if (CD->hasValidDependencies())
22212 return;
22213 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
22214 CD->initDependencies();
22215 CD->resetUnscheduledDeps();
22216 const EdgeInfo &EI = CD->getEdgeInfo();
22217 if (EI.UserTE) {
22218 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
22219 const auto *It = find(Op, CD->getInst());
22220 assert(It != Op.end() && "Lane not set");
22221 SmallPtrSet<Instruction *, 4> Visited;
22222 do {
22223 int Lane = std::distance(Op.begin(), It);
22224 assert(Lane >= 0 && "Lane not set");
22225 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
22226 !EI.UserTE->ReorderIndices.empty())
22227 Lane = EI.UserTE->ReorderIndices[Lane];
22228 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
22229 "Couldn't find extract lane");
22230 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
22231 if (EI.UserTE->isCopyableElement(In)) {
22232 // We may have not have related copyable scheduling data, if the
22233 // instruction is non-schedulable.
22234 if (ScheduleCopyableData *UseSD =
22235 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
22236 CD->incDependencies();
22237 if (!UseSD->isScheduled())
22238 CD->incrementUnscheduledDeps(1);
22239 if (!UseSD->hasValidDependencies() ||
22240 (InsertInReadyList && UseSD->isReady()))
22241 WorkList.push_back(UseSD);
22242 }
22243 } else if (Visited.insert(In).second) {
22244 if (ScheduleData *UseSD = getScheduleData(In)) {
22245 CD->incDependencies();
22246 if (!UseSD->isScheduled())
22247 CD->incrementUnscheduledDeps(1);
22248 if (!UseSD->hasValidDependencies() ||
22249 (InsertInReadyList && UseSD->isReady()))
22250 WorkList.push_back(UseSD);
22251 }
22252 }
22253 It = find(make_range(std::next(It), Op.end()), CD->getInst());
22254 } while (It != Op.end());
22255 if (CD->isReady() && CD->getDependencies() == 0 &&
22256 (EI.UserTE->hasState() &&
22257 (EI.UserTE->getMainOp()->getParent() !=
22258 CD->getInst()->getParent() ||
22259 (isa<PHINode>(EI.UserTE->getMainOp()) &&
22260 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
22261 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
22262 auto *IU = dyn_cast<Instruction>(U);
22263 if (!IU)
22264 return true;
22265 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
22266 })))))) {
22267 // If no uses in the block - mark as having pseudo-use, which cannot
22268 // be scheduled.
22269 // Prevents incorrect def-use tracking between external user and
22270 // actual instruction.
22271 CD->incDependencies();
22272 CD->incrementUnscheduledDeps(1);
22273 }
22274 }
22275 return;
22276 }
22277 auto *BundleMember = cast<ScheduleData>(SE);
22278 if (BundleMember->hasValidDependencies())
22279 return;
22280 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
22281 BundleMember->initDependencies();
22282 BundleMember->resetUnscheduledDeps();
22283 // Handle def-use chain dependencies.
22284 SmallDenseMap<Value *, unsigned> UserToNumOps;
22285 for (User *U : BundleMember->getInst()->users()) {
22286 if (isa<PHINode>(U))
22287 continue;
22288 if (ScheduleData *UseSD = getScheduleData(U)) {
22289 // The operand is a copyable element - skip.
22290 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
22291 ++NumOps;
22292 if (areAllOperandsReplacedByCopyableData(
22293 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
22294 continue;
22295 BundleMember->incDependencies();
22296 if (!UseSD->isScheduled())
22297 BundleMember->incrementUnscheduledDeps(1);
22298 if (!UseSD->hasValidDependencies() ||
22299 (InsertInReadyList && UseSD->isReady()))
22300 WorkList.push_back(UseSD);
22301 }
22302 }
22303 for (ScheduleCopyableData *UseSD :
22304 getScheduleCopyableDataUsers(BundleMember->getInst())) {
22305 BundleMember->incDependencies();
22306 if (!UseSD->isScheduled())
22307 BundleMember->incrementUnscheduledDeps(1);
22308 if (!UseSD->hasValidDependencies() ||
22309 (InsertInReadyList && UseSD->isReady()))
22310 WorkList.push_back(UseSD);
22311 }
22312
22313 SmallPtrSet<const Instruction *, 4> Visited;
22314 auto MakeControlDependent = [&](Instruction *I) {
22315 // Do not mark control dependent twice.
22316 if (!Visited.insert(I).second)
22317 return;
22318 auto *DepDest = getScheduleData(I);
22319 assert(DepDest && "must be in schedule window");
22320 DepDest->addControlDependency(BundleMember);
22321 BundleMember->incDependencies();
22322 if (!DepDest->isScheduled())
22323 BundleMember->incrementUnscheduledDeps(1);
22324 if (!DepDest->hasValidDependencies() ||
22325 (InsertInReadyList && DepDest->isReady()))
22326 WorkList.push_back(DepDest);
22327 };
22328
22329 // Any instruction which isn't safe to speculate at the beginning of the
22330 // block is control depend on any early exit or non-willreturn call
22331 // which proceeds it.
22332 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
22333 for (Instruction *I = BundleMember->getInst()->getNextNode();
22334 I != ScheduleEnd; I = I->getNextNode()) {
22335 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
22336 continue;
22337
22338 // Add the dependency
22339 MakeControlDependent(I);
22340
22342 // Everything past here must be control dependent on I.
22343 break;
22344 }
22345 }
22346
22347 if (RegionHasStackSave) {
22348 // If we have an inalloc alloca instruction, it needs to be scheduled
22349 // after any preceeding stacksave. We also need to prevent any alloca
22350 // from reordering above a preceeding stackrestore.
22351 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
22352 match(BundleMember->getInst(),
22354 for (Instruction *I = BundleMember->getInst()->getNextNode();
22355 I != ScheduleEnd; I = I->getNextNode()) {
22358 // Any allocas past here must be control dependent on I, and I
22359 // must be memory dependend on BundleMember->Inst.
22360 break;
22361
22362 if (!isa<AllocaInst>(I))
22363 continue;
22364
22365 // Add the dependency
22366 MakeControlDependent(I);
22367 }
22368 }
22369
22370 // In addition to the cases handle just above, we need to prevent
22371 // allocas and loads/stores from moving below a stacksave or a
22372 // stackrestore. Avoiding moving allocas below stackrestore is currently
22373 // thought to be conservatism. Moving loads/stores below a stackrestore
22374 // can lead to incorrect code.
22375 if (isa<AllocaInst>(BundleMember->getInst()) ||
22376 BundleMember->getInst()->mayReadOrWriteMemory()) {
22377 for (Instruction *I = BundleMember->getInst()->getNextNode();
22378 I != ScheduleEnd; I = I->getNextNode()) {
22381 continue;
22382
22383 // Add the dependency
22384 MakeControlDependent(I);
22385 break;
22386 }
22387 }
22388 }
22389
22390 // Handle the memory dependencies (if any).
22391 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
22392 if (!NextLoadStore)
22393 return;
22394 Instruction *SrcInst = BundleMember->getInst();
22395 assert(SrcInst->mayReadOrWriteMemory() &&
22396 "NextLoadStore list for non memory effecting bundle?");
22397 MemoryLocation SrcLoc = getLocation(SrcInst);
22398 bool SrcMayWrite = SrcInst->mayWriteToMemory();
22399 unsigned NumAliased = 0;
22400 unsigned DistToSrc = 1;
22401 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
22402
22403 for (ScheduleData *DepDest = NextLoadStore; DepDest;
22404 DepDest = DepDest->getNextLoadStore()) {
22405 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
22406
22407 // We have two limits to reduce the complexity:
22408 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
22409 // SLP->isAliased (which is the expensive part in this loop).
22410 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
22411 // the whole loop (even if the loop is fast, it's quadratic).
22412 // It's important for the loop break condition (see below) to
22413 // check this limit even between two read-only instructions.
22414 if (DistToSrc >= MaxMemDepDistance ||
22415 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
22416 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
22417 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
22418
22419 // We increment the counter only if the locations are aliased
22420 // (instead of counting all alias checks). This gives a better
22421 // balance between reduced runtime and accurate dependencies.
22422 NumAliased++;
22423
22424 DepDest->addMemoryDependency(BundleMember);
22425 BundleMember->incDependencies();
22426 if (!DepDest->isScheduled())
22427 BundleMember->incrementUnscheduledDeps(1);
22428 if (!DepDest->hasValidDependencies() ||
22429 (InsertInReadyList && DepDest->isReady()))
22430 WorkList.push_back(DepDest);
22431 }
22432
22433 // Example, explaining the loop break condition: Let's assume our
22434 // starting instruction is i0 and MaxMemDepDistance = 3.
22435 //
22436 // +--------v--v--v
22437 // i0,i1,i2,i3,i4,i5,i6,i7,i8
22438 // +--------^--^--^
22439 //
22440 // MaxMemDepDistance let us stop alias-checking at i3 and we add
22441 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
22442 // Previously we already added dependencies from i3 to i6,i7,i8
22443 // (because of MaxMemDepDistance). As we added a dependency from
22444 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
22445 // and we can abort this loop at i6.
22446 if (DistToSrc >= 2 * MaxMemDepDistance)
22447 break;
22448 DistToSrc++;
22449 }
22450 };
22451
22452 assert((Bundle || !ControlDeps.empty()) &&
22453 "expected at least one instruction to schedule");
22454 if (Bundle)
22455 WorkList.push_back(Bundle.getBundle().front());
22456 WorkList.append(ControlDeps.begin(), ControlDeps.end());
22457 SmallPtrSet<ScheduleBundle *, 16> Visited;
22458 while (!WorkList.empty()) {
22459 ScheduleEntity *SD = WorkList.pop_back_val();
22460 SmallVector<ScheduleBundle *, 1> CopyableBundle;
22462 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
22463 CopyableBundle.push_back(&CD->getBundle());
22464 Bundles = CopyableBundle;
22465 } else {
22466 Bundles = getScheduleBundles(SD->getInst());
22467 }
22468 if (Bundles.empty()) {
22469 if (!SD->hasValidDependencies())
22470 ProcessNode(SD);
22471 if (InsertInReadyList && SD->isReady()) {
22472 ReadyInsts.insert(SD);
22473 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
22474 }
22475 continue;
22476 }
22477 for (ScheduleBundle *Bundle : Bundles) {
22478 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
22479 continue;
22480 assert(isInSchedulingRegion(*Bundle) &&
22481 "ScheduleData not in scheduling region");
22482 for_each(Bundle->getBundle(), ProcessNode);
22483 }
22484 if (InsertInReadyList && SD->isReady()) {
22485 for (ScheduleBundle *Bundle : Bundles) {
22486 assert(isInSchedulingRegion(*Bundle) &&
22487 "ScheduleData not in scheduling region");
22488 if (!Bundle->isReady())
22489 continue;
22490 ReadyInsts.insert(Bundle);
22491 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
22492 << "\n");
22493 }
22494 }
22495 }
22496}
22497
22498void BoUpSLP::BlockScheduling::resetSchedule() {
22499 assert(ScheduleStart &&
22500 "tried to reset schedule on block which has not been scheduled");
22501 for_each(ScheduleDataMap, [&](auto &P) {
22502 if (BB != P.first->getParent())
22503 return;
22504 ScheduleData *SD = P.second;
22505 if (isInSchedulingRegion(*SD)) {
22506 SD->setScheduled(/*Scheduled=*/false);
22507 SD->resetUnscheduledDeps();
22508 }
22509 });
22510 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
22511 for_each(P.second, [&](ScheduleCopyableData *SD) {
22512 if (isInSchedulingRegion(*SD)) {
22513 SD->setScheduled(/*Scheduled=*/false);
22514 SD->resetUnscheduledDeps();
22515 }
22516 });
22517 });
22518 for_each(ScheduledBundles, [&](auto &P) {
22519 for_each(P.second, [&](ScheduleBundle *Bundle) {
22520 if (isInSchedulingRegion(*Bundle))
22521 Bundle->setScheduled(/*Scheduled=*/false);
22522 });
22523 });
22524 // Reset schedule data for copyable elements.
22525 for (auto &P : ScheduleCopyableDataMap) {
22526 if (isInSchedulingRegion(*P.second)) {
22527 P.second->setScheduled(/*Scheduled=*/false);
22528 P.second->resetUnscheduledDeps();
22529 }
22530 }
22531 ReadyInsts.clear();
22532}
22533
22534void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
22535 if (!BS->ScheduleStart)
22536 return;
22537
22538 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
22539
22540 // A key point - if we got here, pre-scheduling was able to find a valid
22541 // scheduling of the sub-graph of the scheduling window which consists
22542 // of all vector bundles and their transitive users. As such, we do not
22543 // need to reschedule anything *outside of* that subgraph.
22544
22545 BS->resetSchedule();
22546
22547 // For the real scheduling we use a more sophisticated ready-list: it is
22548 // sorted by the original instruction location. This lets the final schedule
22549 // be as close as possible to the original instruction order.
22550 // WARNING: If changing this order causes a correctness issue, that means
22551 // there is some missing dependence edge in the schedule data graph.
22552 struct ScheduleDataCompare {
22553 bool operator()(const ScheduleEntity *SD1,
22554 const ScheduleEntity *SD2) const {
22555 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
22556 }
22557 };
22558 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
22559
22560 // Ensure that all dependency data is updated (for nodes in the sub-graph)
22561 // and fill the ready-list with initial instructions.
22562 int Idx = 0;
22563 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22564 I = I->getNextNode()) {
22565 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
22566 if (!Bundles.empty()) {
22567 for (ScheduleBundle *Bundle : Bundles) {
22568 Bundle->setSchedulingPriority(Idx++);
22569 if (!Bundle->hasValidDependencies())
22570 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
22571 }
22572 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
22573 for (ScheduleCopyableData *SD : reverse(SDs)) {
22574 ScheduleBundle &Bundle = SD->getBundle();
22575 Bundle.setSchedulingPriority(Idx++);
22576 if (!Bundle.hasValidDependencies())
22577 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22578 }
22579 continue;
22580 }
22582 BS->getScheduleCopyableDataUsers(I);
22583 if (ScheduleData *SD = BS->getScheduleData(I)) {
22584 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
22585 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
22586 SDTEs.front()->doesNotNeedToSchedule() ||
22588 "scheduler and vectorizer bundle mismatch");
22589 SD->setSchedulingPriority(Idx++);
22590 if (!SD->hasValidDependencies() &&
22591 (!CopyableData.empty() ||
22592 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
22593 assert(TE->isGather() && "expected gather node");
22594 return TE->hasState() && TE->hasCopyableElements() &&
22595 TE->isCopyableElement(I);
22596 }))) {
22597 // Need to calculate deps for these nodes to correctly handle copyable
22598 // dependencies, even if they were cancelled.
22599 // If copyables bundle was cancelled, the deps are cleared and need to
22600 // recalculate them.
22601 ScheduleBundle Bundle;
22602 Bundle.add(SD);
22603 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22604 }
22605 }
22606 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
22607 ScheduleBundle &Bundle = SD->getBundle();
22608 Bundle.setSchedulingPriority(Idx++);
22609 if (!Bundle.hasValidDependencies())
22610 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22611 }
22612 }
22613 BS->initialFillReadyList(ReadyInsts);
22614
22615 Instruction *LastScheduledInst = BS->ScheduleEnd;
22616
22617 // Do the "real" scheduling.
22618 SmallPtrSet<Instruction *, 16> Scheduled;
22619 while (!ReadyInsts.empty()) {
22620 auto *Picked = *ReadyInsts.begin();
22621 ReadyInsts.erase(ReadyInsts.begin());
22622
22623 // Move the scheduled instruction(s) to their dedicated places, if not
22624 // there yet.
22625 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
22626 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
22627 Instruction *PickedInst = BundleMember->getInst();
22628 // If copyable must be schedule as part of something else, skip it.
22629 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
22630 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
22631 (!IsCopyable && !Scheduled.insert(PickedInst).second))
22632 continue;
22633 if (PickedInst->getNextNode() != LastScheduledInst)
22634 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
22635 LastScheduledInst = PickedInst;
22636 }
22637 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
22638 LastScheduledInst);
22639 } else {
22640 auto *SD = cast<ScheduleData>(Picked);
22641 Instruction *PickedInst = SD->getInst();
22642 if (PickedInst->getNextNode() != LastScheduledInst)
22643 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
22644 LastScheduledInst = PickedInst;
22645 }
22646 auto Invalid = InstructionsState::invalid();
22647 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
22648 }
22649
22650 // Check that we didn't break any of our invariants.
22651#ifdef EXPENSIVE_CHECKS
22652 BS->verify();
22653#endif
22654
22655#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
22656 // Check that all schedulable entities got scheduled
22657 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22658 I = I->getNextNode()) {
22659 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
22660 assert(all_of(Bundles,
22661 [](const ScheduleBundle *Bundle) {
22662 return Bundle->isScheduled();
22663 }) &&
22664 "must be scheduled at this point");
22665 }
22666#endif
22667
22668 // Avoid duplicate scheduling of the block.
22669 BS->ScheduleStart = nullptr;
22670}
22671
22673 // If V is a store, just return the width of the stored value (or value
22674 // truncated just before storing) without traversing the expression tree.
22675 // This is the common case.
22676 if (auto *Store = dyn_cast<StoreInst>(V))
22677 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
22678
22679 if (auto *IEI = dyn_cast<InsertElementInst>(V))
22680 return getVectorElementSize(IEI->getOperand(1));
22681
22682 auto E = InstrElementSize.find(V);
22683 if (E != InstrElementSize.end())
22684 return E->second;
22685
22686 // If V is not a store, we can traverse the expression tree to find loads
22687 // that feed it. The type of the loaded value may indicate a more suitable
22688 // width than V's type. We want to base the vector element size on the width
22689 // of memory operations where possible.
22692 if (auto *I = dyn_cast<Instruction>(V)) {
22693 Worklist.emplace_back(I, I->getParent(), 0);
22694 Visited.insert(I);
22695 }
22696
22697 // Traverse the expression tree in bottom-up order looking for loads. If we
22698 // encounter an instruction we don't yet handle, we give up.
22699 auto Width = 0u;
22700 Value *FirstNonBool = nullptr;
22701 while (!Worklist.empty()) {
22702 auto [I, Parent, Level] = Worklist.pop_back_val();
22703
22704 // We should only be looking at scalar instructions here. If the current
22705 // instruction has a vector type, skip.
22706 auto *Ty = I->getType();
22707 if (isa<VectorType>(Ty))
22708 continue;
22709 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
22710 FirstNonBool = I;
22711 if (Level > RecursionMaxDepth)
22712 continue;
22713
22714 // If the current instruction is a load, update MaxWidth to reflect the
22715 // width of the loaded value.
22717 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
22718
22719 // Otherwise, we need to visit the operands of the instruction. We only
22720 // handle the interesting cases from buildTree here. If an operand is an
22721 // instruction we haven't yet visited and from the same basic block as the
22722 // user or the use is a PHI node, we add it to the worklist.
22725 for (Use &U : I->operands()) {
22726 if (auto *J = dyn_cast<Instruction>(U.get()))
22727 if (Visited.insert(J).second &&
22728 (isa<PHINode>(I) || J->getParent() == Parent)) {
22729 Worklist.emplace_back(J, J->getParent(), Level + 1);
22730 continue;
22731 }
22732 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
22733 FirstNonBool = U.get();
22734 }
22735 } else {
22736 break;
22737 }
22738 }
22739
22740 // If we didn't encounter a memory access in the expression tree, or if we
22741 // gave up for some reason, just return the width of V. Otherwise, return the
22742 // maximum width we found.
22743 if (!Width) {
22744 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
22745 V = FirstNonBool;
22746 Width = DL->getTypeSizeInBits(V->getType());
22747 }
22748
22749 for (Instruction *I : Visited)
22750 InstrElementSize[I] = Width;
22751
22752 return Width;
22753}
22754
22755bool BoUpSLP::collectValuesToDemote(
22756 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
22758 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
22759 bool &IsProfitableToDemote, bool IsTruncRoot) const {
22760 // We can always demote constants.
22761 if (all_of(E.Scalars, IsaPred<Constant>))
22762 return true;
22763
22764 unsigned OrigBitWidth =
22765 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
22766 if (OrigBitWidth == BitWidth) {
22767 MaxDepthLevel = 1;
22768 return true;
22769 }
22770
22771 // Check if the node was analyzed already and must keep its original bitwidth.
22772 if (NodesToKeepBWs.contains(E.Idx))
22773 return false;
22774
22775 // If the value is not a vectorized instruction in the expression and not used
22776 // by the insertelement instruction and not used in multiple vector nodes, it
22777 // cannot be demoted.
22778 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
22779 if (isa<PoisonValue>(R))
22780 return false;
22781 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22782 });
22783 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
22784 if (isa<PoisonValue>(V))
22785 return true;
22786 if (getTreeEntries(V).size() > 1)
22787 return false;
22788 // For lat shuffle of sext/zext with many uses need to check the extra bit
22789 // for unsigned values, otherwise may have incorrect casting for reused
22790 // scalars.
22791 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
22792 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
22793 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22794 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22795 return true;
22796 }
22797 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22798 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22799 if (IsSignedNode)
22800 ++BitWidth1;
22801 if (auto *I = dyn_cast<Instruction>(V)) {
22802 APInt Mask = DB->getDemandedBits(I);
22803 unsigned BitWidth2 =
22804 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
22805 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22806 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
22807 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22808 break;
22809 BitWidth2 *= 2;
22810 }
22811 BitWidth1 = std::min(BitWidth1, BitWidth2);
22812 }
22813 BitWidth = std::max(BitWidth, BitWidth1);
22814 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
22815 };
22816 auto FinalAnalysis = [&, TTI = TTI]() {
22817 if (!IsProfitableToDemote)
22818 return false;
22819 bool Res = all_of(
22820 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
22821 // Demote gathers.
22822 if (Res && E.isGather()) {
22823 if (E.hasState()) {
22824 if (const TreeEntry *SameTE =
22825 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
22826 SameTE)
22827 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
22828 ToDemote, Visited, NodesToKeepBWs,
22829 MaxDepthLevel, IsProfitableToDemote,
22830 IsTruncRoot)) {
22831 ToDemote.push_back(E.Idx);
22832 return true;
22833 }
22834 }
22835 // Check possible extractelement instructions bases and final vector
22836 // length.
22837 SmallPtrSet<Value *, 4> UniqueBases;
22838 for (Value *V : E.Scalars) {
22839 auto *EE = dyn_cast<ExtractElementInst>(V);
22840 if (!EE)
22841 continue;
22842 UniqueBases.insert(EE->getVectorOperand());
22843 }
22844 const unsigned VF = E.Scalars.size();
22845 Type *OrigScalarTy = E.Scalars.front()->getType();
22846 if (UniqueBases.size() <= 2 ||
22847 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
22849 *TTI,
22851 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
22852 VF))) {
22853 ToDemote.push_back(E.Idx);
22854 return true;
22855 }
22856 }
22857 return Res;
22858 };
22859 if (E.isGather() || !Visited.insert(&E).second ||
22860 any_of(E.Scalars, [&](Value *V) {
22861 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22862 return isa<InsertElementInst>(U) && !isVectorized(U);
22863 });
22864 }))
22865 return FinalAnalysis();
22866
22867 if (any_of(E.Scalars, [&](Value *V) {
22868 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22869 return isVectorized(U) ||
22870 (E.Idx == 0 && UserIgnoreList &&
22871 UserIgnoreList->contains(U)) ||
22872 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22873 !U->getType()->isScalableTy() &&
22874 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22875 }) && !IsPotentiallyTruncated(V, BitWidth);
22876 }))
22877 return false;
22878
22879 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
22880 bool &NeedToExit) {
22881 NeedToExit = false;
22882 unsigned InitLevel = MaxDepthLevel;
22883 for (const TreeEntry *Op : Operands) {
22884 unsigned Level = InitLevel;
22885 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
22886 ToDemote, Visited, NodesToKeepBWs, Level,
22887 IsProfitableToDemote, IsTruncRoot)) {
22888 if (!IsProfitableToDemote)
22889 return false;
22890 NeedToExit = true;
22891 if (!FinalAnalysis())
22892 return false;
22893 continue;
22894 }
22895 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22896 }
22897 return true;
22898 };
22899 auto AttemptCheckBitwidth =
22900 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
22901 // Try all bitwidth < OrigBitWidth.
22902 NeedToExit = false;
22903 unsigned BestFailBitwidth = 0;
22904 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
22905 if (Checker(BitWidth, OrigBitWidth))
22906 return true;
22907 if (BestFailBitwidth == 0 && FinalAnalysis())
22908 BestFailBitwidth = BitWidth;
22909 }
22910 if (BitWidth >= OrigBitWidth) {
22911 if (BestFailBitwidth == 0) {
22912 BitWidth = OrigBitWidth;
22913 return false;
22914 }
22915 MaxDepthLevel = 1;
22916 BitWidth = BestFailBitwidth;
22917 NeedToExit = true;
22918 return true;
22919 }
22920 return false;
22921 };
22922 auto TryProcessInstruction =
22923 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
22924 function_ref<bool(unsigned, unsigned)> Checker = {}) {
22925 if (Operands.empty()) {
22926 if (!IsTruncRoot)
22927 MaxDepthLevel = 1;
22928 for (Value *V : E.Scalars)
22929 (void)IsPotentiallyTruncated(V, BitWidth);
22930 } else {
22931 // Several vectorized uses? Check if we can truncate it, otherwise -
22932 // exit.
22933 if (any_of(E.Scalars, [&](Value *V) {
22934 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22935 }))
22936 return false;
22937 bool NeedToExit = false;
22938 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22939 return false;
22940 if (NeedToExit)
22941 return true;
22942 if (!ProcessOperands(Operands, NeedToExit))
22943 return false;
22944 if (NeedToExit)
22945 return true;
22946 }
22947
22948 ++MaxDepthLevel;
22949 // Record the entry that we can demote.
22950 ToDemote.push_back(E.Idx);
22951 return IsProfitableToDemote;
22952 };
22953
22954 if (E.State == TreeEntry::SplitVectorize)
22955 return TryProcessInstruction(
22956 BitWidth,
22957 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
22958 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
22959
22960 if (E.isAltShuffle()) {
22961 // Combining these opcodes may lead to incorrect analysis, skip for now.
22962 auto IsDangerousOpcode = [](unsigned Opcode) {
22963 switch (Opcode) {
22964 case Instruction::Shl:
22965 case Instruction::AShr:
22966 case Instruction::LShr:
22967 case Instruction::UDiv:
22968 case Instruction::SDiv:
22969 case Instruction::URem:
22970 case Instruction::SRem:
22971 return true;
22972 default:
22973 break;
22974 }
22975 return false;
22976 };
22977 if (IsDangerousOpcode(E.getAltOpcode()))
22978 return FinalAnalysis();
22979 }
22980
22981 switch (E.getOpcode()) {
22982
22983 // We can always demote truncations and extensions. Since truncations can
22984 // seed additional demotion, we save the truncated value.
22985 case Instruction::Trunc:
22986 if (IsProfitableToDemoteRoot)
22987 IsProfitableToDemote = true;
22988 return TryProcessInstruction(BitWidth);
22989 case Instruction::ZExt:
22990 case Instruction::SExt:
22991 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
22992 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22993 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22994 return false;
22995 IsProfitableToDemote = true;
22996 return TryProcessInstruction(BitWidth);
22997
22998 // We can demote certain binary operations if we can demote both of their
22999 // operands.
23000 case Instruction::Add:
23001 case Instruction::Sub:
23002 case Instruction::Mul:
23003 case Instruction::And:
23004 case Instruction::Or:
23005 case Instruction::Xor: {
23006 return TryProcessInstruction(
23007 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
23008 }
23009 case Instruction::Freeze:
23010 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
23011 case Instruction::Shl: {
23012 // If we are truncating the result of this SHL, and if it's a shift of an
23013 // inrange amount, we can always perform a SHL in a smaller type.
23014 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
23015 return all_of(E.Scalars, [&](Value *V) {
23016 if (isa<PoisonValue>(V))
23017 return true;
23018 if (E.isCopyableElement(V))
23019 return true;
23020 auto *I = cast<Instruction>(V);
23021 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23022 return AmtKnownBits.getMaxValue().ult(BitWidth);
23023 });
23024 };
23025 return TryProcessInstruction(
23026 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
23027 }
23028 case Instruction::LShr: {
23029 // If this is a truncate of a logical shr, we can truncate it to a smaller
23030 // lshr iff we know that the bits we would otherwise be shifting in are
23031 // already zeros.
23032 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23033 return all_of(E.Scalars, [&](Value *V) {
23034 if (isa<PoisonValue>(V))
23035 return true;
23036 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23037 if (E.isCopyableElement(V))
23038 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
23039 auto *I = cast<Instruction>(V);
23040 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23041 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23042 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
23043 SimplifyQuery(*DL));
23044 });
23045 };
23046 return TryProcessInstruction(
23047 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
23048 LShrChecker);
23049 }
23050 case Instruction::AShr: {
23051 // If this is a truncate of an arithmetic shr, we can truncate it to a
23052 // smaller ashr iff we know that all the bits from the sign bit of the
23053 // original type and the sign bit of the truncate type are similar.
23054 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23055 return all_of(E.Scalars, [&](Value *V) {
23056 if (isa<PoisonValue>(V))
23057 return true;
23058 auto *I = cast<Instruction>(V);
23059 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23060 unsigned ShiftedBits = OrigBitWidth - BitWidth;
23061 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23062 ShiftedBits <
23063 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23064 });
23065 };
23066 return TryProcessInstruction(
23067 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
23068 AShrChecker);
23069 }
23070 case Instruction::UDiv:
23071 case Instruction::URem: {
23072 // UDiv and URem can be truncated if all the truncated bits are zero.
23073 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23074 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23075 return all_of(E.Scalars, [&](Value *V) {
23076 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23077 if (E.hasCopyableElements() && E.isCopyableElement(V))
23078 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
23079 auto *I = cast<Instruction>(V);
23080 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
23081 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23082 });
23083 };
23084 return TryProcessInstruction(
23085 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
23086 }
23087
23088 // We can demote selects if we can demote their true and false values.
23089 case Instruction::Select: {
23090 return TryProcessInstruction(
23091 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
23092 }
23093
23094 // We can demote phis if we can demote all their incoming operands.
23095 case Instruction::PHI: {
23096 const unsigned NumOps = E.getNumOperands();
23098 transform(seq<unsigned>(0, NumOps), Ops.begin(),
23099 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
23100
23101 return TryProcessInstruction(BitWidth, Ops);
23102 }
23103
23104 case Instruction::Call: {
23105 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
23106 if (!IC)
23107 break;
23109 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
23110 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
23111 break;
23112 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
23113 function_ref<bool(unsigned, unsigned)> CallChecker;
23114 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23115 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23116 return all_of(E.Scalars, [&](Value *V) {
23117 auto *I = cast<Instruction>(V);
23118 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
23119 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23120 return MaskedValueIsZero(I->getOperand(0), Mask,
23121 SimplifyQuery(*DL)) &&
23122 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23123 }
23124 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
23125 "Expected min/max intrinsics only.");
23126 unsigned SignBits = OrigBitWidth - BitWidth;
23127 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
23128 unsigned Op0SignBits =
23129 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23130 unsigned Op1SignBits =
23131 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
23132 return SignBits <= Op0SignBits &&
23133 ((SignBits != Op0SignBits &&
23134 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
23135 MaskedValueIsZero(I->getOperand(0), Mask,
23136 SimplifyQuery(*DL))) &&
23137 SignBits <= Op1SignBits &&
23138 ((SignBits != Op1SignBits &&
23139 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
23140 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
23141 });
23142 };
23143 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23144 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23145 return all_of(E.Scalars, [&](Value *V) {
23146 auto *I = cast<Instruction>(V);
23147 unsigned SignBits = OrigBitWidth - BitWidth;
23148 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
23149 unsigned Op0SignBits =
23150 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23151 return SignBits <= Op0SignBits &&
23152 ((SignBits != Op0SignBits &&
23153 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
23154 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
23155 });
23156 };
23157 if (ID != Intrinsic::abs) {
23158 Operands.push_back(getOperandEntry(&E, 1));
23159 CallChecker = CompChecker;
23160 } else {
23161 CallChecker = AbsChecker;
23162 }
23163 InstructionCost BestCost =
23164 std::numeric_limits<InstructionCost::CostType>::max();
23165 unsigned BestBitWidth = BitWidth;
23166 unsigned VF = E.Scalars.size();
23167 // Choose the best bitwidth based on cost estimations.
23168 auto Checker = [&](unsigned BitWidth, unsigned) {
23169 unsigned MinBW = PowerOf2Ceil(BitWidth);
23170 SmallVector<Type *> ArgTys =
23171 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
23172 auto VecCallCosts = getVectorCallCosts(
23173 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
23174 TTI, TLI, ArgTys);
23175 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
23176 if (Cost < BestCost) {
23177 BestCost = Cost;
23178 BestBitWidth = BitWidth;
23179 }
23180 return false;
23181 };
23182 [[maybe_unused]] bool NeedToExit;
23183 (void)AttemptCheckBitwidth(Checker, NeedToExit);
23184 BitWidth = BestBitWidth;
23185 return TryProcessInstruction(BitWidth, Operands, CallChecker);
23186 }
23187
23188 // Otherwise, conservatively give up.
23189 default:
23190 break;
23191 }
23192 MaxDepthLevel = 1;
23193 return FinalAnalysis();
23194}
23195
23196static RecurKind getRdxKind(Value *V);
23197
23199 // We only attempt to truncate integer expressions.
23200 bool IsStoreOrInsertElt =
23201 VectorizableTree.front()->hasState() &&
23202 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
23203 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
23204 if ((IsStoreOrInsertElt || UserIgnoreList) &&
23205 ExtraBitWidthNodes.size() <= 1 &&
23206 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
23207 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
23208 return;
23209
23210 unsigned NodeIdx = 0;
23211 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
23212 NodeIdx = 1;
23213
23214 // Ensure the roots of the vectorizable tree don't form a cycle.
23215 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
23216 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
23217 "Unexpected tree is graph.");
23218
23219 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
23220 // resize to the final type.
23221 bool IsTruncRoot = false;
23222 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
23223 SmallVector<unsigned> RootDemotes;
23224 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
23225 if (NodeIdx != 0 &&
23226 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23227 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23228 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
23229 IsTruncRoot = true;
23230 RootDemotes.push_back(NodeIdx);
23231 IsProfitableToDemoteRoot = true;
23232 ++NodeIdx;
23233 }
23234
23235 // Analyzed the reduction already and not profitable - exit.
23236 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
23237 return;
23238
23239 SmallVector<unsigned> ToDemote;
23240 auto ComputeMaxBitWidth =
23241 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
23242 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
23243 ToDemote.clear();
23244 // Check if the root is trunc and the next node is gather/buildvector, then
23245 // keep trunc in scalars, which is free in most cases.
23246 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
23247 !NodesToKeepBWs.contains(E.Idx) &&
23248 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
23249 all_of(E.Scalars, [&](Value *V) {
23250 return V->hasOneUse() || isa<Constant>(V) ||
23251 (!V->hasNUsesOrMore(UsesLimit) &&
23252 none_of(V->users(), [&](User *U) {
23253 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
23254 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23255 if (TEs.empty() || is_contained(TEs, UserTE))
23256 return false;
23257 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23258 SelectInst>(U) ||
23259 isa<SIToFPInst, UIToFPInst>(U) ||
23260 (UserTE->hasState() &&
23261 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23262 SelectInst>(UserTE->getMainOp()) ||
23263 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
23264 return true;
23265 unsigned UserTESz = DL->getTypeSizeInBits(
23266 UserTE->Scalars.front()->getType());
23267 if (all_of(TEs, [&](const TreeEntry *TE) {
23268 auto It = MinBWs.find(TE);
23269 return It != MinBWs.end() &&
23270 It->second.first > UserTESz;
23271 }))
23272 return true;
23273 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
23274 }));
23275 })) {
23276 ToDemote.push_back(E.Idx);
23277 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23278 auto It = MinBWs.find(UserTE);
23279 if (It != MinBWs.end())
23280 return It->second.first;
23281 unsigned MaxBitWidth =
23282 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
23283 MaxBitWidth = bit_ceil(MaxBitWidth);
23284 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23285 MaxBitWidth = 8;
23286 return MaxBitWidth;
23287 }
23288
23289 if (!E.hasState())
23290 return 0u;
23291
23292 unsigned VF = E.getVectorFactor();
23293 Type *ScalarTy = E.Scalars.front()->getType();
23294 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
23295 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
23296 if (!TreeRootIT)
23297 return 0u;
23298
23299 if (any_of(E.Scalars,
23300 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
23301 return 0u;
23302
23303 unsigned NumParts = ::getNumberOfParts(
23304 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
23305
23306 // The maximum bit width required to represent all the values that can be
23307 // demoted without loss of precision. It would be safe to truncate the roots
23308 // of the expression to this width.
23309 unsigned MaxBitWidth = 1u;
23310
23311 // True if the roots can be zero-extended back to their original type,
23312 // rather than sign-extended. We know that if the leading bits are not
23313 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
23314 // True.
23315 // Determine if the sign bit of all the roots is known to be zero. If not,
23316 // IsKnownPositive is set to False.
23317 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
23318 if (isa<PoisonValue>(R))
23319 return true;
23320 KnownBits Known = computeKnownBits(R, *DL);
23321 return Known.isNonNegative();
23322 });
23323
23324 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
23325 E.UserTreeIndex.UserTE->hasState() &&
23326 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
23327 MaxBitWidth =
23328 std::min(DL->getTypeSizeInBits(
23329 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
23330 DL->getTypeSizeInBits(ScalarTy));
23331
23332 // We first check if all the bits of the roots are demanded. If they're not,
23333 // we can truncate the roots to this narrower type.
23334 for (Value *Root : E.Scalars) {
23335 if (isa<PoisonValue>(Root))
23336 continue;
23337 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
23338 TypeSize NumTypeBits =
23339 DL->getTypeSizeInBits(Root->getType()->getScalarType());
23340 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23341 // If we can't prove that the sign bit is zero, we must add one to the
23342 // maximum bit width to account for the unknown sign bit. This preserves
23343 // the existing sign bit so we can safely sign-extend the root back to the
23344 // original type. Otherwise, if we know the sign bit is zero, we will
23345 // zero-extend the root instead.
23346 //
23347 // FIXME: This is somewhat suboptimal, as there will be cases where adding
23348 // one to the maximum bit width will yield a larger-than-necessary
23349 // type. In general, we need to add an extra bit only if we can't
23350 // prove that the upper bit of the original type is equal to the
23351 // upper bit of the proposed smaller type. If these two bits are
23352 // the same (either zero or one) we know that sign-extending from
23353 // the smaller type will result in the same value. Here, since we
23354 // can't yet prove this, we are just making the proposed smaller
23355 // type larger to ensure correctness.
23356 if (!IsKnownPositive)
23357 ++BitWidth1;
23358
23359 auto *I = dyn_cast<Instruction>(Root);
23360 if (!I) {
23361 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
23362 continue;
23363 }
23364 APInt Mask = DB->getDemandedBits(I);
23365 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23366 MaxBitWidth =
23367 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
23368 }
23369
23370 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23371 MaxBitWidth = 8;
23372
23373 // If the original type is large, but reduced type does not improve the reg
23374 // use - ignore it.
23375 if (NumParts > 1 &&
23376 NumParts ==
23378 *TTI, getWidenedType(IntegerType::get(F->getContext(),
23379 bit_ceil(MaxBitWidth)),
23380 VF)))
23381 return 0u;
23382
23383 unsigned Opcode = E.getOpcode();
23384 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
23385 Opcode == Instruction::SExt ||
23386 Opcode == Instruction::ZExt || NumParts > 1;
23387 // Conservatively determine if we can actually truncate the roots of the
23388 // expression. Collect the values that can be demoted in ToDemote and
23389 // additional roots that require investigating in Roots.
23391 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
23392 bool NeedToDemote = IsProfitableToDemote;
23393
23394 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
23395 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
23396 NeedToDemote, IsTruncRoot) ||
23397 (MaxDepthLevel <= Limit &&
23398 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
23399 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
23400 DL->getTypeSizeInBits(TreeRootIT) /
23401 DL->getTypeSizeInBits(
23402 E.getMainOp()->getOperand(0)->getType()) >
23403 2)))))
23404 return 0u;
23405 // Round MaxBitWidth up to the next power-of-two.
23406 MaxBitWidth = bit_ceil(MaxBitWidth);
23407
23408 return MaxBitWidth;
23409 };
23410
23411 // If we can truncate the root, we must collect additional values that might
23412 // be demoted as a result. That is, those seeded by truncations we will
23413 // modify.
23414 // Add reduction ops sizes, if any.
23415 if (UserIgnoreList &&
23416 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
23417 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
23418 // x i1> to in)).
23419 if (all_of(*UserIgnoreList,
23420 [](Value *V) {
23421 return isa<PoisonValue>(V) ||
23422 cast<Instruction>(V)->getOpcode() == Instruction::Add;
23423 }) &&
23424 VectorizableTree.front()->State == TreeEntry::Vectorize &&
23425 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
23426 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
23427 Builder.getInt1Ty()) {
23428 ReductionBitWidth = 1;
23429 } else {
23430 for (Value *V : *UserIgnoreList) {
23431 if (isa<PoisonValue>(V))
23432 continue;
23433 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
23434 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
23435 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23436 if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
23437 ++BitWidth1;
23438 unsigned BitWidth2 = BitWidth1;
23440 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
23441 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23442 }
23443 ReductionBitWidth =
23444 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
23445 }
23446 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
23447 ReductionBitWidth = 8;
23448
23449 ReductionBitWidth = bit_ceil(ReductionBitWidth);
23450 }
23451 }
23452 bool IsTopRoot = NodeIdx == 0;
23453 while (NodeIdx < VectorizableTree.size() &&
23454 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23455 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23456 RootDemotes.push_back(NodeIdx);
23457 ++NodeIdx;
23458 IsTruncRoot = true;
23459 }
23460 bool IsSignedCmp = false;
23461 if (UserIgnoreList &&
23462 all_of(*UserIgnoreList,
23464 m_SMax(m_Value(), m_Value())))))
23465 IsSignedCmp = true;
23466 while (NodeIdx < VectorizableTree.size()) {
23467 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
23468 unsigned Limit = 2;
23469 if (IsTopRoot &&
23470 ReductionBitWidth ==
23471 DL->getTypeSizeInBits(
23472 VectorizableTree.front()->Scalars.front()->getType()))
23473 Limit = 3;
23474 unsigned MaxBitWidth = ComputeMaxBitWidth(
23475 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
23476 IsTruncRoot, IsSignedCmp);
23477 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
23478 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
23479 ReductionBitWidth = bit_ceil(MaxBitWidth);
23480 else if (MaxBitWidth == 0)
23481 ReductionBitWidth = 0;
23482 }
23483
23484 for (unsigned Idx : RootDemotes) {
23485 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
23486 uint32_t OrigBitWidth =
23487 DL->getTypeSizeInBits(V->getType()->getScalarType());
23488 if (OrigBitWidth > MaxBitWidth) {
23489 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
23490 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
23491 }
23492 return false;
23493 }))
23494 ToDemote.push_back(Idx);
23495 }
23496 RootDemotes.clear();
23497 IsTopRoot = false;
23498 IsProfitableToDemoteRoot = true;
23499
23500 if (ExtraBitWidthNodes.empty()) {
23501 NodeIdx = VectorizableTree.size();
23502 } else {
23503 unsigned NewIdx = 0;
23504 do {
23505 NewIdx = *ExtraBitWidthNodes.begin();
23506 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
23507 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
23508 NodeIdx = NewIdx;
23509 IsTruncRoot =
23510 NodeIdx < VectorizableTree.size() &&
23511 VectorizableTree[NodeIdx]->UserTreeIndex &&
23512 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
23513 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23514 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23515 Instruction::Trunc &&
23516 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
23517 IsSignedCmp =
23518 NodeIdx < VectorizableTree.size() &&
23519 VectorizableTree[NodeIdx]->UserTreeIndex &&
23520 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23521 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23522 Instruction::ICmp &&
23523 any_of(
23524 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
23525 [&](Value *V) {
23526 auto *IC = dyn_cast<ICmpInst>(V);
23527 return IC && (IC->isSigned() ||
23528 !isKnownNonNegative(IC->getOperand(0),
23529 SimplifyQuery(*DL)) ||
23530 !isKnownNonNegative(IC->getOperand(1),
23531 SimplifyQuery(*DL)));
23532 });
23533 }
23534
23535 // If the maximum bit width we compute is less than the width of the roots'
23536 // type, we can proceed with the narrowing. Otherwise, do nothing.
23537 if (MaxBitWidth == 0 ||
23538 MaxBitWidth >=
23539 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
23540 ->getBitWidth()) {
23541 if (UserIgnoreList)
23542 AnalyzedMinBWVals.insert_range(TreeRoot);
23543 NodesToKeepBWs.insert_range(ToDemote);
23544 continue;
23545 }
23546
23547 // Finally, map the values we can demote to the maximum bit with we
23548 // computed.
23549 for (unsigned Idx : ToDemote) {
23550 TreeEntry *TE = VectorizableTree[Idx].get();
23551 if (MinBWs.contains(TE))
23552 continue;
23553 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
23554 if (isa<PoisonValue>(R))
23555 return false;
23556 return !isKnownNonNegative(R, SimplifyQuery(*DL));
23557 });
23558 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
23559 }
23560 }
23561}
23562
23564 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
23565 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
23567 auto *AA = &AM.getResult<AAManager>(F);
23568 auto *LI = &AM.getResult<LoopAnalysis>(F);
23569 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
23570 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
23571 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
23573
23574 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
23575 if (!Changed)
23576 return PreservedAnalyses::all();
23577
23580 return PA;
23581}
23582
23584 TargetTransformInfo *TTI_,
23585 TargetLibraryInfo *TLI_, AAResults *AA_,
23586 LoopInfo *LI_, DominatorTree *DT_,
23587 AssumptionCache *AC_, DemandedBits *DB_,
23590 return false;
23591 SE = SE_;
23592 TTI = TTI_;
23593 TLI = TLI_;
23594 AA = AA_;
23595 LI = LI_;
23596 DT = DT_;
23597 AC = AC_;
23598 DB = DB_;
23599 DL = &F.getDataLayout();
23600
23601 Stores.clear();
23602 GEPs.clear();
23603 bool Changed = false;
23604
23605 // If the target claims to have no vector registers don't attempt
23606 // vectorization.
23607 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
23608 LLVM_DEBUG(
23609 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
23610 return false;
23611 }
23612
23613 // Don't vectorize when the attribute NoImplicitFloat is used.
23614 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
23615 return false;
23616
23617 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
23618
23619 // Use the bottom up slp vectorizer to construct chains that start with
23620 // store instructions.
23621 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
23622
23623 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
23624 // delete instructions.
23625
23626 // Update DFS numbers now so that we can use them for ordering.
23627 DT->updateDFSNumbers();
23628
23629 // Scan the blocks in the function in post order.
23630 for (auto *BB : post_order(&F.getEntryBlock())) {
23632 continue;
23633
23634 // Start new block - clear the list of reduction roots.
23635 R.clearReductionData();
23636 collectSeedInstructions(BB);
23637
23638 // Vectorize trees that end at stores.
23639 if (!Stores.empty()) {
23640 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
23641 << " underlying objects.\n");
23642 Changed |= vectorizeStoreChains(R);
23643 }
23644
23645 // Vectorize trees that end at reductions.
23646 Changed |= vectorizeChainsInBlock(BB, R);
23647
23648 // Vectorize the index computations of getelementptr instructions. This
23649 // is primarily intended to catch gather-like idioms ending at
23650 // non-consecutive loads.
23651 if (!GEPs.empty()) {
23652 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
23653 << " underlying objects.\n");
23654 Changed |= vectorizeGEPIndices(BB, R);
23655 }
23656 }
23657
23658 if (Changed) {
23659 R.optimizeGatherSequence();
23660 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
23661 }
23662 return Changed;
23663}
23664
23665std::optional<bool>
23666SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
23667 unsigned Idx, unsigned MinVF,
23668 unsigned &Size) {
23669 Size = 0;
23670 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
23671 << "\n");
23672 const unsigned Sz = R.getVectorElementSize(Chain[0]);
23673 unsigned VF = Chain.size();
23674
23675 if (!has_single_bit(Sz) ||
23677 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
23678 VF) ||
23679 VF < 2 || VF < MinVF) {
23680 // Check if vectorizing with a non-power-of-2 VF should be considered. At
23681 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
23682 // all vector lanes are used.
23683 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
23684 return false;
23685 }
23686
23687 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
23688 << "\n");
23689
23690 SetVector<Value *> ValOps;
23691 for (Value *V : Chain)
23692 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
23693 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
23694 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
23695 InstructionsState S = Analysis.buildInstructionsState(
23696 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
23697 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
23698 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
23699 bool IsAllowedSize =
23700 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
23701 ValOps.size()) ||
23702 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
23703 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
23704 (!S.getMainOp()->isSafeToRemove() ||
23705 any_of(ValOps.getArrayRef(),
23706 [&](Value *V) {
23707 return !isa<ExtractElementInst>(V) &&
23708 (V->getNumUses() > Chain.size() ||
23709 any_of(V->users(), [&](User *U) {
23710 return !Stores.contains(U);
23711 }));
23712 }))) ||
23713 (ValOps.size() > Chain.size() / 2 && !S)) {
23714 Size = (!IsAllowedSize && S) ? 1 : 2;
23715 return false;
23716 }
23717 }
23718 if (R.isLoadCombineCandidate(Chain))
23719 return true;
23720 R.buildTree(Chain);
23721 // Check if tree tiny and store itself or its value is not vectorized.
23722 if (R.isTreeTinyAndNotFullyVectorizable()) {
23723 if (R.isGathered(Chain.front()) ||
23724 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
23725 return std::nullopt;
23726 Size = R.getCanonicalGraphSize();
23727 return false;
23728 }
23729 if (R.isProfitableToReorder()) {
23730 R.reorderTopToBottom();
23731 R.reorderBottomToTop();
23732 }
23733 R.transformNodes();
23734 R.computeMinimumValueSizes();
23735
23736 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
23737 R.buildExternalUses();
23738
23739 Size = R.getCanonicalGraphSize();
23740 if (S && S.getOpcode() == Instruction::Load)
23741 Size = 2; // cut off masked gather small trees
23742 InstructionCost Cost = R.getTreeCost(TreeCost);
23743
23744 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
23745 if (Cost < -SLPCostThreshold) {
23746 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
23747
23748 using namespace ore;
23749
23750 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
23751 cast<StoreInst>(Chain[0]))
23752 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
23753 << " and with tree size "
23754 << NV("TreeSize", R.getTreeSize()));
23755
23756 R.vectorizeTree();
23757 return true;
23758 }
23759
23760 return false;
23761}
23762
23763/// Checks if the quadratic mean deviation is less than 90% of the mean size.
23764static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
23765 bool First) {
23766 unsigned Num = 0;
23767 uint64_t Sum = std::accumulate(
23768 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
23769 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
23770 unsigned Size = First ? Val.first : Val.second;
23771 if (Size == 1)
23772 return V;
23773 ++Num;
23774 return V + Size;
23775 });
23776 if (Num == 0)
23777 return true;
23778 uint64_t Mean = Sum / Num;
23779 if (Mean == 0)
23780 return true;
23781 uint64_t Dev = std::accumulate(
23782 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
23783 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
23784 unsigned P = First ? Val.first : Val.second;
23785 if (P == 1)
23786 return V;
23787 return V + (P - Mean) * (P - Mean);
23788 }) /
23789 Num;
23790 return Dev * 96 / (Mean * Mean) == 0;
23791}
23792
23793namespace {
23794
23795/// A group of stores that we'll try to bundle together using vector ops.
23796/// They are ordered using the signed distance of their address operand to the
23797/// address of this group's BaseInstr.
23798class RelatedStoreInsts {
23799public:
23800 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
23801 : AllStores(AllStores) {
23802 reset(BaseInstrIdx);
23803 }
23804
23805 void reset(unsigned NewBaseInstr) {
23806 assert(NewBaseInstr < AllStores.size() &&
23807 "Instruction index out of bounds");
23808 BaseInstrIdx = NewBaseInstr;
23809 Instrs.clear();
23810 insertOrLookup(NewBaseInstr, 0);
23811 }
23812
23813 /// Tries to insert \p InstrIdx as the store with a pointer distance of
23814 /// \p PtrDist.
23815 /// Does nothing if there is already a store with that \p PtrDist.
23816 /// \returns The previously associated Instruction index, or std::nullopt
23817 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
23818 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23819 return Inserted ? std::nullopt : std::make_optional(It->second);
23820 }
23821
23822 using DistToInstMap = std::map<int64_t, unsigned>;
23823 const DistToInstMap &getStores() const { return Instrs; }
23824
23825 /// If \p SI is related to this group of stores, return the distance of its
23826 /// pointer operand to the one the group's BaseInstr.
23827 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
23828 ScalarEvolution &SE) const {
23829 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23830 return getPointersDiff(
23831 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
23832 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
23833 /*StrictCheck=*/true);
23834 }
23835
23836 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
23837 /// Stores whose index is less than \p MinSafeIdx will be dropped.
23838 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
23839 int64_t DistFromCurBase) {
23840 DistToInstMap PrevSet = std::move(Instrs);
23841 reset(NewBaseInstIdx);
23842
23843 // Re-insert stores that come after MinSafeIdx to try and vectorize them
23844 // again. Their distance will be "rebased" to use NewBaseInstIdx as
23845 // reference.
23846 for (auto [Dist, InstIdx] : PrevSet) {
23847 if (InstIdx >= MinSafeIdx)
23848 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23849 }
23850 }
23851
23852 /// Remove all stores that have been vectorized from this group.
23853 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
23854 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
23855 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
23856 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
23857 });
23858
23859 // Get a forward iterator pointing after the last vectorized store and erase
23860 // all stores before it so we don't try to vectorize them again.
23861 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23862 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23863 }
23864
23865private:
23866 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
23867 unsigned BaseInstrIdx;
23868
23869 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
23870 DistToInstMap Instrs;
23871
23872 /// Reference to all the stores in the BB being analyzed.
23873 ArrayRef<StoreInst *> AllStores;
23874};
23875
23876} // end anonymous namespace
23877
23878bool SLPVectorizerPass::vectorizeStores(
23879 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
23880 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23881 &Visited) {
23882 // We may run into multiple chains that merge into a single chain. We mark the
23883 // stores that we vectorized so that we don't visit the same store twice.
23884 BoUpSLP::ValueSet VectorizedStores;
23885 bool Changed = false;
23886
23887 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23888 int64_t PrevDist = -1;
23889 BoUpSLP::ValueList Operands;
23890 // Collect the chain into a list.
23891 for (auto [Idx, Data] : enumerate(StoreSeq)) {
23892 auto &[Dist, InstIdx] = Data;
23893 if (Operands.empty() || Dist - PrevDist == 1) {
23894 Operands.push_back(Stores[InstIdx]);
23895 PrevDist = Dist;
23896 if (Idx != StoreSeq.size() - 1)
23897 continue;
23898 }
23899 llvm::scope_exit E([&, &Dist = Dist, &InstIdx = InstIdx]() {
23900 Operands.clear();
23901 Operands.push_back(Stores[InstIdx]);
23902 PrevDist = Dist;
23903 });
23904
23905 if (Operands.size() <= 1 ||
23906 !Visited
23907 .insert({Operands.front(),
23908 cast<StoreInst>(Operands.front())->getValueOperand(),
23909 Operands.back(),
23910 cast<StoreInst>(Operands.back())->getValueOperand(),
23911 Operands.size()})
23912 .second)
23913 continue;
23914
23915 unsigned MaxVecRegSize = R.getMaxVecRegSize();
23916 unsigned EltSize = R.getVectorElementSize(Operands[0]);
23917 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
23918
23919 unsigned MaxVF =
23920 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23921 auto *Store = cast<StoreInst>(Operands[0]);
23922 Type *StoreTy = Store->getValueOperand()->getType();
23923 Type *ValueTy = StoreTy;
23924 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
23925 ValueTy = Trunc->getSrcTy();
23926 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
23927 // getStoreMinimumVF only support scalar type as arguments. As a result,
23928 // we need to use the element type of StoreTy and ValueTy to retrieve the
23929 // VF and then transform it back.
23930 // Remember: VF is defined as the number we want to vectorize, not the
23931 // number of elements in the final vector.
23932 Type *StoreScalarTy = StoreTy->getScalarType();
23933 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
23934 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23935 ValueTy->getScalarType()));
23936 MinVF /= getNumElements(StoreTy);
23937 MinVF = std::max<unsigned>(2, MinVF);
23938
23939 if (MaxVF < MinVF) {
23940 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23941 << ") < "
23942 << "MinVF (" << MinVF << ")\n");
23943 continue;
23944 }
23945
23946 unsigned NonPowerOf2VF = 0;
23948 // First try vectorizing with a non-power-of-2 VF. At the moment, only
23949 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
23950 // lanes are used.
23951 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
23952 if (has_single_bit(CandVF + 1)) {
23953 NonPowerOf2VF = CandVF;
23954 assert(NonPowerOf2VF != MaxVF &&
23955 "Non-power-of-2 VF should not be equal to MaxVF");
23956 }
23957 }
23958
23959 // MaxRegVF represents the number of instructions (scalar, or vector in
23960 // case of revec) that can be vectorized to naturally fit in a vector
23961 // register.
23962 unsigned MaxRegVF = MaxVF;
23963
23964 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
23965 if (MaxVF < MinVF) {
23966 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23967 << ") < "
23968 << "MinVF (" << MinVF << ")\n");
23969 continue;
23970 }
23971
23972 SmallVector<unsigned> CandidateVFs;
23973 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23974 VF = divideCeil(VF, 2))
23975 CandidateVFs.push_back(VF);
23976
23977 unsigned End = Operands.size();
23978 unsigned Repeat = 0;
23979 constexpr unsigned MaxAttempts = 4;
23980 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
23981 for (std::pair<unsigned, unsigned> &P : RangeSizes)
23982 P.first = P.second = 1;
23983 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23984 auto IsNotVectorized = [](bool First,
23985 const std::pair<unsigned, unsigned> &P) {
23986 return First ? P.first > 0 : P.second > 0;
23987 };
23988 auto IsVectorized = [](bool First,
23989 const std::pair<unsigned, unsigned> &P) {
23990 return First ? P.first == 0 : P.second == 0;
23991 };
23992 auto VFIsProfitable = [](bool First, unsigned Size,
23993 const std::pair<unsigned, unsigned> &P) {
23994 return First ? Size >= P.first : Size >= P.second;
23995 };
23996 auto FirstSizeSame = [](unsigned Size,
23997 const std::pair<unsigned, unsigned> &P) {
23998 return Size == P.first;
23999 };
24000 while (true) {
24001 ++Repeat;
24002 bool RepeatChanged = false;
24003 bool AnyProfitableGraph = false;
24004 for (unsigned VF : CandidateVFs) {
24005 AnyProfitableGraph = false;
24006 unsigned FirstUnvecStore =
24007 std::distance(RangeSizes.begin(),
24008 find_if(RangeSizes, std::bind(IsNotVectorized,
24009 VF >= MaxRegVF, _1)));
24010
24011 // Form slices of size VF starting from FirstUnvecStore and try to
24012 // vectorize them.
24013 while (FirstUnvecStore < End) {
24014 unsigned FirstVecStore = std::distance(
24015 RangeSizes.begin(),
24016 find_if(RangeSizes.drop_front(FirstUnvecStore),
24017 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
24018 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
24019 for (unsigned SliceStartIdx = FirstUnvecStore;
24020 SliceStartIdx + VF <= MaxSliceEnd;) {
24021 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
24022 VF >= MaxRegVF)) {
24023 ++SliceStartIdx;
24024 continue;
24025 }
24026 ArrayRef<Value *> Slice =
24027 ArrayRef(Operands).slice(SliceStartIdx, VF);
24028 assert(all_of(Slice,
24029 [&](Value *V) {
24030 return cast<StoreInst>(V)
24031 ->getValueOperand()
24032 ->getType() ==
24033 cast<StoreInst>(Slice.front())
24034 ->getValueOperand()
24035 ->getType();
24036 }) &&
24037 "Expected all operands of same type.");
24038 if (!NonSchedulable.empty()) {
24039 auto [NonSchedSizeMax, NonSchedSizeMin] =
24040 NonSchedulable.lookup(Slice.front());
24041 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
24042 // VF is too ambitious. Try to vectorize another slice before
24043 // trying a smaller VF.
24044 SliceStartIdx += NonSchedSizeMax;
24045 continue;
24046 }
24047 }
24048 unsigned TreeSize;
24049 std::optional<bool> Res =
24050 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
24051 if (!Res) {
24052 // Update the range of non schedulable VFs for slices starting
24053 // at SliceStartIdx.
24054 NonSchedulable
24055 .try_emplace(Slice.front(), std::make_pair(VF, VF))
24056 .first->getSecond()
24057 .second = VF;
24058 } else if (*Res) {
24059 // Mark the vectorized stores so that we don't vectorize them
24060 // again.
24061 VectorizedStores.insert_range(Slice);
24062 // Mark the vectorized stores so that we don't vectorize them
24063 // again.
24064 AnyProfitableGraph = RepeatChanged = Changed = true;
24065 // If we vectorized initial block, no need to try to vectorize
24066 // it again.
24067 for (std::pair<unsigned, unsigned> &P :
24068 RangeSizes.slice(SliceStartIdx, VF))
24069 P.first = P.second = 0;
24070 if (SliceStartIdx < FirstUnvecStore + MinVF) {
24071 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
24072 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
24073 P.first = P.second = 0;
24074 FirstUnvecStore = SliceStartIdx + VF;
24075 }
24076 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
24077 for (std::pair<unsigned, unsigned> &P :
24078 RangeSizes.slice(SliceStartIdx + VF,
24079 MaxSliceEnd - (SliceStartIdx + VF)))
24080 P.first = P.second = 0;
24081 if (MaxSliceEnd == End)
24082 End = SliceStartIdx;
24083 MaxSliceEnd = SliceStartIdx;
24084 }
24085 SliceStartIdx += VF;
24086 continue;
24087 }
24088 if (VF > 2 && Res &&
24089 !all_of(RangeSizes.slice(SliceStartIdx, VF),
24090 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
24091 _1))) {
24092 SliceStartIdx += VF;
24093 continue;
24094 }
24095 // Check for the very big VFs that we're not rebuilding same
24096 // trees, just with larger number of elements.
24097 if (VF > MaxRegVF && TreeSize > 1 &&
24098 all_of(RangeSizes.slice(SliceStartIdx, VF),
24099 std::bind(FirstSizeSame, TreeSize, _1))) {
24100 SliceStartIdx += VF;
24101 while (SliceStartIdx != MaxSliceEnd &&
24102 RangeSizes[SliceStartIdx].first == TreeSize)
24103 ++SliceStartIdx;
24104 continue;
24105 }
24106 if (TreeSize > 1) {
24107 for (std::pair<unsigned, unsigned> &P :
24108 RangeSizes.slice(SliceStartIdx, VF)) {
24109 if (VF >= MaxRegVF)
24110 P.second = std::max(P.second, TreeSize);
24111 else
24112 P.first = std::max(P.first, TreeSize);
24113 }
24114 }
24115 ++SliceStartIdx;
24116 AnyProfitableGraph = true;
24117 }
24118 if (FirstUnvecStore >= End)
24119 break;
24120 if (MaxSliceEnd - FirstUnvecStore < VF &&
24121 MaxSliceEnd - FirstUnvecStore >= MinVF)
24122 AnyProfitableGraph = true;
24123 FirstUnvecStore = std::distance(
24124 RangeSizes.begin(),
24125 find_if(RangeSizes.drop_front(MaxSliceEnd),
24126 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
24127 }
24128 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
24129 break;
24130 }
24131 // All values vectorized - exit.
24132 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
24133 return P.first == 0 && P.second == 0;
24134 }))
24135 break;
24136 // Check if tried all attempts or no need for the last attempts at all.
24137 if (Repeat >= MaxAttempts ||
24138 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
24139 break;
24140 constexpr unsigned StoresLimit = 64;
24141 const unsigned MaxTotalNum = std::min<unsigned>(
24142 Operands.size(),
24143 static_cast<unsigned>(
24144 End -
24145 std::distance(
24146 RangeSizes.begin(),
24147 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
24148 1));
24149 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
24150 unsigned Limit =
24151 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
24152 CandidateVFs.clear();
24153 if (bit_floor(Limit) == VF)
24154 CandidateVFs.push_back(Limit);
24155 if (VF > MaxTotalNum || VF >= StoresLimit)
24156 break;
24157 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
24158 if (P.first != 0)
24159 P.first = std::max(P.second, P.first);
24160 }
24161 // Last attempt to vectorize max number of elements, if all previous
24162 // attempts were unsuccessful because of the cost issues.
24163 CandidateVFs.push_back(VF);
24164 }
24165 }
24166 };
24167
24168 /// Groups of stores to vectorize
24169 SmallVector<RelatedStoreInsts> SortedStores;
24170
24171 // Inserts the specified store SI with the given index Idx to the set of the
24172 // stores. If the store with the same distance is found already - stop
24173 // insertion, try to vectorize already found stores. If some stores from this
24174 // sequence were not vectorized - try to vectorize them with the new store
24175 // later. But this logic is applied only to the stores, that come before the
24176 // previous store with the same distance.
24177 // Example:
24178 // 1. store x, %p
24179 // 2. store y, %p+1
24180 // 3. store z, %p+2
24181 // 4. store a, %p
24182 // 5. store b, %p+3
24183 // - Scan this from the last to first store. The very first bunch of stores is
24184 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
24185 // vector).
24186 // - The next store in the list - #1 - has the same distance from store #5 as
24187 // the store #4.
24188 // - Try to vectorize sequence of stores 4,2,3,5.
24189 // - If all these stores are vectorized - just drop them.
24190 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
24191 // - Start new stores sequence.
24192 // The new bunch of stores is {1, {1, 0}}.
24193 // - Add the stores from previous sequence, that were not vectorized.
24194 // Here we consider the stores in the reversed order, rather they are used in
24195 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
24196 // Store #3 can be added -> comes after store #4 with the same distance as
24197 // store #1.
24198 // Store #5 cannot be added - comes before store #4.
24199 // This logic allows to improve the compile time, we assume that the stores
24200 // after previous store with the same distance most likely have memory
24201 // dependencies and no need to waste compile time to try to vectorize them.
24202 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
24203 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
24204 std::optional<int64_t> PtrDist;
24205 auto *RelatedStores = find_if(
24206 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
24207 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
24208 return PtrDist.has_value();
24209 });
24210
24211 // We did not find a comparable store, start a new group.
24212 if (RelatedStores == SortedStores.end()) {
24213 SortedStores.emplace_back(Idx, Stores);
24214 return;
24215 }
24216
24217 // If there is already a store in the group with the same PtrDiff, try to
24218 // vectorize the existing instructions before adding the current store.
24219 // Otherwise, insert this store and keep collecting.
24220 if (std::optional<unsigned> PrevInst =
24221 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
24222 TryToVectorize(RelatedStores->getStores());
24223 RelatedStores->clearVectorizedStores(VectorizedStores);
24224 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
24225 /*NewBaseInstIdx=*/Idx,
24226 /*DistFromCurBase=*/*PtrDist);
24227 }
24228 };
24229 Type *PrevValTy = nullptr;
24230 for (auto [I, SI] : enumerate(Stores)) {
24231 if (R.isDeleted(SI))
24232 continue;
24233 if (!PrevValTy)
24234 PrevValTy = SI->getValueOperand()->getType();
24235 // Check that we do not try to vectorize stores of different types.
24236 if (PrevValTy != SI->getValueOperand()->getType()) {
24237 for (RelatedStoreInsts &StoreSeq : SortedStores)
24238 TryToVectorize(StoreSeq.getStores());
24239 SortedStores.clear();
24240 PrevValTy = SI->getValueOperand()->getType();
24241 }
24242 FillStoresSet(I, SI);
24243 }
24244
24245 // Final vectorization attempt.
24246 for (RelatedStoreInsts &StoreSeq : SortedStores)
24247 TryToVectorize(StoreSeq.getStores());
24248
24249 return Changed;
24250}
24251
24252void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
24253 // Initialize the collections. We will make a single pass over the block.
24254 Stores.clear();
24255 GEPs.clear();
24256
24257 // Visit the store and getelementptr instructions in BB and organize them in
24258 // Stores and GEPs according to the underlying objects of their pointer
24259 // operands.
24260 for (Instruction &I : *BB) {
24261 // Ignore store instructions that are volatile or have a pointer operand
24262 // that doesn't point to a scalar type.
24263 if (auto *SI = dyn_cast<StoreInst>(&I)) {
24264 if (!SI->isSimple())
24265 continue;
24266 if (!isValidElementType(SI->getValueOperand()->getType()))
24267 continue;
24268 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
24269 }
24270
24271 // Ignore getelementptr instructions that have more than one index, a
24272 // constant index, or a pointer operand that doesn't point to a scalar
24273 // type.
24274 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
24275 if (GEP->getNumIndices() != 1)
24276 continue;
24277 Value *Idx = GEP->idx_begin()->get();
24278 if (isa<Constant>(Idx))
24279 continue;
24280 if (!isValidElementType(Idx->getType()))
24281 continue;
24282 if (GEP->getType()->isVectorTy())
24283 continue;
24284 GEPs[GEP->getPointerOperand()].push_back(GEP);
24285 }
24286 }
24287}
24288
24289bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
24290 bool MaxVFOnly) {
24291 if (VL.size() < 2)
24292 return false;
24293
24294 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
24295 << VL.size() << ".\n");
24296
24297 // Check that all of the parts are instructions of the same type,
24298 // we permit an alternate opcode via InstructionsState.
24299 InstructionsState S = getSameOpcode(VL, *TLI);
24300 if (!S)
24301 return false;
24302
24303 Instruction *I0 = S.getMainOp();
24304 // Make sure invalid types (including vector type) are rejected before
24305 // determining vectorization factor for scalar instructions.
24306 for (Value *V : VL) {
24307 Type *Ty = V->getType();
24309 // NOTE: the following will give user internal llvm type name, which may
24310 // not be useful.
24311 R.getORE()->emit([&]() {
24312 std::string TypeStr;
24313 llvm::raw_string_ostream OS(TypeStr);
24314 Ty->print(OS);
24315 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
24316 << "Cannot SLP vectorize list: type "
24317 << TypeStr + " is unsupported by vectorizer";
24318 });
24319 return false;
24320 }
24321 }
24322
24323 Type *ScalarTy = getValueType(VL[0]);
24324 unsigned Sz = R.getVectorElementSize(I0);
24325 unsigned MinVF = R.getMinVF(Sz);
24326 unsigned MaxVF = std::max<unsigned>(
24327 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
24328 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
24329 if (MaxVF < 2) {
24330 R.getORE()->emit([&]() {
24331 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
24332 << "Cannot SLP vectorize list: vectorization factor "
24333 << "less than 2 is not supported";
24334 });
24335 return false;
24336 }
24337
24338 bool Changed = false;
24339 bool CandidateFound = false;
24340 InstructionCost MinCost = SLPCostThreshold.getValue();
24341
24342 unsigned NextInst = 0, MaxInst = VL.size();
24343 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
24344 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
24345 // No actual vectorization should happen, if number of parts is the same as
24346 // provided vectorization factor (i.e. the scalar type is used for vector
24347 // code during codegen).
24348 auto *VecTy = getWidenedType(ScalarTy, VF);
24349 if (TTI->getNumberOfParts(VecTy) == VF)
24350 continue;
24351 for (unsigned I = NextInst; I < MaxInst; ++I) {
24352 unsigned ActualVF = std::min(MaxInst - I, VF);
24353
24354 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
24355 continue;
24356
24357 if (MaxVFOnly && ActualVF < MaxVF)
24358 break;
24359 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
24360 break;
24361
24362 SmallVector<Value *> Ops(ActualVF, nullptr);
24363 unsigned Idx = 0;
24364 for (Value *V : VL.drop_front(I)) {
24365 // Check that a previous iteration of this loop did not delete the
24366 // Value.
24367 if (auto *Inst = dyn_cast<Instruction>(V);
24368 !Inst || !R.isDeleted(Inst)) {
24369 Ops[Idx] = V;
24370 ++Idx;
24371 if (Idx == ActualVF)
24372 break;
24373 }
24374 }
24375 // Not enough vectorizable instructions - exit.
24376 if (Idx != ActualVF)
24377 break;
24378
24379 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
24380 << "\n");
24381
24382 R.buildTree(Ops);
24383 if (R.isTreeTinyAndNotFullyVectorizable())
24384 continue;
24385 if (R.isProfitableToReorder()) {
24386 R.reorderTopToBottom();
24387 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
24388 }
24389 R.transformNodes();
24390 R.computeMinimumValueSizes();
24391 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
24392 R.buildExternalUses();
24393
24394 InstructionCost Cost = R.getTreeCost(TreeCost);
24395 CandidateFound = true;
24396 MinCost = std::min(MinCost, Cost);
24397
24398 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24399 << " for VF=" << ActualVF << "\n");
24400 if (Cost < -SLPCostThreshold) {
24401 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
24402 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
24404 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
24405 << " and with tree size "
24406 << ore::NV("TreeSize", R.getTreeSize()));
24407
24408 R.vectorizeTree();
24409 // Move to the next bundle.
24410 I += VF - 1;
24411 NextInst = I + 1;
24412 Changed = true;
24413 }
24414 }
24415 }
24416
24417 if (!Changed && CandidateFound) {
24418 R.getORE()->emit([&]() {
24419 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
24420 << "List vectorization was possible but not beneficial with cost "
24421 << ore::NV("Cost", MinCost) << " >= "
24422 << ore::NV("Treshold", -SLPCostThreshold);
24423 });
24424 } else if (!Changed) {
24425 R.getORE()->emit([&]() {
24426 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
24427 << "Cannot SLP vectorize list: vectorization was impossible"
24428 << " with available vectorization factors";
24429 });
24430 }
24431 return Changed;
24432}
24433
24434namespace {
24435
24436/// Model horizontal reductions.
24437///
24438/// A horizontal reduction is a tree of reduction instructions that has values
24439/// that can be put into a vector as its leaves. For example:
24440///
24441/// mul mul mul mul
24442/// \ / \ /
24443/// + +
24444/// \ /
24445/// +
24446/// This tree has "mul" as its leaf values and "+" as its reduction
24447/// instructions. A reduction can feed into a store or a binary operation
24448/// feeding a phi.
24449/// ...
24450/// \ /
24451/// +
24452/// |
24453/// phi +=
24454///
24455/// Or:
24456/// ...
24457/// \ /
24458/// +
24459/// |
24460/// *p =
24461///
24462class HorizontalReduction {
24463 using ReductionOpsType = SmallVector<Value *, 16>;
24464 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
24465 ReductionOpsListType ReductionOps;
24466 /// List of possibly reduced values.
24468 /// Maps reduced value to the corresponding reduction operation.
24469 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
24470 WeakTrackingVH ReductionRoot;
24471 /// The type of reduction operation.
24472 RecurKind RdxKind;
24473 /// Checks if the optimization of original scalar identity operations on
24474 /// matched horizontal reductions is enabled and allowed.
24475 bool IsSupportedHorRdxIdentityOp = false;
24476 /// The minimum number of the reduced values.
24477 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
24478 /// Contains vector values for reduction including their scale factor and
24479 /// signedness.
24481
24482 static bool isCmpSelMinMax(Instruction *I) {
24483 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
24485 }
24486
24487 // And/or are potentially poison-safe logical patterns like:
24488 // select x, y, false
24489 // select x, true, y
24490 static bool isBoolLogicOp(Instruction *I) {
24491 return isa<SelectInst>(I) &&
24492 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
24493 }
24494
24495 /// Checks if instruction is associative and can be vectorized.
24496 static bool isVectorizable(RecurKind Kind, Instruction *I,
24497 bool TwoElementReduction = false) {
24498 if (Kind == RecurKind::None)
24499 return false;
24500
24501 // Integer ops that map to select instructions or intrinsics are fine.
24503 isBoolLogicOp(I))
24504 return true;
24505
24506 // No need to check for associativity, if 2 reduced values.
24507 if (TwoElementReduction)
24508 return true;
24509
24510 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
24511 // FP min/max are associative except for NaN and -0.0. We do not
24512 // have to rule out -0.0 here because the intrinsic semantics do not
24513 // specify a fixed result for it.
24514 return I->getFastMathFlags().noNaNs();
24515 }
24516
24517 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
24518 return true;
24519
24520 return I->isAssociative();
24521 }
24522
24523 static Value *getRdxOperand(Instruction *I, unsigned Index) {
24524 // Poison-safe 'or' takes the form: select X, true, Y
24525 // To make that work with the normal operand processing, we skip the
24526 // true value operand.
24527 // TODO: Change the code and data structures to handle this without a hack.
24528 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
24529 return I->getOperand(2);
24530 return I->getOperand(Index);
24531 }
24532
24533 /// Creates reduction operation with the current opcode.
24534 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
24535 Value *RHS, const Twine &Name, bool UseSelect) {
24536 Type *OpTy = LHS->getType();
24537 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
24538 switch (Kind) {
24539 case RecurKind::Or: {
24540 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
24541 return Builder.CreateSelectWithUnknownProfile(
24542 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
24543 RHS, DEBUG_TYPE, Name);
24544 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24545 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24546 Name);
24547 }
24548 case RecurKind::And: {
24549 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
24550 return Builder.CreateSelectWithUnknownProfile(
24551 LHS, RHS,
24552 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
24553 DEBUG_TYPE, Name);
24554 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24555 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24556 Name);
24557 }
24558 case RecurKind::Add:
24559 case RecurKind::Mul:
24560 case RecurKind::Xor:
24561 case RecurKind::FAdd:
24562 case RecurKind::FMul: {
24563 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24564 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24565 Name);
24566 }
24567 case RecurKind::SMax:
24568 case RecurKind::SMin:
24569 case RecurKind::UMax:
24570 case RecurKind::UMin:
24571 if (UseSelect) {
24573 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
24574 return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
24575 Name);
24576 }
24577 [[fallthrough]];
24578 case RecurKind::FMax:
24579 case RecurKind::FMin:
24580 case RecurKind::FMaximum:
24581 case RecurKind::FMinimum:
24582 case RecurKind::FMaximumNum:
24583 case RecurKind::FMinimumNum: {
24585 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
24586 }
24587 default:
24588 llvm_unreachable("Unknown reduction operation.");
24589 }
24590 }
24591
24592 /// Creates reduction operation with the current opcode with the IR flags
24593 /// from \p ReductionOps, dropping nuw/nsw flags.
24594 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
24595 Value *RHS, const Twine &Name,
24596 const ReductionOpsListType &ReductionOps) {
24597 bool UseSelect = ReductionOps.size() == 2 ||
24598 // Logical or/and.
24599 (ReductionOps.size() == 1 &&
24600 any_of(ReductionOps.front(), IsaPred<SelectInst>));
24601 assert((!UseSelect || ReductionOps.size() != 2 ||
24602 isa<SelectInst>(ReductionOps[1][0])) &&
24603 "Expected cmp + select pairs for reduction");
24604 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
24606 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
24607 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
24608 /*IncludeWrapFlags=*/false);
24609 propagateIRFlags(Op, ReductionOps[1], nullptr,
24610 /*IncludeWrapFlags=*/false);
24611 return Op;
24612 }
24613 }
24614 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
24615 return Op;
24616 }
24617
24618public:
24619 static RecurKind getRdxKind(Value *V) {
24620 auto *I = dyn_cast<Instruction>(V);
24621 if (!I)
24622 return RecurKind::None;
24623 if (match(I, m_Add(m_Value(), m_Value())))
24624 return RecurKind::Add;
24625 if (match(I, m_Mul(m_Value(), m_Value())))
24626 return RecurKind::Mul;
24627 if (match(I, m_And(m_Value(), m_Value())) ||
24629 return RecurKind::And;
24630 if (match(I, m_Or(m_Value(), m_Value())) ||
24632 return RecurKind::Or;
24633 if (match(I, m_Xor(m_Value(), m_Value())))
24634 return RecurKind::Xor;
24635 if (match(I, m_FAdd(m_Value(), m_Value())))
24636 return RecurKind::FAdd;
24637 if (match(I, m_FMul(m_Value(), m_Value())))
24638 return RecurKind::FMul;
24639
24641 return RecurKind::FMax;
24643 return RecurKind::FMin;
24644
24645 if (match(I, m_FMaximum(m_Value(), m_Value())))
24646 return RecurKind::FMaximum;
24647 if (match(I, m_FMinimum(m_Value(), m_Value())))
24648 return RecurKind::FMinimum;
24649 // This matches either cmp+select or intrinsics. SLP is expected to handle
24650 // either form.
24651 // TODO: If we are canonicalizing to intrinsics, we can remove several
24652 // special-case paths that deal with selects.
24653 if (match(I, m_SMax(m_Value(), m_Value())))
24654 return RecurKind::SMax;
24655 if (match(I, m_SMin(m_Value(), m_Value())))
24656 return RecurKind::SMin;
24657 if (match(I, m_UMax(m_Value(), m_Value())))
24658 return RecurKind::UMax;
24659 if (match(I, m_UMin(m_Value(), m_Value())))
24660 return RecurKind::UMin;
24661
24662 if (auto *Select = dyn_cast<SelectInst>(I)) {
24663 // Try harder: look for min/max pattern based on instructions producing
24664 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
24665 // During the intermediate stages of SLP, it's very common to have
24666 // pattern like this (since optimizeGatherSequence is run only once
24667 // at the end):
24668 // %1 = extractelement <2 x i32> %a, i32 0
24669 // %2 = extractelement <2 x i32> %a, i32 1
24670 // %cond = icmp sgt i32 %1, %2
24671 // %3 = extractelement <2 x i32> %a, i32 0
24672 // %4 = extractelement <2 x i32> %a, i32 1
24673 // %select = select i1 %cond, i32 %3, i32 %4
24674 CmpPredicate Pred;
24675 Instruction *L1;
24676 Instruction *L2;
24677
24678 Value *LHS = Select->getTrueValue();
24679 Value *RHS = Select->getFalseValue();
24680 Value *Cond = Select->getCondition();
24681
24682 // TODO: Support inverse predicates.
24683 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
24686 return RecurKind::None;
24687 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
24690 return RecurKind::None;
24691 } else {
24693 return RecurKind::None;
24694 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
24697 return RecurKind::None;
24698 }
24699
24700 switch (Pred) {
24701 default:
24702 return RecurKind::None;
24703 case CmpInst::ICMP_SGT:
24704 case CmpInst::ICMP_SGE:
24705 return RecurKind::SMax;
24706 case CmpInst::ICMP_SLT:
24707 case CmpInst::ICMP_SLE:
24708 return RecurKind::SMin;
24709 case CmpInst::ICMP_UGT:
24710 case CmpInst::ICMP_UGE:
24711 return RecurKind::UMax;
24712 case CmpInst::ICMP_ULT:
24713 case CmpInst::ICMP_ULE:
24714 return RecurKind::UMin;
24715 }
24716 }
24717 return RecurKind::None;
24718 }
24719
24720 /// Get the index of the first operand.
24721 static unsigned getFirstOperandIndex(Instruction *I) {
24722 return isCmpSelMinMax(I) ? 1 : 0;
24723 }
24724
24725private:
24726 /// Total number of operands in the reduction operation.
24727 static unsigned getNumberOfOperands(Instruction *I) {
24728 return isCmpSelMinMax(I) ? 3 : 2;
24729 }
24730
24731 /// Checks if the instruction is in basic block \p BB.
24732 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
24733 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
24734 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
24735 auto *Sel = cast<SelectInst>(I);
24736 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
24737 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
24738 }
24739 return I->getParent() == BB;
24740 }
24741
24742 /// Expected number of uses for reduction operations/reduced values.
24743 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
24744 if (IsCmpSelMinMax) {
24745 // SelectInst must be used twice while the condition op must have single
24746 // use only.
24747 if (auto *Sel = dyn_cast<SelectInst>(I))
24748 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
24749 return I->hasNUses(2);
24750 }
24751
24752 // Arithmetic reduction operation must be used once only.
24753 return I->hasOneUse();
24754 }
24755
24756 /// Initializes the list of reduction operations.
24757 void initReductionOps(Instruction *I) {
24758 if (isCmpSelMinMax(I))
24759 ReductionOps.assign(2, ReductionOpsType());
24760 else
24761 ReductionOps.assign(1, ReductionOpsType());
24762 }
24763
24764 /// Add all reduction operations for the reduction instruction \p I.
24765 void addReductionOps(Instruction *I) {
24766 if (isCmpSelMinMax(I)) {
24767 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
24768 ReductionOps[1].emplace_back(I);
24769 } else {
24770 ReductionOps[0].emplace_back(I);
24771 }
24772 }
24773
24774 static bool isGoodForReduction(ArrayRef<Value *> Data) {
24775 int Sz = Data.size();
24776 auto *I = dyn_cast<Instruction>(Data.front());
24777 return Sz > 1 || isConstant(Data.front()) ||
24778 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
24779 }
24780
24781public:
24782 HorizontalReduction() = default;
24784 : ReductionRoot(I), ReductionLimit(2) {
24785 RdxKind = HorizontalReduction::getRdxKind(I);
24786 ReductionOps.emplace_back().push_back(I);
24787 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
24788 for (Value *V : Ops)
24789 ReducedValsToOps[V].push_back(I);
24790 }
24791
24792 bool matchReductionForOperands() const {
24793 // Analyze "regular" integer/FP types for reductions - no target-specific
24794 // types or pointers.
24795 assert(ReductionRoot && "Reduction root is not set!");
24796 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
24797 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
24798 return Ops.size() == 2;
24799 })))
24800 return false;
24801
24802 return true;
24803 }
24804
24805 /// Try to find a reduction tree.
24806 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24807 ScalarEvolution &SE, const DataLayout &DL,
24808 const TargetLibraryInfo &TLI) {
24809 RdxKind = HorizontalReduction::getRdxKind(Root);
24810 if (!isVectorizable(RdxKind, Root))
24811 return false;
24812
24813 // Analyze "regular" integer/FP types for reductions - no target-specific
24814 // types or pointers.
24815 Type *Ty = Root->getType();
24816 if (!isValidElementType(Ty) || Ty->isPointerTy())
24817 return false;
24818
24819 // Though the ultimate reduction may have multiple uses, its condition must
24820 // have only single use.
24821 if (auto *Sel = dyn_cast<SelectInst>(Root))
24822 if (!Sel->getCondition()->hasOneUse())
24823 return false;
24824
24825 ReductionRoot = Root;
24826
24827 // Iterate through all the operands of the possible reduction tree and
24828 // gather all the reduced values, sorting them by their value id.
24829 BasicBlock *BB = Root->getParent();
24830 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24832 1, std::make_pair(Root, 0));
24833 // Checks if the operands of the \p TreeN instruction are also reduction
24834 // operations or should be treated as reduced values or an extra argument,
24835 // which is not part of the reduction.
24836 auto CheckOperands = [&](Instruction *TreeN,
24837 SmallVectorImpl<Value *> &PossibleReducedVals,
24838 SmallVectorImpl<Instruction *> &ReductionOps,
24839 unsigned Level) {
24840 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
24841 getNumberOfOperands(TreeN)))) {
24842 Value *EdgeVal = getRdxOperand(TreeN, I);
24843 ReducedValsToOps[EdgeVal].push_back(TreeN);
24844 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
24845 // If the edge is not an instruction, or it is different from the main
24846 // reduction opcode or has too many uses - possible reduced value.
24847 // Also, do not try to reduce const values, if the operation is not
24848 // foldable.
24849 if (!EdgeInst || Level > RecursionMaxDepth ||
24850 getRdxKind(EdgeInst) != RdxKind ||
24851 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24852 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24853 !isVectorizable(RdxKind, EdgeInst) ||
24854 (R.isAnalyzedReductionRoot(EdgeInst) &&
24855 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
24856 PossibleReducedVals.push_back(EdgeVal);
24857 continue;
24858 }
24859 ReductionOps.push_back(EdgeInst);
24860 }
24861 };
24862 // Try to regroup reduced values so that it gets more profitable to try to
24863 // reduce them. Values are grouped by their value ids, instructions - by
24864 // instruction op id and/or alternate op id, plus do extra analysis for
24865 // loads (grouping them by the distance between pointers) and cmp
24866 // instructions (grouping them by the predicate).
24867 SmallMapVector<
24868 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24869 8>
24870 PossibleReducedVals;
24871 initReductionOps(Root);
24872 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
24873 SmallSet<size_t, 2> LoadKeyUsed;
24874
24875 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
24877 Value *Ptr =
24879 if (!LoadKeyUsed.insert(Key).second) {
24880 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
24881 if (LIt != LoadsMap.end()) {
24882 for (LoadInst *RLI : LIt->second) {
24883 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
24884 LI->getType(), LI->getPointerOperand(), DL, SE,
24885 /*StrictCheck=*/true))
24886 return hash_value(RLI->getPointerOperand());
24887 }
24888 for (LoadInst *RLI : LIt->second) {
24890 LI->getPointerOperand(), TLI)) {
24891 hash_code SubKey = hash_value(RLI->getPointerOperand());
24892 return SubKey;
24893 }
24894 }
24895 if (LIt->second.size() > 2) {
24896 hash_code SubKey =
24897 hash_value(LIt->second.back()->getPointerOperand());
24898 return SubKey;
24899 }
24900 }
24901 }
24902 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
24903 .first->second.push_back(LI);
24904 return hash_value(LI->getPointerOperand());
24905 };
24906
24907 while (!Worklist.empty()) {
24908 auto [TreeN, Level] = Worklist.pop_back_val();
24909 SmallVector<Value *> PossibleRedVals;
24910 SmallVector<Instruction *> PossibleReductionOps;
24911 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24912 addReductionOps(TreeN);
24913 // Add reduction values. The values are sorted for better vectorization
24914 // results.
24915 for (Value *V : PossibleRedVals) {
24916 size_t Key, Idx;
24917 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
24918 /*AllowAlternate=*/false);
24919 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
24920 }
24921 for (Instruction *I : reverse(PossibleReductionOps))
24922 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
24923 }
24924 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
24925 // Sort values by the total number of values kinds to start the reduction
24926 // from the longest possible reduced values sequences.
24927 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
24928 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
24929 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
24930 for (auto &Slice : PossibleRedVals) {
24931 PossibleRedValsVect.emplace_back();
24932 auto RedValsVect = Slice.second.takeVector();
24933 stable_sort(RedValsVect, llvm::less_second());
24934 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
24935 PossibleRedValsVect.back().append(Data.second, Data.first);
24936 }
24937 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
24938 return P1.size() > P2.size();
24939 });
24940 bool First = true;
24941 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
24942 if (First) {
24943 First = false;
24944 ReducedVals.emplace_back();
24945 } else if (!isGoodForReduction(Data)) {
24946 auto *LI = dyn_cast<LoadInst>(Data.front());
24947 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
24948 if (!LI || !LastLI ||
24950 getUnderlyingObject(LastLI->getPointerOperand()))
24951 ReducedVals.emplace_back();
24952 }
24953 ReducedVals.back().append(Data.rbegin(), Data.rend());
24954 }
24955 }
24956 // Sort the reduced values by number of same/alternate opcode and/or pointer
24957 // operand.
24958 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
24959 return P1.size() > P2.size();
24960 });
24961 return true;
24962 }
24963
24964 /// Attempt to vectorize the tree found by matchAssociativeReduction.
24965 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
24966 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24967 DominatorTree &DT) {
24968 constexpr unsigned RegMaxNumber = 4;
24969 constexpr unsigned RedValsMaxNumber = 128;
24970 // If there are a sufficient number of reduction values, reduce
24971 // to a nearby power-of-2. We can safely generate oversized
24972 // vectors and rely on the backend to split them to legal sizes.
24973 if (unsigned NumReducedVals = std::accumulate(
24974 ReducedVals.begin(), ReducedVals.end(), 0,
24975 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
24976 if (!isGoodForReduction(Vals))
24977 return Num;
24978 return Num + Vals.size();
24979 });
24980 NumReducedVals < ReductionLimit &&
24981 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
24982 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
24983 })) {
24984 for (ReductionOpsType &RdxOps : ReductionOps)
24985 for (Value *RdxOp : RdxOps)
24986 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24987 return nullptr;
24988 }
24989
24990 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24991 TargetFolder(DL));
24992 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
24993
24994 // Track the reduced values in case if they are replaced by extractelement
24995 // because of the vectorization.
24996 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
24997 ReducedVals.front().size());
24998
24999 // The compare instruction of a min/max is the insertion point for new
25000 // instructions and may be replaced with a new compare instruction.
25001 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
25002 assert(isa<SelectInst>(RdxRootInst) &&
25003 "Expected min/max reduction to have select root instruction");
25004 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
25005 assert(isa<Instruction>(ScalarCond) &&
25006 "Expected min/max reduction to have compare condition");
25007 return cast<Instruction>(ScalarCond);
25008 };
25009
25010 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
25011 return isBoolLogicOp(cast<Instruction>(V));
25012 });
25013 // Return new VectorizedTree, based on previous value.
25014 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
25015 if (VectorizedTree) {
25016 // Update the final value in the reduction.
25018 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
25019 if (AnyBoolLogicOp) {
25020 auto It = ReducedValsToOps.find(VectorizedTree);
25021 auto It1 = ReducedValsToOps.find(Res);
25022 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
25023 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
25024 (It != ReducedValsToOps.end() &&
25025 any_of(It->getSecond(), [&](Instruction *I) {
25026 return isBoolLogicOp(I) &&
25027 getRdxOperand(I, 0) == VectorizedTree;
25028 }))) {
25029 ;
25030 } else if (isGuaranteedNotToBePoison(Res, AC) ||
25031 (It1 != ReducedValsToOps.end() &&
25032 any_of(It1->getSecond(), [&](Instruction *I) {
25033 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
25034 }))) {
25035 std::swap(VectorizedTree, Res);
25036 } else {
25037 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
25038 }
25039 }
25040
25041 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
25042 ReductionOps);
25043 }
25044 // Initialize the final value in the reduction.
25045 return Res;
25046 };
25047 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
25048 ReductionOps.front().size());
25049 for (ReductionOpsType &RdxOps : ReductionOps)
25050 for (Value *RdxOp : RdxOps) {
25051 if (!RdxOp)
25052 continue;
25053 IgnoreList.insert(RdxOp);
25054 }
25055 // Intersect the fast-math-flags from all reduction operations.
25056 FastMathFlags RdxFMF;
25057 RdxFMF.set();
25058 for (Value *U : IgnoreList)
25059 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
25060 RdxFMF &= FPMO->getFastMathFlags();
25061 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
25062
25063 // Need to track reduced vals, they may be changed during vectorization of
25064 // subvectors.
25065 for (ArrayRef<Value *> Candidates : ReducedVals)
25066 for (Value *V : Candidates)
25067 TrackedVals.try_emplace(V, V);
25068
25069 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
25070 Value *V) -> unsigned & {
25071 auto *It = MV.find(V);
25072 assert(It != MV.end() && "Unable to find given key.");
25073 return It->second;
25074 };
25075
25076 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
25077 // List of the values that were reduced in other trees as part of gather
25078 // nodes and thus requiring extract if fully vectorized in other trees.
25079 SmallPtrSet<Value *, 4> RequiredExtract;
25080 WeakTrackingVH VectorizedTree = nullptr;
25081 bool CheckForReusedReductionOps = false;
25082 // Try to vectorize elements based on their type.
25084 SmallVector<SmallVector<Value *>> LocalReducedVals;
25085 // Try merge consecutive reduced values into a single vectorizable group and
25086 // check, if they can be vectorized as copyables.
25087 for (ArrayRef<Value *> RV : ReducedVals) {
25088 // Loads are not very compatible with undefs.
25089 if (isa<UndefValue>(RV.front()) &&
25090 (States.empty() || !States.back() ||
25091 States.back().getOpcode() == Instruction::Load)) {
25092 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
25093 States.push_back(InstructionsState::invalid());
25094 continue;
25095 }
25096 if (!LocalReducedVals.empty() &&
25097 isa<UndefValue>(LocalReducedVals.back().front()) &&
25098 isa<LoadInst>(RV.front())) {
25099 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
25100 States.push_back(getSameOpcode(RV, TLI));
25101 continue;
25102 }
25104 if (!LocalReducedVals.empty())
25105 Ops = LocalReducedVals.back();
25106 Ops.append(RV.begin(), RV.end());
25107 InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
25108 InstructionsState OpS =
25109 Analysis.buildInstructionsState(Ops, V, VectorizeCopyableElements);
25110 if (LocalReducedVals.empty()) {
25111 LocalReducedVals.push_back(Ops);
25112 States.push_back(OpS);
25113 continue;
25114 }
25115 if (OpS) {
25116 LocalReducedVals.back().swap(Ops);
25117 States.back() = OpS;
25118 continue;
25119 }
25120 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
25121 States.push_back(getSameOpcode(RV, TLI));
25122 }
25123 ReducedVals.swap(LocalReducedVals);
25124 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
25125 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
25126 InstructionsState S = States[I];
25127 SmallVector<Value *> Candidates;
25128 Candidates.reserve(2 * OrigReducedVals.size());
25129 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
25130 for (Value *ReducedVal : OrigReducedVals) {
25131 Value *RdxVal = TrackedVals.at(ReducedVal);
25132 // Check if the reduction value was not overriden by the extractelement
25133 // instruction because of the vectorization and exclude it, if it is not
25134 // compatible with other values.
25135 // Also check if the instruction was folded to constant/other value.
25136 auto *Inst = dyn_cast<Instruction>(RdxVal);
25137 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
25138 (!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
25139 !S.isCopyableElement(Inst)))) ||
25140 (S && !Inst && !isa<PoisonValue>(RdxVal) &&
25141 !S.isCopyableElement(RdxVal)))
25142 continue;
25143 Candidates.push_back(RdxVal);
25144 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
25145 }
25146 bool ShuffledExtracts = false;
25147 // Try to handle shuffled extractelements.
25148 if (S && S.getOpcode() == Instruction::ExtractElement &&
25149 !S.isAltShuffle() && I + 1 < E) {
25150 SmallVector<Value *> CommonCandidates(Candidates);
25151 for (Value *RV : ReducedVals[I + 1]) {
25152 Value *RdxVal = TrackedVals.at(RV);
25153 // Check if the reduction value was not overriden by the
25154 // extractelement instruction because of the vectorization and
25155 // exclude it, if it is not compatible with other values.
25156 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
25157 if (!Inst)
25158 continue;
25159 CommonCandidates.push_back(RdxVal);
25160 TrackedToOrig.try_emplace(RdxVal, RV);
25161 }
25162 SmallVector<int> Mask;
25163 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
25164 ++I;
25165 Candidates.swap(CommonCandidates);
25166 ShuffledExtracts = true;
25167 }
25168 }
25169
25170 // Emit code for constant values.
25171 if (Candidates.size() > 1 && allConstant(Candidates)) {
25172 Value *Res = Candidates.front();
25173 Value *OrigV = TrackedToOrig.at(Candidates.front());
25174 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25175 for (Value *VC : ArrayRef(Candidates).drop_front()) {
25176 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
25177 Value *OrigV = TrackedToOrig.at(VC);
25178 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25179 if (auto *ResI = dyn_cast<Instruction>(Res))
25180 V.analyzedReductionRoot(ResI);
25181 }
25182 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
25183 continue;
25184 }
25185
25186 unsigned NumReducedVals = Candidates.size();
25187 if (NumReducedVals < ReductionLimit &&
25188 (NumReducedVals < 2 || !isSplat(Candidates)))
25189 continue;
25190
25191 // Check if we support repeated scalar values processing (optimization of
25192 // original scalar identity operations on matched horizontal reductions).
25193 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
25194 RdxKind != RecurKind::FMul &&
25195 RdxKind != RecurKind::FMulAdd;
25196 // Gather same values.
25197 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
25198 if (IsSupportedHorRdxIdentityOp)
25199 for (Value *V : Candidates) {
25200 Value *OrigV = TrackedToOrig.at(V);
25201 ++SameValuesCounter.try_emplace(OrigV).first->second;
25202 }
25203 // Used to check if the reduced values used same number of times. In this
25204 // case the compiler may produce better code. E.g. if reduced values are
25205 // aabbccdd (8 x values), then the first node of the tree will have a node
25206 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
25207 // Plus, the final reduction will be performed on <8 x aabbccdd>.
25208 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
25209 // x abcd) * 2.
25210 // Currently it only handles add/fadd/xor. and/or/min/max do not require
25211 // this analysis, other operations may require an extra estimation of
25212 // the profitability.
25213 bool SameScaleFactor = false;
25214 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
25215 SameValuesCounter.size() != Candidates.size();
25216 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
25217 if (OptReusedScalars) {
25218 SameScaleFactor =
25219 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
25220 RdxKind == RecurKind::Xor) &&
25221 all_of(drop_begin(SameValuesCounter),
25222 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
25223 return P.second == SameValuesCounter.front().second;
25224 });
25225 Candidates.resize(SameValuesCounter.size());
25226 transform(SameValuesCounter, Candidates.begin(),
25227 [&](const auto &P) { return TrackedVals.at(P.first); });
25228 NumReducedVals = Candidates.size();
25229 // Have a reduction of the same element.
25230 if (NumReducedVals == 1) {
25231 Value *OrigV = TrackedToOrig.at(Candidates.front());
25232 unsigned Cnt = At(SameValuesCounter, OrigV);
25233 Value *RedVal =
25234 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
25235 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25236 VectorizedVals.try_emplace(OrigV, Cnt);
25237 ExternallyUsedValues.insert(OrigV);
25238 continue;
25239 }
25240 }
25241
25242 unsigned MaxVecRegSize = V.getMaxVecRegSize();
25243 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
25244 const unsigned MaxElts = std::clamp<unsigned>(
25245 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
25246 RegMaxNumber * RedValsMaxNumber);
25247
25248 unsigned ReduxWidth = NumReducedVals;
25249 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
25250 unsigned NumParts, NumRegs;
25251 Type *ScalarTy = Candidates.front()->getType();
25252 ReduxWidth =
25253 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
25254 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
25255 NumParts = ::getNumberOfParts(TTI, Tp);
25256 NumRegs =
25258 while (NumParts > NumRegs) {
25259 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
25260 ReduxWidth = bit_floor(ReduxWidth - 1);
25261 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
25262 NumParts = ::getNumberOfParts(TTI, Tp);
25263 NumRegs =
25265 }
25266 if (NumParts > NumRegs / 2)
25267 ReduxWidth = bit_floor(ReduxWidth);
25268 return ReduxWidth;
25269 };
25270 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
25271 ReduxWidth = GetVectorFactor(ReduxWidth);
25272 ReduxWidth = std::min(ReduxWidth, MaxElts);
25273
25274 unsigned Start = 0;
25275 unsigned Pos = Start;
25276 // Restarts vectorization attempt with lower vector factor.
25277 unsigned PrevReduxWidth = ReduxWidth;
25278 bool CheckForReusedReductionOpsLocal = false;
25279 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
25280 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
25281 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
25282 // Check if any of the reduction ops are gathered. If so, worth
25283 // trying again with less number of reduction ops.
25284 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
25285 }
25286 ++Pos;
25287 if (Pos < NumReducedVals - ReduxWidth + 1)
25288 return IsAnyRedOpGathered;
25289 Pos = Start;
25290 --ReduxWidth;
25291 if (ReduxWidth > 1)
25292 ReduxWidth = GetVectorFactor(ReduxWidth);
25293 return IsAnyRedOpGathered;
25294 };
25295 bool AnyVectorized = false;
25296 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
25297 while (Pos < NumReducedVals - ReduxWidth + 1 &&
25298 ReduxWidth >= ReductionLimit) {
25299 // Dependency in tree of the reduction ops - drop this attempt, try
25300 // later.
25301 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
25302 Start == 0) {
25303 CheckForReusedReductionOps = true;
25304 break;
25305 }
25306 PrevReduxWidth = ReduxWidth;
25307 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
25308 // Been analyzed already - skip.
25309 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
25310 (!has_single_bit(ReduxWidth) &&
25311 (IgnoredCandidates.contains(
25312 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
25313 IgnoredCandidates.contains(
25314 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
25315 bit_floor(ReduxWidth))))) ||
25316 V.areAnalyzedReductionVals(VL)) {
25317 (void)AdjustReducedVals(/*IgnoreVL=*/true);
25318 continue;
25319 }
25320 // Early exit if any of the reduction values were deleted during
25321 // previous vectorization attempts.
25322 if (any_of(VL, [&V](Value *RedVal) {
25323 auto *RedValI = dyn_cast<Instruction>(RedVal);
25324 return RedValI && V.isDeleted(RedValI);
25325 }))
25326 break;
25327 V.buildTree(VL, IgnoreList);
25328 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
25329 if (!AdjustReducedVals())
25330 V.analyzedReductionVals(VL);
25331 continue;
25332 }
25333 if (V.isLoadCombineReductionCandidate(RdxKind)) {
25334 if (!AdjustReducedVals())
25335 V.analyzedReductionVals(VL);
25336 continue;
25337 }
25338 V.reorderTopToBottom();
25339 // No need to reorder the root node at all for reassociative reduction.
25340 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
25341 VL.front()->getType()->isIntOrIntVectorTy() ||
25342 ReductionLimit > 2);
25343 // Keep extracted other reduction values, if they are used in the
25344 // vectorization trees.
25345 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
25346 ExternallyUsedValues);
25347 // The reduction root is used as the insertion point for new
25348 // instructions, so set it as externally used to prevent it from being
25349 // deleted.
25350 LocalExternallyUsedValues.insert(ReductionRoot);
25351 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
25352 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
25353 continue;
25354 for (Value *V : ReducedVals[Cnt])
25355 if (isa<Instruction>(V))
25356 LocalExternallyUsedValues.insert(TrackedVals[V]);
25357 }
25358 if (!IsSupportedHorRdxIdentityOp) {
25359 // Number of uses of the candidates in the vector of values.
25360 assert(SameValuesCounter.empty() &&
25361 "Reused values counter map is not empty");
25362 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25363 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25364 continue;
25365 Value *V = Candidates[Cnt];
25366 Value *OrigV = TrackedToOrig.at(V);
25367 ++SameValuesCounter.try_emplace(OrigV).first->second;
25368 }
25369 }
25370 V.transformNodes();
25371 V.computeMinimumValueSizes();
25372 InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VL);
25373
25374 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
25375 // Gather externally used values.
25376 SmallPtrSet<Value *, 4> Visited;
25377 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25378 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25379 continue;
25380 Value *RdxVal = Candidates[Cnt];
25381 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
25382 RdxVal = It->second;
25383 if (!Visited.insert(RdxVal).second)
25384 continue;
25385 // Check if the scalar was vectorized as part of the vectorization
25386 // tree but not the top node.
25387 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
25388 LocalExternallyUsedValues.insert(RdxVal);
25389 continue;
25390 }
25391 Value *OrigV = TrackedToOrig.at(RdxVal);
25392 unsigned NumOps =
25393 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
25394 if (NumOps != ReducedValsToOps.at(OrigV).size())
25395 LocalExternallyUsedValues.insert(RdxVal);
25396 }
25397 // Do not need the list of reused scalars in regular mode anymore.
25398 if (!IsSupportedHorRdxIdentityOp)
25399 SameValuesCounter.clear();
25400 for (Value *RdxVal : VL)
25401 if (RequiredExtract.contains(RdxVal))
25402 LocalExternallyUsedValues.insert(RdxVal);
25403 V.buildExternalUses(LocalExternallyUsedValues);
25404
25405 // Estimate cost.
25406 InstructionCost ReductionCost =
25407 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
25408 InstructionCost Cost = V.getTreeCost(TreeCost, VL, ReductionCost);
25409 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
25410 << " for reduction\n");
25411 if (!Cost.isValid())
25412 break;
25413 if (Cost >= -SLPCostThreshold) {
25414 V.getORE()->emit([&]() {
25415 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
25416 ReducedValsToOps.at(VL[0]).front())
25417 << "Vectorizing horizontal reduction is possible "
25418 << "but not beneficial with cost " << ore::NV("Cost", Cost)
25419 << " and threshold "
25420 << ore::NV("Threshold", -SLPCostThreshold);
25421 });
25422 if (!AdjustReducedVals()) {
25423 V.analyzedReductionVals(VL);
25424 unsigned Offset = Pos == Start ? Pos : Pos - 1;
25425 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
25426 // Add subvectors of VL to the list of the analyzed values.
25427 for (unsigned VF = getFloorFullVectorNumberOfElements(
25428 *TTI, VL.front()->getType(), ReduxWidth - 1);
25429 VF >= ReductionLimit;
25431 *TTI, VL.front()->getType(), VF - 1)) {
25432 if (has_single_bit(VF) &&
25433 V.getCanonicalGraphSize() != V.getTreeSize())
25434 continue;
25435 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
25436 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
25437 }
25438 }
25439 }
25440 continue;
25441 }
25442
25443 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
25444 << Cost << ". (HorRdx)\n");
25445 V.getORE()->emit([&]() {
25446 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
25447 ReducedValsToOps.at(VL[0]).front())
25448 << "Vectorized horizontal reduction with cost "
25449 << ore::NV("Cost", Cost) << " and with tree size "
25450 << ore::NV("TreeSize", V.getTreeSize());
25451 });
25452
25453 Builder.setFastMathFlags(RdxFMF);
25454
25455 // Emit a reduction. If the root is a select (min/max idiom), the insert
25456 // point is the compare condition of that select.
25457 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
25458 Instruction *InsertPt = RdxRootInst;
25459 if (IsCmpSelMinMax)
25460 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
25461
25462 // Vectorize a tree.
25463 Value *VectorizedRoot = V.vectorizeTree(
25464 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
25465 // Update TrackedToOrig mapping, since the tracked values might be
25466 // updated.
25467 for (Value *RdxVal : Candidates) {
25468 Value *OrigVal = TrackedToOrig.at(RdxVal);
25469 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
25470 if (TransformedRdxVal != RdxVal)
25471 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
25472 }
25473
25474 Builder.SetInsertPoint(InsertPt);
25475
25476 // To prevent poison from leaking across what used to be sequential,
25477 // safe, scalar boolean logic operations, the reduction operand must be
25478 // frozen.
25479 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
25480 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
25481
25482 // Emit code to correctly handle reused reduced values, if required.
25483 if (OptReusedScalars && !SameScaleFactor) {
25484 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
25485 SameValuesCounter, TrackedToOrig);
25486 }
25487
25488 Type *ScalarTy = VL.front()->getType();
25489 Type *VecTy = VectorizedRoot->getType();
25490 Type *RedScalarTy = VecTy->getScalarType();
25491 VectorValuesAndScales.emplace_back(
25492 VectorizedRoot,
25493 OptReusedScalars && SameScaleFactor
25494 ? SameValuesCounter.front().second
25495 : 1,
25496 RedScalarTy != ScalarTy->getScalarType()
25497 ? V.isSignedMinBitwidthRootNode()
25498 : true);
25499
25500 // Count vectorized reduced values to exclude them from final reduction.
25501 for (Value *RdxVal : VL) {
25502 Value *OrigV = TrackedToOrig.at(RdxVal);
25503 if (IsSupportedHorRdxIdentityOp) {
25504 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
25505 continue;
25506 }
25507 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25508 if (!V.isVectorized(RdxVal))
25509 RequiredExtract.insert(RdxVal);
25510 }
25511 Pos += ReduxWidth;
25512 Start = Pos;
25513 ReduxWidth = NumReducedVals - Pos;
25514 if (ReduxWidth > 1)
25515 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
25516 AnyVectorized = true;
25517 }
25518 if (OptReusedScalars && !AnyVectorized) {
25519 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
25520 Value *RdxVal = TrackedVals.at(P.first);
25521 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
25522 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25523 VectorizedVals.try_emplace(P.first, P.second);
25524 }
25525 continue;
25526 }
25527 }
25528 if (!VectorValuesAndScales.empty())
25529 VectorizedTree = GetNewVectorizedTree(
25530 VectorizedTree,
25531 emitReduction(Builder, *TTI, ReductionRoot->getType()));
25532
25533 if (!VectorizedTree) {
25534 if (!CheckForReusedReductionOps) {
25535 for (ReductionOpsType &RdxOps : ReductionOps)
25536 for (Value *RdxOp : RdxOps)
25537 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
25538 }
25539 return nullptr;
25540 }
25541
25542 // Reorder operands of bool logical op in the natural order to avoid
25543 // possible problem with poison propagation. If not possible to reorder
25544 // (both operands are originally RHS), emit an extra freeze instruction
25545 // for the LHS operand.
25546 // I.e., if we have original code like this:
25547 // RedOp1 = select i1 ?, i1 LHS, i1 false
25548 // RedOp2 = select i1 RHS, i1 ?, i1 false
25549
25550 // Then, we swap LHS/RHS to create a new op that matches the poison
25551 // semantics of the original code.
25552
25553 // If we have original code like this and both values could be poison:
25554 // RedOp1 = select i1 ?, i1 LHS, i1 false
25555 // RedOp2 = select i1 ?, i1 RHS, i1 false
25556
25557 // Then, we must freeze LHS in the new op.
25558 auto FixBoolLogicalOps =
25559 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
25560 Instruction *RedOp2, bool InitStep) {
25561 if (!AnyBoolLogicOp)
25562 return;
25563 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
25564 getRdxOperand(RedOp1, 0) == LHS ||
25566 return;
25567 bool NeedFreeze = LHS != VectorizedTree;
25568 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
25569 getRdxOperand(RedOp2, 0) == RHS ||
25571 // If RedOp2 was used as a second operand - do not swap.
25572 if ((InitStep || RHS != VectorizedTree) &&
25573 getRdxOperand(RedOp2, 0) == RHS &&
25574 ((isBoolLogicOp(RedOp1) &&
25575 getRdxOperand(RedOp1, 1) == RedOp2) ||
25576 any_of(ReductionOps, [&](ArrayRef<Value *> Ops) {
25577 return any_of(Ops, [&](Value *Op) {
25578 auto *OpI = dyn_cast<Instruction>(Op);
25579 return OpI && isBoolLogicOp(OpI) &&
25580 getRdxOperand(OpI, 1) == RedOp2;
25581 });
25582 }))) {
25583 NeedFreeze = false;
25584 } else {
25585 std::swap(LHS, RHS);
25586 return;
25587 }
25588 }
25589 if (NeedFreeze)
25590 LHS = Builder.CreateFreeze(LHS);
25591 };
25592 // Finish the reduction.
25593 // Need to add extra arguments and not vectorized possible reduction values.
25594 // Try to avoid dependencies between the scalar remainders after reductions.
25595 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
25596 bool InitStep) {
25597 unsigned Sz = InstVals.size();
25598 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
25599 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
25600 Instruction *RedOp = InstVals[I + 1].first;
25601 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
25602 Value *RdxVal1 = InstVals[I].second;
25603 Value *StableRdxVal1 = RdxVal1;
25604 auto It1 = TrackedVals.find(RdxVal1);
25605 if (It1 != TrackedVals.end())
25606 StableRdxVal1 = It1->second;
25607 Value *RdxVal2 = InstVals[I + 1].second;
25608 Value *StableRdxVal2 = RdxVal2;
25609 auto It2 = TrackedVals.find(RdxVal2);
25610 if (It2 != TrackedVals.end())
25611 StableRdxVal2 = It2->second;
25612 // To prevent poison from leaking across what used to be sequential,
25613 // safe, scalar boolean logic operations, the reduction operand must be
25614 // frozen.
25615 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
25616 RedOp, InitStep);
25617 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
25618 StableRdxVal2, "op.rdx", ReductionOps);
25619 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
25620 }
25621 if (Sz % 2 == 1)
25622 ExtraReds[Sz / 2] = InstVals.back();
25623 return ExtraReds;
25624 };
25626 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
25627 VectorizedTree);
25628 SmallPtrSet<Value *, 8> Visited;
25629 for (ArrayRef<Value *> Candidates : ReducedVals) {
25630 for (Value *RdxVal : Candidates) {
25631 if (!Visited.insert(RdxVal).second)
25632 continue;
25633 unsigned NumOps = VectorizedVals.lookup(RdxVal);
25634 for (Instruction *RedOp :
25635 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
25636 ExtraReductions.emplace_back(RedOp, RdxVal);
25637 }
25638 }
25639 // Iterate through all not-vectorized reduction values/extra arguments.
25640 bool InitStep = true;
25641 while (ExtraReductions.size() > 1) {
25643 FinalGen(ExtraReductions, InitStep);
25644 ExtraReductions.swap(NewReds);
25645 InitStep = false;
25646 }
25647 VectorizedTree = ExtraReductions.front().second;
25648
25649 ReductionRoot->replaceAllUsesWith(VectorizedTree);
25650
25651 // The original scalar reduction is expected to have no remaining
25652 // uses outside the reduction tree itself. Assert that we got this
25653 // correct, replace internal uses with undef, and mark for eventual
25654 // deletion.
25655#ifndef NDEBUG
25656 SmallPtrSet<Value *, 4> IgnoreSet;
25657 for (ArrayRef<Value *> RdxOps : ReductionOps)
25658 IgnoreSet.insert_range(RdxOps);
25659#endif
25660 for (ArrayRef<Value *> RdxOps : ReductionOps) {
25661 for (Value *Ignore : RdxOps) {
25662 if (!Ignore)
25663 continue;
25664#ifndef NDEBUG
25665 for (auto *U : Ignore->users()) {
25666 assert(IgnoreSet.count(U) &&
25667 "All users must be either in the reduction ops list.");
25668 }
25669#endif
25670 if (!Ignore->use_empty()) {
25671 Value *P = PoisonValue::get(Ignore->getType());
25672 Ignore->replaceAllUsesWith(P);
25673 }
25674 }
25675 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
25676 }
25677 return VectorizedTree;
25678 }
25679
25680private:
25681 /// Creates the reduction from the given \p Vec vector value with the given
25682 /// scale \p Scale and signedness \p IsSigned.
25683 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
25684 Value *Vec, unsigned Scale, bool IsSigned,
25685 Type *DestTy) {
25686 Value *Rdx;
25687 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
25688 unsigned DestTyNumElements = getNumElements(VecTy);
25689 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
25690 Rdx = PoisonValue::get(
25691 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
25692 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
25693 // Do reduction for each lane.
25694 // e.g., do reduce add for
25695 // VL[0] = <4 x Ty> <a, b, c, d>
25696 // VL[1] = <4 x Ty> <e, f, g, h>
25697 // Lane[0] = <2 x Ty> <a, e>
25698 // Lane[1] = <2 x Ty> <b, f>
25699 // Lane[2] = <2 x Ty> <c, g>
25700 // Lane[3] = <2 x Ty> <d, h>
25701 // result[0] = reduce add Lane[0]
25702 // result[1] = reduce add Lane[1]
25703 // result[2] = reduce add Lane[2]
25704 // result[3] = reduce add Lane[3]
25705 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
25706 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
25707 Rdx = Builder.CreateInsertElement(
25708 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
25709 }
25710 } else {
25711 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
25712 }
25713 if (Rdx->getType() != DestTy)
25714 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
25715 // Improved analysis for add/fadd/xor reductions with same scale
25716 // factor for all operands of reductions. We can emit scalar ops for
25717 // them instead.
25718 if (Scale > 1)
25719 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
25720 return Rdx;
25721 }
25722
25723 /// Calculate the cost of a reduction.
25724 InstructionCost getReductionCost(TargetTransformInfo *TTI,
25725 ArrayRef<Value *> ReducedVals,
25726 bool IsCmpSelMinMax, FastMathFlags FMF,
25727 const BoUpSLP &R, DominatorTree &DT,
25728 const DataLayout &DL,
25729 const TargetLibraryInfo &TLI) {
25731 Type *ScalarTy = ReducedVals.front()->getType();
25732 unsigned ReduxWidth = ReducedVals.size();
25733 FixedVectorType *VectorTy = R.getReductionType();
25734 InstructionCost VectorCost = 0, ScalarCost;
25735 // If all of the reduced values are constant, the vector cost is 0, since
25736 // the reduction value can be calculated at the compile time.
25737 bool AllConsts = allConstant(ReducedVals);
25738 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
25740 // Scalar cost is repeated for N-1 elements.
25741 int Cnt = ReducedVals.size();
25742 for (Value *RdxVal : ReducedVals) {
25743 if (!isa<Instruction>(RdxVal))
25744 continue;
25745 if (Cnt == 1)
25746 break;
25747 --Cnt;
25748 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
25749 Cost += GenCostFn();
25750 continue;
25751 }
25752 InstructionCost ScalarCost = 0;
25753 for (User *U : RdxVal->users()) {
25754 auto *RdxOp = cast<Instruction>(U);
25755 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
25756 if (RdxKind == RecurKind::FAdd) {
25758 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
25759 if (FMACost.isValid()) {
25760 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
25761 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
25762 // Also, exclude scalar fmul cost.
25763 InstructionCost FMulCost =
25765 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
25766 FMACost -= FMulCost;
25767 }
25768 ScalarCost += FMACost;
25769 continue;
25770 }
25771 }
25772 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
25773 continue;
25774 }
25775 ScalarCost = InstructionCost::getInvalid();
25776 break;
25777 }
25778 if (ScalarCost.isValid())
25779 Cost += ScalarCost;
25780 else
25781 Cost += GenCostFn();
25782 }
25783 return Cost;
25784 };
25785 // Require reduction cost if:
25786 // 1. This type is not a full register type and no other vectors with the
25787 // same type in the storage (first vector with small type).
25788 // 2. The storage does not have any vector with full vector use (first
25789 // vector with full register use).
25790 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
25791 switch (RdxKind) {
25792 case RecurKind::Add:
25793 case RecurKind::Mul:
25794 case RecurKind::Or:
25795 case RecurKind::And:
25796 case RecurKind::Xor:
25797 case RecurKind::FAdd:
25798 case RecurKind::FMul: {
25799 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
25800 if (!AllConsts) {
25801 if (DoesRequireReductionOp) {
25802 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
25803 assert(SLPReVec && "FixedVectorType is not expected.");
25804 unsigned ScalarTyNumElements = VecTy->getNumElements();
25805 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
25806 VectorCost += TTI->getShuffleCost(
25809 ReducedVals.size()),
25810 VectorTy,
25811 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
25812 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
25813 FMF, CostKind);
25814 }
25815 VectorCost += TTI->getScalarizationOverhead(
25816 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
25817 /*Extract*/ false, TTI::TCK_RecipThroughput);
25818 } else {
25819 Type *RedTy = VectorTy->getElementType();
25820 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25821 std::make_pair(RedTy, true));
25822 if (RType == RedTy) {
25823 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
25824 FMF, CostKind);
25825 } else {
25826 VectorCost = TTI->getExtendedReductionCost(
25827 RdxOpcode, !IsSigned, RedTy,
25828 getWidenedType(RType, ReduxWidth), FMF, CostKind);
25829 }
25830 }
25831 } else {
25832 Type *RedTy = VectorTy->getElementType();
25833 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25834 std::make_pair(RedTy, true));
25835 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
25836 InstructionCost FMACost = InstructionCost::getInvalid();
25837 if (RdxKind == RecurKind::FAdd) {
25838 // Check if the reduction operands can be converted to FMA.
25840 FastMathFlags FMF;
25841 FMF.set();
25842 for (Value *RdxVal : ReducedVals) {
25843 if (!RdxVal->hasOneUse()) {
25844 Ops.clear();
25845 break;
25846 }
25847 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
25848 FMF &= FPCI->getFastMathFlags();
25849 Ops.push_back(RdxVal->user_back());
25850 }
25851 if (!Ops.empty()) {
25852 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
25853 *TTI, TLI);
25854 if (FMACost.isValid()) {
25855 // Calculate actual FMAD cost.
25856 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25857 {RVecTy, RVecTy, RVecTy}, FMF);
25858 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
25859
25860 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
25861 // Also, exclude vector fmul cost.
25863 Instruction::FMul, RVecTy, CostKind);
25865 << "Minus vector FMul cost: " << FMulCost << "\n");
25866 FMACost -= FMulCost;
25867 }
25868 }
25869 }
25870 if (FMACost.isValid())
25871 VectorCost += FMACost;
25872 else
25873 VectorCost +=
25874 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
25875 if (RType != RedTy) {
25876 unsigned Opcode = Instruction::Trunc;
25877 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25878 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25879 VectorCost += TTI->getCastInstrCost(
25880 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25881 }
25882 }
25883 }
25884 ScalarCost = EvaluateScalarCost([&]() {
25885 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
25886 });
25887 break;
25888 }
25889 case RecurKind::FMax:
25890 case RecurKind::FMin:
25891 case RecurKind::FMaximum:
25892 case RecurKind::FMinimum:
25893 case RecurKind::SMax:
25894 case RecurKind::SMin:
25895 case RecurKind::UMax:
25896 case RecurKind::UMin: {
25898 if (!AllConsts) {
25899 if (DoesRequireReductionOp) {
25900 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
25901 } else {
25902 // Check if the previous reduction already exists and account it as
25903 // series of operations + single reduction.
25904 Type *RedTy = VectorTy->getElementType();
25905 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25906 std::make_pair(RedTy, true));
25907 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
25908 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25909 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
25910 if (RType != RedTy) {
25911 unsigned Opcode = Instruction::Trunc;
25912 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25913 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25914 VectorCost += TTI->getCastInstrCost(
25915 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25916 }
25917 }
25918 }
25919 ScalarCost = EvaluateScalarCost([&]() {
25920 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25921 return TTI->getIntrinsicInstrCost(ICA, CostKind);
25922 });
25923 break;
25924 }
25925 default:
25926 llvm_unreachable("Expected arithmetic or min/max reduction operation");
25927 }
25928
25929 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
25930 << " for reduction of " << shortBundleName(ReducedVals)
25931 << " (It is a splitting reduction)\n");
25932 return VectorCost - ScalarCost;
25933 }
25934
25935 /// Splits the values, stored in VectorValuesAndScales, into registers/free
25936 /// sub-registers, combines them with the given reduction operation as a
25937 /// vector operation and then performs single (small enough) reduction.
25938 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
25939 Type *DestTy) {
25940 Value *ReducedSubTree = nullptr;
25941 // Creates reduction and combines with the previous reduction.
25942 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
25943 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
25944 if (ReducedSubTree)
25945 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25946 "op.rdx", ReductionOps);
25947 else
25948 ReducedSubTree = Rdx;
25949 };
25950 if (VectorValuesAndScales.size() == 1) {
25951 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
25952 CreateSingleOp(Vec, Scale, IsSigned);
25953 return ReducedSubTree;
25954 }
25955 // Scales Vec using given Cnt scale factor and then performs vector combine
25956 // with previous value of VecOp.
25957 Value *VecRes = nullptr;
25958 bool VecResSignedness = false;
25959 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
25960 Type *ScalarTy = Vec->getType()->getScalarType();
25961 // Scale Vec using given Cnt scale factor.
25962 if (Cnt > 1) {
25963 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
25964 switch (RdxKind) {
25965 case RecurKind::Add: {
25966 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
25967 unsigned VF = getNumElements(Vec->getType());
25968 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
25969 << ". (HorRdx)\n");
25970 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
25971 for (unsigned I : seq<unsigned>(Cnt))
25972 std::iota(std::next(Mask.begin(), VF * I),
25973 std::next(Mask.begin(), VF * (I + 1)), 0);
25974 ++NumVectorInstructions;
25975 Vec = Builder.CreateShuffleVector(Vec, Mask);
25976 break;
25977 }
25978 // res = mul vv, n
25979 if (ScalarTy != DestTy->getScalarType())
25980 Vec = Builder.CreateIntCast(
25981 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25982 IsSigned);
25984 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
25985 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
25986 << ". (HorRdx)\n");
25987 ++NumVectorInstructions;
25988 Vec = Builder.CreateMul(Vec, Scale);
25989 break;
25990 }
25991 case RecurKind::Xor: {
25992 // res = n % 2 ? 0 : vv
25994 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
25995 if (Cnt % 2 == 0)
25996 Vec = Constant::getNullValue(Vec->getType());
25997 break;
25998 }
25999 case RecurKind::FAdd: {
26000 // res = fmul v, n
26001 Value *Scale =
26002 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
26003 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
26004 << ". (HorRdx)\n");
26005 ++NumVectorInstructions;
26006 Vec = Builder.CreateFMul(Vec, Scale);
26007 break;
26008 }
26009 case RecurKind::And:
26010 case RecurKind::Or:
26011 case RecurKind::SMax:
26012 case RecurKind::SMin:
26013 case RecurKind::UMax:
26014 case RecurKind::UMin:
26015 case RecurKind::FMax:
26016 case RecurKind::FMin:
26017 case RecurKind::FMaximum:
26018 case RecurKind::FMinimum:
26019 // res = vv
26020 break;
26021 case RecurKind::Sub:
26022 case RecurKind::AddChainWithSubs:
26023 case RecurKind::Mul:
26024 case RecurKind::FMul:
26025 case RecurKind::FMulAdd:
26026 case RecurKind::AnyOf:
26027 case RecurKind::FindFirstIVSMin:
26028 case RecurKind::FindFirstIVUMin:
26029 case RecurKind::FindLastIVSMax:
26030 case RecurKind::FindLastIVUMax:
26031 case RecurKind::FindLast:
26032 case RecurKind::FMaxNum:
26033 case RecurKind::FMinNum:
26034 case RecurKind::FMaximumNum:
26035 case RecurKind::FMinimumNum:
26036 case RecurKind::None:
26037 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
26038 }
26039 }
26040 // Combine Vec with the previous VecOp.
26041 if (!VecRes) {
26042 VecRes = Vec;
26043 VecResSignedness = IsSigned;
26044 } else {
26045 ++NumVectorInstructions;
26046 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
26047 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
26048 // Handle ctpop.
26049 unsigned VecResVF = getNumElements(VecRes->getType());
26050 unsigned VecVF = getNumElements(Vec->getType());
26051 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
26052 std::iota(Mask.begin(), Mask.end(), 0);
26053 // Ensure that VecRes is always larger than Vec
26054 if (VecResVF < VecVF) {
26055 std::swap(VecRes, Vec);
26056 std::swap(VecResVF, VecVF);
26057 }
26058 if (VecResVF != VecVF) {
26059 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
26060 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
26061 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
26062 }
26063 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
26064 return;
26065 }
26066 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
26067 VecRes = Builder.CreateIntCast(
26068 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
26069 VecResSignedness);
26070 if (ScalarTy != DestTy->getScalarType())
26071 Vec = Builder.CreateIntCast(
26072 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
26073 IsSigned);
26074 unsigned VecResVF = getNumElements(VecRes->getType());
26075 unsigned VecVF = getNumElements(Vec->getType());
26076 // Ensure that VecRes is always larger than Vec
26077 if (VecResVF < VecVF) {
26078 std::swap(VecRes, Vec);
26079 std::swap(VecResVF, VecVF);
26080 }
26081 // extract + op + insert
26082 Value *Op = VecRes;
26083 if (VecResVF != VecVF)
26084 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
26085 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
26086 if (VecResVF != VecVF)
26087 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
26088 VecRes = Op;
26089 }
26090 };
26091 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
26092 CreateVecOp(Vec, Scale, IsSigned);
26093 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
26094
26095 return ReducedSubTree;
26096 }
26097
26098 /// Emit a horizontal reduction of the vectorized value.
26099 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
26100 const TargetTransformInfo *TTI, Type *DestTy) {
26101 assert(VectorizedValue && "Need to have a vectorized tree node");
26102 assert(RdxKind != RecurKind::FMulAdd &&
26103 "A call to the llvm.fmuladd intrinsic is not handled yet");
26104
26105 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
26106 if (FTy->getScalarType() == Builder.getInt1Ty() &&
26107 RdxKind == RecurKind::Add &&
26108 DestTy->getScalarType() != FTy->getScalarType()) {
26109 // Convert vector_reduce_add(ZExt(<n x i1>)) to
26110 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
26111 Value *V = Builder.CreateBitCast(
26112 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
26113 ++NumVectorInstructions;
26114 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
26115 }
26116 ++NumVectorInstructions;
26117 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
26118 }
26119
26120 /// Emits optimized code for unique scalar value reused \p Cnt times.
26121 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
26122 unsigned Cnt) {
26123 assert(IsSupportedHorRdxIdentityOp &&
26124 "The optimization of matched scalar identity horizontal reductions "
26125 "must be supported.");
26126 if (Cnt == 1)
26127 return VectorizedValue;
26128 switch (RdxKind) {
26129 case RecurKind::Add: {
26130 // res = mul vv, n
26131 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
26132 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
26133 << VectorizedValue << ". (HorRdx)\n");
26134 return Builder.CreateMul(VectorizedValue, Scale);
26135 }
26136 case RecurKind::Xor: {
26137 // res = n % 2 ? 0 : vv
26138 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
26139 << ". (HorRdx)\n");
26140 if (Cnt % 2 == 0)
26141 return Constant::getNullValue(VectorizedValue->getType());
26142 return VectorizedValue;
26143 }
26144 case RecurKind::FAdd: {
26145 // res = fmul v, n
26146 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
26147 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
26148 << VectorizedValue << ". (HorRdx)\n");
26149 return Builder.CreateFMul(VectorizedValue, Scale);
26150 }
26151 case RecurKind::And:
26152 case RecurKind::Or:
26153 case RecurKind::SMax:
26154 case RecurKind::SMin:
26155 case RecurKind::UMax:
26156 case RecurKind::UMin:
26157 case RecurKind::FMax:
26158 case RecurKind::FMin:
26159 case RecurKind::FMaximum:
26160 case RecurKind::FMinimum:
26161 // res = vv
26162 return VectorizedValue;
26163 case RecurKind::Sub:
26164 case RecurKind::AddChainWithSubs:
26165 case RecurKind::Mul:
26166 case RecurKind::FMul:
26167 case RecurKind::FMulAdd:
26168 case RecurKind::AnyOf:
26169 case RecurKind::FindFirstIVSMin:
26170 case RecurKind::FindFirstIVUMin:
26171 case RecurKind::FindLastIVSMax:
26172 case RecurKind::FindLastIVUMax:
26173 case RecurKind::FindLast:
26174 case RecurKind::FMaxNum:
26175 case RecurKind::FMinNum:
26176 case RecurKind::FMaximumNum:
26177 case RecurKind::FMinimumNum:
26178 case RecurKind::None:
26179 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
26180 }
26181 return nullptr;
26182 }
26183
26184 /// Emits actual operation for the scalar identity values, found during
26185 /// horizontal reduction analysis.
26186 Value *
26187 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
26188 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
26189 const DenseMap<Value *, Value *> &TrackedToOrig) {
26190 assert(IsSupportedHorRdxIdentityOp &&
26191 "The optimization of matched scalar identity horizontal reductions "
26192 "must be supported.");
26193 ArrayRef<Value *> VL = R.getRootNodeScalars();
26194 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
26195 if (VTy->getElementType() != VL.front()->getType()) {
26196 VectorizedValue = Builder.CreateIntCast(
26197 VectorizedValue,
26198 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
26199 R.isSignedMinBitwidthRootNode());
26200 }
26201 switch (RdxKind) {
26202 case RecurKind::Add: {
26203 // root = mul prev_root, <1, 1, n, 1>
26205 for (Value *V : VL) {
26206 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
26207 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
26208 }
26209 auto *Scale = ConstantVector::get(Vals);
26210 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
26211 << VectorizedValue << ". (HorRdx)\n");
26212 return Builder.CreateMul(VectorizedValue, Scale);
26213 }
26214 case RecurKind::And:
26215 case RecurKind::Or:
26216 // No need for multiple or/and(s).
26217 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
26218 << ". (HorRdx)\n");
26219 return VectorizedValue;
26220 case RecurKind::SMax:
26221 case RecurKind::SMin:
26222 case RecurKind::UMax:
26223 case RecurKind::UMin:
26224 case RecurKind::FMax:
26225 case RecurKind::FMin:
26226 case RecurKind::FMaximum:
26227 case RecurKind::FMinimum:
26228 // No need for multiple min/max(s) of the same value.
26229 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
26230 << ". (HorRdx)\n");
26231 return VectorizedValue;
26232 case RecurKind::Xor: {
26233 // Replace values with even number of repeats with 0, since
26234 // x xor x = 0.
26235 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
26236 // 7>, if elements 4th and 6th elements have even number of repeats.
26237 SmallVector<int> Mask(
26238 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
26240 std::iota(Mask.begin(), Mask.end(), 0);
26241 bool NeedShuffle = false;
26242 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
26243 Value *V = VL[I];
26244 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
26245 if (Cnt % 2 == 0) {
26246 Mask[I] = VF;
26247 NeedShuffle = true;
26248 }
26249 }
26250 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
26251 : Mask) dbgs()
26252 << I << " ";
26253 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
26254 if (NeedShuffle)
26255 VectorizedValue = Builder.CreateShuffleVector(
26256 VectorizedValue,
26257 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
26258 return VectorizedValue;
26259 }
26260 case RecurKind::FAdd: {
26261 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
26263 for (Value *V : VL) {
26264 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
26265 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
26266 }
26267 auto *Scale = ConstantVector::get(Vals);
26268 return Builder.CreateFMul(VectorizedValue, Scale);
26269 }
26270 case RecurKind::Sub:
26271 case RecurKind::AddChainWithSubs:
26272 case RecurKind::Mul:
26273 case RecurKind::FMul:
26274 case RecurKind::FMulAdd:
26275 case RecurKind::AnyOf:
26276 case RecurKind::FindFirstIVSMin:
26277 case RecurKind::FindFirstIVUMin:
26278 case RecurKind::FindLastIVSMax:
26279 case RecurKind::FindLastIVUMax:
26280 case RecurKind::FindLast:
26281 case RecurKind::FMaxNum:
26282 case RecurKind::FMinNum:
26283 case RecurKind::FMaximumNum:
26284 case RecurKind::FMinimumNum:
26285 case RecurKind::None:
26286 llvm_unreachable("Unexpected reduction kind for reused scalars.");
26287 }
26288 return nullptr;
26289 }
26290};
26291} // end anonymous namespace
26292
26293/// Gets recurrence kind from the specified value.
26295 return HorizontalReduction::getRdxKind(V);
26296}
26297static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
26298 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
26299 return cast<FixedVectorType>(IE->getType())->getNumElements();
26300
26301 unsigned AggregateSize = 1;
26302 auto *IV = cast<InsertValueInst>(InsertInst);
26303 Type *CurrentType = IV->getType();
26304 do {
26305 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
26306 for (auto *Elt : ST->elements())
26307 if (Elt != ST->getElementType(0)) // check homogeneity
26308 return std::nullopt;
26309 AggregateSize *= ST->getNumElements();
26310 CurrentType = ST->getElementType(0);
26311 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
26312 AggregateSize *= AT->getNumElements();
26313 CurrentType = AT->getElementType();
26314 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
26315 AggregateSize *= VT->getNumElements();
26316 return AggregateSize;
26317 } else if (CurrentType->isSingleValueType()) {
26318 return AggregateSize;
26319 } else {
26320 return std::nullopt;
26321 }
26322 } while (true);
26323}
26324
26325static void findBuildAggregateRec(Instruction *LastInsertInst,
26327 SmallVectorImpl<Value *> &BuildVectorOpds,
26328 SmallVectorImpl<Value *> &InsertElts,
26329 unsigned OperandOffset, const BoUpSLP &R) {
26330 do {
26331 Value *InsertedOperand = LastInsertInst->getOperand(1);
26332 std::optional<unsigned> OperandIndex =
26333 getElementIndex(LastInsertInst, OperandOffset);
26334 if (!OperandIndex || R.isDeleted(LastInsertInst))
26335 return;
26336 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
26338 BuildVectorOpds, InsertElts, *OperandIndex, R);
26339
26340 } else {
26341 BuildVectorOpds[*OperandIndex] = InsertedOperand;
26342 InsertElts[*OperandIndex] = LastInsertInst;
26343 }
26344 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
26345 } while (LastInsertInst != nullptr &&
26347 LastInsertInst->hasOneUse());
26348}
26349
26350/// Recognize construction of vectors like
26351/// %ra = insertelement <4 x float> poison, float %s0, i32 0
26352/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
26353/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
26354/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
26355/// starting from the last insertelement or insertvalue instruction.
26356///
26357/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
26358/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
26359/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
26360///
26361/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
26362///
26363/// \return true if it matches.
26364static bool findBuildAggregate(Instruction *LastInsertInst,
26366 SmallVectorImpl<Value *> &BuildVectorOpds,
26367 SmallVectorImpl<Value *> &InsertElts,
26368 const BoUpSLP &R) {
26369
26370 assert((isa<InsertElementInst>(LastInsertInst) ||
26371 isa<InsertValueInst>(LastInsertInst)) &&
26372 "Expected insertelement or insertvalue instruction!");
26373
26374 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
26375 "Expected empty result vectors!");
26376
26377 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
26378 if (!AggregateSize)
26379 return false;
26380 BuildVectorOpds.resize(*AggregateSize);
26381 InsertElts.resize(*AggregateSize);
26382
26383 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
26384 llvm::erase(BuildVectorOpds, nullptr);
26385 llvm::erase(InsertElts, nullptr);
26386 if (BuildVectorOpds.size() >= 2)
26387 return true;
26388
26389 return false;
26390}
26391
26392/// Try and get a reduction instruction from a phi node.
26393///
26394/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
26395/// if they come from either \p ParentBB or a containing loop latch.
26396///
26397/// \returns A candidate reduction value if possible, or \code nullptr \endcode
26398/// if not possible.
26400 BasicBlock *ParentBB, LoopInfo *LI) {
26401 // There are situations where the reduction value is not dominated by the
26402 // reduction phi. Vectorizing such cases has been reported to cause
26403 // miscompiles. See PR25787.
26404 auto DominatedReduxValue = [&](Value *R) {
26405 return isa<Instruction>(R) &&
26406 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
26407 };
26408
26409 Instruction *Rdx = nullptr;
26410
26411 // Return the incoming value if it comes from the same BB as the phi node.
26412 if (P->getIncomingBlock(0) == ParentBB) {
26413 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
26414 } else if (P->getIncomingBlock(1) == ParentBB) {
26415 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
26416 }
26417
26418 if (Rdx && DominatedReduxValue(Rdx))
26419 return Rdx;
26420
26421 // Otherwise, check whether we have a loop latch to look at.
26422 Loop *BBL = LI->getLoopFor(ParentBB);
26423 if (!BBL)
26424 return nullptr;
26425 BasicBlock *BBLatch = BBL->getLoopLatch();
26426 if (!BBLatch)
26427 return nullptr;
26428
26429 // There is a loop latch, return the incoming value if it comes from
26430 // that. This reduction pattern occasionally turns up.
26431 if (P->getIncomingBlock(0) == BBLatch) {
26432 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
26433 } else if (P->getIncomingBlock(1) == BBLatch) {
26434 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
26435 }
26436
26437 if (Rdx && DominatedReduxValue(Rdx))
26438 return Rdx;
26439
26440 return nullptr;
26441}
26442
26443static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
26444 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
26445 return true;
26446 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
26447 return true;
26448 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
26449 return true;
26450 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
26451 return true;
26452 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
26453 return true;
26455 return true;
26457 return true;
26459 return true;
26461 return true;
26462 return false;
26463}
26464
26465/// We could have an initial reduction that is not an add.
26466/// r *= v1 + v2 + v3 + v4
26467/// In such a case start looking for a tree rooted in the first '+'.
26468/// \Returns the new root if found, which may be nullptr if not an instruction.
26470 Instruction *Root) {
26471 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
26472 isa<IntrinsicInst>(Root)) &&
26473 "Expected binop, select, or intrinsic for reduction matching");
26474 Value *LHS =
26475 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
26476 Value *RHS =
26477 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
26478 if (LHS == Phi)
26479 return dyn_cast<Instruction>(RHS);
26480 if (RHS == Phi)
26481 return dyn_cast<Instruction>(LHS);
26482 return nullptr;
26483}
26484
26485/// \p Returns the first operand of \p I that does not match \p Phi. If
26486/// operand is not an instruction it returns nullptr.
26488 Value *Op0 = nullptr;
26489 Value *Op1 = nullptr;
26490 if (!matchRdxBop(I, Op0, Op1))
26491 return nullptr;
26492 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
26493}
26494
26495/// \Returns true if \p I is a candidate instruction for reduction vectorization.
26497 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
26498 Value *B0 = nullptr, *B1 = nullptr;
26499 bool IsBinop = matchRdxBop(I, B0, B1);
26500 return IsBinop || IsSelect;
26501}
26502
26503bool SLPVectorizerPass::vectorizeHorReduction(
26504 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
26505 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
26506 if (!ShouldVectorizeHor)
26507 return false;
26508 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
26509
26510 if (Root->getParent() != BB || isa<PHINode>(Root))
26511 return false;
26512
26513 // If we can find a secondary reduction root, use that instead.
26514 auto SelectRoot = [&]() {
26515 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
26516 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
26517 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
26518 return NewRoot;
26519 return Root;
26520 };
26521
26522 // Start analysis starting from Root instruction. If horizontal reduction is
26523 // found, try to vectorize it. If it is not a horizontal reduction or
26524 // vectorization is not possible or not effective, and currently analyzed
26525 // instruction is a binary operation, try to vectorize the operands, using
26526 // pre-order DFS traversal order. If the operands were not vectorized, repeat
26527 // the same procedure considering each operand as a possible root of the
26528 // horizontal reduction.
26529 // Interrupt the process if the Root instruction itself was vectorized or all
26530 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
26531 // If a horizintal reduction was not matched or vectorized we collect
26532 // instructions for possible later attempts for vectorization.
26533 std::queue<std::pair<Instruction *, unsigned>> Stack;
26534 Stack.emplace(SelectRoot(), 0);
26535 SmallPtrSet<Value *, 8> VisitedInstrs;
26536 bool Res = false;
26537 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
26538 if (R.isAnalyzedReductionRoot(Inst))
26539 return nullptr;
26540 if (!isReductionCandidate(Inst))
26541 return nullptr;
26542 HorizontalReduction HorRdx;
26543 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
26544 return nullptr;
26545 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
26546 };
26547 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
26548 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
26549 FutureSeed = getNonPhiOperand(Root, P);
26550 if (!FutureSeed)
26551 return false;
26552 }
26553 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
26554 // analysis is done separately.
26556 PostponedInsts.push_back(FutureSeed);
26557 return true;
26558 };
26559
26560 while (!Stack.empty()) {
26561 Instruction *Inst;
26562 unsigned Level;
26563 std::tie(Inst, Level) = Stack.front();
26564 Stack.pop();
26565 // Do not try to analyze instruction that has already been vectorized.
26566 // This may happen when we vectorize instruction operands on a previous
26567 // iteration while stack was populated before that happened.
26568 if (R.isDeleted(Inst))
26569 continue;
26570 if (Value *VectorizedV = TryToReduce(Inst)) {
26571 Res = true;
26572 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
26573 // Try to find another reduction.
26574 Stack.emplace(I, Level);
26575 continue;
26576 }
26577 if (R.isDeleted(Inst))
26578 continue;
26579 } else {
26580 // We could not vectorize `Inst` so try to use it as a future seed.
26581 if (!TryAppendToPostponedInsts(Inst)) {
26582 assert(Stack.empty() && "Expected empty stack");
26583 break;
26584 }
26585 }
26586
26587 // Try to vectorize operands.
26588 // Continue analysis for the instruction from the same basic block only to
26589 // save compile time.
26590 if (++Level < RecursionMaxDepth)
26591 for (auto *Op : Inst->operand_values())
26592 if (VisitedInstrs.insert(Op).second)
26593 if (auto *I = dyn_cast<Instruction>(Op))
26594 // Do not try to vectorize CmpInst operands, this is done
26595 // separately.
26597 !R.isDeleted(I) && I->getParent() == BB)
26598 Stack.emplace(I, Level);
26599 }
26600 return Res;
26601}
26602
26603bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
26604 if (!I)
26605 return false;
26606
26607 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
26608 return false;
26609 // Skip potential FMA candidates.
26610 if ((I->getOpcode() == Instruction::FAdd ||
26611 I->getOpcode() == Instruction::FSub) &&
26612 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
26613 .isValid())
26614 return false;
26615
26616 Value *P = I->getParent();
26617
26618 // Vectorize in current basic block only.
26619 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
26620 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
26621 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
26622 R.isDeleted(Op0) || R.isDeleted(Op1))
26623 return false;
26624
26625 // First collect all possible candidates
26627 Candidates.emplace_back(Op0, Op1);
26628
26629 auto *A = dyn_cast<BinaryOperator>(Op0);
26630 auto *B = dyn_cast<BinaryOperator>(Op1);
26631 // Try to skip B.
26632 if (A && B && B->hasOneUse()) {
26633 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
26634 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
26635 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
26636 Candidates.emplace_back(A, B0);
26637 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
26638 Candidates.emplace_back(A, B1);
26639 }
26640 // Try to skip A.
26641 if (B && A && A->hasOneUse()) {
26642 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
26643 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
26644 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
26645 Candidates.emplace_back(A0, B);
26646 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
26647 Candidates.emplace_back(A1, B);
26648 }
26649
26650 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
26652 if (!isReductionCandidate(Inst))
26653 return false;
26654 Type *Ty = Inst->getType();
26655 if (!isValidElementType(Ty) || Ty->isPointerTy())
26656 return false;
26657 HorizontalReduction HorRdx(Inst, Ops);
26658 if (!HorRdx.matchReductionForOperands())
26659 return false;
26660 // Check the cost of operations.
26661 VectorType *VecTy = getWidenedType(Ty, Ops.size());
26663 InstructionCost ScalarCost =
26664 TTI.getScalarizationOverhead(
26665 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
26666 /*Extract=*/true, CostKind) +
26667 TTI.getInstructionCost(Inst, CostKind);
26668 InstructionCost RedCost;
26669 switch (::getRdxKind(Inst)) {
26670 case RecurKind::Add:
26671 case RecurKind::Mul:
26672 case RecurKind::Or:
26673 case RecurKind::And:
26674 case RecurKind::Xor:
26675 case RecurKind::FAdd:
26676 case RecurKind::FMul: {
26677 FastMathFlags FMF;
26678 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
26679 FMF = FPCI->getFastMathFlags();
26680 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
26681 CostKind);
26682 break;
26683 }
26684 default:
26685 return false;
26686 }
26687 if (RedCost >= ScalarCost)
26688 return false;
26689
26690 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
26691 };
26692 if (Candidates.size() == 1)
26693 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
26694
26695 // We have multiple options. Try to pick the single best.
26696 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
26697 if (!BestCandidate)
26698 return false;
26699 return (*BestCandidate == 0 &&
26700 TryToReduce(I, {Candidates[*BestCandidate].first,
26701 Candidates[*BestCandidate].second})) ||
26702 tryToVectorizeList({Candidates[*BestCandidate].first,
26703 Candidates[*BestCandidate].second},
26704 R);
26705}
26706
26707bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
26708 BasicBlock *BB, BoUpSLP &R) {
26709 SmallVector<WeakTrackingVH> PostponedInsts;
26710 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
26711 Res |= tryToVectorize(PostponedInsts, R);
26712 return Res;
26713}
26714
26715bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
26716 BoUpSLP &R) {
26717 bool Res = false;
26718 for (Value *V : Insts)
26719 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
26720 Res |= tryToVectorize(Inst, R);
26721 return Res;
26722}
26723
26724bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
26725 BasicBlock *BB, BoUpSLP &R,
26726 bool MaxVFOnly) {
26727 if (!R.canMapToVector(IVI->getType()))
26728 return false;
26729
26730 SmallVector<Value *, 16> BuildVectorOpds;
26731 SmallVector<Value *, 16> BuildVectorInsts;
26732 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
26733 return false;
26734
26735 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
26736 R.getORE()->emit([&]() {
26737 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
26738 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
26739 "trying reduction first.";
26740 });
26741 return false;
26742 }
26743 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
26744 // Aggregate value is unlikely to be processed in vector register.
26745 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
26746}
26747
26748bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
26749 BasicBlock *BB, BoUpSLP &R,
26750 bool MaxVFOnly) {
26751 SmallVector<Value *, 16> BuildVectorInsts;
26752 SmallVector<Value *, 16> BuildVectorOpds;
26753 SmallVector<int> Mask;
26754 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
26756 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
26757 return false;
26758
26759 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
26760 R.getORE()->emit([&]() {
26761 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
26762 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
26763 "trying reduction first.";
26764 });
26765 return false;
26766 }
26767 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
26768 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
26769}
26770
26771template <typename T>
26773 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
26774 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
26775 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
26776 bool MaxVFOnly, BoUpSLP &R) {
26777 bool Changed = false;
26778 // Sort by type, parent, operands.
26779 stable_sort(Incoming, Comparator);
26780
26781 // Try to vectorize elements base on their type.
26782 SmallVector<T *> Candidates;
26784 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
26785 VL.clear()) {
26786 // Look for the next elements with the same type, parent and operand
26787 // kinds.
26788 auto *I = dyn_cast<Instruction>(*IncIt);
26789 if (!I || R.isDeleted(I)) {
26790 ++IncIt;
26791 continue;
26792 }
26793 auto *SameTypeIt = IncIt;
26794 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
26795 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
26796 AreCompatible(VL, *SameTypeIt))) {
26797 auto *I = dyn_cast<Instruction>(*SameTypeIt);
26798 ++SameTypeIt;
26799 if (I && !R.isDeleted(I))
26800 VL.push_back(cast<T>(I));
26801 }
26802
26803 // Try to vectorize them.
26804 unsigned NumElts = VL.size();
26805 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
26806 << NumElts << ")\n");
26807 // The vectorization is a 3-state attempt:
26808 // 1. Try to vectorize instructions with the same/alternate opcodes with the
26809 // size of maximal register at first.
26810 // 2. Try to vectorize remaining instructions with the same type, if
26811 // possible. This may result in the better vectorization results rather than
26812 // if we try just to vectorize instructions with the same/alternate opcodes.
26813 // 3. Final attempt to try to vectorize all instructions with the
26814 // same/alternate ops only, this may result in some extra final
26815 // vectorization.
26816 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
26817 // Success start over because instructions might have been changed.
26818 Changed = true;
26819 VL.swap(Candidates);
26820 Candidates.clear();
26821 for (T *V : VL) {
26822 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
26823 Candidates.push_back(V);
26824 }
26825 } else {
26826 /// \Returns the minimum number of elements that we will attempt to
26827 /// vectorize.
26828 auto GetMinNumElements = [&R](Value *V) {
26829 unsigned EltSize = R.getVectorElementSize(V);
26830 return std::max(2U, R.getMaxVecRegSize() / EltSize);
26831 };
26832 if (NumElts < GetMinNumElements(*IncIt) &&
26833 (Candidates.empty() ||
26834 Candidates.front()->getType() == (*IncIt)->getType())) {
26835 for (T *V : VL) {
26836 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
26837 Candidates.push_back(V);
26838 }
26839 }
26840 }
26841 // Final attempt to vectorize instructions with the same types.
26842 if (Candidates.size() > 1 &&
26843 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
26844 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
26845 // Success start over because instructions might have been changed.
26846 Changed = true;
26847 } else if (MaxVFOnly) {
26848 // Try to vectorize using small vectors.
26850 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
26851 VL.clear()) {
26852 auto *I = dyn_cast<Instruction>(*It);
26853 if (!I || R.isDeleted(I)) {
26854 ++It;
26855 continue;
26856 }
26857 auto *SameTypeIt = It;
26858 while (SameTypeIt != End &&
26859 (!isa<Instruction>(*SameTypeIt) ||
26860 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
26861 AreCompatible(*SameTypeIt, *It))) {
26862 auto *I = dyn_cast<Instruction>(*SameTypeIt);
26863 ++SameTypeIt;
26864 if (I && !R.isDeleted(I))
26865 VL.push_back(cast<T>(I));
26866 }
26867 unsigned NumElts = VL.size();
26868 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
26869 /*MaxVFOnly=*/false))
26870 Changed = true;
26871 It = SameTypeIt;
26872 }
26873 }
26874 Candidates.clear();
26875 }
26876
26877 // Start over at the next instruction of a different type (or the end).
26878 IncIt = SameTypeIt;
26879 }
26880 return Changed;
26881}
26882
26883/// Compare two cmp instructions. If IsCompatibility is true, function returns
26884/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
26885/// operands. If IsCompatibility is false, function implements strict weak
26886/// ordering relation between two cmp instructions, returning true if the first
26887/// instruction is "less" than the second, i.e. its predicate is less than the
26888/// predicate of the second or the operands IDs are less than the operands IDs
26889/// of the second cmp instruction.
26890template <bool IsCompatibility>
26891static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
26892 const DominatorTree &DT) {
26893 assert(isValidElementType(V->getType()) &&
26894 isValidElementType(V2->getType()) &&
26895 "Expected valid element types only.");
26896 if (V == V2)
26897 return IsCompatibility;
26898 auto *CI1 = cast<CmpInst>(V);
26899 auto *CI2 = cast<CmpInst>(V2);
26900 if (CI1->getOperand(0)->getType()->getTypeID() <
26901 CI2->getOperand(0)->getType()->getTypeID())
26902 return !IsCompatibility;
26903 if (CI1->getOperand(0)->getType()->getTypeID() >
26904 CI2->getOperand(0)->getType()->getTypeID())
26905 return false;
26906 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26908 return !IsCompatibility;
26909 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26911 return false;
26912 CmpInst::Predicate Pred1 = CI1->getPredicate();
26913 CmpInst::Predicate Pred2 = CI2->getPredicate();
26916 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
26917 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
26918 if (BasePred1 < BasePred2)
26919 return !IsCompatibility;
26920 if (BasePred1 > BasePred2)
26921 return false;
26922 // Compare operands.
26923 bool CI1Preds = Pred1 == BasePred1;
26924 bool CI2Preds = Pred2 == BasePred1;
26925 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
26926 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
26927 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
26928 if (Op1 == Op2)
26929 continue;
26930 if (Op1->getValueID() < Op2->getValueID())
26931 return !IsCompatibility;
26932 if (Op1->getValueID() > Op2->getValueID())
26933 return false;
26934 if (auto *I1 = dyn_cast<Instruction>(Op1))
26935 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
26936 if (IsCompatibility) {
26937 if (I1->getParent() != I2->getParent())
26938 return false;
26939 } else {
26940 // Try to compare nodes with same parent.
26941 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
26942 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
26943 if (!NodeI1)
26944 return NodeI2 != nullptr;
26945 if (!NodeI2)
26946 return false;
26947 assert((NodeI1 == NodeI2) ==
26948 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26949 "Different nodes should have different DFS numbers");
26950 if (NodeI1 != NodeI2)
26951 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26952 }
26953 InstructionsState S = getSameOpcode({I1, I2}, TLI);
26954 if (S && (IsCompatibility || !S.isAltShuffle()))
26955 continue;
26956 if (IsCompatibility)
26957 return false;
26958 if (I1->getOpcode() != I2->getOpcode())
26959 return I1->getOpcode() < I2->getOpcode();
26960 }
26961 }
26962 return IsCompatibility;
26963}
26964
26965template <typename ItT>
26966bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
26967 BasicBlock *BB, BoUpSLP &R) {
26968 bool Changed = false;
26969 // Try to find reductions first.
26970 for (CmpInst *I : CmpInsts) {
26971 if (R.isDeleted(I))
26972 continue;
26973 for (Value *Op : I->operands())
26974 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
26975 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
26976 if (R.isDeleted(I))
26977 break;
26978 }
26979 }
26980 // Try to vectorize operands as vector bundles.
26981 for (CmpInst *I : CmpInsts) {
26982 if (R.isDeleted(I))
26983 continue;
26984 Changed |= tryToVectorize(I, R);
26985 }
26986 // Try to vectorize list of compares.
26987 // Sort by type, compare predicate, etc.
26988 auto CompareSorter = [&](Value *V, Value *V2) {
26989 if (V == V2)
26990 return false;
26991 return compareCmp<false>(V, V2, *TLI, *DT);
26992 };
26993
26994 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
26995 if (VL.empty() || VL.back() == V1)
26996 return true;
26997 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
26998 };
26999
27001 for (Instruction *V : CmpInsts)
27002 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
27003 Vals.push_back(V);
27004 if (Vals.size() <= 1)
27005 return Changed;
27007 Vals, CompareSorter, AreCompatibleCompares,
27008 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
27009 // Exclude possible reductions from other blocks.
27010 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
27011 return any_of(V->users(), [V](User *U) {
27012 auto *Select = dyn_cast<SelectInst>(U);
27013 return Select &&
27014 Select->getParent() != cast<Instruction>(V)->getParent();
27015 });
27016 });
27017 if (ArePossiblyReducedInOtherBlock)
27018 return false;
27019 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27020 },
27021 /*MaxVFOnly=*/true, R);
27022 return Changed;
27023}
27024
27025bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
27026 BasicBlock *BB, BoUpSLP &R) {
27028 "This function only accepts Insert instructions");
27029 bool OpsChanged = false;
27030 SmallVector<WeakTrackingVH> PostponedInsts;
27031 for (auto *I : reverse(Instructions)) {
27032 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
27033 if (R.isDeleted(I) || isa<CmpInst>(I))
27034 continue;
27035 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
27036 OpsChanged |=
27037 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
27038 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
27039 OpsChanged |=
27040 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
27041 }
27042 // pass2 - try to vectorize reductions only
27043 if (R.isDeleted(I))
27044 continue;
27045 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
27046 if (R.isDeleted(I) || isa<CmpInst>(I))
27047 continue;
27048 // pass3 - try to match and vectorize a buildvector sequence.
27049 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
27050 OpsChanged |=
27051 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
27052 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
27053 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
27054 /*MaxVFOnly=*/false);
27055 }
27056 }
27057 // Now try to vectorize postponed instructions.
27058 OpsChanged |= tryToVectorize(PostponedInsts, R);
27059
27060 Instructions.clear();
27061 return OpsChanged;
27062}
27063
27064bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
27065 bool Changed = false;
27066 SmallVector<Value *, 4> Incoming;
27067 SmallPtrSet<Value *, 16> VisitedInstrs;
27068 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
27069 // node. Allows better to identify the chains that can be vectorized in the
27070 // better way.
27071 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
27072 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
27074 isValidElementType(V2->getType()) &&
27075 "Expected vectorizable types only.");
27076 if (V1 == V2)
27077 return false;
27078 // It is fine to compare type IDs here, since we expect only vectorizable
27079 // types, like ints, floats and pointers, we don't care about other type.
27080 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
27081 return true;
27082 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
27083 return false;
27084 if (V1->getType()->getScalarSizeInBits() <
27085 V2->getType()->getScalarSizeInBits())
27086 return true;
27087 if (V1->getType()->getScalarSizeInBits() >
27088 V2->getType()->getScalarSizeInBits())
27089 return false;
27090 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
27091 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
27092 if (Opcodes1.size() < Opcodes2.size())
27093 return true;
27094 if (Opcodes1.size() > Opcodes2.size())
27095 return false;
27096 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
27097 {
27098 // Instructions come first.
27099 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
27100 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
27101 if (I1 && I2) {
27102 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
27103 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
27104 if (!NodeI1)
27105 return NodeI2 != nullptr;
27106 if (!NodeI2)
27107 return false;
27108 assert((NodeI1 == NodeI2) ==
27109 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27110 "Different nodes should have different DFS numbers");
27111 if (NodeI1 != NodeI2)
27112 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27113 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
27114 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
27115 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
27116 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
27117 if (!E1 || !E2)
27118 continue;
27119
27120 // Sort on ExtractElementInsts primarily by vector operands. Prefer
27121 // program order of the vector operands.
27122 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
27123 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
27124 if (V1 != V2) {
27125 if (V1 && !V2)
27126 return true;
27127 if (!V1 && V2)
27128 return false;
27130 DT->getNode(V1->getParent());
27132 DT->getNode(V2->getParent());
27133 if (!NodeI1)
27134 return NodeI2 != nullptr;
27135 if (!NodeI2)
27136 return false;
27137 assert((NodeI1 == NodeI2) ==
27138 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27139 "Different nodes should have different DFS numbers");
27140 if (NodeI1 != NodeI2)
27141 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27142 return V1->comesBefore(V2);
27143 }
27144 // If we have the same vector operand, try to sort by constant
27145 // index.
27146 std::optional<unsigned> Id1 = getExtractIndex(E1);
27147 std::optional<unsigned> Id2 = getExtractIndex(E2);
27148 // Bring constants to the top
27149 if (Id1 && !Id2)
27150 return true;
27151 if (!Id1 && Id2)
27152 return false;
27153 // First elements come first.
27154 if (Id1 && Id2)
27155 return *Id1 < *Id2;
27156
27157 continue;
27158 }
27159 if (I1->getOpcode() == I2->getOpcode())
27160 continue;
27161 return I1->getOpcode() < I2->getOpcode();
27162 }
27163 if (I1)
27164 return true;
27165 if (I2)
27166 return false;
27167 }
27168 {
27169 // Non-undef constants come next.
27170 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
27171 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
27172 if (C1 && C2)
27173 continue;
27174 if (C1)
27175 return true;
27176 if (C2)
27177 return false;
27178 }
27179 bool U1 = isa<UndefValue>(Opcodes1[I]);
27180 bool U2 = isa<UndefValue>(Opcodes2[I]);
27181 {
27182 // Non-constant non-instructions come next.
27183 if (!U1 && !U2) {
27184 auto ValID1 = Opcodes1[I]->getValueID();
27185 auto ValID2 = Opcodes2[I]->getValueID();
27186 if (ValID1 == ValID2)
27187 continue;
27188 if (ValID1 < ValID2)
27189 return true;
27190 if (ValID1 > ValID2)
27191 return false;
27192 }
27193 if (!U1)
27194 return true;
27195 if (!U2)
27196 return false;
27197 }
27198 // Undefs come last.
27199 assert(U1 && U2 && "The only thing left should be undef & undef.");
27200 }
27201 return false;
27202 };
27203 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
27204 Value *V1) {
27205 if (VL.empty() || V1 == VL.back())
27206 return true;
27207 Value *V2 = VL.back();
27208 if (V1->getType() != V2->getType())
27209 return false;
27210 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
27211 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
27212 if (Opcodes1.size() != Opcodes2.size())
27213 return false;
27214 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
27215 // Undefs are compatible with any other value.
27216 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
27217 continue;
27218 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
27219 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
27220 if (R.isDeleted(I1) || R.isDeleted(I2))
27221 return false;
27222 if (I1->getParent() != I2->getParent())
27223 return false;
27224 if (getSameOpcode({I1, I2}, *TLI))
27225 continue;
27226 return false;
27227 }
27228 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
27229 continue;
27230 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
27231 return false;
27232 }
27233 return true;
27234 };
27235
27236 bool HaveVectorizedPhiNodes = false;
27237 do {
27238 // Collect the incoming values from the PHIs.
27239 Incoming.clear();
27240 for (Instruction &I : *BB) {
27241 auto *P = dyn_cast<PHINode>(&I);
27242 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
27243 break;
27244
27245 // No need to analyze deleted, vectorized and non-vectorizable
27246 // instructions.
27247 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
27248 isValidElementType(P->getType()))
27249 Incoming.push_back(P);
27250 }
27251
27252 if (Incoming.size() <= 1)
27253 break;
27254
27255 // Find the corresponding non-phi nodes for better matching when trying to
27256 // build the tree.
27257 for (Value *V : Incoming) {
27258 SmallVectorImpl<Value *> &Opcodes =
27259 PHIToOpcodes.try_emplace(V).first->getSecond();
27260 if (!Opcodes.empty())
27261 continue;
27262 SmallVector<Value *, 4> Nodes(1, V);
27263 SmallPtrSet<Value *, 4> Visited;
27264 while (!Nodes.empty()) {
27265 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
27266 if (!Visited.insert(PHI).second)
27267 continue;
27268 for (Value *V : PHI->incoming_values()) {
27269 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
27270 Nodes.push_back(PHI1);
27271 continue;
27272 }
27273 Opcodes.emplace_back(V);
27274 }
27275 }
27276 }
27277
27278 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
27279 Incoming, PHICompare, AreCompatiblePHIs,
27280 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
27281 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27282 },
27283 /*MaxVFOnly=*/true, R);
27284 Changed |= HaveVectorizedPhiNodes;
27285 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
27286 auto *PHI = dyn_cast<PHINode>(P.first);
27287 return !PHI || R.isDeleted(PHI);
27288 }))
27289 PHIToOpcodes.clear();
27290 VisitedInstrs.insert_range(Incoming);
27291 } while (HaveVectorizedPhiNodes);
27292
27293 VisitedInstrs.clear();
27294
27295 InstSetVector PostProcessInserts;
27296 SmallSetVector<CmpInst *, 8> PostProcessCmps;
27297 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
27298 // also vectorizes `PostProcessCmps`.
27299 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
27300 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
27301 if (VectorizeCmps) {
27302 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
27303 PostProcessCmps.clear();
27304 }
27305 PostProcessInserts.clear();
27306 return Changed;
27307 };
27308 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
27309 auto IsInPostProcessInstrs = [&](Instruction *I) {
27310 if (auto *Cmp = dyn_cast<CmpInst>(I))
27311 return PostProcessCmps.contains(Cmp);
27313 PostProcessInserts.contains(I);
27314 };
27315 // Returns true if `I` is an instruction without users, like terminator, or
27316 // function call with ignored return value, store. Ignore unused instructions
27317 // (basing on instruction type, except for CallInst and InvokeInst).
27318 auto HasNoUsers = [](Instruction *I) {
27319 return I->use_empty() &&
27320 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
27321 };
27322 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
27323 // Skip instructions with scalable type. The num of elements is unknown at
27324 // compile-time for scalable type.
27325 if (isa<ScalableVectorType>(It->getType()))
27326 continue;
27327
27328 // Skip instructions marked for the deletion.
27329 if (R.isDeleted(&*It))
27330 continue;
27331 // We may go through BB multiple times so skip the one we have checked.
27332 if (!VisitedInstrs.insert(&*It).second) {
27333 if (HasNoUsers(&*It) &&
27334 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
27335 // We would like to start over since some instructions are deleted
27336 // and the iterator may become invalid value.
27337 Changed = true;
27338 It = BB->begin();
27339 E = BB->end();
27340 }
27341 continue;
27342 }
27343
27344 // Try to vectorize reductions that use PHINodes.
27345 if (PHINode *P = dyn_cast<PHINode>(It)) {
27346 // Check that the PHI is a reduction PHI.
27347 if (P->getNumIncomingValues() == 2) {
27348 // Try to match and vectorize a horizontal reduction.
27349 Instruction *Root = getReductionInstr(DT, P, BB, LI);
27350 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
27351 Changed = true;
27352 It = BB->begin();
27353 E = BB->end();
27354 continue;
27355 }
27356 }
27357 // Try to vectorize the incoming values of the PHI, to catch reductions
27358 // that feed into PHIs.
27359 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
27360 // Skip if the incoming block is the current BB for now. Also, bypass
27361 // unreachable IR for efficiency and to avoid crashing.
27362 // TODO: Collect the skipped incoming values and try to vectorize them
27363 // after processing BB.
27364 if (BB == P->getIncomingBlock(I) ||
27365 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
27366 continue;
27367
27368 // Postponed instructions should not be vectorized here, delay their
27369 // vectorization.
27370 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
27371 PI && !IsInPostProcessInstrs(PI)) {
27372 bool Res =
27373 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
27374 Changed |= Res;
27375 if (Res && R.isDeleted(P)) {
27376 It = BB->begin();
27377 E = BB->end();
27378 break;
27379 }
27380 }
27381 }
27382 continue;
27383 }
27384
27385 if (HasNoUsers(&*It)) {
27386 bool OpsChanged = false;
27387 auto *SI = dyn_cast<StoreInst>(It);
27388 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
27389 if (SI) {
27390 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
27391 // Try to vectorize chain in store, if this is the only store to the
27392 // address in the block.
27393 // TODO: This is just a temporarily solution to save compile time. Need
27394 // to investigate if we can safely turn on slp-vectorize-hor-store
27395 // instead to allow lookup for reduction chains in all non-vectorized
27396 // stores (need to check side effects and compile time).
27397 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
27398 SI->getValueOperand()->hasOneUse();
27399 }
27400 if (TryToVectorizeRoot) {
27401 for (auto *V : It->operand_values()) {
27402 // Postponed instructions should not be vectorized here, delay their
27403 // vectorization.
27404 if (auto *VI = dyn_cast<Instruction>(V);
27405 VI && !IsInPostProcessInstrs(VI))
27406 // Try to match and vectorize a horizontal reduction.
27407 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
27408 }
27409 }
27410 // Start vectorization of post-process list of instructions from the
27411 // top-tree instructions to try to vectorize as many instructions as
27412 // possible.
27413 OpsChanged |=
27414 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
27415 if (OpsChanged) {
27416 // We would like to start over since some instructions are deleted
27417 // and the iterator may become invalid value.
27418 Changed = true;
27419 It = BB->begin();
27420 E = BB->end();
27421 continue;
27422 }
27423 }
27424
27426 PostProcessInserts.insert(&*It);
27427 else if (isa<CmpInst>(It))
27428 PostProcessCmps.insert(cast<CmpInst>(&*It));
27429 }
27430
27431 return Changed;
27432}
27433
27434bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
27435 auto Changed = false;
27436 for (auto &Entry : GEPs) {
27437 // If the getelementptr list has fewer than two elements, there's nothing
27438 // to do.
27439 if (Entry.second.size() < 2)
27440 continue;
27441
27442 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
27443 << Entry.second.size() << ".\n");
27444
27445 // Process the GEP list in chunks suitable for the target's supported
27446 // vector size. If a vector register can't hold 1 element, we are done. We
27447 // are trying to vectorize the index computations, so the maximum number of
27448 // elements is based on the size of the index expression, rather than the
27449 // size of the GEP itself (the target's pointer size).
27450 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
27451 return !R.isDeleted(GEP);
27452 });
27453 if (It == Entry.second.end())
27454 continue;
27455 unsigned MaxVecRegSize = R.getMaxVecRegSize();
27456 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
27457 if (MaxVecRegSize < EltSize)
27458 continue;
27459
27460 unsigned MaxElts = MaxVecRegSize / EltSize;
27461 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
27462 auto Len = std::min<unsigned>(BE - BI, MaxElts);
27463 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
27464
27465 // Initialize a set a candidate getelementptrs. Note that we use a
27466 // SetVector here to preserve program order. If the index computations
27467 // are vectorizable and begin with loads, we want to minimize the chance
27468 // of having to reorder them later.
27469 SetVector<Value *> Candidates(llvm::from_range, GEPList);
27470
27471 // Some of the candidates may have already been vectorized after we
27472 // initially collected them or their index is optimized to constant value.
27473 // If so, they are marked as deleted, so remove them from the set of
27474 // candidates.
27475 Candidates.remove_if([&R](Value *I) {
27476 return R.isDeleted(cast<Instruction>(I)) ||
27477 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
27478 });
27479
27480 // Remove from the set of candidates all pairs of getelementptrs with
27481 // constant differences. Such getelementptrs are likely not good
27482 // candidates for vectorization in a bottom-up phase since one can be
27483 // computed from the other. We also ensure all candidate getelementptr
27484 // indices are unique.
27485 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
27486 auto *GEPI = GEPList[I];
27487 if (!Candidates.count(GEPI))
27488 continue;
27489 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
27490 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
27491 auto *GEPJ = GEPList[J];
27492 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
27493 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
27494 Candidates.remove(GEPI);
27495 Candidates.remove(GEPJ);
27496 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
27497 Candidates.remove(GEPJ);
27498 }
27499 }
27500 }
27501
27502 // We break out of the above computation as soon as we know there are
27503 // fewer than two candidates remaining.
27504 if (Candidates.size() < 2)
27505 continue;
27506
27507 // Add the single, non-constant index of each candidate to the bundle. We
27508 // ensured the indices met these constraints when we originally collected
27509 // the getelementptrs.
27510 SmallVector<Value *, 16> Bundle(Candidates.size());
27511 auto BundleIndex = 0u;
27512 for (auto *V : Candidates) {
27513 auto *GEP = cast<GetElementPtrInst>(V);
27514 auto *GEPIdx = GEP->idx_begin()->get();
27515 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
27516 Bundle[BundleIndex++] = GEPIdx;
27517 }
27518
27519 // Try and vectorize the indices. We are currently only interested in
27520 // gather-like cases of the form:
27521 //
27522 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
27523 //
27524 // where the loads of "a", the loads of "b", and the subtractions can be
27525 // performed in parallel. It's likely that detecting this pattern in a
27526 // bottom-up phase will be simpler and less costly than building a
27527 // full-blown top-down phase beginning at the consecutive loads.
27528 Changed |= tryToVectorizeList(Bundle, R);
27529 }
27530 }
27531 return Changed;
27532}
27533
27534bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
27535 bool Changed = false;
27536 // Sort by type, base pointers and values operand. Value operands must be
27537 // compatible (have the same opcode, same parent), otherwise it is
27538 // definitely not profitable to try to vectorize them.
27539 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
27540 if (V->getValueOperand()->getType()->getTypeID() <
27541 V2->getValueOperand()->getType()->getTypeID())
27542 return true;
27543 if (V->getValueOperand()->getType()->getTypeID() >
27544 V2->getValueOperand()->getType()->getTypeID())
27545 return false;
27546 if (V->getPointerOperandType()->getTypeID() <
27547 V2->getPointerOperandType()->getTypeID())
27548 return true;
27549 if (V->getPointerOperandType()->getTypeID() >
27550 V2->getPointerOperandType()->getTypeID())
27551 return false;
27552 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
27553 V2->getValueOperand()->getType()->getScalarSizeInBits())
27554 return true;
27555 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
27556 V2->getValueOperand()->getType()->getScalarSizeInBits())
27557 return false;
27558 // UndefValues are compatible with all other values.
27559 auto *I1 = dyn_cast<Instruction>(V->getValueOperand());
27560 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
27561 if (I1 && I2) {
27562 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
27563 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
27564 assert(NodeI1 && "Should only process reachable instructions");
27565 assert(NodeI2 && "Should only process reachable instructions");
27566 assert((NodeI1 == NodeI2) ==
27567 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27568 "Different nodes should have different DFS numbers");
27569 if (NodeI1 != NodeI2)
27570 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27571 return I1->getOpcode() < I2->getOpcode();
27572 }
27573 if (I1 && !I2)
27574 return true;
27575 if (!I1 && I2)
27576 return false;
27577 return V->getValueOperand()->getValueID() <
27578 V2->getValueOperand()->getValueID();
27579 };
27580
27581 bool SameParent = true;
27582 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
27583 if (VL.empty()) {
27584 SameParent = true;
27585 return true;
27586 }
27587 StoreInst *V2 = VL.back();
27588 if (V1 == V2)
27589 return true;
27590 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
27591 return false;
27592 if (V1->getPointerOperandType() != V2->getPointerOperandType())
27593 return false;
27594 // Undefs are compatible with any other value.
27595 if (isa<UndefValue>(V1->getValueOperand()) ||
27597 return true;
27598 if (isa<Constant>(V1->getValueOperand()) &&
27600 return true;
27601 // Check if the operands of the stores can be vectorized. They can be
27602 // vectorized, if they have compatible operands or have operands, which can
27603 // be vectorized as copyables.
27604 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
27605 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
27606 if (I1 || I2) {
27607 // Accept only tail-following non-compatible values for now.
27608 // TODO: investigate if it is possible to vectorize incompatible values,
27609 // if the copyables are first in the list.
27610 if (I1 && !I2)
27611 return false;
27612 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
27613 SmallVector<Value *> NewVL(VL.size() + 1);
27614 for (auto [SI, V] : zip(VL, NewVL))
27615 V = SI->getValueOperand();
27616 NewVL.back() = V1->getValueOperand();
27617 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
27618 InstructionsState S = Analysis.buildInstructionsState(
27619 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
27620 /*SkipSameCodeCheck=*/!SameParent);
27621 if (S)
27622 return true;
27623 if (!SameParent)
27624 return false;
27625 }
27626 return V1->getValueOperand()->getValueID() ==
27627 V2->getValueOperand()->getValueID();
27628 };
27629
27630 // Attempt to sort and vectorize each of the store-groups.
27631 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
27632 for (auto &Pair : Stores) {
27633 if (Pair.second.size() < 2)
27634 continue;
27635
27636 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
27637 << Pair.second.size() << ".\n");
27638
27639 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
27640 continue;
27641
27642 // Reverse stores to do bottom-to-top analysis. This is important if the
27643 // values are stores to the same addresses several times, in this case need
27644 // to follow the stores order (reversed to meet the memory dependecies).
27645 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
27646 Pair.second.rend());
27648 ReversedStores, StoreSorter, AreCompatibleStores,
27649 [&](ArrayRef<StoreInst *> Candidates, bool) {
27650 return vectorizeStores(Candidates, R, Attempted);
27651 },
27652 /*MaxVFOnly=*/false, R);
27653 }
27654 return Changed;
27655}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
static Type * getIndexType(Value *In)
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op, bool IsCopyable=false)
Checks if the operand is commutative.
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, SmallVectorImpl< int64_t > &Coeffs)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1415
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1549
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1339
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1677
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1497
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1112
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1405
void negate()
Negate this APInt in place.
Definition APInt.h:1477
unsigned logBase2() const
Definition APInt.h:1770
void setAllBits()
Set every bit to 1.
Definition APInt.h:1328
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1376
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:287
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:178
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:219
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:195
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:157
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:470
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:488
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:491
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:718
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:664
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
static bool shouldExecute(CounterInfo &Counter)
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getUnknown()
Definition DebugLoc.h:161
An analysis that produces DemandedBits for a function.
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2553
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2619
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2175
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2575
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2248
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2410
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:154
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Information for memory intrinsic cost model.
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
T & front() const
front - Get the first element.
Definition ArrayRef.h:349
iterator end() const
Definition ArrayRef.h:343
iterator begin() const
Definition ArrayRef.h:342
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:91
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:132
void insert_range(Range &&R)
Definition SetVector.h:176
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:94
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:228
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:180
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:285
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:233
unsigned getNumOperands() const
Definition User.h:255
iterator_range< value_op_iterator > operand_values()
Definition User.h:317
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
bool hasUseList() const
Check if this Value has a use-list.
Definition Value.h:344
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1106
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
InstructionCost getTreeCost(InstructionCost TreeCost, ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
InstructionCost calculateTreeCostAndTrimNonProfitable(ArrayRef< Value * > VectorizedVals={})
Calculates the cost of the subtrees, trims non-profitable ones and returns final cost.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2170
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2106
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1757
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1731
scope_exit(Callable) -> scope_exit< Callable >
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2303
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:2029
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2190
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2016
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1775
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:361
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1397
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1968
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Add
Sum of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1883
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1407
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1437
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1446
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)