LLVM 23.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <map>
99#include <memory>
100#include <optional>
101#include <set>
102#include <string>
103#include <tuple>
104#include <utility>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108using namespace slpvectorizer;
109using namespace std::placeholders;
110
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
113
114STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
115
116DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
117 "Controls which SLP graphs should be vectorized.");
118
119static cl::opt<bool>
120 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
121 cl::desc("Run the SLP vectorization passes"));
122
123static cl::opt<bool>
124 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
125 cl::desc("Enable vectorization for wider vector utilization"));
126
127static cl::opt<int>
129 cl::desc("Only vectorize if you gain more than this "
130 "number "));
131
133 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
134 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
135 "heuristics and makes vectorization decision via cost modeling."));
136
137static cl::opt<bool>
138ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
139 cl::desc("Attempt to vectorize horizontal reductions"));
140
142 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
143 cl::desc(
144 "Attempt to vectorize horizontal reductions feeding into a store"));
145
147 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
148 cl::desc("Improve the code quality by splitting alternate instructions"));
149
150static cl::opt<int>
152 cl::desc("Attempt to vectorize for this register size in bits"));
153
156 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
157
158/// Limits the size of scheduling regions in a block.
159/// It avoid long compile times for _very_ large blocks where vector
160/// instructions are spread over a wide range.
161/// This limit is way higher than needed by real-world functions.
162static cl::opt<int>
163ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
164 cl::desc("Limit the size of the SLP scheduling region per block"));
165
167 "slp-min-reg-size", cl::init(128), cl::Hidden,
168 cl::desc("Attempt to vectorize for this register size in bits"));
169
171 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
172 cl::desc("Limit the recursion depth when building a vectorizable tree"));
173
175 "slp-min-tree-size", cl::init(3), cl::Hidden,
176 cl::desc("Only vectorize small trees if they are fully vectorizable"));
177
178// The maximum depth that the look-ahead score heuristic will explore.
179// The higher this value, the higher the compilation time overhead.
181 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
182 cl::desc("The maximum look-ahead depth for operand reordering scores"));
183
184// The maximum depth that the look-ahead score heuristic will explore
185// when it probing among candidates for vectorization tree roots.
186// The higher this value, the higher the compilation time overhead but unlike
187// similar limit for operands ordering this is less frequently used, hence
188// impact of higher value is less noticeable.
190 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
191 cl::desc("The maximum look-ahead depth for searching best rooting option"));
192
194 "slp-min-strided-loads", cl::init(2), cl::Hidden,
195 cl::desc("The minimum number of loads, which should be considered strided, "
196 "if the stride is > 1 or is runtime value"));
197
199 "slp-max-stride", cl::init(8), cl::Hidden,
200 cl::desc("The maximum stride, considered to be profitable."));
201
202static cl::opt<bool>
203 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
204 cl::desc("Disable tree reordering even if it is "
205 "profitable. Used for testing only."));
206
207static cl::opt<bool>
208 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
209 cl::desc("Generate strided loads even if they are not "
210 "profitable. Used for testing only."));
211
212static cl::opt<bool>
213 ViewSLPTree("view-slp-tree", cl::Hidden,
214 cl::desc("Display the SLP trees with Graphviz"));
215
217 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
218 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
219
220/// Enables vectorization of copyable elements.
222 "slp-copyable-elements", cl::init(true), cl::Hidden,
223 cl::desc("Try to replace values with the idempotent instructions for "
224 "better vectorization."));
225
226// Limit the number of alias checks. The limit is chosen so that
227// it has no negative effect on the llvm benchmarks.
228static const unsigned AliasedCheckLimit = 10;
229
230// Limit of the number of uses for potentially transformed instructions/values,
231// used in checks to avoid compile-time explode.
232static constexpr int UsesLimit = 64;
233
234// Another limit for the alias checks: The maximum distance between load/store
235// instructions where alias checks are done.
236// This limit is useful for very large basic blocks.
237static const unsigned MaxMemDepDistance = 160;
238
239/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
240/// regions to be handled.
241static const int MinScheduleRegionSize = 16;
242
243/// Maximum allowed number of operands in the PHI nodes.
244static const unsigned MaxPHINumOperands = 128;
245
246/// Predicate for the element types that the SLP vectorizer supports.
247///
248/// The most important thing to filter here are types which are invalid in LLVM
249/// vectors. We also filter target specific types which have absolutely no
250/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
251/// avoids spending time checking the cost model and realizing that they will
252/// be inevitably scalarized.
253static bool isValidElementType(Type *Ty) {
254 // TODO: Support ScalableVectorType.
256 Ty = Ty->getScalarType();
257 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
258 !Ty->isPPC_FP128Ty();
259}
260
261/// Returns the type of the given value/instruction \p V. If it is store,
262/// returns the type of its value operand, for Cmp - the types of the compare
263/// operands and for insertelement - the type os the inserted operand.
264/// Otherwise, just the type of the value is returned.
266 if (auto *SI = dyn_cast<StoreInst>(V))
267 return SI->getValueOperand()->getType();
268 if (auto *CI = dyn_cast<CmpInst>(V))
269 return CI->getOperand(0)->getType();
270 if (auto *IE = dyn_cast<InsertElementInst>(V))
271 return IE->getOperand(1)->getType();
272 return V->getType();
273}
274
275/// \returns the number of elements for Ty.
276static unsigned getNumElements(Type *Ty) {
278 "ScalableVectorType is not supported.");
279 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
280 return VecTy->getNumElements();
281 return 1;
282}
283
284/// \returns the vector type of ScalarTy based on vectorization factor.
285static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
286 return FixedVectorType::get(ScalarTy->getScalarType(),
287 VF * getNumElements(ScalarTy));
288}
289
290/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
291/// which forms type, which splits by \p TTI into whole vector types during
292/// legalization.
294 Type *Ty, unsigned Sz) {
295 if (!isValidElementType(Ty))
296 return bit_ceil(Sz);
297 // Find the number of elements, which forms full vectors.
298 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
299 if (NumParts == 0 || NumParts >= Sz)
300 return bit_ceil(Sz);
301 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
302}
303
304/// Returns the number of elements of the given type \p Ty, not greater than \p
305/// Sz, which forms type, which splits by \p TTI into whole vector types during
306/// legalization.
307static unsigned
309 unsigned Sz) {
310 if (!isValidElementType(Ty))
311 return bit_floor(Sz);
312 // Find the number of elements, which forms full vectors.
313 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
314 if (NumParts == 0 || NumParts >= Sz)
315 return bit_floor(Sz);
316 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
317 if (RegVF > Sz)
318 return bit_floor(Sz);
319 return (Sz / RegVF) * RegVF;
320}
321
322static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
323 SmallVectorImpl<int> &Mask) {
324 // The ShuffleBuilder implementation use shufflevector to splat an "element".
325 // But the element have different meaning for SLP (scalar) and REVEC
326 // (vector). We need to expand Mask into masks which shufflevector can use
327 // directly.
328 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
329 for (unsigned I : seq<unsigned>(Mask.size()))
330 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
331 I * VecTyNumElements, VecTyNumElements)))
332 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
333 : Mask[I] * VecTyNumElements + J;
334 Mask.swap(NewMask);
335}
336
337/// \returns the number of groups of shufflevector
338/// A group has the following features
339/// 1. All of value in a group are shufflevector.
340/// 2. The mask of all shufflevector is isExtractSubvectorMask.
341/// 3. The mask of all shufflevector uses all of the elements of the source.
342/// e.g., it is 1 group (%0)
343/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
344/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
345/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
346/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
347/// it is 2 groups (%3 and %4)
348/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
349/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
350/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
351/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
352/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
353/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
354/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
355/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
356/// it is 0 group
357/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
358/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
359/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
360/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
362 if (VL.empty())
363 return 0;
365 return 0;
366 auto *SV = cast<ShuffleVectorInst>(VL.front());
367 unsigned SVNumElements =
368 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
369 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
370 if (SVNumElements % ShuffleMaskSize != 0)
371 return 0;
372 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
373 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
374 return 0;
375 unsigned NumGroup = 0;
376 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
377 auto *SV = cast<ShuffleVectorInst>(VL[I]);
378 Value *Src = SV->getOperand(0);
379 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
380 SmallBitVector ExpectedIndex(GroupSize);
381 if (!all_of(Group, [&](Value *V) {
382 auto *SV = cast<ShuffleVectorInst>(V);
383 // From the same source.
384 if (SV->getOperand(0) != Src)
385 return false;
386 int Index;
387 if (!SV->isExtractSubvectorMask(Index))
388 return false;
389 ExpectedIndex.set(Index / ShuffleMaskSize);
390 return true;
391 }))
392 return 0;
393 if (!ExpectedIndex.all())
394 return 0;
395 ++NumGroup;
396 }
397 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
398 return NumGroup;
399}
400
401/// \returns a shufflevector mask which is used to vectorize shufflevectors
402/// e.g.,
403/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
404/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
405/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
406/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
407/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
408/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
409/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
410/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
411/// the result is
412/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
414 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
415 auto *SV = cast<ShuffleVectorInst>(VL.front());
416 unsigned SVNumElements =
417 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
418 SmallVector<int> Mask;
419 unsigned AccumulateLength = 0;
420 for (Value *V : VL) {
421 auto *SV = cast<ShuffleVectorInst>(V);
422 for (int M : SV->getShuffleMask())
423 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
424 : AccumulateLength + M);
425 AccumulateLength += SVNumElements;
426 }
427 return Mask;
428}
429
430/// \returns True if the value is a constant (but not globals/constant
431/// expressions).
432static bool isConstant(Value *V) {
434}
435
436/// Checks if \p V is one of vector-like instructions, i.e. undef,
437/// insertelement/extractelement with constant indices for fixed vector type or
438/// extractvalue instruction.
442 return false;
443 auto *I = dyn_cast<Instruction>(V);
444 if (!I || isa<ExtractValueInst>(I))
445 return true;
446 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
447 return false;
449 return isConstant(I->getOperand(1));
450 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
451 return isConstant(I->getOperand(2));
452}
453
454/// Returns power-of-2 number of elements in a single register (part), given the
455/// total number of elements \p Size and number of registers (parts) \p
456/// NumParts.
457static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
458 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
459}
460
461/// Returns correct remaining number of elements, considering total amount \p
462/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
463/// and current register (part) \p Part.
464static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
465 unsigned Part) {
466 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
467}
468
469#if !defined(NDEBUG)
470/// Print a short descriptor of the instruction bundle suitable for debug output.
471static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
472 std::string Result;
473 raw_string_ostream OS(Result);
474 if (Idx >= 0)
475 OS << "Idx: " << Idx << ", ";
476 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
477 return Result;
478}
479#endif
480
481/// \returns true if all of the instructions in \p VL are in the same block or
482/// false otherwise.
484 auto *It = find_if(VL, IsaPred<Instruction>);
485 if (It == VL.end())
486 return false;
489 return true;
490
491 BasicBlock *BB = I0->getParent();
492 for (Value *V : iterator_range(It, VL.end())) {
493 if (isa<PoisonValue>(V))
494 continue;
495 auto *II = dyn_cast<Instruction>(V);
496 if (!II)
497 return false;
498
499 if (BB != II->getParent())
500 return false;
501 }
502 return true;
503}
504
505/// \returns True if all of the values in \p VL are constants (but not
506/// globals/constant expressions).
508 // Constant expressions and globals can't be vectorized like normal integer/FP
509 // constants.
510 return all_of(VL, isConstant);
511}
512
513/// \returns True if all of the values in \p VL are identical or some of them
514/// are UndefValue.
515static bool isSplat(ArrayRef<Value *> VL) {
516 Value *FirstNonUndef = nullptr;
517 for (Value *V : VL) {
518 if (isa<UndefValue>(V))
519 continue;
520 if (!FirstNonUndef) {
521 FirstNonUndef = V;
522 continue;
523 }
524 if (V != FirstNonUndef)
525 return false;
526 }
527 return FirstNonUndef != nullptr;
528}
529
530/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
531/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
532/// patterns that make it effectively commutative (like equality comparisons
533/// with zero).
534/// In most cases, users should not call this function directly (since \p I and
535/// \p InstWithUses are the same). However, when analyzing interchangeable
536/// instructions, we need to use the converted opcode along with the original
537/// uses.
538/// \param I The instruction to check for commutativity
539/// \param ValWithUses The value whose uses are analyzed for special
540/// patterns
541static bool isCommutative(Instruction *I, Value *ValWithUses,
542 bool IsCopyable = false) {
543 if (auto *Cmp = dyn_cast<CmpInst>(I))
544 return Cmp->isCommutative();
545 if (auto *BO = dyn_cast<BinaryOperator>(I))
546 return BO->isCommutative() ||
547 (BO->getOpcode() == Instruction::Sub &&
548 ValWithUses->hasUseList() &&
549 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
550 all_of(
551 ValWithUses->uses(),
552 [&](const Use &U) {
553 // Commutative, if icmp eq/ne sub, 0
554 CmpPredicate Pred;
555 if (match(U.getUser(),
556 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
557 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return true;
559 // Commutative, if abs(sub nsw, true) or abs(sub, false).
560 ConstantInt *Flag;
561 auto *I = dyn_cast<BinaryOperator>(U.get());
562 return match(U.getUser(),
563 m_Intrinsic<Intrinsic::abs>(
564 m_Specific(U.get()), m_ConstantInt(Flag))) &&
565 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
566 Flag->isOne());
567 })) ||
568 (BO->getOpcode() == Instruction::FSub &&
569 ValWithUses->hasUseList() &&
570 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
571 all_of(ValWithUses->uses(), [](const Use &U) {
572 return match(U.getUser(),
573 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
574 }));
575 return I->isCommutative();
576}
577
578/// Checks if the operand is commutative. In commutative operations, not all
579/// operands might commutable, e.g. for fmuladd only 2 first operands are
580/// commutable.
581static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
582 bool IsCopyable = false) {
583 assert(::isCommutative(I, ValWithUses, IsCopyable) &&
584 "The instruction is not commutative.");
585 if (isa<CmpInst>(I))
586 return true;
587 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
588 switch (BO->getOpcode()) {
589 case Instruction::Sub:
590 case Instruction::FSub:
591 return true;
592 default:
593 break;
594 }
595 }
596 return I->isCommutableOperand(Op);
597}
598
599/// This is a helper function to check whether \p I is commutative.
600/// This is a convenience wrapper that calls the two-parameter version of
601/// isCommutative with the same instruction for both parameters. This is
602/// the common case where the instruction being checked for commutativity
603/// is the same as the instruction whose uses are analyzed for special
604/// patterns (see the two-parameter version above for details).
605/// \param I The instruction to check for commutativity
606/// \returns true if the instruction is commutative, false otherwise
607static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
608
609/// \returns number of operands of \p I, considering commutativity. Returns 2
610/// for commutative instrinsics.
611/// \param I The instruction to check for commutativity
614 // IntrinsicInst::isCommutative returns true if swapping the first "two"
615 // arguments to the intrinsic produces the same result.
616 constexpr unsigned IntrinsicNumOperands = 2;
617 return IntrinsicNumOperands;
618 }
619 return I->getNumOperands();
620}
621
622template <typename T>
623static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
624 unsigned Offset) {
625 static_assert(std::is_same_v<T, InsertElementInst> ||
626 std::is_same_v<T, ExtractElementInst>,
627 "unsupported T");
628 int Index = Offset;
629 if (const auto *IE = dyn_cast<T>(Inst)) {
630 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
631 if (!VT)
632 return std::nullopt;
633 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
634 if (!CI)
635 return std::nullopt;
636 if (CI->getValue().uge(VT->getNumElements()))
637 return std::nullopt;
638 Index *= VT->getNumElements();
639 Index += CI->getZExtValue();
640 return Index;
641 }
642 return std::nullopt;
643}
644
645/// \returns inserting or extracting index of InsertElement, ExtractElement or
646/// InsertValue instruction, using Offset as base offset for index.
647/// \returns std::nullopt if the index is not an immediate.
648static std::optional<unsigned> getElementIndex(const Value *Inst,
649 unsigned Offset = 0) {
650 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
651 return Index;
653 return Index;
654
655 int Index = Offset;
656
657 const auto *IV = dyn_cast<InsertValueInst>(Inst);
658 if (!IV)
659 return std::nullopt;
660
661 Type *CurrentType = IV->getType();
662 for (unsigned I : IV->indices()) {
663 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
664 Index *= ST->getNumElements();
665 CurrentType = ST->getElementType(I);
666 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
667 Index *= AT->getNumElements();
668 CurrentType = AT->getElementType();
669 } else {
670 return std::nullopt;
671 }
672 Index += I;
673 }
674 return Index;
675}
676
677/// \returns true if all of the values in \p VL use the same opcode.
678/// For comparison instructions, also checks if predicates match.
679/// PoisonValues are considered matching.
680/// Interchangeable instructions are not considered.
682 auto *It = find_if(VL, IsaPred<Instruction>);
683 if (It == VL.end())
684 return true;
685 Instruction *MainOp = cast<Instruction>(*It);
686 unsigned Opcode = MainOp->getOpcode();
687 bool IsCmpOp = isa<CmpInst>(MainOp);
688 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
690 return std::all_of(It, VL.end(), [&](Value *V) {
691 if (auto *CI = dyn_cast<CmpInst>(V))
692 return BasePred == CI->getPredicate();
693 if (auto *I = dyn_cast<Instruction>(V))
694 return I->getOpcode() == Opcode;
695 return isa<PoisonValue>(V);
696 });
697}
698
699namespace {
700/// Specifies the way the mask should be analyzed for undefs/poisonous elements
701/// in the shuffle mask.
702enum class UseMask {
703 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
704 ///< check for the mask elements for the first argument (mask
705 ///< indices are in range [0:VF)).
706 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
707 ///< for the mask elements for the second argument (mask indices
708 ///< are in range [VF:2*VF))
709 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
710 ///< future shuffle elements and mark them as ones as being used
711 ///< in future. Non-undef elements are considered as unused since
712 ///< they're already marked as used in the mask.
713};
714} // namespace
715
716/// Prepares a use bitset for the given mask either for the first argument or
717/// for the second.
719 UseMask MaskArg) {
720 SmallBitVector UseMask(VF, true);
721 for (auto [Idx, Value] : enumerate(Mask)) {
722 if (Value == PoisonMaskElem) {
723 if (MaskArg == UseMask::UndefsAsMask)
724 UseMask.reset(Idx);
725 continue;
726 }
727 if (MaskArg == UseMask::FirstArg && Value < VF)
728 UseMask.reset(Value);
729 else if (MaskArg == UseMask::SecondArg && Value >= VF)
730 UseMask.reset(Value - VF);
731 }
732 return UseMask;
733}
734
735/// Checks if the given value is actually an undefined constant vector.
736/// Also, if the \p UseMask is not empty, tries to check if the non-masked
737/// elements actually mask the insertelement buildvector, if any.
738template <bool IsPoisonOnly = false>
740 const SmallBitVector &UseMask = {}) {
741 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
742 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
743 if (isa<T>(V))
744 return Res;
745 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
746 if (!VecTy)
747 return Res.reset();
748 auto *C = dyn_cast<Constant>(V);
749 if (!C) {
750 if (!UseMask.empty()) {
751 const Value *Base = V;
752 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
753 Base = II->getOperand(0);
754 if (isa<T>(II->getOperand(1)))
755 continue;
756 std::optional<unsigned> Idx = getElementIndex(II);
757 if (!Idx) {
758 Res.reset();
759 return Res;
760 }
761 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
762 Res.reset(*Idx);
763 }
764 // TODO: Add analysis for shuffles here too.
765 if (V == Base) {
766 Res.reset();
767 } else {
768 SmallBitVector SubMask(UseMask.size(), false);
769 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
770 }
771 } else {
772 Res.reset();
773 }
774 return Res;
775 }
776 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
777 if (Constant *Elem = C->getAggregateElement(I))
778 if (!isa<T>(Elem) &&
779 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
780 Res.reset(I);
781 }
782 return Res;
783}
784
785/// Checks if the vector of instructions can be represented as a shuffle, like:
786/// %x0 = extractelement <4 x i8> %x, i32 0
787/// %x3 = extractelement <4 x i8> %x, i32 3
788/// %y1 = extractelement <4 x i8> %y, i32 1
789/// %y2 = extractelement <4 x i8> %y, i32 2
790/// %x0x0 = mul i8 %x0, %x0
791/// %x3x3 = mul i8 %x3, %x3
792/// %y1y1 = mul i8 %y1, %y1
793/// %y2y2 = mul i8 %y2, %y2
794/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
795/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
796/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
797/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
798/// ret <4 x i8> %ins4
799/// can be transformed into:
800/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
801/// i32 6>
802/// %2 = mul <4 x i8> %1, %1
803/// ret <4 x i8> %2
804/// Mask will return the Shuffle Mask equivalent to the extracted elements.
805/// TODO: Can we split off and reuse the shuffle mask detection from
806/// ShuffleVectorInst/getShuffleCost?
807static std::optional<TargetTransformInfo::ShuffleKind>
809 AssumptionCache *AC) {
810 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
811 if (It == VL.end())
812 return std::nullopt;
813 unsigned Size =
814 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
815 auto *EI = dyn_cast<ExtractElementInst>(V);
816 if (!EI)
817 return S;
818 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
819 if (!VTy)
820 return S;
821 return std::max(S, VTy->getNumElements());
822 });
823
824 Value *Vec1 = nullptr;
825 Value *Vec2 = nullptr;
826 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
827 auto *EE = dyn_cast<ExtractElementInst>(V);
828 if (!EE)
829 return false;
830 Value *Vec = EE->getVectorOperand();
831 if (isa<UndefValue>(Vec))
832 return false;
833 return isGuaranteedNotToBePoison(Vec, AC);
834 });
835 enum ShuffleMode { Unknown, Select, Permute };
836 ShuffleMode CommonShuffleMode = Unknown;
837 Mask.assign(VL.size(), PoisonMaskElem);
838 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
839 // Undef can be represented as an undef element in a vector.
840 if (isa<UndefValue>(VL[I]))
841 continue;
842 auto *EI = cast<ExtractElementInst>(VL[I]);
843 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
844 return std::nullopt;
845 auto *Vec = EI->getVectorOperand();
846 // We can extractelement from undef or poison vector.
848 continue;
849 // All vector operands must have the same number of vector elements.
850 if (isa<UndefValue>(Vec)) {
851 Mask[I] = I;
852 } else {
853 if (isa<UndefValue>(EI->getIndexOperand()))
854 continue;
855 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
856 if (!Idx)
857 return std::nullopt;
858 // Undefined behavior if Idx is negative or >= Size.
859 if (Idx->getValue().uge(Size))
860 continue;
861 unsigned IntIdx = Idx->getValue().getZExtValue();
862 Mask[I] = IntIdx;
863 }
864 if (isUndefVector(Vec).all() && HasNonUndefVec)
865 continue;
866 // For correct shuffling we have to have at most 2 different vector operands
867 // in all extractelement instructions.
868 if (!Vec1 || Vec1 == Vec) {
869 Vec1 = Vec;
870 } else if (!Vec2 || Vec2 == Vec) {
871 Vec2 = Vec;
872 Mask[I] += Size;
873 } else {
874 return std::nullopt;
875 }
876 if (CommonShuffleMode == Permute)
877 continue;
878 // If the extract index is not the same as the operation number, it is a
879 // permutation.
880 if (Mask[I] % Size != I) {
881 CommonShuffleMode = Permute;
882 continue;
883 }
884 CommonShuffleMode = Select;
885 }
886 // If we're not crossing lanes in different vectors, consider it as blending.
887 if (CommonShuffleMode == Select && Vec2)
889 // If Vec2 was never used, we have a permutation of a single vector, otherwise
890 // we have permutation of 2 vectors.
893}
894
895/// \returns True if Extract{Value,Element} instruction extracts element Idx.
896static std::optional<unsigned> getExtractIndex(const Instruction *E) {
897 unsigned Opcode = E->getOpcode();
898 assert((Opcode == Instruction::ExtractElement ||
899 Opcode == Instruction::ExtractValue) &&
900 "Expected extractelement or extractvalue instruction.");
901 if (Opcode == Instruction::ExtractElement) {
902 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
903 if (!CI)
904 return std::nullopt;
905 return CI->getZExtValue();
906 }
907 auto *EI = cast<ExtractValueInst>(E);
908 if (EI->getNumIndices() != 1)
909 return std::nullopt;
910 return *EI->idx_begin();
911}
912
913/// Checks if the provided value does not require scheduling. It does not
914/// require scheduling if this is not an instruction or it is an instruction
915/// that does not read/write memory and all operands are either not instructions
916/// or phi nodes or instructions from different blocks.
917static bool areAllOperandsNonInsts(Value *V);
918/// Checks if the provided value does not require scheduling. It does not
919/// require scheduling if this is not an instruction or it is an instruction
920/// that does not read/write memory and all users are phi nodes or instructions
921/// from the different blocks.
922static bool isUsedOutsideBlock(Value *V);
923/// Checks if the specified value does not require scheduling. It does not
924/// require scheduling if all operands and all users do not need to be scheduled
925/// in the current basic block.
926static bool doesNotNeedToBeScheduled(Value *V);
927
928/// \returns true if \p Opcode is allowed as part of the main/alternate
929/// instruction for SLP vectorization.
930///
931/// Example of unsupported opcode is SDIV that can potentially cause UB if the
932/// "shuffled out" lane would result in division by zero.
933static bool isValidForAlternation(unsigned Opcode) {
934 return !Instruction::isIntDivRem(Opcode);
935}
936
937namespace {
938
939/// Helper class that determines VL can use the same opcode.
940/// Alternate instruction is supported. In addition, it supports interchangeable
941/// instruction. An interchangeable instruction is an instruction that can be
942/// converted to another instruction with same semantics. For example, x << 1 is
943/// equal to x * 2. x * 1 is equal to x | 0.
944class BinOpSameOpcodeHelper {
945 using MaskType = std::uint_fast16_t;
946 /// Sort SupportedOp because it is used by binary_search.
947 constexpr static std::initializer_list<unsigned> SupportedOp = {
948 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
949 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
950 enum : MaskType {
951 ShlBIT = 0b1,
952 AShrBIT = 0b10,
953 MulBIT = 0b100,
954 AddBIT = 0b1000,
955 SubBIT = 0b10000,
956 AndBIT = 0b100000,
957 OrBIT = 0b1000000,
958 XorBIT = 0b10000000,
959 MainOpBIT = 0b100000000,
961 };
962 /// Return a non-nullptr if either operand of I is a ConstantInt.
963 /// The second return value represents the operand position. We check the
964 /// right-hand side first (1). If the right hand side is not a ConstantInt and
965 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
966 /// side (0).
967 static std::pair<ConstantInt *, unsigned>
968 isBinOpWithConstantInt(const Instruction *I) {
969 unsigned Opcode = I->getOpcode();
970 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
971 (void)SupportedOp;
972 auto *BinOp = cast<BinaryOperator>(I);
973 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
974 return {CI, 1};
975 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
976 Opcode == Instruction::AShr)
977 return {nullptr, 0};
978 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
979 return {CI, 0};
980 return {nullptr, 0};
981 }
982 struct InterchangeableInfo {
983 const Instruction *I = nullptr;
984 /// The bit it sets represents whether MainOp can be converted to.
985 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
986 MulBIT | AShrBIT | ShlBIT;
987 /// We cannot create an interchangeable instruction that does not exist in
988 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
989 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
990 /// 1]. SeenBefore is used to know what operations have been seen before.
991 MaskType SeenBefore = 0;
992 InterchangeableInfo(const Instruction *I) : I(I) {}
993 /// Return false allows BinOpSameOpcodeHelper to find an alternate
994 /// instruction. Directly setting the mask will destroy the mask state,
995 /// preventing us from determining which instruction it should convert to.
996 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
997 if (Mask & InterchangeableMask) {
998 SeenBefore |= OpcodeInMaskForm;
999 Mask &= InterchangeableMask;
1000 return true;
1001 }
1002 return false;
1003 }
1004 bool equal(unsigned Opcode) {
1005 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1006 }
1007 unsigned getOpcode() const {
1008 MaskType Candidate = Mask & SeenBefore;
1009 if (Candidate & MainOpBIT)
1010 return I->getOpcode();
1011 if (Candidate & ShlBIT)
1012 return Instruction::Shl;
1013 if (Candidate & AShrBIT)
1014 return Instruction::AShr;
1015 if (Candidate & MulBIT)
1016 return Instruction::Mul;
1017 if (Candidate & AddBIT)
1018 return Instruction::Add;
1019 if (Candidate & SubBIT)
1020 return Instruction::Sub;
1021 if (Candidate & AndBIT)
1022 return Instruction::And;
1023 if (Candidate & OrBIT)
1024 return Instruction::Or;
1025 if (Candidate & XorBIT)
1026 return Instruction::Xor;
1027 llvm_unreachable("Cannot find interchangeable instruction.");
1028 }
1029
1030 /// Return true if the instruction can be converted to \p Opcode.
1031 bool hasCandidateOpcode(unsigned Opcode) const {
1032 MaskType Candidate = Mask & SeenBefore;
1033 switch (Opcode) {
1034 case Instruction::Shl:
1035 return Candidate & ShlBIT;
1036 case Instruction::AShr:
1037 return Candidate & AShrBIT;
1038 case Instruction::Mul:
1039 return Candidate & MulBIT;
1040 case Instruction::Add:
1041 return Candidate & AddBIT;
1042 case Instruction::Sub:
1043 return Candidate & SubBIT;
1044 case Instruction::And:
1045 return Candidate & AndBIT;
1046 case Instruction::Or:
1047 return Candidate & OrBIT;
1048 case Instruction::Xor:
1049 return Candidate & XorBIT;
1050 case Instruction::LShr:
1051 case Instruction::FAdd:
1052 case Instruction::FSub:
1053 case Instruction::FMul:
1054 case Instruction::SDiv:
1055 case Instruction::UDiv:
1056 case Instruction::FDiv:
1057 case Instruction::SRem:
1058 case Instruction::URem:
1059 case Instruction::FRem:
1060 return false;
1061 default:
1062 break;
1063 }
1064 llvm_unreachable("Cannot find interchangeable instruction.");
1065 }
1066
1067 SmallVector<Value *> getOperand(const Instruction *To) const {
1068 unsigned ToOpcode = To->getOpcode();
1069 unsigned FromOpcode = I->getOpcode();
1070 if (FromOpcode == ToOpcode)
1071 return SmallVector<Value *>(I->operands());
1072 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1073 auto [CI, Pos] = isBinOpWithConstantInt(I);
1074 const APInt &FromCIValue = CI->getValue();
1075 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1076 APInt ToCIValue;
1077 switch (FromOpcode) {
1078 case Instruction::Shl:
1079 if (ToOpcode == Instruction::Mul) {
1080 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1081 FromCIValue.getZExtValue());
1082 } else {
1083 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1084 ToCIValue = ToOpcode == Instruction::And
1085 ? APInt::getAllOnes(FromCIValueBitWidth)
1086 : APInt::getZero(FromCIValueBitWidth);
1087 }
1088 break;
1089 case Instruction::Mul:
1090 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1091 if (ToOpcode == Instruction::Shl) {
1092 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1093 } else {
1094 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1095 ToCIValue = ToOpcode == Instruction::And
1096 ? APInt::getAllOnes(FromCIValueBitWidth)
1097 : APInt::getZero(FromCIValueBitWidth);
1098 }
1099 break;
1100 case Instruction::Add:
1101 case Instruction::Sub:
1102 if (FromCIValue.isZero()) {
1103 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1104 } else {
1105 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1106 "Cannot convert the instruction.");
1107 ToCIValue = FromCIValue;
1108 ToCIValue.negate();
1109 }
1110 break;
1111 case Instruction::And:
1112 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1113 ToCIValue = ToOpcode == Instruction::Mul
1114 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1115 : APInt::getZero(FromCIValueBitWidth);
1116 break;
1117 default:
1118 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1119 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1120 break;
1121 }
1122 Value *LHS = I->getOperand(1 - Pos);
1123 Constant *RHS =
1124 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1125 // constant + x cannot be -constant - x
1126 // instead, it should be x - -constant
1127 if (Pos == 1 ||
1128 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1129 FromOpcode == Instruction::Xor) &&
1130 ToOpcode == Instruction::Sub))
1131 return SmallVector<Value *>({LHS, RHS});
1132 return SmallVector<Value *>({RHS, LHS});
1133 }
1134 };
1135 InterchangeableInfo MainOp;
1136 InterchangeableInfo AltOp;
1137 bool isValidForAlternation(const Instruction *I) const {
1138 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1139 ::isValidForAlternation(I->getOpcode());
1140 }
1141 bool initializeAltOp(const Instruction *I) {
1142 if (AltOp.I)
1143 return true;
1145 return false;
1146 AltOp.I = I;
1147 return true;
1148 }
1149
1150public:
1151 BinOpSameOpcodeHelper(const Instruction *MainOp,
1152 const Instruction *AltOp = nullptr)
1153 : MainOp(MainOp), AltOp(AltOp) {
1154 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1155 }
1156 bool add(const Instruction *I) {
1158 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1159 unsigned Opcode = I->getOpcode();
1160 MaskType OpcodeInMaskForm;
1161 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1162 switch (Opcode) {
1163 case Instruction::Shl:
1164 OpcodeInMaskForm = ShlBIT;
1165 break;
1166 case Instruction::AShr:
1167 OpcodeInMaskForm = AShrBIT;
1168 break;
1169 case Instruction::Mul:
1170 OpcodeInMaskForm = MulBIT;
1171 break;
1172 case Instruction::Add:
1173 OpcodeInMaskForm = AddBIT;
1174 break;
1175 case Instruction::Sub:
1176 OpcodeInMaskForm = SubBIT;
1177 break;
1178 case Instruction::And:
1179 OpcodeInMaskForm = AndBIT;
1180 break;
1181 case Instruction::Or:
1182 OpcodeInMaskForm = OrBIT;
1183 break;
1184 case Instruction::Xor:
1185 OpcodeInMaskForm = XorBIT;
1186 break;
1187 default:
1188 return MainOp.equal(Opcode) ||
1189 (initializeAltOp(I) && AltOp.equal(Opcode));
1190 }
1191 MaskType InterchangeableMask = OpcodeInMaskForm;
1192 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1193 if (CI) {
1194 constexpr MaskType CanBeAll =
1195 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1196 const APInt &CIValue = CI->getValue();
1197 switch (Opcode) {
1198 case Instruction::Shl:
1199 if (CIValue.ult(CIValue.getBitWidth()))
1200 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1201 break;
1202 case Instruction::Mul:
1203 if (CIValue.isOne()) {
1204 InterchangeableMask = CanBeAll;
1205 break;
1206 }
1207 if (CIValue.isPowerOf2())
1208 InterchangeableMask = MulBIT | ShlBIT;
1209 break;
1210 case Instruction::Add:
1211 case Instruction::Sub:
1212 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1213 break;
1214 case Instruction::And:
1215 if (CIValue.isAllOnes())
1216 InterchangeableMask = CanBeAll;
1217 break;
1218 case Instruction::Xor:
1219 if (CIValue.isZero())
1220 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1221 break;
1222 default:
1223 if (CIValue.isZero())
1224 InterchangeableMask = CanBeAll;
1225 break;
1226 }
1227 }
1228 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1229 (initializeAltOp(I) &&
1230 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1231 }
1232 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1233 /// Checks if the list of potential opcodes includes \p Opcode.
1234 bool hasCandidateOpcode(unsigned Opcode) const {
1235 return MainOp.hasCandidateOpcode(Opcode);
1236 }
1237 bool hasAltOp() const { return AltOp.I; }
1238 unsigned getAltOpcode() const {
1239 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1240 }
1241 SmallVector<Value *> getOperand(const Instruction *I) const {
1242 return MainOp.getOperand(I);
1243 }
1244};
1245
1246/// Main data required for vectorization of instructions.
1247class InstructionsState {
1248 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1249 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1250 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1251 /// isAltShuffle).
1252 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1253 /// from getMainAltOpsNoStateVL.
1254 /// For those InstructionsState that use alternate instructions, the resulting
1255 /// vectorized output ultimately comes from a shufflevector. For example,
1256 /// given a vector list (VL):
1257 /// VL[0] = add i32 a, e
1258 /// VL[1] = sub i32 b, f
1259 /// VL[2] = add i32 c, g
1260 /// VL[3] = sub i32 d, h
1261 /// The vectorized result would be:
1262 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1263 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1264 /// result = shufflevector <4 x i32> intermediated_0,
1265 /// <4 x i32> intermediated_1,
1266 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1267 /// Since shufflevector is used in the final result, when calculating the cost
1268 /// (getEntryCost), we must account for the usage of shufflevector in
1269 /// GetVectorCost.
1270 Instruction *MainOp = nullptr;
1271 Instruction *AltOp = nullptr;
1272 /// Wether the instruction state represents copyable instructions.
1273 bool HasCopyables = false;
1274
1275public:
1276 Instruction *getMainOp() const {
1277 assert(valid() && "InstructionsState is invalid.");
1278 return MainOp;
1279 }
1280
1281 Instruction *getAltOp() const {
1282 assert(valid() && "InstructionsState is invalid.");
1283 return AltOp;
1284 }
1285
1286 /// The main/alternate opcodes for the list of instructions.
1287 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1288
1289 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1290
1291 /// Some of the instructions in the list have alternate opcodes.
1292 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1293
1294 /// Checks if the instruction matches either the main or alternate opcode.
1295 /// \returns
1296 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1297 /// to it
1298 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1299 /// it
1300 /// - nullptr if \param I cannot be matched or converted to either opcode
1301 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1302 assert(MainOp && "MainOp cannot be nullptr.");
1303 if (I->getOpcode() == MainOp->getOpcode())
1304 return MainOp;
1305 // Prefer AltOp instead of interchangeable instruction of MainOp.
1306 assert(AltOp && "AltOp cannot be nullptr.");
1307 if (I->getOpcode() == AltOp->getOpcode())
1308 return AltOp;
1309 if (!I->isBinaryOp())
1310 return nullptr;
1311 BinOpSameOpcodeHelper Converter(MainOp);
1312 if (!Converter.add(I) || !Converter.add(MainOp))
1313 return nullptr;
1314 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1315 BinOpSameOpcodeHelper AltConverter(AltOp);
1316 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1317 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1318 return AltOp;
1319 }
1320 if (Converter.hasAltOp() && !isAltShuffle())
1321 return nullptr;
1322 return Converter.hasAltOp() ? AltOp : MainOp;
1323 }
1324
1325 /// Checks if main/alt instructions are shift operations.
1326 bool isShiftOp() const {
1327 return getMainOp()->isShift() && getAltOp()->isShift();
1328 }
1329
1330 /// Checks if main/alt instructions are bitwise logic operations.
1331 bool isBitwiseLogicOp() const {
1332 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1333 }
1334
1335 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1336 bool isMulDivLikeOp() const {
1337 constexpr std::array<unsigned, 8> MulDiv = {
1338 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1339 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1340 Instruction::URem, Instruction::FRem};
1341 return is_contained(MulDiv, getOpcode()) &&
1342 is_contained(MulDiv, getAltOpcode());
1343 }
1344
1345 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1346 bool isAddSubLikeOp() const {
1347 constexpr std::array<unsigned, 4> AddSub = {
1348 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1349 Instruction::FSub};
1350 return is_contained(AddSub, getOpcode()) &&
1351 is_contained(AddSub, getAltOpcode());
1352 }
1353
1354 /// Checks if main/alt instructions are cmp operations.
1355 bool isCmpOp() const {
1356 return (getOpcode() == Instruction::ICmp ||
1357 getOpcode() == Instruction::FCmp) &&
1358 getAltOpcode() == getOpcode();
1359 }
1360
1361 /// Checks if the current state is valid, i.e. has non-null MainOp
1362 bool valid() const { return MainOp && AltOp; }
1363
1364 explicit operator bool() const { return valid(); }
1365
1366 InstructionsState() = delete;
1367 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1368 bool HasCopyables = false)
1369 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1370 static InstructionsState invalid() { return {nullptr, nullptr}; }
1371
1372 /// Checks if the value is a copyable element.
1373 bool isCopyableElement(Value *V) const {
1374 assert(valid() && "InstructionsState is invalid.");
1375 if (!HasCopyables)
1376 return false;
1377 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1378 return false;
1379 auto *I = dyn_cast<Instruction>(V);
1380 if (!I)
1381 return !isa<PoisonValue>(V);
1382 if (I->getParent() != MainOp->getParent() &&
1385 return true;
1386 if (I->getOpcode() == MainOp->getOpcode())
1387 return false;
1388 if (!I->isBinaryOp())
1389 return true;
1390 BinOpSameOpcodeHelper Converter(MainOp);
1391 return !Converter.add(I) || !Converter.add(MainOp) ||
1392 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1393 }
1394
1395 /// Checks if the value is non-schedulable.
1396 bool isNonSchedulable(Value *V) const {
1397 assert(valid() && "InstructionsState is invalid.");
1398 auto *I = dyn_cast<Instruction>(V);
1399 if (!HasCopyables)
1400 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1402 // MainOp for copyables always schedulable to correctly identify
1403 // non-schedulable copyables.
1404 if (getMainOp() == V)
1405 return false;
1406 if (isCopyableElement(V)) {
1407 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1408 auto *I = dyn_cast<Instruction>(V);
1409 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1411 // If the copyable instructions comes after MainOp
1412 // (non-schedulable, but used in the block) - cannot vectorize
1413 // it, will possibly generate use before def.
1414 !MainOp->comesBefore(I));
1415 };
1416
1417 return IsNonSchedulableCopyableElement(V);
1418 }
1419 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1421 }
1422
1423 /// Checks if the state represents copyable instructions.
1424 bool areInstructionsWithCopyableElements() const {
1425 assert(valid() && "InstructionsState is invalid.");
1426 return HasCopyables;
1427 }
1428};
1429
1430std::pair<Instruction *, SmallVector<Value *>>
1431convertTo(Instruction *I, const InstructionsState &S) {
1432 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1433 assert(SelectedOp && "Cannot convert the instruction.");
1434 if (I->isBinaryOp()) {
1435 BinOpSameOpcodeHelper Converter(I);
1436 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1437 }
1438 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1439}
1440
1441} // end anonymous namespace
1442
1443static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1444 const TargetLibraryInfo &TLI);
1445
1446/// Find an instruction with a specific opcode in VL.
1447/// \param VL Array of values to search through. Must contain only Instructions
1448/// and PoisonValues.
1449/// \param Opcode The instruction opcode to search for
1450/// \returns
1451/// - The first instruction found with matching opcode
1452/// - nullptr if no matching instruction is found
1454 unsigned Opcode) {
1455 for (Value *V : VL) {
1456 if (isa<PoisonValue>(V))
1457 continue;
1458 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1459 auto *Inst = cast<Instruction>(V);
1460 if (Inst->getOpcode() == Opcode)
1461 return Inst;
1462 }
1463 return nullptr;
1464}
1465
1466/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1467/// compatible instructions or constants, or just some other regular values.
1468static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1469 Value *Op1, const TargetLibraryInfo &TLI) {
1470 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1471 (isConstant(BaseOp1) && isConstant(Op1)) ||
1472 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1473 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1474 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1475 getSameOpcode({BaseOp0, Op0}, TLI) ||
1476 getSameOpcode({BaseOp1, Op1}, TLI);
1477}
1478
1479/// \returns true if a compare instruction \p CI has similar "look" and
1480/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1481/// swapped, false otherwise.
1482static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1483 const TargetLibraryInfo &TLI) {
1484 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1485 "Assessing comparisons of different types?");
1486 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1487 CmpInst::Predicate Pred = CI->getPredicate();
1489
1490 Value *BaseOp0 = BaseCI->getOperand(0);
1491 Value *BaseOp1 = BaseCI->getOperand(1);
1492 Value *Op0 = CI->getOperand(0);
1493 Value *Op1 = CI->getOperand(1);
1494
1495 return (BasePred == Pred &&
1496 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1497 (BasePred == SwappedPred &&
1498 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1499}
1500
1501/// \returns analysis of the Instructions in \p VL described in
1502/// InstructionsState, the Opcode that we suppose the whole list
1503/// could be vectorized even if its structure is diverse.
1504static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1505 const TargetLibraryInfo &TLI) {
1506 // Make sure these are all Instructions.
1508 return InstructionsState::invalid();
1509
1510 auto *It = find_if(VL, IsaPred<Instruction>);
1511 if (It == VL.end())
1512 return InstructionsState::invalid();
1513
1514 Instruction *MainOp = cast<Instruction>(*It);
1515 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1516 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1517 (VL.size() == 2 && InstCnt < 2))
1518 return InstructionsState::invalid();
1519
1520 bool IsCastOp = isa<CastInst>(MainOp);
1521 bool IsBinOp = isa<BinaryOperator>(MainOp);
1522 bool IsCmpOp = isa<CmpInst>(MainOp);
1523 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1525 Instruction *AltOp = MainOp;
1526 unsigned Opcode = MainOp->getOpcode();
1527 unsigned AltOpcode = Opcode;
1528
1529 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1530 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1531 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1532 UniquePreds.insert(BasePred);
1533 UniqueNonSwappedPreds.insert(BasePred);
1534 for (Value *V : VL) {
1535 auto *I = dyn_cast<CmpInst>(V);
1536 if (!I)
1537 return false;
1538 CmpInst::Predicate CurrentPred = I->getPredicate();
1539 CmpInst::Predicate SwappedCurrentPred =
1540 CmpInst::getSwappedPredicate(CurrentPred);
1541 UniqueNonSwappedPreds.insert(CurrentPred);
1542 if (!UniquePreds.contains(CurrentPred) &&
1543 !UniquePreds.contains(SwappedCurrentPred))
1544 UniquePreds.insert(CurrentPred);
1545 }
1546 // Total number of predicates > 2, but if consider swapped predicates
1547 // compatible only 2, consider swappable predicates as compatible opcodes,
1548 // not alternate.
1549 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1550 }();
1551 // Check for one alternate opcode from another BinaryOperator.
1552 // TODO - generalize to support all operators (types, calls etc.).
1553 Intrinsic::ID BaseID = 0;
1554 SmallVector<VFInfo> BaseMappings;
1555 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1556 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1557 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1558 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1559 return InstructionsState::invalid();
1560 }
1561 bool AnyPoison = InstCnt != VL.size();
1562 // Check MainOp too to be sure that it matches the requirements for the
1563 // instructions.
1564 for (Value *V : iterator_range(It, VL.end())) {
1565 auto *I = dyn_cast<Instruction>(V);
1566 if (!I)
1567 continue;
1568
1569 // Cannot combine poison and divisions.
1570 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1571 // intrinsics/functions only.
1572 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1573 return InstructionsState::invalid();
1574 unsigned InstOpcode = I->getOpcode();
1575 if (IsBinOp && isa<BinaryOperator>(I)) {
1576 if (BinOpHelper.add(I))
1577 continue;
1578 } else if (IsCastOp && isa<CastInst>(I)) {
1579 Value *Op0 = MainOp->getOperand(0);
1580 Type *Ty0 = Op0->getType();
1581 Value *Op1 = I->getOperand(0);
1582 Type *Ty1 = Op1->getType();
1583 if (Ty0 == Ty1) {
1584 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1585 continue;
1586 if (Opcode == AltOpcode) {
1587 assert(isValidForAlternation(Opcode) &&
1588 isValidForAlternation(InstOpcode) &&
1589 "Cast isn't safe for alternation, logic needs to be updated!");
1590 AltOpcode = InstOpcode;
1591 AltOp = I;
1592 continue;
1593 }
1594 }
1595 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1596 auto *BaseInst = cast<CmpInst>(MainOp);
1597 Type *Ty0 = BaseInst->getOperand(0)->getType();
1598 Type *Ty1 = Inst->getOperand(0)->getType();
1599 if (Ty0 == Ty1) {
1600 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1601 assert(InstOpcode == AltOpcode &&
1602 "Alternate instructions are only supported by BinaryOperator "
1603 "and CastInst.");
1604 // Check for compatible operands. If the corresponding operands are not
1605 // compatible - need to perform alternate vectorization.
1606 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1607 CmpInst::Predicate SwappedCurrentPred =
1608 CmpInst::getSwappedPredicate(CurrentPred);
1609
1610 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1611 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1612 continue;
1613
1614 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1615 continue;
1616 auto *AltInst = cast<CmpInst>(AltOp);
1617 if (MainOp != AltOp) {
1618 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1619 continue;
1620 } else if (BasePred != CurrentPred) {
1621 assert(
1622 isValidForAlternation(InstOpcode) &&
1623 "CmpInst isn't safe for alternation, logic needs to be updated!");
1624 AltOp = I;
1625 continue;
1626 }
1627 CmpInst::Predicate AltPred = AltInst->getPredicate();
1628 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1629 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1630 continue;
1631 }
1632 } else if (InstOpcode == Opcode) {
1633 assert(InstOpcode == AltOpcode &&
1634 "Alternate instructions are only supported by BinaryOperator and "
1635 "CastInst.");
1636 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1637 if (Gep->getNumOperands() != 2 ||
1638 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1639 return InstructionsState::invalid();
1640 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1642 return InstructionsState::invalid();
1643 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1644 auto *BaseLI = cast<LoadInst>(MainOp);
1645 if (!LI->isSimple() || !BaseLI->isSimple())
1646 return InstructionsState::invalid();
1647 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1648 auto *CallBase = cast<CallInst>(MainOp);
1649 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1650 return InstructionsState::invalid();
1651 if (Call->hasOperandBundles() &&
1653 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1654 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1655 CallBase->op_begin() +
1657 return InstructionsState::invalid();
1659 if (ID != BaseID)
1660 return InstructionsState::invalid();
1661 if (!ID) {
1662 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1663 if (Mappings.size() != BaseMappings.size() ||
1664 Mappings.front().ISA != BaseMappings.front().ISA ||
1665 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1666 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1667 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1668 Mappings.front().Shape.Parameters !=
1669 BaseMappings.front().Shape.Parameters)
1670 return InstructionsState::invalid();
1671 }
1672 }
1673 continue;
1674 }
1675 return InstructionsState::invalid();
1676 }
1677
1678 if (IsBinOp) {
1679 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1680 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1681 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1682 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1683 }
1684 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1685 "Incorrect implementation of allSameOpcode.");
1686 InstructionsState S(MainOp, AltOp);
1687 assert(all_of(VL,
1688 [&](Value *V) {
1689 return isa<PoisonValue>(V) ||
1690 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1691 }) &&
1692 "Invalid InstructionsState.");
1693 return S;
1694}
1695
1696/// \returns true if all of the values in \p VL have the same type or false
1697/// otherwise.
1699 Type *Ty = VL.consume_front()->getType();
1700 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1701}
1702
1703/// \returns True if in-tree use also needs extract. This refers to
1704/// possible scalar operand in vectorized instruction.
1705static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1706 TargetLibraryInfo *TLI,
1707 const TargetTransformInfo *TTI) {
1708 if (!UserInst)
1709 return false;
1710 unsigned Opcode = UserInst->getOpcode();
1711 switch (Opcode) {
1712 case Instruction::Load: {
1713 LoadInst *LI = cast<LoadInst>(UserInst);
1714 return (LI->getPointerOperand() == Scalar);
1715 }
1716 case Instruction::Store: {
1717 StoreInst *SI = cast<StoreInst>(UserInst);
1718 return (SI->getPointerOperand() == Scalar);
1719 }
1720 case Instruction::Call: {
1721 CallInst *CI = cast<CallInst>(UserInst);
1723 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1724 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1725 Arg.value().get() == Scalar;
1726 });
1727 }
1728 default:
1729 return false;
1730 }
1731}
1732
1733/// \returns the AA location that is being access by the instruction.
1736 return MemoryLocation::get(SI);
1737 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1738 return MemoryLocation::get(LI);
1739 return MemoryLocation();
1740}
1741
1742/// \returns True if the instruction is not a volatile or atomic load/store.
1743static bool isSimple(Instruction *I) {
1744 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1745 return LI->isSimple();
1747 return SI->isSimple();
1749 return !MI->isVolatile();
1750 return true;
1751}
1752
1753/// Shuffles \p Mask in accordance with the given \p SubMask.
1754/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1755/// one but two input vectors.
1756static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1757 bool ExtendingManyInputs = false) {
1758 if (SubMask.empty())
1759 return;
1760 assert(
1761 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1762 // Check if input scalars were extended to match the size of other node.
1763 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1764 "SubMask with many inputs support must be larger than the mask.");
1765 if (Mask.empty()) {
1766 Mask.append(SubMask.begin(), SubMask.end());
1767 return;
1768 }
1769 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1770 int TermValue = std::min(Mask.size(), SubMask.size());
1771 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1772 if (SubMask[I] == PoisonMaskElem ||
1773 (!ExtendingManyInputs &&
1774 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1775 continue;
1776 NewMask[I] = Mask[SubMask[I]];
1777 }
1778 Mask.swap(NewMask);
1779}
1780
1781/// Order may have elements assigned special value (size) which is out of
1782/// bounds. Such indices only appear on places which correspond to undef values
1783/// (see canReuseExtract for details) and used in order to avoid undef values
1784/// have effect on operands ordering.
1785/// The first loop below simply finds all unused indices and then the next loop
1786/// nest assigns these indices for undef values positions.
1787/// As an example below Order has two undef positions and they have assigned
1788/// values 3 and 7 respectively:
1789/// before: 6 9 5 4 9 2 1 0
1790/// after: 6 3 5 4 7 2 1 0
1792 const size_t Sz = Order.size();
1793 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1794 SmallBitVector MaskedIndices(Sz);
1795 for (unsigned I = 0; I < Sz; ++I) {
1796 if (Order[I] < Sz)
1797 UnusedIndices.reset(Order[I]);
1798 else
1799 MaskedIndices.set(I);
1800 }
1801 if (MaskedIndices.none())
1802 return;
1803 assert(UnusedIndices.count() == MaskedIndices.count() &&
1804 "Non-synced masked/available indices.");
1805 int Idx = UnusedIndices.find_first();
1806 int MIdx = MaskedIndices.find_first();
1807 while (MIdx >= 0) {
1808 assert(Idx >= 0 && "Indices must be synced.");
1809 Order[MIdx] = Idx;
1810 Idx = UnusedIndices.find_next(Idx);
1811 MIdx = MaskedIndices.find_next(MIdx);
1812 }
1813}
1814
1815/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1816/// Opcode1.
1818 unsigned Opcode0, unsigned Opcode1) {
1819 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1820 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1821 for (unsigned Lane : seq<unsigned>(VL.size())) {
1822 if (isa<PoisonValue>(VL[Lane]))
1823 continue;
1824 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1825 OpcodeMask.set(Lane * ScalarTyNumElements,
1826 Lane * ScalarTyNumElements + ScalarTyNumElements);
1827 }
1828 return OpcodeMask;
1829}
1830
1831/// Replicates the given \p Val \p VF times.
1833 unsigned VF) {
1834 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1835 "Expected scalar constants.");
1836 SmallVector<Constant *> NewVal(Val.size() * VF);
1837 for (auto [I, V] : enumerate(Val))
1838 std::fill_n(NewVal.begin() + I * VF, VF, V);
1839 return NewVal;
1840}
1841
1843 SmallVectorImpl<int> &Mask) {
1844 Mask.clear();
1845 const unsigned E = Indices.size();
1846 Mask.resize(E, PoisonMaskElem);
1847 for (unsigned I = 0; I < E; ++I)
1848 Mask[Indices[I]] = I;
1849}
1850
1851/// Reorders the list of scalars in accordance with the given \p Mask.
1853 ArrayRef<int> Mask) {
1854 assert(!Mask.empty() && "Expected non-empty mask.");
1855 SmallVector<Value *> Prev(Scalars.size(),
1856 PoisonValue::get(Scalars.front()->getType()));
1857 Prev.swap(Scalars);
1858 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1859 if (Mask[I] != PoisonMaskElem)
1860 Scalars[Mask[I]] = Prev[I];
1861}
1862
1863/// Checks if the provided value does not require scheduling. It does not
1864/// require scheduling if this is not an instruction or it is an instruction
1865/// that does not read/write memory and all operands are either not instructions
1866/// or phi nodes or instructions from different blocks.
1868 auto *I = dyn_cast<Instruction>(V);
1869 if (!I)
1870 return true;
1871 return !mayHaveNonDefUseDependency(*I) &&
1872 all_of(I->operands(), [I](Value *V) {
1873 auto *IO = dyn_cast<Instruction>(V);
1874 if (!IO)
1875 return true;
1876 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1877 });
1878}
1879
1880/// Checks if the provided value does not require scheduling. It does not
1881/// require scheduling if this is not an instruction or it is an instruction
1882/// that does not read/write memory and all users are phi nodes or instructions
1883/// from the different blocks.
1884static bool isUsedOutsideBlock(Value *V) {
1885 auto *I = dyn_cast<Instruction>(V);
1886 if (!I)
1887 return true;
1888 // Limits the number of uses to save compile time.
1889 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1890 all_of(I->users(), [I](User *U) {
1891 auto *IU = dyn_cast<Instruction>(U);
1892 if (!IU)
1893 return true;
1894 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1895 });
1896}
1897
1898/// Checks if the specified value does not require scheduling. It does not
1899/// require scheduling if all operands and all users do not need to be scheduled
1900/// in the current basic block.
1903}
1904
1905/// Checks if the specified array of instructions does not require scheduling.
1906/// It is so if all either instructions have operands that do not require
1907/// scheduling or their users do not require scheduling since they are phis or
1908/// in other basic blocks.
1910 return !VL.empty() &&
1912}
1913
1914/// Returns true if widened type of \p Ty elements with size \p Sz represents
1915/// full vector type, i.e. adding extra element results in extra parts upon type
1916/// legalization.
1918 unsigned Sz) {
1919 if (Sz <= 1)
1920 return false;
1922 return false;
1923 if (has_single_bit(Sz))
1924 return true;
1925 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1926 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1927 Sz % NumParts == 0;
1928}
1929
1930/// Returns number of parts, the type \p VecTy will be split at the codegen
1931/// phase. If the type is going to be scalarized or does not uses whole
1932/// registers, returns 1.
1933static unsigned
1935 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1936 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1937 if (NumParts == 0 || NumParts >= Limit)
1938 return 1;
1939 unsigned Sz = getNumElements(VecTy);
1940 if (NumParts >= Sz || Sz % NumParts != 0 ||
1941 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1942 return 1;
1943 return NumParts;
1944}
1945
1946/// Bottom Up SLP Vectorizer.
1948 class TreeEntry;
1949 class ScheduleEntity;
1950 class ScheduleData;
1951 class ScheduleCopyableData;
1952 class ScheduleBundle;
1955
1956 /// If we decide to generate strided load / store, this struct contains all
1957 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1958 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1959 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1960 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1961 /// size of element of FixedVectorType.
1962 struct StridedPtrInfo {
1963 Value *StrideVal = nullptr;
1964 const SCEV *StrideSCEV = nullptr;
1965 FixedVectorType *Ty = nullptr;
1966 };
1967 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1968
1969public:
1970 /// Tracks the state we can represent the loads in the given sequence.
1978
1985
1987 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1989 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1990 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1991 AC(AC), DB(DB), DL(DL), ORE(ORE),
1992 Builder(Se->getContext(), TargetFolder(*DL)) {
1993 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1994 // Use the vector register size specified by the target unless overridden
1995 // by a command-line option.
1996 // TODO: It would be better to limit the vectorization factor based on
1997 // data type rather than just register size. For example, x86 AVX has
1998 // 256-bit registers, but it does not support integer operations
1999 // at that width (that requires AVX2).
2000 if (MaxVectorRegSizeOption.getNumOccurrences())
2001 MaxVecRegSize = MaxVectorRegSizeOption;
2002 else
2003 MaxVecRegSize =
2004 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
2005 .getFixedValue();
2006
2007 if (MinVectorRegSizeOption.getNumOccurrences())
2008 MinVecRegSize = MinVectorRegSizeOption;
2009 else
2010 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2011 }
2012
2013 /// Vectorize the tree that starts with the elements in \p VL.
2014 /// Returns the vectorized root.
2016
2017 /// Vectorize the tree but with the list of externally used values \p
2018 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
2019 /// generated extractvalue instructions.
2021 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2022 Instruction *ReductionRoot = nullptr,
2023 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2024
2025 /// \returns the cost incurred by unwanted spills and fills, caused by
2026 /// holding live values over call sites.
2028
2029 /// Calculates the cost of the subtrees, trims non-profitable ones and returns
2030 /// final cost.
2033
2034 /// \returns the vectorization cost of the subtree that starts at \p VL.
2035 /// A negative number means that this is profitable.
2037 ArrayRef<Value *> VectorizedVals = {},
2038 InstructionCost ReductionCost = TTI::TCC_Free);
2039
2040 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2041 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2042 void buildTree(ArrayRef<Value *> Roots,
2043 const SmallDenseSet<Value *> &UserIgnoreLst);
2044
2045 /// Construct a vectorizable tree that starts at \p Roots.
2046 void buildTree(ArrayRef<Value *> Roots);
2047
2048 /// Return the scalars of the root node.
2050 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2051 return VectorizableTree.front()->Scalars;
2052 }
2053
2054 /// Returns the type/is-signed info for the root node in the graph without
2055 /// casting.
2056 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2057 const TreeEntry &Root = *VectorizableTree.front();
2058 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2059 !Root.Scalars.front()->getType()->isIntegerTy())
2060 return std::nullopt;
2061 auto It = MinBWs.find(&Root);
2062 if (It != MinBWs.end())
2063 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2064 It->second.first),
2065 It->second.second);
2066 if (Root.getOpcode() == Instruction::ZExt ||
2067 Root.getOpcode() == Instruction::SExt)
2068 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2069 Root.getOpcode() == Instruction::SExt);
2070 return std::nullopt;
2071 }
2072
2073 /// Checks if the root graph node can be emitted with narrower bitwidth at
2074 /// codegen and returns it signedness, if so.
2076 return MinBWs.at(VectorizableTree.front().get()).second;
2077 }
2078
2079 /// Returns reduction type after minbitdth analysis.
2081 if (ReductionBitWidth == 0 ||
2082 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2083 ReductionBitWidth >=
2084 DL->getTypeSizeInBits(
2085 VectorizableTree.front()->Scalars.front()->getType()))
2086 return getWidenedType(
2087 VectorizableTree.front()->Scalars.front()->getType(),
2088 VectorizableTree.front()->getVectorFactor());
2089 return getWidenedType(
2091 VectorizableTree.front()->Scalars.front()->getContext(),
2092 ReductionBitWidth),
2093 VectorizableTree.front()->getVectorFactor());
2094 }
2095
2096 /// Builds external uses of the vectorized scalars, i.e. the list of
2097 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2098 /// ExternallyUsedValues contains additional list of external uses to handle
2099 /// vectorization of reductions.
2100 void
2101 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2102
2103 /// Transforms graph nodes to target specific representations, if profitable.
2104 void transformNodes();
2105
2106 /// Clear the internal data structures that are created by 'buildTree'.
2107 void deleteTree() {
2108 VectorizableTree.clear();
2109 ScalarToTreeEntries.clear();
2110 DeletedNodes.clear();
2111 TransformedToGatherNodes.clear();
2112 OperandsToTreeEntry.clear();
2113 ScalarsInSplitNodes.clear();
2114 MustGather.clear();
2115 NonScheduledFirst.clear();
2116 EntryToLastInstruction.clear();
2117 LastInstructionToPos.clear();
2118 LoadEntriesToVectorize.clear();
2119 IsGraphTransformMode = false;
2120 GatheredLoadsEntriesFirst.reset();
2121 CompressEntryToData.clear();
2122 ExternalUses.clear();
2123 ExternalUsesAsOriginalScalar.clear();
2124 ExternalUsesWithNonUsers.clear();
2125 for (auto &Iter : BlocksSchedules) {
2126 BlockScheduling *BS = Iter.second.get();
2127 BS->clear();
2128 }
2129 MinBWs.clear();
2130 ReductionBitWidth = 0;
2131 BaseGraphSize = 1;
2132 CastMaxMinBWSizes.reset();
2133 ExtraBitWidthNodes.clear();
2134 InstrElementSize.clear();
2135 UserIgnoreList = nullptr;
2136 PostponedGathers.clear();
2137 ValueToGatherNodes.clear();
2138 TreeEntryToStridedPtrInfoMap.clear();
2139 }
2140
2141 unsigned getTreeSize() const { return VectorizableTree.size(); }
2142
2143 /// Returns the base graph size, before any transformations.
2144 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2145
2146 /// Perform LICM and CSE on the newly generated gather sequences.
2148
2149 /// Does this non-empty order represent an identity order? Identity
2150 /// should be represented as an empty order, so this is used to
2151 /// decide if we can canonicalize a computed order. Undef elements
2152 /// (represented as size) are ignored.
2154 assert(!Order.empty() && "expected non-empty order");
2155 const unsigned Sz = Order.size();
2156 return all_of(enumerate(Order), [&](const auto &P) {
2157 return P.value() == P.index() || P.value() == Sz;
2158 });
2159 }
2160
2161 /// Checks if the specified gather tree entry \p TE can be represented as a
2162 /// shuffled vector entry + (possibly) permutation with other gathers. It
2163 /// implements the checks only for possibly ordered scalars (Loads,
2164 /// ExtractElement, ExtractValue), which can be part of the graph.
2165 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2166 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2167 /// node might be ignored.
2168 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2169 bool TopToBottom,
2170 bool IgnoreReorder);
2171
2172 /// Sort loads into increasing pointers offsets to allow greater clustering.
2173 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2174
2175 /// Gets reordering data for the given tree entry. If the entry is vectorized
2176 /// - just return ReorderIndices, otherwise check if the scalars can be
2177 /// reordered and return the most optimal order.
2178 /// \return std::nullopt if ordering is not important, empty order, if
2179 /// identity order is important, or the actual order.
2180 /// \param TopToBottom If true, include the order of vectorized stores and
2181 /// insertelement nodes, otherwise skip them.
2182 /// \param IgnoreReorder true, if the root node order can be ignored.
2183 std::optional<OrdersType>
2184 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2185
2186 /// Checks if it is profitable to reorder the current tree.
2187 /// If the tree does not contain many profitable reordable nodes, better to
2188 /// skip it to save compile time.
2189 bool isProfitableToReorder() const;
2190
2191 /// Reorders the current graph to the most profitable order starting from the
2192 /// root node to the leaf nodes. The best order is chosen only from the nodes
2193 /// of the same size (vectorization factor). Smaller nodes are considered
2194 /// parts of subgraph with smaller VF and they are reordered independently. We
2195 /// can make it because we still need to extend smaller nodes to the wider VF
2196 /// and we can merge reordering shuffles with the widening shuffles.
2197 void reorderTopToBottom();
2198
2199 /// Reorders the current graph to the most profitable order starting from
2200 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2201 /// number of reshuffles if the leaf nodes use the same order. In this case we
2202 /// can merge the orders and just shuffle user node instead of shuffling its
2203 /// operands. Plus, even the leaf nodes have different orders, it allows to
2204 /// sink reordering in the graph closer to the root node and merge it later
2205 /// during analysis.
2206 void reorderBottomToTop(bool IgnoreReorder = false);
2207
2208 /// \return The vector element size in bits to use when vectorizing the
2209 /// expression tree ending at \p V. If V is a store, the size is the width of
2210 /// the stored value. Otherwise, the size is the width of the largest loaded
2211 /// value reaching V. This method is used by the vectorizer to calculate
2212 /// vectorization factors.
2213 unsigned getVectorElementSize(Value *V);
2214
2215 /// Compute the minimum type sizes required to represent the entries in a
2216 /// vectorizable tree.
2218
2219 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2220 unsigned getMaxVecRegSize() const {
2221 return MaxVecRegSize;
2222 }
2223
2224 // \returns minimum vector register size as set by cl::opt.
2225 unsigned getMinVecRegSize() const {
2226 return MinVecRegSize;
2227 }
2228
2229 unsigned getMinVF(unsigned Sz) const {
2230 return std::max(2U, getMinVecRegSize() / Sz);
2231 }
2232
2233 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2234 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2235 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2236 return MaxVF ? MaxVF : UINT_MAX;
2237 }
2238
2239 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2240 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2241 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2242 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2243 ///
2244 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2245 unsigned canMapToVector(Type *T) const;
2246
2247 /// \returns True if the VectorizableTree is both tiny and not fully
2248 /// vectorizable. We do not vectorize such trees.
2249 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2250
2251 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2252 /// It may happen, if all gather nodes are loads and they cannot be
2253 /// "clusterized". In this case even subgraphs cannot be vectorized more
2254 /// effectively than the base graph.
2255 bool isTreeNotExtendable() const;
2256
2257 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2258 /// can be load combined in the backend. Load combining may not be allowed in
2259 /// the IR optimizer, so we do not want to alter the pattern. For example,
2260 /// partially transforming a scalar bswap() pattern into vector code is
2261 /// effectively impossible for the backend to undo.
2262 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2263 /// may not be necessary.
2264 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2265
2266 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2267 /// can be load combined in the backend. Load combining may not be allowed in
2268 /// the IR optimizer, so we do not want to alter the pattern. For example,
2269 /// partially transforming a scalar bswap() pattern into vector code is
2270 /// effectively impossible for the backend to undo.
2271 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2272 /// may not be necessary.
2273 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2274 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2275 Align Alignment, const int64_t Diff,
2276 const size_t Sz) const;
2277
2278 /// Return true if an array of scalar loads can be replaced with a strided
2279 /// load (with constant stride).
2280 ///
2281 /// It is possible that the load gets "widened". Suppose that originally each
2282 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2283 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2284 /// ...
2285 /// %b + 0 * %s + (w - 1)
2286 ///
2287 /// %b + 1 * %s + 0
2288 /// %b + 1 * %s + 1
2289 /// %b + 1 * %s + 2
2290 /// ...
2291 /// %b + 1 * %s + (w - 1)
2292 /// ...
2293 ///
2294 /// %b + (n - 1) * %s + 0
2295 /// %b + (n - 1) * %s + 1
2296 /// %b + (n - 1) * %s + 2
2297 /// ...
2298 /// %b + (n - 1) * %s + (w - 1)
2299 ///
2300 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2301 ///
2302 /// \param PointerOps list of pointer arguments of loads.
2303 /// \param ElemTy original scalar type of loads.
2304 /// \param Alignment alignment of the first load.
2305 /// \param SortedIndices is the order of PointerOps as returned by
2306 /// `sortPtrAccesses`
2307 /// \param Diff Pointer difference between the lowest and the highes pointer
2308 /// in `PointerOps` as returned by `getPointersDiff`.
2309 /// \param Ptr0 first pointer in `PointersOps`.
2310 /// \param PtrN last pointer in `PointersOps`.
2311 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2312 /// of `SPtrInfo` necessary to generate the strided load later.
2314 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2315 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2316 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2317
2318 /// Return true if an array of scalar loads can be replaced with a strided
2319 /// load (with run-time stride).
2320 /// \param PointerOps list of pointer arguments of loads.
2321 /// \param ScalarTy type of loads.
2322 /// \param CommonAlignment common alignement of loads as computed by
2323 /// `computeCommonAlignment<LoadInst>`.
2324 /// \param SortedIndicies is a list of indicies computed by this function such
2325 /// that the sequence `PointerOps[SortedIndices[0]],
2326 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2327 /// ordered by the coefficient of the stride. For example, if PointerOps is
2328 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2329 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2330 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2331 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2332 /// of `SPtrInfo` necessary to generate the strided load later.
2333 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2334 Align CommonAlignment,
2335 SmallVectorImpl<unsigned> &SortedIndices,
2336 StridedPtrInfo &SPtrInfo) const;
2337
2338 /// Checks if the given array of loads can be represented as a vectorized,
2339 /// scatter or just simple gather.
2340 /// \param VL list of loads.
2341 /// \param VL0 main load value.
2342 /// \param Order returned order of load instructions.
2343 /// \param PointerOps returned list of pointer operands.
2344 /// \param BestVF return best vector factor, if recursive check found better
2345 /// vectorization sequences rather than masked gather.
2346 /// \param TryRecursiveCheck used to check if long masked gather can be
2347 /// represented as a serie of loads/insert subvector, if profitable.
2350 SmallVectorImpl<Value *> &PointerOps,
2351 StridedPtrInfo &SPtrInfo,
2352 unsigned *BestVF = nullptr,
2353 bool TryRecursiveCheck = true) const;
2354
2355 /// Registers non-vectorizable sequence of loads
2356 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2357 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2358 }
2359
2360 /// Checks if the given loads sequence is known as not vectorizable
2361 template <typename T>
2363 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2364 }
2365
2367
2368 /// This structure holds any data we need about the edges being traversed
2369 /// during buildTreeRec(). We keep track of:
2370 /// (i) the user TreeEntry index, and
2371 /// (ii) the index of the edge.
2372 struct EdgeInfo {
2373 EdgeInfo() = default;
2374 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2376 /// The user TreeEntry.
2377 TreeEntry *UserTE = nullptr;
2378 /// The operand index of the use.
2379 unsigned EdgeIdx = UINT_MAX;
2380#ifndef NDEBUG
2382 const BoUpSLP::EdgeInfo &EI) {
2383 EI.dump(OS);
2384 return OS;
2385 }
2386 /// Debug print.
2387 void dump(raw_ostream &OS) const {
2388 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2389 << " EdgeIdx:" << EdgeIdx << "}";
2390 }
2391 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2392#endif
2393 bool operator == (const EdgeInfo &Other) const {
2394 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2395 }
2396
2397 operator bool() const { return UserTE != nullptr; }
2398 };
2399 friend struct DenseMapInfo<EdgeInfo>;
2400
2401 /// A helper class used for scoring candidates for two consecutive lanes.
2403 const TargetLibraryInfo &TLI;
2404 const DataLayout &DL;
2405 ScalarEvolution &SE;
2406 const BoUpSLP &R;
2407 int NumLanes; // Total number of lanes (aka vectorization factor).
2408 int MaxLevel; // The maximum recursion depth for accumulating score.
2409
2410 public:
2412 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2413 int MaxLevel)
2414 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2415 MaxLevel(MaxLevel) {}
2416
2417 // The hard-coded scores listed here are not very important, though it shall
2418 // be higher for better matches to improve the resulting cost. When
2419 // computing the scores of matching one sub-tree with another, we are
2420 // basically counting the number of values that are matching. So even if all
2421 // scores are set to 1, we would still get a decent matching result.
2422 // However, sometimes we have to break ties. For example we may have to
2423 // choose between matching loads vs matching opcodes. This is what these
2424 // scores are helping us with: they provide the order of preference. Also,
2425 // this is important if the scalar is externally used or used in another
2426 // tree entry node in the different lane.
2427
2428 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2429 static const int ScoreConsecutiveLoads = 4;
2430 /// The same load multiple times. This should have a better score than
2431 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2432 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2433 /// a vector load and 1.0 for a broadcast.
2434 static const int ScoreSplatLoads = 3;
2435 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2436 static const int ScoreReversedLoads = 3;
2437 /// A load candidate for masked gather.
2438 static const int ScoreMaskedGatherCandidate = 1;
2439 /// ExtractElementInst from same vector and consecutive indexes.
2440 static const int ScoreConsecutiveExtracts = 4;
2441 /// ExtractElementInst from same vector and reversed indices.
2442 static const int ScoreReversedExtracts = 3;
2443 /// Constants.
2444 static const int ScoreConstants = 2;
2445 /// Instructions with the same opcode.
2446 static const int ScoreSameOpcode = 2;
2447 /// Instructions with alt opcodes (e.g, add + sub).
2448 static const int ScoreAltOpcodes = 1;
2449 /// Identical instructions (a.k.a. splat or broadcast).
2450 static const int ScoreSplat = 1;
2451 /// Matching with an undef is preferable to failing.
2452 static const int ScoreUndef = 1;
2453 /// Score for failing to find a decent match.
2454 static const int ScoreFail = 0;
2455 /// Score if all users are vectorized.
2456 static const int ScoreAllUserVectorized = 1;
2457
2458 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2459 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2460 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2461 /// MainAltOps.
2463 ArrayRef<Value *> MainAltOps) const {
2464 if (!isValidElementType(V1->getType()) ||
2467
2468 if (V1 == V2) {
2469 if (isa<LoadInst>(V1)) {
2470 // Retruns true if the users of V1 and V2 won't need to be extracted.
2471 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2472 // Bail out if we have too many uses to save compilation time.
2473 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2474 return false;
2475
2476 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2477 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2478 return U == U1 || U == U2 || R.isVectorized(U);
2479 });
2480 };
2481 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2482 };
2483 // A broadcast of a load can be cheaper on some targets.
2484 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2485 ElementCount::getFixed(NumLanes)) &&
2486 ((int)V1->getNumUses() == NumLanes ||
2487 AllUsersAreInternal(V1, V2)))
2489 }
2491 }
2492
2493 auto CheckSameEntryOrFail = [&]() {
2494 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2496 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2497 !TEs2.empty() &&
2498 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2500 }
2502 };
2503
2504 auto *LI1 = dyn_cast<LoadInst>(V1);
2505 auto *LI2 = dyn_cast<LoadInst>(V2);
2506 if (LI1 && LI2) {
2507 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2508 !LI2->isSimple())
2509 return CheckSameEntryOrFail();
2510
2511 std::optional<int64_t> Dist = getPointersDiff(
2512 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2513 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2514 if (!Dist || *Dist == 0) {
2515 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2516 getUnderlyingObject(LI2->getPointerOperand()) &&
2517 R.TTI->isLegalMaskedGather(
2518 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2520 return CheckSameEntryOrFail();
2521 }
2522 // The distance is too large - still may be profitable to use masked
2523 // loads/gathers.
2524 if (std::abs(*Dist) > NumLanes / 2)
2526 // This still will detect consecutive loads, but we might have "holes"
2527 // in some cases. It is ok for non-power-2 vectorization and may produce
2528 // better results. It should not affect current vectorization.
2531 }
2532
2533 auto *C1 = dyn_cast<Constant>(V1);
2534 auto *C2 = dyn_cast<Constant>(V2);
2535 if (C1 && C2)
2537
2538 // Consider constants and buildvector compatible.
2539 if ((C1 && isa<InsertElementInst>(V2)) ||
2540 (C2 && isa<InsertElementInst>(V1)))
2542
2543 // Extracts from consecutive indexes of the same vector better score as
2544 // the extracts could be optimized away.
2545 Value *EV1;
2546 ConstantInt *Ex1Idx;
2547 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2548 // Undefs are always profitable for extractelements.
2549 // Compiler can easily combine poison and extractelement <non-poison> or
2550 // undef and extractelement <poison>. But combining undef +
2551 // extractelement <non-poison-but-may-produce-poison> requires some
2552 // extra operations.
2553 if (isa<UndefValue>(V2))
2554 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2557 Value *EV2 = nullptr;
2558 ConstantInt *Ex2Idx = nullptr;
2559 if (match(V2,
2561 m_Undef())))) {
2562 // Undefs are always profitable for extractelements.
2563 if (!Ex2Idx)
2565 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2567 if (EV2 == EV1) {
2568 int Idx1 = Ex1Idx->getZExtValue();
2569 int Idx2 = Ex2Idx->getZExtValue();
2570 int Dist = Idx2 - Idx1;
2571 // The distance is too large - still may be profitable to use
2572 // shuffles.
2573 if (std::abs(Dist) == 0)
2575 if (std::abs(Dist) > NumLanes / 2)
2579 }
2581 }
2582 return CheckSameEntryOrFail();
2583 }
2584
2585 auto *I1 = dyn_cast<Instruction>(V1);
2586 auto *I2 = dyn_cast<Instruction>(V2);
2587 if (I1 && I2) {
2588 if (I1->getParent() != I2->getParent())
2589 return CheckSameEntryOrFail();
2590 SmallVector<Value *, 4> Ops(MainAltOps);
2591 Ops.push_back(I1);
2592 Ops.push_back(I2);
2593 InstructionsState S = getSameOpcode(Ops, TLI);
2594 // Note: Only consider instructions with <= 2 operands to avoid
2595 // complexity explosion.
2596 if (S &&
2597 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2598 !S.isAltShuffle()) &&
2599 all_of(Ops, [&S](Value *V) {
2600 return isa<PoisonValue>(V) ||
2601 cast<Instruction>(V)->getNumOperands() ==
2602 S.getMainOp()->getNumOperands();
2603 }))
2604 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2606 }
2607
2608 if (I1 && isa<PoisonValue>(V2))
2610
2611 if (isa<UndefValue>(V2))
2613
2614 return CheckSameEntryOrFail();
2615 }
2616
2617 /// Go through the operands of \p LHS and \p RHS recursively until
2618 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2619 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2620 /// of \p U1 and \p U2), except at the beginning of the recursion where
2621 /// these are set to nullptr.
2622 ///
2623 /// For example:
2624 /// \verbatim
2625 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2626 /// \ / \ / \ / \ /
2627 /// + + + +
2628 /// G1 G2 G3 G4
2629 /// \endverbatim
2630 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2631 /// each level recursively, accumulating the score. It starts from matching
2632 /// the additions at level 0, then moves on to the loads (level 1). The
2633 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2634 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2635 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2636 /// Please note that the order of the operands does not matter, as we
2637 /// evaluate the score of all profitable combinations of operands. In
2638 /// other words the score of G1 and G4 is the same as G1 and G2. This
2639 /// heuristic is based on ideas described in:
2640 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2641 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2642 /// Luís F. W. Góes
2644 Instruction *U2, int CurrLevel,
2645 ArrayRef<Value *> MainAltOps) const {
2646
2647 // Get the shallow score of V1 and V2.
2648 int ShallowScoreAtThisLevel =
2649 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2650
2651 // If reached MaxLevel,
2652 // or if V1 and V2 are not instructions,
2653 // or if they are SPLAT,
2654 // or if they are not consecutive,
2655 // or if profitable to vectorize loads or extractelements, early return
2656 // the current cost.
2657 auto *I1 = dyn_cast<Instruction>(LHS);
2658 auto *I2 = dyn_cast<Instruction>(RHS);
2659 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2660 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2661 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2662 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2664 ShallowScoreAtThisLevel))
2665 return ShallowScoreAtThisLevel;
2666 assert(I1 && I2 && "Should have early exited.");
2667
2668 // Contains the I2 operand indexes that got matched with I1 operands.
2669 SmallSet<unsigned, 4> Op2Used;
2670
2671 // Recursion towards the operands of I1 and I2. We are trying all possible
2672 // operand pairs, and keeping track of the best score.
2673 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2674 OpIdx1 != NumOperands1; ++OpIdx1) {
2675 // Try to pair op1I with the best operand of I2.
2676 int MaxTmpScore = 0;
2677 unsigned MaxOpIdx2 = 0;
2678 bool FoundBest = false;
2679 // If I2 is commutative try all combinations.
2680 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2681 unsigned ToIdx = isCommutative(I2)
2682 ? I2->getNumOperands()
2683 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2684 assert(FromIdx <= ToIdx && "Bad index");
2685 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2686 // Skip operands already paired with OpIdx1.
2687 if (Op2Used.count(OpIdx2))
2688 continue;
2689 // Recursively calculate the cost at each level
2690 int TmpScore =
2691 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2692 I1, I2, CurrLevel + 1, {});
2693 // Look for the best score.
2694 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2695 TmpScore > MaxTmpScore) {
2696 MaxTmpScore = TmpScore;
2697 MaxOpIdx2 = OpIdx2;
2698 FoundBest = true;
2699 }
2700 }
2701 if (FoundBest) {
2702 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2703 Op2Used.insert(MaxOpIdx2);
2704 ShallowScoreAtThisLevel += MaxTmpScore;
2705 }
2706 }
2707 return ShallowScoreAtThisLevel;
2708 }
2709 };
2710 /// A helper data structure to hold the operands of a vector of instructions.
2711 /// This supports a fixed vector length for all operand vectors.
2713 /// For each operand we need (i) the value, and (ii) the opcode that it
2714 /// would be attached to if the expression was in a left-linearized form.
2715 /// This is required to avoid illegal operand reordering.
2716 /// For example:
2717 /// \verbatim
2718 /// 0 Op1
2719 /// |/
2720 /// Op1 Op2 Linearized + Op2
2721 /// \ / ----------> |/
2722 /// - -
2723 ///
2724 /// Op1 - Op2 (0 + Op1) - Op2
2725 /// \endverbatim
2726 ///
2727 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2728 ///
2729 /// Another way to think of this is to track all the operations across the
2730 /// path from the operand all the way to the root of the tree and to
2731 /// calculate the operation that corresponds to this path. For example, the
2732 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2733 /// corresponding operation is a '-' (which matches the one in the
2734 /// linearized tree, as shown above).
2735 ///
2736 /// For lack of a better term, we refer to this operation as Accumulated
2737 /// Path Operation (APO).
2738 struct OperandData {
2739 OperandData() = default;
2740 OperandData(Value *V, bool APO, bool IsUsed)
2741 : V(V), APO(APO), IsUsed(IsUsed) {}
2742 /// The operand value.
2743 Value *V = nullptr;
2744 /// TreeEntries only allow a single opcode, or an alternate sequence of
2745 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2746 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2747 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2748 /// (e.g., Add/Mul)
2749 bool APO = false;
2750 /// Helper data for the reordering function.
2751 bool IsUsed = false;
2752 };
2753
2754 /// During operand reordering, we are trying to select the operand at lane
2755 /// that matches best with the operand at the neighboring lane. Our
2756 /// selection is based on the type of value we are looking for. For example,
2757 /// if the neighboring lane has a load, we need to look for a load that is
2758 /// accessing a consecutive address. These strategies are summarized in the
2759 /// 'ReorderingMode' enumerator.
2760 enum class ReorderingMode {
2761 Load, ///< Matching loads to consecutive memory addresses
2762 Opcode, ///< Matching instructions based on opcode (same or alternate)
2763 Constant, ///< Matching constants
2764 Splat, ///< Matching the same instruction multiple times (broadcast)
2765 Failed, ///< We failed to create a vectorizable group
2766 };
2767
2768 using OperandDataVec = SmallVector<OperandData, 2>;
2769
2770 /// A vector of operand vectors.
2772 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2773 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2774 unsigned ArgSize = 0;
2775
2776 const TargetLibraryInfo &TLI;
2777 const DataLayout &DL;
2778 ScalarEvolution &SE;
2779 const BoUpSLP &R;
2780 const Loop *L = nullptr;
2781
2782 /// \returns the operand data at \p OpIdx and \p Lane.
2783 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2784 return OpsVec[OpIdx][Lane];
2785 }
2786
2787 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2788 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2789 return OpsVec[OpIdx][Lane];
2790 }
2791
2792 /// Clears the used flag for all entries.
2793 void clearUsed() {
2794 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2795 OpIdx != NumOperands; ++OpIdx)
2796 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2797 ++Lane)
2798 OpsVec[OpIdx][Lane].IsUsed = false;
2799 }
2800
2801 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2802 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2803 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2804 }
2805
2806 /// \param Lane lane of the operands under analysis.
2807 /// \param OpIdx operand index in \p Lane lane we're looking the best
2808 /// candidate for.
2809 /// \param Idx operand index of the current candidate value.
2810 /// \returns The additional score due to possible broadcasting of the
2811 /// elements in the lane. It is more profitable to have power-of-2 unique
2812 /// elements in the lane, it will be vectorized with higher probability
2813 /// after removing duplicates. Currently the SLP vectorizer supports only
2814 /// vectorization of the power-of-2 number of unique scalars.
2815 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2816 const SmallBitVector &UsedLanes) const {
2817 Value *IdxLaneV = getData(Idx, Lane).V;
2818 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2819 isa<ExtractElementInst>(IdxLaneV))
2820 return 0;
2822 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2823 if (Ln == Lane)
2824 continue;
2825 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2826 if (!isa<Instruction>(OpIdxLnV))
2827 return 0;
2828 Uniques.try_emplace(OpIdxLnV, Ln);
2829 }
2830 unsigned UniquesCount = Uniques.size();
2831 auto IdxIt = Uniques.find(IdxLaneV);
2832 unsigned UniquesCntWithIdxLaneV =
2833 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2834 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2835 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2836 unsigned UniquesCntWithOpIdxLaneV =
2837 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2838 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2839 return 0;
2840 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2841 UniquesCntWithOpIdxLaneV,
2842 UniquesCntWithOpIdxLaneV -
2843 bit_floor(UniquesCntWithOpIdxLaneV)) -
2844 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2845 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2846 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2847 }
2848
2849 /// \param Lane lane of the operands under analysis.
2850 /// \param OpIdx operand index in \p Lane lane we're looking the best
2851 /// candidate for.
2852 /// \param Idx operand index of the current candidate value.
2853 /// \returns The additional score for the scalar which users are all
2854 /// vectorized.
2855 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2856 Value *IdxLaneV = getData(Idx, Lane).V;
2857 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2858 // Do not care about number of uses for vector-like instructions
2859 // (extractelement/extractvalue with constant indices), they are extracts
2860 // themselves and already externally used. Vectorization of such
2861 // instructions does not add extra extractelement instruction, just may
2862 // remove it.
2863 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2864 isVectorLikeInstWithConstOps(OpIdxLaneV))
2866 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2867 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2868 return 0;
2869 return R.areAllUsersVectorized(IdxLaneI)
2871 : 0;
2872 }
2873
2874 /// Score scaling factor for fully compatible instructions but with
2875 /// different number of external uses. Allows better selection of the
2876 /// instructions with less external uses.
2877 static const int ScoreScaleFactor = 10;
2878
2879 /// \Returns the look-ahead score, which tells us how much the sub-trees
2880 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2881 /// score. This helps break ties in an informed way when we cannot decide on
2882 /// the order of the operands by just considering the immediate
2883 /// predecessors.
2884 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2885 int Lane, unsigned OpIdx, unsigned Idx,
2886 bool &IsUsed, const SmallBitVector &UsedLanes) {
2887 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2889 // Keep track of the instruction stack as we recurse into the operands
2890 // during the look-ahead score exploration.
2891 int Score =
2892 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2893 /*CurrLevel=*/1, MainAltOps);
2894 if (Score) {
2895 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2896 if (Score <= -SplatScore) {
2897 // Failed score.
2898 Score = 0;
2899 } else {
2900 Score += SplatScore;
2901 // Scale score to see the difference between different operands
2902 // and similar operands but all vectorized/not all vectorized
2903 // uses. It does not affect actual selection of the best
2904 // compatible operand in general, just allows to select the
2905 // operand with all vectorized uses.
2906 Score *= ScoreScaleFactor;
2907 Score += getExternalUseScore(Lane, OpIdx, Idx);
2908 IsUsed = true;
2909 }
2910 }
2911 return Score;
2912 }
2913
2914 /// Best defined scores per lanes between the passes. Used to choose the
2915 /// best operand (with the highest score) between the passes.
2916 /// The key - {Operand Index, Lane}.
2917 /// The value - the best score between the passes for the lane and the
2918 /// operand.
2920 BestScoresPerLanes;
2921
2922 // Search all operands in Ops[*][Lane] for the one that matches best
2923 // Ops[OpIdx][LastLane] and return its opreand index.
2924 // If no good match can be found, return std::nullopt.
2925 std::optional<unsigned>
2926 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2927 ArrayRef<ReorderingMode> ReorderingModes,
2928 ArrayRef<Value *> MainAltOps,
2929 const SmallBitVector &UsedLanes) {
2930 unsigned NumOperands = getNumOperands();
2931
2932 // The operand of the previous lane at OpIdx.
2933 Value *OpLastLane = getData(OpIdx, LastLane).V;
2934
2935 // Our strategy mode for OpIdx.
2936 ReorderingMode RMode = ReorderingModes[OpIdx];
2937 if (RMode == ReorderingMode::Failed)
2938 return std::nullopt;
2939
2940 // The linearized opcode of the operand at OpIdx, Lane.
2941 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2942
2943 // The best operand index and its score.
2944 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2945 // are using the score to differentiate between the two.
2946 struct BestOpData {
2947 std::optional<unsigned> Idx;
2948 unsigned Score = 0;
2949 } BestOp;
2950 BestOp.Score =
2951 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2952 .first->second;
2953
2954 // Track if the operand must be marked as used. If the operand is set to
2955 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2956 // want to reestimate the operands again on the following iterations).
2957 bool IsUsed = RMode == ReorderingMode::Splat ||
2958 RMode == ReorderingMode::Constant ||
2959 RMode == ReorderingMode::Load;
2960 // Iterate through all unused operands and look for the best.
2961 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2962 // Get the operand at Idx and Lane.
2963 OperandData &OpData = getData(Idx, Lane);
2964 Value *Op = OpData.V;
2965 bool OpAPO = OpData.APO;
2966
2967 // Skip already selected operands.
2968 if (OpData.IsUsed)
2969 continue;
2970
2971 // Skip if we are trying to move the operand to a position with a
2972 // different opcode in the linearized tree form. This would break the
2973 // semantics.
2974 if (OpAPO != OpIdxAPO)
2975 continue;
2976
2977 // Look for an operand that matches the current mode.
2978 switch (RMode) {
2979 case ReorderingMode::Load:
2980 case ReorderingMode::Opcode: {
2981 bool LeftToRight = Lane > LastLane;
2982 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2983 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2984 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2985 OpIdx, Idx, IsUsed, UsedLanes);
2986 if (Score > static_cast<int>(BestOp.Score) ||
2987 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2988 Idx == OpIdx)) {
2989 BestOp.Idx = Idx;
2990 BestOp.Score = Score;
2991 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2992 }
2993 break;
2994 }
2995 case ReorderingMode::Constant:
2996 if (isa<Constant>(Op) ||
2997 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2998 BestOp.Idx = Idx;
2999 if (isa<Constant>(Op)) {
3001 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3003 }
3005 IsUsed = false;
3006 }
3007 break;
3008 case ReorderingMode::Splat:
3009 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
3010 IsUsed = Op == OpLastLane;
3011 if (Op == OpLastLane) {
3012 BestOp.Score = LookAheadHeuristics::ScoreSplat;
3013 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3015 }
3016 BestOp.Idx = Idx;
3017 }
3018 break;
3019 case ReorderingMode::Failed:
3020 llvm_unreachable("Not expected Failed reordering mode.");
3021 }
3022 }
3023
3024 if (BestOp.Idx) {
3025 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3026 return BestOp.Idx;
3027 }
3028 // If we could not find a good match return std::nullopt.
3029 return std::nullopt;
3030 }
3031
3032 /// Helper for reorderOperandVecs.
3033 /// \returns the lane that we should start reordering from. This is the one
3034 /// which has the least number of operands that can freely move about or
3035 /// less profitable because it already has the most optimal set of operands.
3036 unsigned getBestLaneToStartReordering() const {
3037 unsigned Min = UINT_MAX;
3038 unsigned SameOpNumber = 0;
3039 // std::pair<unsigned, unsigned> is used to implement a simple voting
3040 // algorithm and choose the lane with the least number of operands that
3041 // can freely move about or less profitable because it already has the
3042 // most optimal set of operands. The first unsigned is a counter for
3043 // voting, the second unsigned is the counter of lanes with instructions
3044 // with same/alternate opcodes and same parent basic block.
3046 // Try to be closer to the original results, if we have multiple lanes
3047 // with same cost. If 2 lanes have the same cost, use the one with the
3048 // highest index.
3049 for (int I = getNumLanes(); I > 0; --I) {
3050 unsigned Lane = I - 1;
3051 OperandsOrderData NumFreeOpsHash =
3052 getMaxNumOperandsThatCanBeReordered(Lane);
3053 // Compare the number of operands that can move and choose the one with
3054 // the least number.
3055 if (NumFreeOpsHash.NumOfAPOs < Min) {
3056 Min = NumFreeOpsHash.NumOfAPOs;
3057 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3058 HashMap.clear();
3059 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3060 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3061 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3062 // Select the most optimal lane in terms of number of operands that
3063 // should be moved around.
3064 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3065 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3066 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3067 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3068 auto [It, Inserted] =
3069 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3070 if (!Inserted)
3071 ++It->second.first;
3072 }
3073 }
3074 // Select the lane with the minimum counter.
3075 unsigned BestLane = 0;
3076 unsigned CntMin = UINT_MAX;
3077 for (const auto &Data : reverse(HashMap)) {
3078 if (Data.second.first < CntMin) {
3079 CntMin = Data.second.first;
3080 BestLane = Data.second.second;
3081 }
3082 }
3083 return BestLane;
3084 }
3085
3086 /// Data structure that helps to reorder operands.
3087 struct OperandsOrderData {
3088 /// The best number of operands with the same APOs, which can be
3089 /// reordered.
3090 unsigned NumOfAPOs = UINT_MAX;
3091 /// Number of operands with the same/alternate instruction opcode and
3092 /// parent.
3093 unsigned NumOpsWithSameOpcodeParent = 0;
3094 /// Hash for the actual operands ordering.
3095 /// Used to count operands, actually their position id and opcode
3096 /// value. It is used in the voting mechanism to find the lane with the
3097 /// least number of operands that can freely move about or less profitable
3098 /// because it already has the most optimal set of operands. Can be
3099 /// replaced with SmallVector<unsigned> instead but hash code is faster
3100 /// and requires less memory.
3101 unsigned Hash = 0;
3102 };
3103 /// \returns the maximum number of operands that are allowed to be reordered
3104 /// for \p Lane and the number of compatible instructions(with the same
3105 /// parent/opcode). This is used as a heuristic for selecting the first lane
3106 /// to start operand reordering.
3107 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3108 unsigned CntTrue = 0;
3109 unsigned NumOperands = getNumOperands();
3110 // Operands with the same APO can be reordered. We therefore need to count
3111 // how many of them we have for each APO, like this: Cnt[APO] = x.
3112 // Since we only have two APOs, namely true and false, we can avoid using
3113 // a map. Instead we can simply count the number of operands that
3114 // correspond to one of them (in this case the 'true' APO), and calculate
3115 // the other by subtracting it from the total number of operands.
3116 // Operands with the same instruction opcode and parent are more
3117 // profitable since we don't need to move them in many cases, with a high
3118 // probability such lane already can be vectorized effectively.
3119 bool AllUndefs = true;
3120 unsigned NumOpsWithSameOpcodeParent = 0;
3121 Instruction *OpcodeI = nullptr;
3122 BasicBlock *Parent = nullptr;
3123 unsigned Hash = 0;
3124 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3125 const OperandData &OpData = getData(OpIdx, Lane);
3126 if (OpData.APO)
3127 ++CntTrue;
3128 // Use Boyer-Moore majority voting for finding the majority opcode and
3129 // the number of times it occurs.
3130 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3131 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3132 I->getParent() != Parent) {
3133 if (NumOpsWithSameOpcodeParent == 0) {
3134 NumOpsWithSameOpcodeParent = 1;
3135 OpcodeI = I;
3136 Parent = I->getParent();
3137 } else {
3138 --NumOpsWithSameOpcodeParent;
3139 }
3140 } else {
3141 ++NumOpsWithSameOpcodeParent;
3142 }
3143 }
3144 Hash = hash_combine(
3145 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3146 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3147 }
3148 if (AllUndefs)
3149 return {};
3150 OperandsOrderData Data;
3151 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3152 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3153 Data.Hash = Hash;
3154 return Data;
3155 }
3156
3157 /// Go through the instructions in VL and append their operands.
3158 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3159 const InstructionsState &S) {
3160 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3161 assert((empty() || all_of(Operands,
3162 [this](const ValueList &VL) {
3163 return VL.size() == getNumLanes();
3164 })) &&
3165 "Expected same number of lanes");
3166 assert(S.valid() && "InstructionsState is invalid.");
3167 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3168 // arguments to the intrinsic produces the same result.
3169 Instruction *MainOp = S.getMainOp();
3170 unsigned NumOperands = MainOp->getNumOperands();
3172 OpsVec.resize(ArgSize);
3173 unsigned NumLanes = VL.size();
3174 for (OperandDataVec &Ops : OpsVec)
3175 Ops.resize(NumLanes);
3176 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3177 // Our tree has just 3 nodes: the root and two operands.
3178 // It is therefore trivial to get the APO. We only need to check the
3179 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3180 // operand. The LHS operand of both add and sub is never attached to an
3181 // inversese operation in the linearized form, therefore its APO is
3182 // false. The RHS is true only if V is an inverse operation.
3183
3184 // Since operand reordering is performed on groups of commutative
3185 // operations or alternating sequences (e.g., +, -), we can safely tell
3186 // the inverse operations by checking commutativity.
3187 auto *I = dyn_cast<Instruction>(VL[Lane]);
3188 if (!I && isa<PoisonValue>(VL[Lane])) {
3189 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3190 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3191 continue;
3192 }
3193 bool IsInverseOperation = false;
3194 if (S.isCopyableElement(VL[Lane])) {
3195 // The value is a copyable element.
3196 IsInverseOperation =
3197 !isCommutative(MainOp, VL[Lane], /*IsCopyable=*/true);
3198 } else {
3199 assert(I && "Expected instruction");
3200 auto [SelectedOp, Ops] = convertTo(I, S);
3201 // We cannot check commutativity by the converted instruction
3202 // (SelectedOp) because isCommutative also examines def-use
3203 // relationships.
3204 IsInverseOperation = !isCommutative(SelectedOp, I);
3205 }
3206 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3207 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3208 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3209 }
3210 }
3211 }
3212
3213 /// \returns the number of operands.
3214 unsigned getNumOperands() const { return ArgSize; }
3215
3216 /// \returns the number of lanes.
3217 unsigned getNumLanes() const { return OpsVec[0].size(); }
3218
3219 /// \returns the operand value at \p OpIdx and \p Lane.
3220 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3221 return getData(OpIdx, Lane).V;
3222 }
3223
3224 /// \returns true if the data structure is empty.
3225 bool empty() const { return OpsVec.empty(); }
3226
3227 /// Clears the data.
3228 void clear() { OpsVec.clear(); }
3229
3230 /// \Returns true if there are enough operands identical to \p Op to fill
3231 /// the whole vector (it is mixed with constants or loop invariant values).
3232 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3233 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3234 assert(Op == getValue(OpIdx, Lane) &&
3235 "Op is expected to be getValue(OpIdx, Lane).");
3236 // Small number of loads - try load matching.
3237 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3238 return false;
3239 bool OpAPO = getData(OpIdx, Lane).APO;
3240 bool IsInvariant = L && L->isLoopInvariant(Op);
3241 unsigned Cnt = 0;
3242 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3243 if (Ln == Lane)
3244 continue;
3245 // This is set to true if we found a candidate for broadcast at Lane.
3246 bool FoundCandidate = false;
3247 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3248 OperandData &Data = getData(OpI, Ln);
3249 if (Data.APO != OpAPO || Data.IsUsed)
3250 continue;
3251 Value *OpILane = getValue(OpI, Lane);
3252 bool IsConstantOp = isa<Constant>(OpILane);
3253 // Consider the broadcast candidate if:
3254 // 1. Same value is found in one of the operands.
3255 if (Data.V == Op ||
3256 // 2. The operand in the given lane is not constant but there is a
3257 // constant operand in another lane (which can be moved to the
3258 // given lane). In this case we can represent it as a simple
3259 // permutation of constant and broadcast.
3260 (!IsConstantOp &&
3261 ((Lns > 2 && isa<Constant>(Data.V)) ||
3262 // 2.1. If we have only 2 lanes, need to check that value in the
3263 // next lane does not build same opcode sequence.
3264 (Lns == 2 &&
3265 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3266 isa<Constant>(Data.V)))) ||
3267 // 3. The operand in the current lane is loop invariant (can be
3268 // hoisted out) and another operand is also a loop invariant
3269 // (though not a constant). In this case the whole vector can be
3270 // hoisted out.
3271 // FIXME: need to teach the cost model about this case for better
3272 // estimation.
3273 (IsInvariant && !isa<Constant>(Data.V) &&
3274 !getSameOpcode({Op, Data.V}, TLI) &&
3275 L->isLoopInvariant(Data.V))) {
3276 FoundCandidate = true;
3277 Data.IsUsed = Data.V == Op;
3278 if (Data.V == Op)
3279 ++Cnt;
3280 break;
3281 }
3282 }
3283 if (!FoundCandidate)
3284 return false;
3285 }
3286 return getNumLanes() == 2 || Cnt > 1;
3287 }
3288
3289 /// Checks if there is at least single compatible operand in lanes other
3290 /// than \p Lane, compatible with the operand \p Op.
3291 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3292 assert(Op == getValue(OpIdx, Lane) &&
3293 "Op is expected to be getValue(OpIdx, Lane).");
3294 bool OpAPO = getData(OpIdx, Lane).APO;
3295 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3296 if (Ln == Lane)
3297 continue;
3298 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3299 const OperandData &Data = getData(OpI, Ln);
3300 if (Data.APO != OpAPO || Data.IsUsed)
3301 return true;
3302 Value *OpILn = getValue(OpI, Ln);
3303 return (L && L->isLoopInvariant(OpILn)) ||
3304 (getSameOpcode({Op, OpILn}, TLI) &&
3305 allSameBlock({Op, OpILn}));
3306 }))
3307 return true;
3308 }
3309 return false;
3310 }
3311
3312 public:
3313 /// Initialize with all the operands of the instruction vector \p RootVL.
3315 const InstructionsState &S, const BoUpSLP &R)
3316 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3317 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3318 // Append all the operands of RootVL.
3319 appendOperands(RootVL, Operands, S);
3320 }
3321
3322 /// \Returns a value vector with the operands across all lanes for the
3323 /// opearnd at \p OpIdx.
3324 ValueList getVL(unsigned OpIdx) const {
3325 ValueList OpVL(OpsVec[OpIdx].size());
3326 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3327 "Expected same num of lanes across all operands");
3328 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3329 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3330 return OpVL;
3331 }
3332
3333 // Performs operand reordering for 2 or more operands.
3334 // The original operands are in OrigOps[OpIdx][Lane].
3335 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3336 void reorder() {
3337 unsigned NumOperands = getNumOperands();
3338 unsigned NumLanes = getNumLanes();
3339 // Each operand has its own mode. We are using this mode to help us select
3340 // the instructions for each lane, so that they match best with the ones
3341 // we have selected so far.
3342 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3343
3344 // This is a greedy single-pass algorithm. We are going over each lane
3345 // once and deciding on the best order right away with no back-tracking.
3346 // However, in order to increase its effectiveness, we start with the lane
3347 // that has operands that can move the least. For example, given the
3348 // following lanes:
3349 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3350 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3351 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3352 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3353 // we will start at Lane 1, since the operands of the subtraction cannot
3354 // be reordered. Then we will visit the rest of the lanes in a circular
3355 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3356
3357 // Find the first lane that we will start our search from.
3358 unsigned FirstLane = getBestLaneToStartReordering();
3359
3360 // Initialize the modes.
3361 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3362 Value *OpLane0 = getValue(OpIdx, FirstLane);
3363 // Keep track if we have instructions with all the same opcode on one
3364 // side.
3365 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3366 // Check if OpLane0 should be broadcast.
3367 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3368 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3369 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3370 else if (isa<LoadInst>(OpILane0))
3371 ReorderingModes[OpIdx] = ReorderingMode::Load;
3372 else
3373 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3374 } else if (isa<Constant>(OpLane0)) {
3375 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3376 } else if (isa<Argument>(OpLane0)) {
3377 // Our best hope is a Splat. It may save some cost in some cases.
3378 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3379 } else {
3380 llvm_unreachable("Unexpected value kind.");
3381 }
3382 }
3383
3384 // Check that we don't have same operands. No need to reorder if operands
3385 // are just perfect diamond or shuffled diamond match. Do not do it only
3386 // for possible broadcasts or non-power of 2 number of scalars (just for
3387 // now).
3388 auto &&SkipReordering = [this]() {
3389 SmallPtrSet<Value *, 4> UniqueValues;
3390 ArrayRef<OperandData> Op0 = OpsVec.front();
3391 for (const OperandData &Data : Op0)
3392 UniqueValues.insert(Data.V);
3394 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3395 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3396 return !UniqueValues.contains(Data.V);
3397 }))
3398 return false;
3399 }
3400 // TODO: Check if we can remove a check for non-power-2 number of
3401 // scalars after full support of non-power-2 vectorization.
3402 return UniqueValues.size() != 2 &&
3403 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3404 UniqueValues.size());
3405 };
3406
3407 // If the initial strategy fails for any of the operand indexes, then we
3408 // perform reordering again in a second pass. This helps avoid assigning
3409 // high priority to the failed strategy, and should improve reordering for
3410 // the non-failed operand indexes.
3411 for (int Pass = 0; Pass != 2; ++Pass) {
3412 // Check if no need to reorder operands since they're are perfect or
3413 // shuffled diamond match.
3414 // Need to do it to avoid extra external use cost counting for
3415 // shuffled matches, which may cause regressions.
3416 if (SkipReordering())
3417 break;
3418 // Skip the second pass if the first pass did not fail.
3419 bool StrategyFailed = false;
3420 // Mark all operand data as free to use.
3421 clearUsed();
3422 // We keep the original operand order for the FirstLane, so reorder the
3423 // rest of the lanes. We are visiting the nodes in a circular fashion,
3424 // using FirstLane as the center point and increasing the radius
3425 // distance.
3426 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3427 for (unsigned I = 0; I < NumOperands; ++I)
3428 MainAltOps[I].push_back(getData(I, FirstLane).V);
3429
3430 SmallBitVector UsedLanes(NumLanes);
3431 UsedLanes.set(FirstLane);
3432 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3433 // Visit the lane on the right and then the lane on the left.
3434 for (int Direction : {+1, -1}) {
3435 int Lane = FirstLane + Direction * Distance;
3436 if (Lane < 0 || Lane >= (int)NumLanes)
3437 continue;
3438 UsedLanes.set(Lane);
3439 int LastLane = Lane - Direction;
3440 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3441 "Out of bounds");
3442 // Look for a good match for each operand.
3443 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3444 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3445 std::optional<unsigned> BestIdx =
3446 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3447 MainAltOps[OpIdx], UsedLanes);
3448 // By not selecting a value, we allow the operands that follow to
3449 // select a better matching value. We will get a non-null value in
3450 // the next run of getBestOperand().
3451 if (BestIdx) {
3452 // Swap the current operand with the one returned by
3453 // getBestOperand().
3454 swap(OpIdx, *BestIdx, Lane);
3455 } else {
3456 // Enable the second pass.
3457 StrategyFailed = true;
3458 }
3459 // Try to get the alternate opcode and follow it during analysis.
3460 if (MainAltOps[OpIdx].size() != 2) {
3461 OperandData &AltOp = getData(OpIdx, Lane);
3462 InstructionsState OpS =
3463 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3464 if (OpS && OpS.isAltShuffle())
3465 MainAltOps[OpIdx].push_back(AltOp.V);
3466 }
3467 }
3468 }
3469 }
3470 // Skip second pass if the strategy did not fail.
3471 if (!StrategyFailed)
3472 break;
3473 }
3474 }
3475
3476#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3477 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3478 switch (RMode) {
3479 case ReorderingMode::Load:
3480 return "Load";
3481 case ReorderingMode::Opcode:
3482 return "Opcode";
3483 case ReorderingMode::Constant:
3484 return "Constant";
3485 case ReorderingMode::Splat:
3486 return "Splat";
3487 case ReorderingMode::Failed:
3488 return "Failed";
3489 }
3490 llvm_unreachable("Unimplemented Reordering Type");
3491 }
3492
3493 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3494 raw_ostream &OS) {
3495 return OS << getModeStr(RMode);
3496 }
3497
3498 /// Debug print.
3499 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3500 printMode(RMode, dbgs());
3501 }
3502
3503 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3504 return printMode(RMode, OS);
3505 }
3506
3508 const unsigned Indent = 2;
3509 unsigned Cnt = 0;
3510 for (const OperandDataVec &OpDataVec : OpsVec) {
3511 OS << "Operand " << Cnt++ << "\n";
3512 for (const OperandData &OpData : OpDataVec) {
3513 OS.indent(Indent) << "{";
3514 if (Value *V = OpData.V)
3515 OS << *V;
3516 else
3517 OS << "null";
3518 OS << ", APO:" << OpData.APO << "}\n";
3519 }
3520 OS << "\n";
3521 }
3522 return OS;
3523 }
3524
3525 /// Debug print.
3526 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3527#endif
3528 };
3529
3530 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3531 /// for a pair which have highest score deemed to have best chance to form
3532 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3533 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3534 /// of the cost, considered to be good enough score.
3535 std::optional<int>
3536 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3537 int Limit = LookAheadHeuristics::ScoreFail) const {
3538 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3540 int BestScore = Limit;
3541 std::optional<int> Index;
3542 for (int I : seq<int>(0, Candidates.size())) {
3543 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3544 Candidates[I].second,
3545 /*U1=*/nullptr, /*U2=*/nullptr,
3546 /*CurrLevel=*/1, {});
3547 if (Score > BestScore) {
3548 BestScore = Score;
3549 Index = I;
3550 }
3551 }
3552 return Index;
3553 }
3554
3555 /// Checks if the instruction is marked for deletion.
3556 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3557
3558 /// Removes an instruction from its block and eventually deletes it.
3559 /// It's like Instruction::eraseFromParent() except that the actual deletion
3560 /// is delayed until BoUpSLP is destructed.
3562 DeletedInstructions.insert(I);
3563 }
3564
3565 /// Remove instructions from the parent function and clear the operands of \p
3566 /// DeadVals instructions, marking for deletion trivially dead operands.
3567 template <typename T>
3569 ArrayRef<T *> DeadVals,
3570 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3572 for (T *V : DeadVals) {
3573 auto *I = cast<Instruction>(V);
3575 }
3576 DenseSet<Value *> Processed;
3577 for (T *V : DeadVals) {
3578 if (!V || !Processed.insert(V).second)
3579 continue;
3580 auto *I = cast<Instruction>(V);
3582 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3583 for (Use &U : I->operands()) {
3584 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3585 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3587 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3588 return Entry->VectorizedValue == OpI;
3589 })))
3590 DeadInsts.push_back(OpI);
3591 }
3592 I->dropAllReferences();
3593 }
3594 for (T *V : DeadVals) {
3595 auto *I = cast<Instruction>(V);
3596 if (!I->getParent())
3597 continue;
3598 assert((I->use_empty() || all_of(I->uses(),
3599 [&](Use &U) {
3600 return isDeleted(
3601 cast<Instruction>(U.getUser()));
3602 })) &&
3603 "trying to erase instruction with users.");
3604 I->removeFromParent();
3605 SE->forgetValue(I);
3606 }
3607 // Process the dead instruction list until empty.
3608 while (!DeadInsts.empty()) {
3609 Value *V = DeadInsts.pop_back_val();
3611 if (!VI || !VI->getParent())
3612 continue;
3614 "Live instruction found in dead worklist!");
3615 assert(VI->use_empty() && "Instructions with uses are not dead.");
3616
3617 // Don't lose the debug info while deleting the instructions.
3618 salvageDebugInfo(*VI);
3619
3620 // Null out all of the instruction's operands to see if any operand
3621 // becomes dead as we go.
3622 for (Use &OpU : VI->operands()) {
3623 Value *OpV = OpU.get();
3624 if (!OpV)
3625 continue;
3626 OpU.set(nullptr);
3627
3628 if (!OpV->use_empty())
3629 continue;
3630
3631 // If the operand is an instruction that became dead as we nulled out
3632 // the operand, and if it is 'trivially' dead, delete it in a future
3633 // loop iteration.
3634 if (auto *OpI = dyn_cast<Instruction>(OpV))
3635 if (!DeletedInstructions.contains(OpI) &&
3636 (!OpI->getType()->isVectorTy() ||
3637 none_of(VectorValuesAndScales,
3638 [&](const std::tuple<Value *, unsigned, bool> &V) {
3639 return std::get<0>(V) == OpI;
3640 })) &&
3642 DeadInsts.push_back(OpI);
3643 }
3644
3645 VI->removeFromParent();
3646 eraseInstruction(VI);
3647 SE->forgetValue(VI);
3648 }
3649 }
3650
3651 /// Checks if the instruction was already analyzed for being possible
3652 /// reduction root.
3654 return AnalyzedReductionsRoots.count(I);
3655 }
3656 /// Register given instruction as already analyzed for being possible
3657 /// reduction root.
3659 AnalyzedReductionsRoots.insert(I);
3660 }
3661 /// Checks if the provided list of reduced values was checked already for
3662 /// vectorization.
3664 return AnalyzedReductionVals.contains(hash_value(VL));
3665 }
3666 /// Adds the list of reduced values to list of already checked values for the
3667 /// vectorization.
3669 AnalyzedReductionVals.insert(hash_value(VL));
3670 }
3671 /// Clear the list of the analyzed reduction root instructions.
3673 AnalyzedReductionsRoots.clear();
3674 AnalyzedReductionVals.clear();
3675 AnalyzedMinBWVals.clear();
3676 }
3677 /// Checks if the given value is gathered in one of the nodes.
3678 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3679 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3680 }
3681 /// Checks if the given value is gathered in one of the nodes.
3682 bool isGathered(const Value *V) const {
3683 return MustGather.contains(V);
3684 }
3685 /// Checks if the specified value was not schedule.
3686 bool isNotScheduled(const Value *V) const {
3687 return NonScheduledFirst.contains(V);
3688 }
3689
3690 /// Check if the value is vectorized in the tree.
3691 bool isVectorized(const Value *V) const {
3692 assert(V && "V cannot be nullptr.");
3693 return ScalarToTreeEntries.contains(V);
3694 }
3695
3696 ~BoUpSLP();
3697
3698private:
3699 /// Determine if a node \p E in can be demoted to a smaller type with a
3700 /// truncation. We collect the entries that will be demoted in ToDemote.
3701 /// \param E Node for analysis
3702 /// \param ToDemote indices of the nodes to be demoted.
3703 bool collectValuesToDemote(
3704 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3706 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3707 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3708
3709 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3710 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3711 /// they have only one user and reordarable).
3712 /// \param ReorderableGathers List of all gather nodes that require reordering
3713 /// (e.g., gather of extractlements or partially vectorizable loads).
3714 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3715 /// reordering, subset of \p NonVectorized.
3716 void buildReorderableOperands(
3717 TreeEntry *UserTE,
3718 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3719 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3720 SmallVectorImpl<TreeEntry *> &GatherOps);
3721
3722 /// Checks if the given \p TE is a gather node with clustered reused scalars
3723 /// and reorders it per given \p Mask.
3724 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3725
3726 /// Checks if all users of \p I are the part of the vectorization tree.
3727 bool areAllUsersVectorized(
3728 Instruction *I,
3729 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3730
3731 /// Return information about the vector formed for the specified index
3732 /// of a vector of (the same) instruction.
3734
3735 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3736 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3737 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3738 return const_cast<TreeEntry *>(
3739 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3740 }
3741
3742 /// Gets the root instruction for the given node. If the node is a strided
3743 /// load/store node with the reverse order, the root instruction is the last
3744 /// one.
3745 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3746
3747 /// \returns Cast context for the given graph node.
3749 getCastContextHint(const TreeEntry &TE) const;
3750
3751 /// \returns the cost of the vectorizable entry.
3752 InstructionCost getEntryCost(const TreeEntry *E,
3753 ArrayRef<Value *> VectorizedVals,
3754 SmallPtrSetImpl<Value *> &CheckedExtracts);
3755
3756 /// Checks if it is legal and profitable to build SplitVectorize node for the
3757 /// given \p VL.
3758 /// \param Op1 first homogeneous scalars.
3759 /// \param Op2 second homogeneous scalars.
3760 /// \param ReorderIndices indices to reorder the scalars.
3761 /// \returns true if the node was successfully built.
3762 bool canBuildSplitNode(ArrayRef<Value *> VL,
3763 const InstructionsState &LocalState,
3766 OrdersType &ReorderIndices) const;
3767
3768 /// This is the recursive part of buildTree.
3769 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3770 unsigned InterleaveFactor = 0);
3771
3772 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3773 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3774 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3775 /// returns false, setting \p CurrentOrder to either an empty vector or a
3776 /// non-identity permutation that allows to reuse extract instructions.
3777 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3778 /// extract order.
3779 bool canReuseExtract(ArrayRef<Value *> VL,
3780 SmallVectorImpl<unsigned> &CurrentOrder,
3781 bool ResizeAllowed = false) const;
3782
3783 /// Vectorize a single entry in the tree.
3784 Value *vectorizeTree(TreeEntry *E);
3785
3786 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3787 /// \p E.
3788 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3789
3790 /// Create a new vector from a list of scalar values. Produces a sequence
3791 /// which exploits values reused across lanes, and arranges the inserts
3792 /// for ease of later optimization.
3793 template <typename BVTy, typename ResTy, typename... Args>
3794 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3795
3796 /// Create a new vector from a list of scalar values. Produces a sequence
3797 /// which exploits values reused across lanes, and arranges the inserts
3798 /// for ease of later optimization.
3799 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3800
3801 /// Returns the instruction in the bundle, which can be used as a base point
3802 /// for scheduling. Usually it is the last instruction in the bundle, except
3803 /// for the case when all operands are external (in this case, it is the first
3804 /// instruction in the list).
3805 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3806
3807 /// Tries to find extractelement instructions with constant indices from fixed
3808 /// vector type and gather such instructions into a bunch, which highly likely
3809 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3810 /// was successful, the matched scalars are replaced by poison values in \p VL
3811 /// for future analysis.
3812 std::optional<TargetTransformInfo::ShuffleKind>
3813 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3814 SmallVectorImpl<int> &Mask) const;
3815
3816 /// Tries to find extractelement instructions with constant indices from fixed
3817 /// vector type and gather such instructions into a bunch, which highly likely
3818 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3819 /// was successful, the matched scalars are replaced by poison values in \p VL
3820 /// for future analysis.
3822 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3824 unsigned NumParts) const;
3825
3826 /// Checks if the gathered \p VL can be represented as a single register
3827 /// shuffle(s) of previous tree entries.
3828 /// \param TE Tree entry checked for permutation.
3829 /// \param VL List of scalars (a subset of the TE scalar), checked for
3830 /// permutations. Must form single-register vector.
3831 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3832 /// commands to build the mask using the original vector value, without
3833 /// relying on the potential reordering.
3834 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3835 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3836 std::optional<TargetTransformInfo::ShuffleKind>
3837 isGatherShuffledSingleRegisterEntry(
3838 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3839 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3840 bool ForOrder);
3841
3842 /// Checks if the gathered \p VL can be represented as multi-register
3843 /// shuffle(s) of previous tree entries.
3844 /// \param TE Tree entry checked for permutation.
3845 /// \param VL List of scalars (a subset of the TE scalar), checked for
3846 /// permutations.
3847 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3848 /// commands to build the mask using the original vector value, without
3849 /// relying on the potential reordering.
3850 /// \returns per-register series of ShuffleKind, if gathered values can be
3851 /// represented as shuffles of previous tree entries. \p Mask is filled with
3852 /// the shuffle mask (also on per-register base).
3854 isGatherShuffledEntry(
3855 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3857 unsigned NumParts, bool ForOrder = false);
3858
3859 /// \returns the cost of gathering (inserting) the values in \p VL into a
3860 /// vector.
3861 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3862 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3863 Type *ScalarTy) const;
3864
3865 /// Set the Builder insert point to one after the last instruction in
3866 /// the bundle
3867 void setInsertPointAfterBundle(const TreeEntry *E);
3868
3869 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3870 /// specified, the starting vector value is poison.
3871 Value *
3872 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3873 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3874
3875 /// \returns whether the VectorizableTree is fully vectorizable and will
3876 /// be beneficial even the tree height is tiny.
3877 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3878
3879 /// Run through the list of all gathered loads in the graph and try to find
3880 /// vector loads/masked gathers instead of regular gathers. Later these loads
3881 /// are reshufled to build final gathered nodes.
3882 void tryToVectorizeGatheredLoads(
3883 const SmallMapVector<
3884 std::tuple<BasicBlock *, Value *, Type *>,
3885 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3886 &GatheredLoads);
3887
3888 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3889 /// users of \p TE and collects the stores. It returns the map from the store
3890 /// pointers to the collected stores.
3892 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3893
3894 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3895 /// stores in \p StoresVec can form a vector instruction. If so it returns
3896 /// true and populates \p ReorderIndices with the shuffle indices of the
3897 /// stores when compared to the sorted vector.
3898 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3899 OrdersType &ReorderIndices) const;
3900
3901 /// Iterates through the users of \p TE, looking for scalar stores that can be
3902 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3903 /// their order and builds an order index vector for each store bundle. It
3904 /// returns all these order vectors found.
3905 /// We run this after the tree has formed, otherwise we may come across user
3906 /// instructions that are not yet in the tree.
3908 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3909
3910 /// Tries to reorder the gathering node for better vectorization
3911 /// opportunities.
3912 void reorderGatherNode(TreeEntry &TE);
3913
3914 class TreeEntry {
3915 public:
3916 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3917 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3918
3919 /// \returns Common mask for reorder indices and reused scalars.
3920 SmallVector<int> getCommonMask() const {
3921 if (State == TreeEntry::SplitVectorize)
3922 return {};
3923 SmallVector<int> Mask;
3924 inversePermutation(ReorderIndices, Mask);
3925 ::addMask(Mask, ReuseShuffleIndices);
3926 return Mask;
3927 }
3928
3929 /// \returns The mask for split nodes.
3930 SmallVector<int> getSplitMask() const {
3931 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3932 "Expected only split vectorize node.");
3933 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3934 unsigned CommonVF = std::max<unsigned>(
3935 CombinedEntriesWithIndices.back().second,
3936 Scalars.size() - CombinedEntriesWithIndices.back().second);
3937 for (auto [Idx, I] : enumerate(ReorderIndices))
3938 Mask[I] =
3939 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3940 ? CommonVF - CombinedEntriesWithIndices.back().second
3941 : 0);
3942 return Mask;
3943 }
3944
3945 /// Updates (reorders) SplitVectorize node according to the given mask \p
3946 /// Mask and order \p MaskOrder.
3947 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3948 ArrayRef<int> MaskOrder);
3949
3950 /// \returns true if the scalars in VL are equal to this entry.
3951 bool isSame(ArrayRef<Value *> VL) const {
3952 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3953 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3954 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3955 return VL.size() == Mask.size() &&
3956 std::equal(VL.begin(), VL.end(), Mask.begin(),
3957 [Scalars](Value *V, int Idx) {
3958 return (isa<UndefValue>(V) &&
3959 Idx == PoisonMaskElem) ||
3960 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3961 });
3962 };
3963 if (!ReorderIndices.empty()) {
3964 // TODO: implement matching if the nodes are just reordered, still can
3965 // treat the vector as the same if the list of scalars matches VL
3966 // directly, without reordering.
3967 SmallVector<int> Mask;
3968 inversePermutation(ReorderIndices, Mask);
3969 if (VL.size() == Scalars.size())
3970 return IsSame(Scalars, Mask);
3971 if (VL.size() == ReuseShuffleIndices.size()) {
3972 ::addMask(Mask, ReuseShuffleIndices);
3973 return IsSame(Scalars, Mask);
3974 }
3975 return false;
3976 }
3977 return IsSame(Scalars, ReuseShuffleIndices);
3978 }
3979
3980 /// \returns true if current entry has same operands as \p TE.
3981 bool hasEqualOperands(const TreeEntry &TE) const {
3982 if (TE.getNumOperands() != getNumOperands())
3983 return false;
3984 SmallBitVector Used(getNumOperands());
3985 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3986 unsigned PrevCount = Used.count();
3987 for (unsigned K = 0; K < E; ++K) {
3988 if (Used.test(K))
3989 continue;
3990 if (getOperand(K) == TE.getOperand(I)) {
3991 Used.set(K);
3992 break;
3993 }
3994 }
3995 // Check if we actually found the matching operand.
3996 if (PrevCount == Used.count())
3997 return false;
3998 }
3999 return true;
4000 }
4001
4002 /// \return Final vectorization factor for the node. Defined by the total
4003 /// number of vectorized scalars, including those, used several times in the
4004 /// entry and counted in the \a ReuseShuffleIndices, if any.
4005 unsigned getVectorFactor() const {
4006 if (!ReuseShuffleIndices.empty())
4007 return ReuseShuffleIndices.size();
4008 return Scalars.size();
4009 };
4010
4011 /// Checks if the current node is a gather node.
4012 bool isGather() const { return State == NeedToGather; }
4013
4014 /// A vector of scalars.
4015 ValueList Scalars;
4016
4017 /// The Scalars are vectorized into this value. It is initialized to Null.
4018 WeakTrackingVH VectorizedValue = nullptr;
4019
4020 /// Do we need to gather this sequence or vectorize it
4021 /// (either with vector instruction or with scatter/gather
4022 /// intrinsics for store/load)?
4023 enum EntryState {
4024 Vectorize, ///< The node is regularly vectorized.
4025 ScatterVectorize, ///< Masked scatter/gather node.
4026 StridedVectorize, ///< Strided loads (and stores)
4027 CompressVectorize, ///< (Masked) load with compress.
4028 NeedToGather, ///< Gather/buildvector node.
4029 CombinedVectorize, ///< Vectorized node, combined with its user into more
4030 ///< complex node like select/cmp to minmax, mul/add to
4031 ///< fma, etc. Must be used for the following nodes in
4032 ///< the pattern, not the very first one.
4033 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4034 ///< independently and then combines back.
4035 };
4036 EntryState State;
4037
4038 /// List of combined opcodes supported by the vectorizer.
4039 enum CombinedOpcode {
4040 NotCombinedOp = -1,
4041 MinMax = Instruction::OtherOpsEnd + 1,
4042 FMulAdd,
4043 };
4044 CombinedOpcode CombinedOp = NotCombinedOp;
4045
4046 /// Does this sequence require some shuffling?
4047 SmallVector<int, 4> ReuseShuffleIndices;
4048
4049 /// Does this entry require reordering?
4050 SmallVector<unsigned, 4> ReorderIndices;
4051
4052 /// Points back to the VectorizableTree.
4053 ///
4054 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4055 /// to be a pointer and needs to be able to initialize the child iterator.
4056 /// Thus we need a reference back to the container to translate the indices
4057 /// to entries.
4058 VecTreeTy &Container;
4059
4060 /// The TreeEntry index containing the user of this entry.
4061 EdgeInfo UserTreeIndex;
4062
4063 /// The index of this treeEntry in VectorizableTree.
4064 unsigned Idx = 0;
4065
4066 /// For gather/buildvector/alt opcode nodes, which are combined from
4067 /// other nodes as a series of insertvector instructions.
4068 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4069
4070 private:
4071 /// The operands of each instruction in each lane Operands[op_index][lane].
4072 /// Note: This helps avoid the replication of the code that performs the
4073 /// reordering of operands during buildTreeRec() and vectorizeTree().
4074 SmallVector<ValueList, 2> Operands;
4075
4076 /// Copyable elements of the entry node.
4077 SmallPtrSet<const Value *, 4> CopyableElements;
4078
4079 /// MainOp and AltOp are recorded inside. S should be obtained from
4080 /// newTreeEntry.
4081 InstructionsState S = InstructionsState::invalid();
4082
4083 /// Interleaving factor for interleaved loads Vectorize nodes.
4084 unsigned InterleaveFactor = 0;
4085
4086 /// True if the node does not require scheduling.
4087 bool DoesNotNeedToSchedule = false;
4088
4089 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4090 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4091 if (Operands.size() < OpIdx + 1)
4092 Operands.resize(OpIdx + 1);
4093 assert(Operands[OpIdx].empty() && "Already resized?");
4094 assert(OpVL.size() <= Scalars.size() &&
4095 "Number of operands is greater than the number of scalars.");
4096 Operands[OpIdx].resize(OpVL.size());
4097 copy(OpVL, Operands[OpIdx].begin());
4098 }
4099
4100 public:
4101 /// Returns interleave factor for interleave nodes.
4102 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4103 /// Sets interleaving factor for the interleaving nodes.
4104 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4105
4106 /// Marks the node as one that does not require scheduling.
4107 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4108 /// Returns true if the node is marked as one that does not require
4109 /// scheduling.
4110 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4111
4112 /// Set this bundle's operands from \p Operands.
4113 void setOperands(ArrayRef<ValueList> Operands) {
4114 for (unsigned I : seq<unsigned>(Operands.size()))
4115 setOperand(I, Operands[I]);
4116 }
4117
4118 /// Reorders operands of the node to the given mask \p Mask.
4119 void reorderOperands(ArrayRef<int> Mask) {
4120 for (ValueList &Operand : Operands)
4121 reorderScalars(Operand, Mask);
4122 }
4123
4124 /// \returns the \p OpIdx operand of this TreeEntry.
4125 ValueList &getOperand(unsigned OpIdx) {
4126 assert(OpIdx < Operands.size() && "Off bounds");
4127 return Operands[OpIdx];
4128 }
4129
4130 /// \returns the \p OpIdx operand of this TreeEntry.
4131 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4132 assert(OpIdx < Operands.size() && "Off bounds");
4133 return Operands[OpIdx];
4134 }
4135
4136 /// \returns the number of operands.
4137 unsigned getNumOperands() const { return Operands.size(); }
4138
4139 /// \return the single \p OpIdx operand.
4140 Value *getSingleOperand(unsigned OpIdx) const {
4141 assert(OpIdx < Operands.size() && "Off bounds");
4142 assert(!Operands[OpIdx].empty() && "No operand available");
4143 return Operands[OpIdx][0];
4144 }
4145
4146 /// Some of the instructions in the list have alternate opcodes.
4147 bool isAltShuffle() const { return S.isAltShuffle(); }
4148
4149 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4150 return S.getMatchingMainOpOrAltOp(I);
4151 }
4152
4153 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4154 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4155 /// \p OpValue.
4156 Value *isOneOf(Value *Op) const {
4157 auto *I = dyn_cast<Instruction>(Op);
4158 if (I && getMatchingMainOpOrAltOp(I))
4159 return Op;
4160 return S.getMainOp();
4161 }
4162
4163 void setOperations(const InstructionsState &S) {
4164 assert(S && "InstructionsState is invalid.");
4165 this->S = S;
4166 }
4167
4168 Instruction *getMainOp() const { return S.getMainOp(); }
4169
4170 Instruction *getAltOp() const { return S.getAltOp(); }
4171
4172 /// The main/alternate opcodes for the list of instructions.
4173 unsigned getOpcode() const { return S.getOpcode(); }
4174
4175 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4176
4177 bool hasState() const { return S.valid(); }
4178
4179 /// Add \p V to the list of copyable elements.
4180 void addCopyableElement(Value *V) {
4181 assert(S.isCopyableElement(V) && "Not a copyable element.");
4182 CopyableElements.insert(V);
4183 }
4184
4185 /// Returns true if \p V is a copyable element.
4186 bool isCopyableElement(Value *V) const {
4187 return CopyableElements.contains(V);
4188 }
4189
4190 /// Returns true if any scalar in the list is a copyable element.
4191 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4192
4193 /// Returns the state of the operations.
4194 const InstructionsState &getOperations() const { return S; }
4195
4196 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4197 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4198 unsigned findLaneForValue(Value *V) const {
4199 unsigned FoundLane = getVectorFactor();
4200 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4201 std::advance(It, 1)) {
4202 if (*It != V)
4203 continue;
4204 FoundLane = std::distance(Scalars.begin(), It);
4205 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4206 if (!ReorderIndices.empty())
4207 FoundLane = ReorderIndices[FoundLane];
4208 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4209 if (ReuseShuffleIndices.empty())
4210 break;
4211 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4212 RIt != ReuseShuffleIndices.end()) {
4213 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4214 break;
4215 }
4216 }
4217 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4218 return FoundLane;
4219 }
4220
4221 /// Build a shuffle mask for graph entry which represents a merge of main
4222 /// and alternate operations.
4223 void
4224 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4225 SmallVectorImpl<int> &Mask,
4226 SmallVectorImpl<Value *> *OpScalars = nullptr,
4227 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4228
4229 /// Return true if this is a non-power-of-2 node.
4230 bool isNonPowOf2Vec() const {
4231 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4232 return IsNonPowerOf2;
4233 }
4234
4235 /// Return true if this is a node, which tries to vectorize number of
4236 /// elements, forming whole vectors.
4237 bool
4238 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4239 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4240 TTI, getValueType(Scalars.front()), Scalars.size());
4241 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4242 "Reshuffling not supported with non-power-of-2 vectors yet.");
4243 return IsNonPowerOf2;
4244 }
4245
4246 Value *getOrdered(unsigned Idx) const {
4247 if (ReorderIndices.empty())
4248 return Scalars[Idx];
4249 SmallVector<int> Mask;
4250 inversePermutation(ReorderIndices, Mask);
4251 return Scalars[Mask[Idx]];
4252 }
4253
4254#ifndef NDEBUG
4255 /// Debug printer.
4256 LLVM_DUMP_METHOD void dump() const {
4257 dbgs() << Idx << ".\n";
4258 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4259 dbgs() << "Operand " << OpI << ":\n";
4260 for (const Value *V : Operands[OpI])
4261 dbgs().indent(2) << *V << "\n";
4262 }
4263 dbgs() << "Scalars: \n";
4264 for (Value *V : Scalars)
4265 dbgs().indent(2) << *V << "\n";
4266 dbgs() << "State: ";
4267 if (S && hasCopyableElements())
4268 dbgs() << "[[Copyable]] ";
4269 switch (State) {
4270 case Vectorize:
4271 if (InterleaveFactor > 0) {
4272 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4273 << "\n";
4274 } else {
4275 dbgs() << "Vectorize\n";
4276 }
4277 break;
4278 case ScatterVectorize:
4279 dbgs() << "ScatterVectorize\n";
4280 break;
4281 case StridedVectorize:
4282 dbgs() << "StridedVectorize\n";
4283 break;
4284 case CompressVectorize:
4285 dbgs() << "CompressVectorize\n";
4286 break;
4287 case NeedToGather:
4288 dbgs() << "NeedToGather\n";
4289 break;
4290 case CombinedVectorize:
4291 dbgs() << "CombinedVectorize\n";
4292 break;
4293 case SplitVectorize:
4294 dbgs() << "SplitVectorize\n";
4295 break;
4296 }
4297 if (S) {
4298 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4299 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4300 } else {
4301 dbgs() << "MainOp: NULL\n";
4302 dbgs() << "AltOp: NULL\n";
4303 }
4304 dbgs() << "VectorizedValue: ";
4305 if (VectorizedValue)
4306 dbgs() << *VectorizedValue << "\n";
4307 else
4308 dbgs() << "NULL\n";
4309 dbgs() << "ReuseShuffleIndices: ";
4310 if (ReuseShuffleIndices.empty())
4311 dbgs() << "Empty";
4312 else
4313 for (int ReuseIdx : ReuseShuffleIndices)
4314 dbgs() << ReuseIdx << ", ";
4315 dbgs() << "\n";
4316 dbgs() << "ReorderIndices: ";
4317 for (unsigned ReorderIdx : ReorderIndices)
4318 dbgs() << ReorderIdx << ", ";
4319 dbgs() << "\n";
4320 dbgs() << "UserTreeIndex: ";
4321 if (UserTreeIndex)
4322 dbgs() << UserTreeIndex;
4323 else
4324 dbgs() << "<invalid>";
4325 dbgs() << "\n";
4326 if (!CombinedEntriesWithIndices.empty()) {
4327 dbgs() << "Combined entries: ";
4328 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4329 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4330 });
4331 dbgs() << "\n";
4332 }
4333 }
4334#endif
4335 };
4336
4337#ifndef NDEBUG
4338 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4339 InstructionCost VecCost, InstructionCost ScalarCost,
4340 StringRef Banner) const {
4341 dbgs() << "SLP: " << Banner << ":\n";
4342 E->dump();
4343 dbgs() << "SLP: Costs:\n";
4344 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4345 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4346 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4347 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4348 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4349 }
4350#endif
4351
4352 /// Create a new gather TreeEntry
4353 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4354 const InstructionsState &S,
4355 const EdgeInfo &UserTreeIdx,
4356 ArrayRef<int> ReuseShuffleIndices = {}) {
4357 auto Invalid = ScheduleBundle::invalid();
4358 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4359 }
4360
4361 /// Create a new VectorizableTree entry.
4362 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4363 const InstructionsState &S,
4364 const EdgeInfo &UserTreeIdx,
4365 ArrayRef<int> ReuseShuffleIndices = {},
4366 ArrayRef<unsigned> ReorderIndices = {},
4367 unsigned InterleaveFactor = 0) {
4368 TreeEntry::EntryState EntryState =
4369 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4370 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4371 ReuseShuffleIndices, ReorderIndices);
4372 if (E && InterleaveFactor > 0)
4373 E->setInterleave(InterleaveFactor);
4374 return E;
4375 }
4376
4377 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4378 TreeEntry::EntryState EntryState,
4379 ScheduleBundle &Bundle, const InstructionsState &S,
4380 const EdgeInfo &UserTreeIdx,
4381 ArrayRef<int> ReuseShuffleIndices = {},
4382 ArrayRef<unsigned> ReorderIndices = {}) {
4383 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4384 EntryState == TreeEntry::SplitVectorize)) ||
4385 (Bundle && EntryState != TreeEntry::NeedToGather &&
4386 EntryState != TreeEntry::SplitVectorize)) &&
4387 "Need to vectorize gather entry?");
4388 // Gathered loads still gathered? Do not create entry, use the original one.
4389 if (GatheredLoadsEntriesFirst.has_value() &&
4390 EntryState == TreeEntry::NeedToGather && S &&
4391 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4392 !UserTreeIdx.UserTE)
4393 return nullptr;
4394 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4395 TreeEntry *Last = VectorizableTree.back().get();
4396 Last->Idx = VectorizableTree.size() - 1;
4397 Last->State = EntryState;
4398 if (UserTreeIdx.UserTE)
4399 OperandsToTreeEntry.try_emplace(
4400 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4401 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4402 // for non-power-of-two vectors.
4403 assert(
4404 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4405 ReuseShuffleIndices.empty()) &&
4406 "Reshuffling scalars not yet supported for nodes with padding");
4407 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4408 ReuseShuffleIndices.end());
4409 if (ReorderIndices.empty()) {
4410 Last->Scalars.assign(VL.begin(), VL.end());
4411 if (S)
4412 Last->setOperations(S);
4413 } else {
4414 // Reorder scalars and build final mask.
4415 Last->Scalars.assign(VL.size(), nullptr);
4416 transform(ReorderIndices, Last->Scalars.begin(),
4417 [VL](unsigned Idx) -> Value * {
4418 if (Idx >= VL.size())
4419 return UndefValue::get(VL.front()->getType());
4420 return VL[Idx];
4421 });
4422 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4423 if (S)
4424 Last->setOperations(S);
4425 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4426 }
4427 if (EntryState == TreeEntry::SplitVectorize) {
4428 assert(S && "Split nodes must have operations.");
4429 Last->setOperations(S);
4430 SmallPtrSet<Value *, 4> Processed;
4431 for (Value *V : VL) {
4432 auto *I = dyn_cast<Instruction>(V);
4433 if (!I)
4434 continue;
4435 auto It = ScalarsInSplitNodes.find(V);
4436 if (It == ScalarsInSplitNodes.end()) {
4437 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4438 (void)Processed.insert(V);
4439 } else if (Processed.insert(V).second) {
4440 assert(!is_contained(It->getSecond(), Last) &&
4441 "Value already associated with the node.");
4442 It->getSecond().push_back(Last);
4443 }
4444 }
4445 } else if (!Last->isGather()) {
4446 if (isa<PHINode>(S.getMainOp()) ||
4447 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4448 (!S.areInstructionsWithCopyableElements() &&
4449 doesNotNeedToSchedule(VL)) ||
4450 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4451 Last->setDoesNotNeedToSchedule();
4452 SmallPtrSet<Value *, 4> Processed;
4453 for (Value *V : VL) {
4454 if (isa<PoisonValue>(V))
4455 continue;
4456 if (S.isCopyableElement(V)) {
4457 Last->addCopyableElement(V);
4458 continue;
4459 }
4460 auto It = ScalarToTreeEntries.find(V);
4461 if (It == ScalarToTreeEntries.end()) {
4462 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4463 (void)Processed.insert(V);
4464 } else if (Processed.insert(V).second) {
4465 assert(!is_contained(It->getSecond(), Last) &&
4466 "Value already associated with the node.");
4467 It->getSecond().push_back(Last);
4468 }
4469 }
4470 // Update the scheduler bundle to point to this TreeEntry.
4471 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4472 "Bundle and VL out of sync");
4473 if (!Bundle.getBundle().empty()) {
4474#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4475 auto *BundleMember = Bundle.getBundle().begin();
4476 SmallPtrSet<Value *, 4> Processed;
4477 for (Value *V : VL) {
4478 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4479 continue;
4480 ++BundleMember;
4481 }
4482 assert(BundleMember == Bundle.getBundle().end() &&
4483 "Bundle and VL out of sync");
4484#endif
4485 Bundle.setTreeEntry(Last);
4486 }
4487 } else {
4488 // Build a map for gathered scalars to the nodes where they are used.
4489 bool AllConstsOrCasts = true;
4490 for (Value *V : VL) {
4491 if (S && S.areInstructionsWithCopyableElements() &&
4492 S.isCopyableElement(V))
4493 Last->addCopyableElement(V);
4494 if (!isConstant(V)) {
4495 auto *I = dyn_cast<CastInst>(V);
4496 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4497 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4498 !UserTreeIdx.UserTE->isGather())
4499 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4500 }
4501 }
4502 if (AllConstsOrCasts)
4503 CastMaxMinBWSizes =
4504 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4505 MustGather.insert_range(VL);
4506 }
4507
4508 if (UserTreeIdx.UserTE)
4509 Last->UserTreeIndex = UserTreeIdx;
4510 return Last;
4511 }
4512
4513 /// -- Vectorization State --
4514 /// Holds all of the tree entries.
4515 TreeEntry::VecTreeTy VectorizableTree;
4516
4517#ifndef NDEBUG
4518 /// Debug printer.
4519 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4520 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4521 VectorizableTree[Id]->dump();
4522 dbgs() << "\n";
4523 }
4524 }
4525#endif
4526
4527 /// Get list of vector entries, associated with the value \p V.
4528 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4529 assert(V && "V cannot be nullptr.");
4530 auto It = ScalarToTreeEntries.find(V);
4531 if (It == ScalarToTreeEntries.end())
4532 return {};
4533 return It->getSecond();
4534 }
4535
4536 /// Get list of split vector entries, associated with the value \p V.
4537 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4538 assert(V && "V cannot be nullptr.");
4539 auto It = ScalarsInSplitNodes.find(V);
4540 if (It == ScalarsInSplitNodes.end())
4541 return {};
4542 return It->getSecond();
4543 }
4544
4545 /// Returns first vector node for value \p V, matching values \p VL.
4546 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4547 bool SameVF = false) const {
4548 assert(V && "V cannot be nullptr.");
4549 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4550 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4551 return TE;
4552 return nullptr;
4553 }
4554
4555 /// Check that the operand node of alternate node does not generate
4556 /// buildvector sequence. If it is, then probably not worth it to build
4557 /// alternate shuffle, if number of buildvector operands + alternate
4558 /// instruction > than the number of buildvector instructions.
4559 /// \param S the instructions state of the analyzed values.
4560 /// \param VL list of the instructions with alternate opcodes.
4561 bool areAltOperandsProfitable(const InstructionsState &S,
4562 ArrayRef<Value *> VL) const;
4563
4564 /// Contains all the outputs of legality analysis for a list of values to
4565 /// vectorize.
4566 class ScalarsVectorizationLegality {
4567 InstructionsState S;
4568 bool IsLegal;
4569 bool TryToFindDuplicates;
4570 bool TrySplitVectorize;
4571
4572 public:
4573 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4574 bool TryToFindDuplicates = true,
4575 bool TrySplitVectorize = false)
4576 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4577 TrySplitVectorize(TrySplitVectorize) {
4578 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4579 "Inconsistent state");
4580 }
4581 const InstructionsState &getInstructionsState() const { return S; };
4582 bool isLegal() const { return IsLegal; }
4583 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4584 bool trySplitVectorize() const { return TrySplitVectorize; }
4585 };
4586
4587 /// Checks if the specified list of the instructions/values can be vectorized
4588 /// in general.
4589 ScalarsVectorizationLegality
4590 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4591 const EdgeInfo &UserTreeIdx,
4592 bool TryCopyableElementsVectorization) const;
4593
4594 /// Checks if the specified list of the instructions/values can be vectorized
4595 /// and fills required data before actual scheduling of the instructions.
4596 TreeEntry::EntryState getScalarsVectorizationState(
4597 const InstructionsState &S, ArrayRef<Value *> VL,
4598 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4599 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4600
4601 /// Maps a specific scalar to its tree entry(ies).
4602 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4603
4604 /// List of deleted non-profitable nodes.
4605 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4606
4607 /// List of nodes, transformed to gathered, with their conservative
4608 /// gather/buildvector cost estimation.
4609 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4610
4611 /// Maps the operand index and entry to the corresponding tree entry.
4612 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4613 OperandsToTreeEntry;
4614
4615 /// Scalars, used in split vectorize nodes.
4616 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4617
4618 /// Maps a value to the proposed vectorizable size.
4619 SmallDenseMap<Value *, unsigned> InstrElementSize;
4620
4621 /// A list of scalars that we found that we need to keep as scalars.
4622 ValueSet MustGather;
4623
4624 /// A set of first non-schedulable values.
4625 ValueSet NonScheduledFirst;
4626
4627 /// A map between the vectorized entries and the last instructions in the
4628 /// bundles. The bundles are built in use order, not in the def order of the
4629 /// instructions. So, we cannot rely directly on the last instruction in the
4630 /// bundle being the last instruction in the program order during
4631 /// vectorization process since the basic blocks are affected, need to
4632 /// pre-gather them before.
4633 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4634
4635 /// Keeps the mapping between the last instructions and their insertion
4636 /// points, which is an instruction-after-the-last-instruction.
4637 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4638
4639 /// List of gather nodes, depending on other gather/vector nodes, which should
4640 /// be emitted after the vector instruction emission process to correctly
4641 /// handle order of the vector instructions and shuffles.
4642 SetVector<const TreeEntry *> PostponedGathers;
4643
4644 using ValueToGatherNodesMap =
4645 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4646 ValueToGatherNodesMap ValueToGatherNodes;
4647
4648 /// A list of the load entries (node indices), which can be vectorized using
4649 /// strided or masked gather approach, but attempted to be represented as
4650 /// contiguous loads.
4651 SetVector<unsigned> LoadEntriesToVectorize;
4652
4653 /// true if graph nodes transforming mode is on.
4654 bool IsGraphTransformMode = false;
4655
4656 /// The index of the first gathered load entry in the VectorizeTree.
4657 std::optional<unsigned> GatheredLoadsEntriesFirst;
4658
4659 /// Maps compress entries to their mask data for the final codegen.
4660 SmallDenseMap<const TreeEntry *,
4661 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4662 CompressEntryToData;
4663
4664 /// This POD struct describes one external user in the vectorized tree.
4665 struct ExternalUser {
4666 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4667 : Scalar(S), User(U), E(E), Lane(L) {}
4668
4669 /// Which scalar in our function.
4670 Value *Scalar = nullptr;
4671
4672 /// Which user that uses the scalar.
4673 llvm::User *User = nullptr;
4674
4675 /// Vector node, the value is part of.
4676 const TreeEntry &E;
4677
4678 /// Which lane does the scalar belong to.
4679 unsigned Lane;
4680 };
4681 using UserList = SmallVector<ExternalUser, 16>;
4682
4683 /// Checks if two instructions may access the same memory.
4684 ///
4685 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4686 /// is invariant in the calling loop.
4687 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4688 Instruction *Inst2) {
4689 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4690 // First check if the result is already in the cache.
4691 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4692 auto Res = AliasCache.try_emplace(Key);
4693 if (!Res.second)
4694 return Res.first->second;
4695 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4696 // Store the result in the cache.
4697 Res.first->getSecond() = Aliased;
4698 return Aliased;
4699 }
4700
4701 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4702
4703 /// Cache for alias results.
4704 /// TODO: consider moving this to the AliasAnalysis itself.
4705 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4706
4707 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4708 // globally through SLP because we don't perform any action which
4709 // invalidates capture results.
4710 BatchAAResults BatchAA;
4711
4712 /// Temporary store for deleted instructions. Instructions will be deleted
4713 /// eventually when the BoUpSLP is destructed. The deferral is required to
4714 /// ensure that there are no incorrect collisions in the AliasCache, which
4715 /// can happen if a new instruction is allocated at the same address as a
4716 /// previously deleted instruction.
4717 DenseSet<Instruction *> DeletedInstructions;
4718
4719 /// Set of the instruction, being analyzed already for reductions.
4720 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4721
4722 /// Set of hashes for the list of reduction values already being analyzed.
4723 DenseSet<size_t> AnalyzedReductionVals;
4724
4725 /// Values, already been analyzed for mininmal bitwidth and found to be
4726 /// non-profitable.
4727 DenseSet<Value *> AnalyzedMinBWVals;
4728
4729 /// A list of values that need to extracted out of the tree.
4730 /// This list holds pairs of (Internal Scalar : External User). External User
4731 /// can be nullptr, it means that this Internal Scalar will be used later,
4732 /// after vectorization.
4733 UserList ExternalUses;
4734
4735 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4736 /// extractelement instructions.
4737 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4738
4739 /// A list of scalar to be extracted without specific user necause of too many
4740 /// uses.
4741 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4742
4743 /// Values used only by @llvm.assume calls.
4744 SmallPtrSet<const Value *, 32> EphValues;
4745
4746 /// Holds all of the instructions that we gathered, shuffle instructions and
4747 /// extractelements.
4748 SetVector<Instruction *> GatherShuffleExtractSeq;
4749
4750 /// A list of blocks that we are going to CSE.
4751 DenseSet<BasicBlock *> CSEBlocks;
4752
4753 /// List of hashes of vector of loads, which are known to be non vectorizable.
4754 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4755
4756 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4757 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4758 /// instructions, while ScheduleBundle represents a batch of instructions,
4759 /// going to be groupped together. ScheduleCopyableData models extra user for
4760 /// "copyable" instructions.
4761 class ScheduleEntity {
4762 friend class ScheduleBundle;
4763 friend class ScheduleData;
4764 friend class ScheduleCopyableData;
4765
4766 protected:
4767 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4768 Kind getKind() const { return K; }
4769 ScheduleEntity(Kind K) : K(K) {}
4770
4771 private:
4772 /// Used for getting a "good" final ordering of instructions.
4773 int SchedulingPriority = 0;
4774 /// True if this instruction (or bundle) is scheduled (or considered as
4775 /// scheduled in the dry-run).
4776 bool IsScheduled = false;
4777 /// The kind of the ScheduleEntity.
4778 const Kind K = Kind::ScheduleData;
4779
4780 public:
4781 ScheduleEntity() = delete;
4782 /// Gets/sets the scheduling priority.
4783 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4784 int getSchedulingPriority() const { return SchedulingPriority; }
4785 bool isReady() const {
4786 if (const auto *SD = dyn_cast<ScheduleData>(this))
4787 return SD->isReady();
4788 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4789 return CD->isReady();
4790 return cast<ScheduleBundle>(this)->isReady();
4791 }
4792 /// Returns true if the dependency information has been calculated.
4793 /// Note that depenendency validity can vary between instructions within
4794 /// a single bundle.
4795 bool hasValidDependencies() const {
4796 if (const auto *SD = dyn_cast<ScheduleData>(this))
4797 return SD->hasValidDependencies();
4798 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4799 return CD->hasValidDependencies();
4800 return cast<ScheduleBundle>(this)->hasValidDependencies();
4801 }
4802 /// Gets the number of unscheduled dependencies.
4803 int getUnscheduledDeps() const {
4804 if (const auto *SD = dyn_cast<ScheduleData>(this))
4805 return SD->getUnscheduledDeps();
4806 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4807 return CD->getUnscheduledDeps();
4808 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4809 }
4810 /// Increments the number of unscheduled dependencies.
4811 int incrementUnscheduledDeps(int Incr) {
4812 if (auto *SD = dyn_cast<ScheduleData>(this))
4813 return SD->incrementUnscheduledDeps(Incr);
4814 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4815 }
4816 /// Gets the number of dependencies.
4817 int getDependencies() const {
4818 if (const auto *SD = dyn_cast<ScheduleData>(this))
4819 return SD->getDependencies();
4820 return cast<ScheduleCopyableData>(this)->getDependencies();
4821 }
4822 /// Gets the instruction.
4823 Instruction *getInst() const {
4824 if (const auto *SD = dyn_cast<ScheduleData>(this))
4825 return SD->getInst();
4826 return cast<ScheduleCopyableData>(this)->getInst();
4827 }
4828
4829 /// Gets/sets if the bundle is scheduled.
4830 bool isScheduled() const { return IsScheduled; }
4831 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4832
4833 static bool classof(const ScheduleEntity *) { return true; }
4834
4835#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4836 void dump(raw_ostream &OS) const {
4837 if (const auto *SD = dyn_cast<ScheduleData>(this))
4838 return SD->dump(OS);
4839 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4840 return CD->dump(OS);
4841 return cast<ScheduleBundle>(this)->dump(OS);
4842 }
4843
4844 LLVM_DUMP_METHOD void dump() const {
4845 dump(dbgs());
4846 dbgs() << '\n';
4847 }
4848#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4849 };
4850
4851#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4853 const BoUpSLP::ScheduleEntity &SE) {
4854 SE.dump(OS);
4855 return OS;
4856 }
4857#endif
4858
4859 /// Contains all scheduling relevant data for an instruction.
4860 /// A ScheduleData either represents a single instruction or a member of an
4861 /// instruction bundle (= a group of instructions which is combined into a
4862 /// vector instruction).
4863 class ScheduleData final : public ScheduleEntity {
4864 public:
4865 // The initial value for the dependency counters. It means that the
4866 // dependencies are not calculated yet.
4867 enum { InvalidDeps = -1 };
4868
4869 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4870 static bool classof(const ScheduleEntity *Entity) {
4871 return Entity->getKind() == Kind::ScheduleData;
4872 }
4873
4874 void init(int BlockSchedulingRegionID, Instruction *I) {
4875 NextLoadStore = nullptr;
4876 IsScheduled = false;
4877 SchedulingRegionID = BlockSchedulingRegionID;
4878 clearDependencies();
4879 Inst = I;
4880 }
4881
4882 /// Verify basic self consistency properties
4883 void verify() {
4884 if (hasValidDependencies()) {
4885 assert(UnscheduledDeps <= Dependencies && "invariant");
4886 } else {
4887 assert(UnscheduledDeps == Dependencies && "invariant");
4888 }
4889
4890 if (IsScheduled) {
4891 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4892 "unexpected scheduled state");
4893 }
4894 }
4895
4896 /// Returns true if the dependency information has been calculated.
4897 /// Note that depenendency validity can vary between instructions within
4898 /// a single bundle.
4899 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4900
4901 /// Returns true if it is ready for scheduling, i.e. it has no more
4902 /// unscheduled depending instructions/bundles.
4903 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4904
4905 /// Modifies the number of unscheduled dependencies for this instruction,
4906 /// and returns the number of remaining dependencies for the containing
4907 /// bundle.
4908 int incrementUnscheduledDeps(int Incr) {
4909 assert(hasValidDependencies() &&
4910 "increment of unscheduled deps would be meaningless");
4911 UnscheduledDeps += Incr;
4912 assert(UnscheduledDeps >= 0 &&
4913 "Expected valid number of unscheduled deps");
4914 return UnscheduledDeps;
4915 }
4916
4917 /// Sets the number of unscheduled dependencies to the number of
4918 /// dependencies.
4919 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4920
4921 /// Clears all dependency information.
4922 void clearDependencies() {
4923 clearDirectDependencies();
4924 MemoryDependencies.clear();
4925 ControlDependencies.clear();
4926 }
4927
4928 /// Clears all direct dependencies only, except for control and memory
4929 /// dependencies.
4930 /// Required for copyable elements to correctly handle control/memory deps
4931 /// and avoid extra reclaculation of such deps.
4932 void clearDirectDependencies() {
4933 Dependencies = InvalidDeps;
4934 resetUnscheduledDeps();
4935 IsScheduled = false;
4936 }
4937
4938 /// Gets the number of unscheduled dependencies.
4939 int getUnscheduledDeps() const { return UnscheduledDeps; }
4940 /// Gets the number of dependencies.
4941 int getDependencies() const { return Dependencies; }
4942 /// Initializes the number of dependencies.
4943 void initDependencies() { Dependencies = 0; }
4944 /// Increments the number of dependencies.
4945 void incDependencies() { Dependencies++; }
4946
4947 /// Gets scheduling region ID.
4948 int getSchedulingRegionID() const { return SchedulingRegionID; }
4949
4950 /// Gets the instruction.
4951 Instruction *getInst() const { return Inst; }
4952
4953 /// Gets the list of memory dependencies.
4954 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4955 return MemoryDependencies;
4956 }
4957 /// Adds a memory dependency.
4958 void addMemoryDependency(ScheduleData *Dep) {
4959 MemoryDependencies.push_back(Dep);
4960 }
4961 /// Gets the list of control dependencies.
4962 ArrayRef<ScheduleData *> getControlDependencies() const {
4963 return ControlDependencies;
4964 }
4965 /// Adds a control dependency.
4966 void addControlDependency(ScheduleData *Dep) {
4967 ControlDependencies.push_back(Dep);
4968 }
4969 /// Gets/sets the next load/store instruction in the block.
4970 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4971 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4972
4973 void dump(raw_ostream &OS) const { OS << *Inst; }
4974
4975 LLVM_DUMP_METHOD void dump() const {
4976 dump(dbgs());
4977 dbgs() << '\n';
4978 }
4979
4980 private:
4981 Instruction *Inst = nullptr;
4982
4983 /// Single linked list of all memory instructions (e.g. load, store, call)
4984 /// in the block - until the end of the scheduling region.
4985 ScheduleData *NextLoadStore = nullptr;
4986
4987 /// The dependent memory instructions.
4988 /// This list is derived on demand in calculateDependencies().
4989 SmallVector<ScheduleData *> MemoryDependencies;
4990
4991 /// List of instructions which this instruction could be control dependent
4992 /// on. Allowing such nodes to be scheduled below this one could introduce
4993 /// a runtime fault which didn't exist in the original program.
4994 /// ex: this is a load or udiv following a readonly call which inf loops
4995 SmallVector<ScheduleData *> ControlDependencies;
4996
4997 /// This ScheduleData is in the current scheduling region if this matches
4998 /// the current SchedulingRegionID of BlockScheduling.
4999 int SchedulingRegionID = 0;
5000
5001 /// The number of dependencies. Constitutes of the number of users of the
5002 /// instruction plus the number of dependent memory instructions (if any).
5003 /// This value is calculated on demand.
5004 /// If InvalidDeps, the number of dependencies is not calculated yet.
5005 int Dependencies = InvalidDeps;
5006
5007 /// The number of dependencies minus the number of dependencies of scheduled
5008 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5009 /// for scheduling.
5010 /// Note that this is negative as long as Dependencies is not calculated.
5011 int UnscheduledDeps = InvalidDeps;
5012 };
5013
5014#ifndef NDEBUG
5016 const BoUpSLP::ScheduleData &SD) {
5017 SD.dump(OS);
5018 return OS;
5019 }
5020#endif
5021
5022 class ScheduleBundle final : public ScheduleEntity {
5023 /// The schedule data for the instructions in the bundle.
5025 /// True if this bundle is valid.
5026 bool IsValid = true;
5027 /// The TreeEntry that this instruction corresponds to.
5028 TreeEntry *TE = nullptr;
5029 ScheduleBundle(bool IsValid)
5030 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5031
5032 public:
5033 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5034 static bool classof(const ScheduleEntity *Entity) {
5035 return Entity->getKind() == Kind::ScheduleBundle;
5036 }
5037
5038 /// Verify basic self consistency properties
5039 void verify() const {
5040 for (const ScheduleEntity *SD : Bundle) {
5041 if (SD->hasValidDependencies()) {
5042 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5043 "invariant");
5044 } else {
5045 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5046 "invariant");
5047 }
5048
5049 if (isScheduled()) {
5050 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5051 "unexpected scheduled state");
5052 }
5053 }
5054 }
5055
5056 /// Returns the number of unscheduled dependencies in the bundle.
5057 int unscheduledDepsInBundle() const {
5058 assert(*this && "bundle must not be empty");
5059 int Sum = 0;
5060 for (const ScheduleEntity *BundleMember : Bundle) {
5061 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5062 return ScheduleData::InvalidDeps;
5063 Sum += BundleMember->getUnscheduledDeps();
5064 }
5065 return Sum;
5066 }
5067
5068 /// Returns true if the dependency information has been calculated.
5069 /// Note that depenendency validity can vary between instructions within
5070 /// a single bundle.
5071 bool hasValidDependencies() const {
5072 return all_of(Bundle, [](const ScheduleEntity *SD) {
5073 return SD->hasValidDependencies();
5074 });
5075 }
5076
5077 /// Returns true if it is ready for scheduling, i.e. it has no more
5078 /// unscheduled depending instructions/bundles.
5079 bool isReady() const {
5080 assert(*this && "bundle must not be empty");
5081 return unscheduledDepsInBundle() == 0 && !isScheduled();
5082 }
5083
5084 /// Returns the bundle of scheduling data, associated with the current
5085 /// instruction.
5086 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5087 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5088 /// Adds an instruction to the bundle.
5089 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5090
5091 /// Gets/sets the associated tree entry.
5092 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5093 TreeEntry *getTreeEntry() const { return TE; }
5094
5095 static ScheduleBundle invalid() { return {false}; }
5096
5097 operator bool() const { return IsValid; }
5098
5099#ifndef NDEBUG
5100 void dump(raw_ostream &OS) const {
5101 if (!*this) {
5102 OS << "[]";
5103 return;
5104 }
5105 OS << '[';
5106 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5108 OS << "<Copyable>";
5109 OS << *SD->getInst();
5110 });
5111 OS << ']';
5112 }
5113
5114 LLVM_DUMP_METHOD void dump() const {
5115 dump(dbgs());
5116 dbgs() << '\n';
5117 }
5118#endif // NDEBUG
5119 };
5120
5121#ifndef NDEBUG
5123 const BoUpSLP::ScheduleBundle &Bundle) {
5124 Bundle.dump(OS);
5125 return OS;
5126 }
5127#endif
5128
5129 /// Contains all scheduling relevant data for the copyable instruction.
5130 /// It models the virtual instructions, supposed to replace the original
5131 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5132 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5133 /// instruction %virt = add %0, 0.
5134 class ScheduleCopyableData final : public ScheduleEntity {
5135 /// The source schedule data for the instruction.
5136 Instruction *Inst = nullptr;
5137 /// The edge information for the instruction.
5138 const EdgeInfo EI;
5139 /// This ScheduleData is in the current scheduling region if this matches
5140 /// the current SchedulingRegionID of BlockScheduling.
5141 int SchedulingRegionID = 0;
5142 /// Bundle, this data is part of.
5143 ScheduleBundle &Bundle;
5144
5145 public:
5146 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5147 const EdgeInfo &EI, ScheduleBundle &Bundle)
5148 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5149 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5150 static bool classof(const ScheduleEntity *Entity) {
5151 return Entity->getKind() == Kind::ScheduleCopyableData;
5152 }
5153
5154 /// Verify basic self consistency properties
5155 void verify() {
5156 if (hasValidDependencies()) {
5157 assert(UnscheduledDeps <= Dependencies && "invariant");
5158 } else {
5159 assert(UnscheduledDeps == Dependencies && "invariant");
5160 }
5161
5162 if (IsScheduled) {
5163 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5164 "unexpected scheduled state");
5165 }
5166 }
5167
5168 /// Returns true if the dependency information has been calculated.
5169 /// Note that depenendency validity can vary between instructions within
5170 /// a single bundle.
5171 bool hasValidDependencies() const {
5172 return Dependencies != ScheduleData::InvalidDeps;
5173 }
5174
5175 /// Returns true if it is ready for scheduling, i.e. it has no more
5176 /// unscheduled depending instructions/bundles.
5177 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5178
5179 /// Modifies the number of unscheduled dependencies for this instruction,
5180 /// and returns the number of remaining dependencies for the containing
5181 /// bundle.
5182 int incrementUnscheduledDeps(int Incr) {
5183 assert(hasValidDependencies() &&
5184 "increment of unscheduled deps would be meaningless");
5185 UnscheduledDeps += Incr;
5186 assert(UnscheduledDeps >= 0 && "invariant");
5187 return UnscheduledDeps;
5188 }
5189
5190 /// Sets the number of unscheduled dependencies to the number of
5191 /// dependencies.
5192 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5193
5194 /// Gets the number of unscheduled dependencies.
5195 int getUnscheduledDeps() const { return UnscheduledDeps; }
5196 /// Gets the number of dependencies.
5197 int getDependencies() const { return Dependencies; }
5198 /// Initializes the number of dependencies.
5199 void initDependencies() { Dependencies = 0; }
5200 /// Increments the number of dependencies.
5201 void incDependencies() { Dependencies++; }
5202
5203 /// Gets scheduling region ID.
5204 int getSchedulingRegionID() const { return SchedulingRegionID; }
5205
5206 /// Gets the instruction.
5207 Instruction *getInst() const { return Inst; }
5208
5209 /// Clears all dependency information.
5210 void clearDependencies() {
5211 Dependencies = ScheduleData::InvalidDeps;
5212 UnscheduledDeps = ScheduleData::InvalidDeps;
5213 IsScheduled = false;
5214 }
5215
5216 /// Gets the edge information.
5217 const EdgeInfo &getEdgeInfo() const { return EI; }
5218
5219 /// Gets the bundle.
5220 ScheduleBundle &getBundle() { return Bundle; }
5221 const ScheduleBundle &getBundle() const { return Bundle; }
5222
5223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5224 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5225
5226 LLVM_DUMP_METHOD void dump() const {
5227 dump(dbgs());
5228 dbgs() << '\n';
5229 }
5230#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5231
5232 private:
5233 /// true, if it has valid dependency information. These nodes always have
5234 /// only single dependency.
5235 int Dependencies = ScheduleData::InvalidDeps;
5236
5237 /// The number of dependencies minus the number of dependencies of scheduled
5238 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5239 /// for scheduling.
5240 /// Note that this is negative as long as Dependencies is not calculated.
5241 int UnscheduledDeps = ScheduleData::InvalidDeps;
5242 };
5243
5244#ifndef NDEBUG
5245 friend inline raw_ostream &
5246 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5247 SD.dump(OS);
5248 return OS;
5249 }
5250#endif
5251
5252 friend struct GraphTraits<BoUpSLP *>;
5253 friend struct DOTGraphTraits<BoUpSLP *>;
5254
5255 /// Contains all scheduling data for a basic block.
5256 /// It does not schedules instructions, which are not memory read/write
5257 /// instructions and their operands are either constants, or arguments, or
5258 /// phis, or instructions from others blocks, or their users are phis or from
5259 /// the other blocks. The resulting vector instructions can be placed at the
5260 /// beginning of the basic block without scheduling (if operands does not need
5261 /// to be scheduled) or at the end of the block (if users are outside of the
5262 /// block). It allows to save some compile time and memory used by the
5263 /// compiler.
5264 /// ScheduleData is assigned for each instruction in between the boundaries of
5265 /// the tree entry, even for those, which are not part of the graph. It is
5266 /// required to correctly follow the dependencies between the instructions and
5267 /// their correct scheduling. The ScheduleData is not allocated for the
5268 /// instructions, which do not require scheduling, like phis, nodes with
5269 /// extractelements/insertelements only or nodes with instructions, with
5270 /// uses/operands outside of the block.
5271 struct BlockScheduling {
5272 BlockScheduling(BasicBlock *BB)
5273 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5274
5275 void clear() {
5276 ScheduledBundles.clear();
5277 ScheduledBundlesList.clear();
5278 ScheduleCopyableDataMap.clear();
5279 ScheduleCopyableDataMapByInst.clear();
5280 ScheduleCopyableDataMapByInstUser.clear();
5281 ScheduleCopyableDataMapByUsers.clear();
5282 ReadyInsts.clear();
5283 ScheduleStart = nullptr;
5284 ScheduleEnd = nullptr;
5285 FirstLoadStoreInRegion = nullptr;
5286 LastLoadStoreInRegion = nullptr;
5287 RegionHasStackSave = false;
5288
5289 // Reduce the maximum schedule region size by the size of the
5290 // previous scheduling run.
5291 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5292 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5293 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5294 ScheduleRegionSize = 0;
5295
5296 // Make a new scheduling region, i.e. all existing ScheduleData is not
5297 // in the new region yet.
5298 ++SchedulingRegionID;
5299 }
5300
5301 ScheduleData *getScheduleData(Instruction *I) {
5302 if (!I)
5303 return nullptr;
5304 if (BB != I->getParent())
5305 // Avoid lookup if can't possibly be in map.
5306 return nullptr;
5307 ScheduleData *SD = ScheduleDataMap.lookup(I);
5308 if (SD && isInSchedulingRegion(*SD))
5309 return SD;
5310 return nullptr;
5311 }
5312
5313 ScheduleData *getScheduleData(Value *V) {
5314 return getScheduleData(dyn_cast<Instruction>(V));
5315 }
5316
5317 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5318 /// operand number) and value.
5319 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5320 const Value *V) const {
5321 if (ScheduleCopyableDataMap.empty())
5322 return nullptr;
5323 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5324 if (It == ScheduleCopyableDataMap.end())
5325 return nullptr;
5326 ScheduleCopyableData *SD = It->getSecond().get();
5327 if (!isInSchedulingRegion(*SD))
5328 return nullptr;
5329 return SD;
5330 }
5331
5332 /// Returns the ScheduleCopyableData for the given user \p User, operand
5333 /// number and operand \p V.
5335 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5336 const Value *V) {
5337 if (ScheduleCopyableDataMapByInstUser.empty())
5338 return {};
5339 const auto It = ScheduleCopyableDataMapByInstUser.find(
5340 std::make_pair(std::make_pair(User, OperandIdx), V));
5341 if (It == ScheduleCopyableDataMapByInstUser.end())
5342 return {};
5344 for (ScheduleCopyableData *SD : It->getSecond()) {
5345 if (isInSchedulingRegion(*SD))
5346 Res.push_back(SD);
5347 }
5348 return Res;
5349 }
5350
5351 /// Returns true if all operands of the given instruction \p User are
5352 /// replaced by copyable data.
5353 /// \param User The user instruction.
5354 /// \param Op The operand, which might be replaced by the copyable data.
5355 /// \param SLP The SLP tree.
5356 /// \param NumOps The number of operands used. If the instruction uses the
5357 /// same operand several times, check for the first use, then the second,
5358 /// etc.
5359 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5360 Instruction *Op, BoUpSLP &SLP,
5361 unsigned NumOps) const {
5362 assert(NumOps > 0 && "No operands");
5363 if (ScheduleCopyableDataMap.empty())
5364 return false;
5365 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5366 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5367 if (Entries.empty())
5368 return false;
5369 unsigned CurNumOps = 0;
5370 for (const Use &U : User->operands()) {
5371 if (U.get() != Op)
5372 continue;
5373 ++CurNumOps;
5374 // Check all tree entries, if they have operands replaced by copyable
5375 // data.
5376 for (TreeEntry *TE : Entries) {
5377 unsigned Inc = 0;
5378 bool IsNonSchedulableWithParentPhiNode =
5379 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5380 TE->UserTreeIndex.UserTE->hasState() &&
5381 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5382 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5383 // Count the number of unique phi nodes, which are the parent for
5384 // parent entry, and exit, if all the unique phis are processed.
5385 if (IsNonSchedulableWithParentPhiNode) {
5386 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5387 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5388 for (Value *V : ParentTE->Scalars) {
5389 auto *PHI = dyn_cast<PHINode>(V);
5390 if (!PHI)
5391 continue;
5392 if (ParentsUniqueUsers.insert(PHI).second &&
5393 is_contained(PHI->incoming_values(), User))
5394 ++Inc;
5395 }
5396 } else {
5397 Inc = count(TE->Scalars, User);
5398 }
5399
5400 // Check if the user is commutative.
5401 // The commutatives are handled later, as their operands can be
5402 // reordered.
5403 // Same applies even for non-commutative cmps, because we can invert
5404 // their predicate potentially and, thus, reorder the operands.
5405 bool IsCommutativeUser =
5406 ::isCommutative(User) &&
5407 ::isCommutableOperand(User, User, U.getOperandNo());
5408 if (!IsCommutativeUser) {
5409 Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User);
5410 IsCommutativeUser =
5411 ::isCommutative(MainOp, User) &&
5412 ::isCommutableOperand(MainOp, User, U.getOperandNo());
5413 }
5414 // The commutative user with the same operands can be safely
5415 // considered as non-commutative, operands reordering does not change
5416 // the semantics.
5417 assert(
5418 (!IsCommutativeUser ||
5419 (((::isCommutative(User) &&
5420 ::isCommutableOperand(User, User, 0) &&
5421 ::isCommutableOperand(User, User, 1)) ||
5422 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
5423 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5424 User, 0) &&
5425 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5426 User, 1))))) &&
5427 "Expected commutative user with 2 first commutable operands");
5428 bool IsCommutativeWithSameOps =
5429 IsCommutativeUser && User->getOperand(0) == User->getOperand(1);
5430 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5431 !isa<CmpInst>(User)) {
5432 EdgeInfo EI(TE, U.getOperandNo());
5433 if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
5434 continue;
5435 return false;
5436 }
5437 PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5438 .first->getSecond() += Inc;
5439 }
5440 }
5441 if (PotentiallyReorderedEntriesCount.empty())
5442 return true;
5443 // Check the commutative/cmp entries.
5444 for (auto &P : PotentiallyReorderedEntriesCount) {
5445 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5446 bool IsNonSchedulableWithParentPhiNode =
5447 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5448 P.first->UserTreeIndex.UserTE->hasState() &&
5449 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5450 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5451 auto *It = find(P.first->Scalars, User);
5452 do {
5453 assert(It != P.first->Scalars.end() &&
5454 "User is not in the tree entry");
5455 int Lane = std::distance(P.first->Scalars.begin(), It);
5456 assert(Lane >= 0 && "Lane is not found");
5457 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5458 Lane = P.first->ReorderIndices[Lane];
5459 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5460 "Couldn't find extract lane");
5461 // Count the number of unique phi nodes, which are the parent for
5462 // parent entry, and exit, if all the unique phis are processed.
5463 if (IsNonSchedulableWithParentPhiNode) {
5464 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5465 Value *User = ParentTE->Scalars[Lane];
5466 if (!ParentsUniqueUsers.insert(User).second) {
5467 It =
5468 find(make_range(std::next(It), P.first->Scalars.end()), User);
5469 continue;
5470 }
5471 }
5472 for (unsigned OpIdx :
5474 P.first->getMainOp()))) {
5475 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5476 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5477 --P.getSecond();
5478 }
5479 // If parent node is schedulable, it will be handled correctly.
5480 It = find(make_range(std::next(It), P.first->Scalars.end()), User);
5481 } while (It != P.first->Scalars.end());
5482 }
5483 return all_of(PotentiallyReorderedEntriesCount,
5484 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5485 return P.second == NumOps - 1;
5486 });
5487 }
5488
5490 getScheduleCopyableData(const Instruction *I) const {
5491 if (ScheduleCopyableDataMapByInst.empty())
5492 return {};
5493 const auto It = ScheduleCopyableDataMapByInst.find(I);
5494 if (It == ScheduleCopyableDataMapByInst.end())
5495 return {};
5497 for (ScheduleCopyableData *SD : It->getSecond()) {
5498 if (isInSchedulingRegion(*SD))
5499 Res.push_back(SD);
5500 }
5501 return Res;
5502 }
5503
5505 getScheduleCopyableDataUsers(const Instruction *User) const {
5506 if (ScheduleCopyableDataMapByUsers.empty())
5507 return {};
5508 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5509 if (It == ScheduleCopyableDataMapByUsers.end())
5510 return {};
5512 for (ScheduleCopyableData *SD : It->getSecond()) {
5513 if (isInSchedulingRegion(*SD))
5514 Res.push_back(SD);
5515 }
5516 return Res;
5517 }
5518
5519 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5520 Instruction *I,
5521 int SchedulingRegionID,
5522 ScheduleBundle &Bundle) {
5523 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5524 ScheduleCopyableData *CD =
5525 ScheduleCopyableDataMap
5526 .try_emplace(std::make_pair(EI, I),
5527 std::make_unique<ScheduleCopyableData>(
5528 SchedulingRegionID, I, EI, Bundle))
5529 .first->getSecond()
5530 .get();
5531 ScheduleCopyableDataMapByInst[I].push_back(CD);
5532 if (EI.UserTE) {
5533 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5534 const auto *It = find(Op, I);
5535 assert(It != Op.end() && "Lane not set");
5536 SmallPtrSet<Instruction *, 4> Visited;
5537 do {
5538 int Lane = std::distance(Op.begin(), It);
5539 assert(Lane >= 0 && "Lane not set");
5540 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5541 !EI.UserTE->ReorderIndices.empty())
5542 Lane = EI.UserTE->ReorderIndices[Lane];
5543 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5544 "Couldn't find extract lane");
5545 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5546 if (!Visited.insert(In).second) {
5547 It = find(make_range(std::next(It), Op.end()), I);
5548 continue;
5549 }
5550 ScheduleCopyableDataMapByInstUser
5551 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5552 .first->getSecond()
5553 .push_back(CD);
5554 ScheduleCopyableDataMapByUsers.try_emplace(I)
5555 .first->getSecond()
5556 .insert(CD);
5557 // Remove extra deps for users, becoming non-immediate users of the
5558 // instruction. It may happen, if the chain of same copyable elements
5559 // appears in the tree.
5560 if (In == I) {
5561 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5562 if (ScheduleCopyableData *UserCD =
5563 getScheduleCopyableData(UserEI, In))
5564 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5565 }
5566 It = find(make_range(std::next(It), Op.end()), I);
5567 } while (It != Op.end());
5568 } else {
5569 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5570 CD);
5571 }
5572 return *CD;
5573 }
5574
5575 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5576 auto *I = dyn_cast<Instruction>(V);
5577 if (!I)
5578 return {};
5579 auto It = ScheduledBundles.find(I);
5580 if (It == ScheduledBundles.end())
5581 return {};
5582 return It->getSecond();
5583 }
5584
5585 /// Returns true if the entity is in the scheduling region.
5586 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5587 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5588 return Data->getSchedulingRegionID() == SchedulingRegionID;
5589 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5590 return CD->getSchedulingRegionID() == SchedulingRegionID;
5591 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5592 [&](const ScheduleEntity *BundleMember) {
5593 return isInSchedulingRegion(*BundleMember);
5594 });
5595 }
5596
5597 /// Marks an instruction as scheduled and puts all dependent ready
5598 /// instructions into the ready-list.
5599 template <typename ReadyListType>
5600 void schedule(const BoUpSLP &R, const InstructionsState &S,
5601 const EdgeInfo &EI, ScheduleEntity *Data,
5602 ReadyListType &ReadyList) {
5603 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5605 // Handle the def-use chain dependencies.
5606
5607 // Decrement the unscheduled counter and insert to ready list if ready.
5608 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5609 if ((IsControl || Data->hasValidDependencies()) &&
5610 Data->incrementUnscheduledDeps(-1) == 0) {
5611 // There are no more unscheduled dependencies after
5612 // decrementing, so we can put the dependent instruction
5613 // into the ready list.
5614 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5616 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5617 CopyableBundle.push_back(&CD->getBundle());
5618 Bundles = CopyableBundle;
5619 } else {
5620 Bundles = getScheduleBundles(Data->getInst());
5621 }
5622 if (!Bundles.empty()) {
5623 for (ScheduleBundle *Bundle : Bundles) {
5624 if (Bundle->unscheduledDepsInBundle() == 0) {
5625 assert(!Bundle->isScheduled() &&
5626 "already scheduled bundle gets ready");
5627 ReadyList.insert(Bundle);
5629 << "SLP: gets ready: " << *Bundle << "\n");
5630 }
5631 }
5632 return;
5633 }
5634 assert(!Data->isScheduled() &&
5635 "already scheduled bundle gets ready");
5637 "Expected non-copyable data");
5638 ReadyList.insert(Data);
5639 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5640 }
5641 };
5642
5643 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5644 Instruction *I) {
5645 if (!ScheduleCopyableDataMap.empty()) {
5647 getScheduleCopyableData(User, OpIdx, I);
5648 for (ScheduleCopyableData *CD : CopyableData)
5649 DecrUnsched(CD, /*IsControl=*/false);
5650 if (!CopyableData.empty())
5651 return;
5652 }
5653 if (ScheduleData *OpSD = getScheduleData(I))
5654 DecrUnsched(OpSD, /*IsControl=*/false);
5655 };
5656
5657 // If BundleMember is a vector bundle, its operands may have been
5658 // reordered during buildTree(). We therefore need to get its operands
5659 // through the TreeEntry.
5660 if (!Bundles.empty()) {
5661 auto *In = BundleMember->getInst();
5662 // Count uses of each instruction operand.
5663 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5664 unsigned TotalOpCount = 0;
5665 if (isa<ScheduleCopyableData>(BundleMember)) {
5666 // Copyable data is used only once (uses itself).
5667 TotalOpCount = OperandsUses[In] = 1;
5668 } else {
5669 for (const Use &U : In->operands()) {
5670 if (auto *I = dyn_cast<Instruction>(U.get())) {
5671 auto Res = OperandsUses.try_emplace(I, 0);
5672 ++Res.first->getSecond();
5673 ++TotalOpCount;
5674 }
5675 }
5676 }
5677 // Decrement the unscheduled counter and insert to ready list if
5678 // ready.
5679 auto DecrUnschedForInst =
5680 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5681 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5682 &Checked) {
5683 if (!ScheduleCopyableDataMap.empty()) {
5684 const EdgeInfo EI = {UserTE, OpIdx};
5685 if (ScheduleCopyableData *CD =
5686 getScheduleCopyableData(EI, I)) {
5687 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
5688 return;
5689 DecrUnsched(CD, /*IsControl=*/false);
5690 return;
5691 }
5692 }
5693 auto It = OperandsUses.find(I);
5694 assert(It != OperandsUses.end() && "Operand not found");
5695 if (It->second > 0) {
5696 if (ScheduleData *OpSD = getScheduleData(I)) {
5697 if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
5698 return;
5699 --It->getSecond();
5700 assert(TotalOpCount > 0 && "No more operands to decrement");
5701 --TotalOpCount;
5702 DecrUnsched(OpSD, /*IsControl=*/false);
5703 } else {
5704 --It->getSecond();
5705 assert(TotalOpCount > 0 && "No more operands to decrement");
5706 --TotalOpCount;
5707 }
5708 }
5709 };
5710
5711 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5712 for (ScheduleBundle *Bundle : Bundles) {
5713 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5714 break;
5715 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5716 // Need to search for the lane since the tree entry can be
5717 // reordered.
5718 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
5719 bool IsNonSchedulableWithParentPhiNode =
5720 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5721 Bundle->getTreeEntry()->UserTreeIndex &&
5722 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5723 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5724 TreeEntry::SplitVectorize &&
5725 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5726 Instruction::PHI;
5727 do {
5728 int Lane =
5729 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5730 assert(Lane >= 0 && "Lane not set");
5731 if (isa<StoreInst>(In) &&
5732 !Bundle->getTreeEntry()->ReorderIndices.empty())
5733 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5734 assert(Lane < static_cast<int>(
5735 Bundle->getTreeEntry()->Scalars.size()) &&
5736 "Couldn't find extract lane");
5737
5738 // Since vectorization tree is being built recursively this
5739 // assertion ensures that the tree entry has all operands set
5740 // before reaching this code. Couple of exceptions known at the
5741 // moment are extracts where their second (immediate) operand is
5742 // not added. Since immediates do not affect scheduler behavior
5743 // this is considered okay.
5744 assert(In &&
5746 In->getNumOperands() ==
5747 Bundle->getTreeEntry()->getNumOperands() ||
5748 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5749 "Missed TreeEntry operands?");
5750
5751 // Count the number of unique phi nodes, which are the parent for
5752 // parent entry, and exit, if all the unique phis are processed.
5753 if (IsNonSchedulableWithParentPhiNode) {
5754 const TreeEntry *ParentTE =
5755 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5756 Value *User = ParentTE->Scalars[Lane];
5757 if (!ParentsUniqueUsers.insert(User).second) {
5758 It = std::find(std::next(It),
5759 Bundle->getTreeEntry()->Scalars.end(), In);
5760 continue;
5761 }
5762 }
5763
5764 for (unsigned OpIdx :
5765 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5766 if (auto *I = dyn_cast<Instruction>(
5767 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5768 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5769 << *I << "\n");
5770 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5771 }
5772 // If parent node is schedulable, it will be handled correctly.
5773 if (Bundle->getTreeEntry()->isCopyableElement(In))
5774 break;
5775 It = std::find(std::next(It),
5776 Bundle->getTreeEntry()->Scalars.end(), In);
5777 } while (It != Bundle->getTreeEntry()->Scalars.end());
5778 }
5779 } else {
5780 // If BundleMember is a stand-alone instruction, no operand reordering
5781 // has taken place, so we directly access its operands.
5782 for (Use &U : BundleMember->getInst()->operands()) {
5783 if (auto *I = dyn_cast<Instruction>(U.get())) {
5785 << "SLP: check for readiness (def): " << *I << "\n");
5786 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5787 }
5788 }
5789 }
5790 // Handle the memory dependencies.
5791 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5792 if (!SD)
5793 return;
5794 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5795 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5796 if (!VisitedMemory.insert(MemoryDep).second)
5797 continue;
5798 // There are no more unscheduled dependencies after decrementing,
5799 // so we can put the dependent instruction into the ready list.
5800 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5801 << *MemoryDep << "\n");
5802 DecrUnsched(MemoryDep);
5803 }
5804 // Handle the control dependencies.
5805 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5806 for (ScheduleData *Dep : SD->getControlDependencies()) {
5807 if (!VisitedControl.insert(Dep).second)
5808 continue;
5809 // There are no more unscheduled dependencies after decrementing,
5810 // so we can put the dependent instruction into the ready list.
5812 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5813 DecrUnsched(Dep, /*IsControl=*/true);
5814 }
5815 };
5816 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5817 SD->setScheduled(/*Scheduled=*/true);
5818 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5821 Instruction *In = SD->getInst();
5822 if (R.isVectorized(In)) {
5823 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5824 for (TreeEntry *TE : Entries) {
5826 In->getNumOperands() != TE->getNumOperands())
5827 continue;
5828 auto &BundlePtr =
5829 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5830 BundlePtr->setTreeEntry(TE);
5831 BundlePtr->add(SD);
5832 Bundles.push_back(BundlePtr.get());
5833 }
5834 }
5835 ProcessBundleMember(SD, Bundles);
5836 } else {
5837 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5838 Bundle.setScheduled(/*Scheduled=*/true);
5839 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5840 auto AreAllBundlesScheduled =
5841 [&](const ScheduleEntity *SD,
5842 ArrayRef<ScheduleBundle *> SDBundles) {
5844 return true;
5845 return !SDBundles.empty() &&
5846 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5847 return SDBundle->isScheduled();
5848 });
5849 };
5850 for (ScheduleEntity *SD : Bundle.getBundle()) {
5853 SDBundles = getScheduleBundles(SD->getInst());
5854 if (AreAllBundlesScheduled(SD, SDBundles)) {
5855 SD->setScheduled(/*Scheduled=*/true);
5856 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5857 : SDBundles);
5858 }
5859 }
5860 }
5861 }
5862
5863 /// Verify basic self consistency properties of the data structure.
5864 void verify() {
5865 if (!ScheduleStart)
5866 return;
5867
5868 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5869 ScheduleStart->comesBefore(ScheduleEnd) &&
5870 "Not a valid scheduling region?");
5871
5872 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5873 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5874 if (!Bundles.empty()) {
5875 for (ScheduleBundle *Bundle : Bundles) {
5876 assert(isInSchedulingRegion(*Bundle) &&
5877 "primary schedule data not in window?");
5878 Bundle->verify();
5879 }
5880 continue;
5881 }
5882 auto *SD = getScheduleData(I);
5883 if (!SD)
5884 continue;
5885 assert(isInSchedulingRegion(*SD) &&
5886 "primary schedule data not in window?");
5887 SD->verify();
5888 }
5889
5890 assert(all_of(ReadyInsts,
5891 [](const ScheduleEntity *Bundle) {
5892 return Bundle->isReady();
5893 }) &&
5894 "item in ready list not ready?");
5895 }
5896
5897 /// Put all instructions into the ReadyList which are ready for scheduling.
5898 template <typename ReadyListType>
5899 void initialFillReadyList(ReadyListType &ReadyList) {
5900 SmallPtrSet<ScheduleBundle *, 16> Visited;
5901 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5902 ScheduleData *SD = getScheduleData(I);
5903 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5904 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5905 !Bundles.empty()) {
5906 for (ScheduleBundle *Bundle : Bundles) {
5907 if (!Visited.insert(Bundle).second)
5908 continue;
5909 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5910 ReadyList.insert(Bundle);
5911 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5912 << *Bundle << "\n");
5913 }
5914 }
5915 continue;
5916 }
5917 ReadyList.insert(SD);
5919 << "SLP: initially in ready list: " << *SD << "\n");
5920 }
5921 }
5922 }
5923
5924 /// Build a bundle from the ScheduleData nodes corresponding to the
5925 /// scalar instruction for each lane.
5926 /// \param VL The list of scalar instructions.
5927 /// \param S The state of the instructions.
5928 /// \param EI The edge in the SLP graph or the user node/operand number.
5929 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5930 const InstructionsState &S, const EdgeInfo &EI);
5931
5932 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5933 /// cyclic dependencies. This is only a dry-run, no instructions are
5934 /// actually moved at this stage.
5935 /// \returns the scheduling bundle. The returned Optional value is not
5936 /// std::nullopt if \p VL is allowed to be scheduled.
5937 std::optional<ScheduleBundle *>
5938 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5939 const InstructionsState &S, const EdgeInfo &EI);
5940
5941 /// Allocates schedule data chunk.
5942 ScheduleData *allocateScheduleDataChunks();
5943
5944 /// Extends the scheduling region so that V is inside the region.
5945 /// \returns true if the region size is within the limit.
5946 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5947
5948 /// Initialize the ScheduleData structures for new instructions in the
5949 /// scheduling region.
5950 void initScheduleData(Instruction *FromI, Instruction *ToI,
5951 ScheduleData *PrevLoadStore,
5952 ScheduleData *NextLoadStore);
5953
5954 /// Updates the dependency information of a bundle and of all instructions/
5955 /// bundles which depend on the original bundle.
5956 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5957 BoUpSLP *SLP,
5958 ArrayRef<ScheduleData *> ControlDeps = {});
5959
5960 /// Sets all instruction in the scheduling region to un-scheduled.
5961 void resetSchedule();
5962
5963 BasicBlock *BB;
5964
5965 /// Simple memory allocation for ScheduleData.
5967
5968 /// The size of a ScheduleData array in ScheduleDataChunks.
5969 int ChunkSize;
5970
5971 /// The allocator position in the current chunk, which is the last entry
5972 /// of ScheduleDataChunks.
5973 int ChunkPos;
5974
5975 /// Attaches ScheduleData to Instruction.
5976 /// Note that the mapping survives during all vectorization iterations, i.e.
5977 /// ScheduleData structures are recycled.
5978 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5979
5980 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5981 /// number) and the operand instruction, represented as copyable element.
5982 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5983 std::unique_ptr<ScheduleCopyableData>>
5984 ScheduleCopyableDataMap;
5985
5986 /// Represents mapping between instruction and all related
5987 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5988 /// element). The SLP tree may contain several representations of the same
5989 /// instruction.
5990 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5991 ScheduleCopyableDataMapByInst;
5992
5993 /// Represents mapping between user value and operand number, the operand
5994 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5995 /// the same user may refernce the same operand in different tree entries
5996 /// and the operand may be modelled by the different copyable data element.
5997 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5999 ScheduleCopyableDataMapByInstUser;
6000
6001 /// Represents mapping between instruction and all related
6002 /// ScheduleCopyableData. It represents the mapping between the actual
6003 /// instruction and the last copyable data element in the chain. E.g., if
6004 /// the graph models the following instructions:
6005 /// %0 = non-add instruction ...
6006 /// ...
6007 /// %4 = add %3, 1
6008 /// %5 = add %4, 1
6009 /// %6 = insertelement poison, %0, 0
6010 /// %7 = insertelement %6, %5, 1
6011 /// And the graph is modeled as:
6012 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
6013 /// -> [1, 0] -> [%1, 0]
6014 ///
6015 /// this map will map %0 only to the copyable element <1>, which is the last
6016 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
6017 /// keep the map to <0>, not the %0.
6018 SmallDenseMap<const Instruction *,
6019 SmallSetVector<ScheduleCopyableData *, 4>>
6020 ScheduleCopyableDataMapByUsers;
6021
6022 /// Attaches ScheduleBundle to Instruction.
6023 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6024 ScheduledBundles;
6025 /// The list of ScheduleBundles.
6026 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
6027
6028 /// The ready-list for scheduling (only used for the dry-run).
6029 SetVector<ScheduleEntity *> ReadyInsts;
6030
6031 /// The first instruction of the scheduling region.
6032 Instruction *ScheduleStart = nullptr;
6033
6034 /// The first instruction _after_ the scheduling region.
6035 Instruction *ScheduleEnd = nullptr;
6036
6037 /// The first memory accessing instruction in the scheduling region
6038 /// (can be null).
6039 ScheduleData *FirstLoadStoreInRegion = nullptr;
6040
6041 /// The last memory accessing instruction in the scheduling region
6042 /// (can be null).
6043 ScheduleData *LastLoadStoreInRegion = nullptr;
6044
6045 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
6046 /// region? Used to optimize the dependence calculation for the
6047 /// common case where there isn't.
6048 bool RegionHasStackSave = false;
6049
6050 /// The current size of the scheduling region.
6051 int ScheduleRegionSize = 0;
6052
6053 /// The maximum size allowed for the scheduling region.
6054 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6055
6056 /// The ID of the scheduling region. For a new vectorization iteration this
6057 /// is incremented which "removes" all ScheduleData from the region.
6058 /// Make sure that the initial SchedulingRegionID is greater than the
6059 /// initial SchedulingRegionID in ScheduleData (which is 0).
6060 int SchedulingRegionID = 1;
6061 };
6062
6063 /// Attaches the BlockScheduling structures to basic blocks.
6064 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6065
6066 /// Performs the "real" scheduling. Done before vectorization is actually
6067 /// performed in a basic block.
6068 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6069
6070 /// List of users to ignore during scheduling and that don't need extracting.
6071 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6072
6073 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6074 /// sorted SmallVectors of unsigned.
6075 struct OrdersTypeDenseMapInfo {
6076 static OrdersType getEmptyKey() {
6077 OrdersType V;
6078 V.push_back(~1U);
6079 return V;
6080 }
6081
6082 static OrdersType getTombstoneKey() {
6083 OrdersType V;
6084 V.push_back(~2U);
6085 return V;
6086 }
6087
6088 static unsigned getHashValue(const OrdersType &V) {
6089 return static_cast<unsigned>(hash_combine_range(V));
6090 }
6091
6092 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6093 return LHS == RHS;
6094 }
6095 };
6096
6097 // Analysis and block reference.
6098 Function *F;
6099 ScalarEvolution *SE;
6100 TargetTransformInfo *TTI;
6101 TargetLibraryInfo *TLI;
6102 LoopInfo *LI;
6103 DominatorTree *DT;
6104 AssumptionCache *AC;
6105 DemandedBits *DB;
6106 const DataLayout *DL;
6107 OptimizationRemarkEmitter *ORE;
6108
6109 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6110 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6111
6112 /// Instruction builder to construct the vectorized tree.
6113 IRBuilder<TargetFolder> Builder;
6114
6115 /// A map of scalar integer values to the smallest bit width with which they
6116 /// can legally be represented. The values map to (width, signed) pairs,
6117 /// where "width" indicates the minimum bit width and "signed" is True if the
6118 /// value must be signed-extended, rather than zero-extended, back to its
6119 /// original width.
6120 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6121
6122 /// Final size of the reduced vector, if the current graph represents the
6123 /// input for the reduction and it was possible to narrow the size of the
6124 /// reduction.
6125 unsigned ReductionBitWidth = 0;
6126
6127 /// Canonical graph size before the transformations.
6128 unsigned BaseGraphSize = 1;
6129
6130 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6131 /// type sizes, used in the tree.
6132 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6133
6134 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6135 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6136 DenseSet<unsigned> ExtraBitWidthNodes;
6137};
6138
6139template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6143 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6144 SecondInfo::getEmptyKey());
6145 }
6146
6148 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6149 SecondInfo::getTombstoneKey());
6150 }
6151
6152 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6153 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6154 SecondInfo::getHashValue(Val.EdgeIdx));
6155 }
6156
6157 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6158 const BoUpSLP::EdgeInfo &RHS) {
6159 return LHS == RHS;
6160 }
6161};
6162
6163template <> struct llvm::GraphTraits<BoUpSLP *> {
6164 using TreeEntry = BoUpSLP::TreeEntry;
6165
6166 /// NodeRef has to be a pointer per the GraphWriter.
6168
6169 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6170
6171 /// Add the VectorizableTree to the index iterator to be able to return
6172 /// TreeEntry pointers.
6174 : public iterator_adaptor_base<
6175 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6177
6181
6182 NodeRef operator*() { return I->UserTE; }
6183 };
6184
6186 return R.VectorizableTree[0].get();
6187 }
6188
6190 return {&N->UserTreeIndex, N->Container};
6191 }
6192
6194 return {&N->UserTreeIndex + 1, N->Container};
6195 }
6196
6197 /// For the node iterator we just need to turn the TreeEntry iterator into a
6198 /// TreeEntry* iterator so that it dereferences to NodeRef.
6200 using ItTy = ContainerTy::iterator;
6201 ItTy It;
6202
6203 public:
6204 nodes_iterator(const ItTy &It2) : It(It2) {}
6205 NodeRef operator*() { return It->get(); }
6207 ++It;
6208 return *this;
6209 }
6210 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6211 };
6212
6214 return nodes_iterator(R->VectorizableTree.begin());
6215 }
6216
6218 return nodes_iterator(R->VectorizableTree.end());
6219 }
6220
6221 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6222};
6223
6224template <>
6226 using TreeEntry = BoUpSLP::TreeEntry;
6227
6228 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6229
6230 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6231 std::string Str;
6232 raw_string_ostream OS(Str);
6233 OS << Entry->Idx << ".\n";
6234 if (isSplat(Entry->Scalars))
6235 OS << "<splat> ";
6236 for (auto *V : Entry->Scalars) {
6237 OS << *V;
6238 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6239 return EU.Scalar == V;
6240 }))
6241 OS << " <extract>";
6242 OS << "\n";
6243 }
6244 return Str;
6245 }
6246
6247 static std::string getNodeAttributes(const TreeEntry *Entry,
6248 const BoUpSLP *) {
6249 if (Entry->isGather())
6250 return "color=red";
6251 if (Entry->State == TreeEntry::ScatterVectorize ||
6252 Entry->State == TreeEntry::StridedVectorize ||
6253 Entry->State == TreeEntry::CompressVectorize)
6254 return "color=blue";
6255 return "";
6256 }
6257};
6258
6261 for (auto *I : DeletedInstructions) {
6262 if (!I->getParent()) {
6263 // Temporarily insert instruction back to erase them from parent and
6264 // memory later.
6265 if (isa<PHINode>(I))
6266 // Phi nodes must be the very first instructions in the block.
6267 I->insertBefore(F->getEntryBlock(),
6268 F->getEntryBlock().getFirstNonPHIIt());
6269 else
6270 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6271 continue;
6272 }
6273 for (Use &U : I->operands()) {
6274 auto *Op = dyn_cast<Instruction>(U.get());
6275 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6277 DeadInsts.emplace_back(Op);
6278 }
6279 I->dropAllReferences();
6280 }
6281 for (auto *I : DeletedInstructions) {
6282 assert(I->use_empty() &&
6283 "trying to erase instruction with users.");
6284 I->eraseFromParent();
6285 }
6286
6287 // Cleanup any dead scalar code feeding the vectorized instructions
6289
6290#ifdef EXPENSIVE_CHECKS
6291 // If we could guarantee that this call is not extremely slow, we could
6292 // remove the ifdef limitation (see PR47712).
6293 assert(!verifyFunction(*F, &dbgs()));
6294#endif
6295}
6296
6297/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6298/// contains original mask for the scalars reused in the node. Procedure
6299/// transform this mask in accordance with the given \p Mask.
6301 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6302 "Expected non-empty mask.");
6303 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6304 Prev.swap(Reuses);
6305 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6306 if (Mask[I] != PoisonMaskElem)
6307 Reuses[Mask[I]] = Prev[I];
6308}
6309
6310/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6311/// the original order of the scalars. Procedure transforms the provided order
6312/// in accordance with the given \p Mask. If the resulting \p Order is just an
6313/// identity order, \p Order is cleared.
6315 bool BottomOrder = false) {
6316 assert(!Mask.empty() && "Expected non-empty mask.");
6317 unsigned Sz = Mask.size();
6318 if (BottomOrder) {
6319 SmallVector<unsigned> PrevOrder;
6320 if (Order.empty()) {
6321 PrevOrder.resize(Sz);
6322 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6323 } else {
6324 PrevOrder.swap(Order);
6325 }
6326 Order.assign(Sz, Sz);
6327 for (unsigned I = 0; I < Sz; ++I)
6328 if (Mask[I] != PoisonMaskElem)
6329 Order[I] = PrevOrder[Mask[I]];
6330 if (all_of(enumerate(Order), [&](const auto &Data) {
6331 return Data.value() == Sz || Data.index() == Data.value();
6332 })) {
6333 Order.clear();
6334 return;
6335 }
6336 fixupOrderingIndices(Order);
6337 return;
6338 }
6339 SmallVector<int> MaskOrder;
6340 if (Order.empty()) {
6341 MaskOrder.resize(Sz);
6342 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6343 } else {
6344 inversePermutation(Order, MaskOrder);
6345 }
6346 reorderReuses(MaskOrder, Mask);
6347 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6348 Order.clear();
6349 return;
6350 }
6351 Order.assign(Sz, Sz);
6352 for (unsigned I = 0; I < Sz; ++I)
6353 if (MaskOrder[I] != PoisonMaskElem)
6354 Order[MaskOrder[I]] = I;
6355 fixupOrderingIndices(Order);
6356}
6357
6358std::optional<BoUpSLP::OrdersType>
6359BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6360 bool TopToBottom, bool IgnoreReorder) {
6361 assert(TE.isGather() && "Expected gather node only.");
6362 // Try to find subvector extract/insert patterns and reorder only such
6363 // patterns.
6364 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6365 Type *ScalarTy = GatheredScalars.front()->getType();
6366 size_t NumScalars = GatheredScalars.size();
6367 if (!isValidElementType(ScalarTy))
6368 return std::nullopt;
6369 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6370 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6371 SmallVector<int> ExtractMask;
6372 SmallVector<int> Mask;
6375 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6377 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6378 /*ForOrder=*/true);
6379 // No shuffled operands - ignore.
6380 if (GatherShuffles.empty() && ExtractShuffles.empty())
6381 return std::nullopt;
6382 OrdersType CurrentOrder(NumScalars, NumScalars);
6383 if (GatherShuffles.size() == 1 &&
6384 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6385 Entries.front().front()->isSame(TE.Scalars)) {
6386 // If the full matched node in whole tree rotation - no need to consider the
6387 // matching order, rotating the whole tree.
6388 if (TopToBottom)
6389 return std::nullopt;
6390 // No need to keep the order for the same user node.
6391 if (Entries.front().front()->UserTreeIndex.UserTE ==
6392 TE.UserTreeIndex.UserTE)
6393 return std::nullopt;
6394 // No need to keep the order for the matched root node, if it can be freely
6395 // reordered.
6396 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6397 return std::nullopt;
6398 // If shuffling 2 elements only and the matching node has reverse reuses -
6399 // no need to count order, both work fine.
6400 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6401 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6402 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6403 [](const auto &P) {
6404 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6405 }))
6406 return std::nullopt;
6407
6408 // Perfect match in the graph, will reuse the previously vectorized
6409 // node. Cost is 0.
6410 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6411 return CurrentOrder;
6412 }
6413 auto IsSplatMask = [](ArrayRef<int> Mask) {
6414 int SingleElt = PoisonMaskElem;
6415 return all_of(Mask, [&](int I) {
6416 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6417 SingleElt = I;
6418 return I == PoisonMaskElem || I == SingleElt;
6419 });
6420 };
6421 // Exclusive broadcast mask - ignore.
6422 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6423 (Entries.size() != 1 ||
6424 Entries.front().front()->ReorderIndices.empty())) ||
6425 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6426 return std::nullopt;
6427 SmallBitVector ShuffledSubMasks(NumParts);
6428 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6429 ArrayRef<int> Mask, int PartSz, int NumParts,
6430 function_ref<unsigned(unsigned)> GetVF) {
6431 for (int I : seq<int>(0, NumParts)) {
6432 if (ShuffledSubMasks.test(I))
6433 continue;
6434 const int VF = GetVF(I);
6435 if (VF == 0)
6436 continue;
6437 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6438 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6439 // Shuffle of at least 2 vectors - ignore.
6440 if (any_of(Slice, not_equal_to(NumScalars))) {
6441 llvm::fill(Slice, NumScalars);
6442 ShuffledSubMasks.set(I);
6443 continue;
6444 }
6445 // Try to include as much elements from the mask as possible.
6446 int FirstMin = INT_MAX;
6447 int SecondVecFound = false;
6448 for (int K : seq<int>(Limit)) {
6449 int Idx = Mask[I * PartSz + K];
6450 if (Idx == PoisonMaskElem) {
6451 Value *V = GatheredScalars[I * PartSz + K];
6452 if (isConstant(V) && !isa<PoisonValue>(V)) {
6453 SecondVecFound = true;
6454 break;
6455 }
6456 continue;
6457 }
6458 if (Idx < VF) {
6459 if (FirstMin > Idx)
6460 FirstMin = Idx;
6461 } else {
6462 SecondVecFound = true;
6463 break;
6464 }
6465 }
6466 FirstMin = (FirstMin / PartSz) * PartSz;
6467 // Shuffle of at least 2 vectors - ignore.
6468 if (SecondVecFound) {
6469 llvm::fill(Slice, NumScalars);
6470 ShuffledSubMasks.set(I);
6471 continue;
6472 }
6473 for (int K : seq<int>(Limit)) {
6474 int Idx = Mask[I * PartSz + K];
6475 if (Idx == PoisonMaskElem)
6476 continue;
6477 Idx -= FirstMin;
6478 if (Idx >= PartSz) {
6479 SecondVecFound = true;
6480 break;
6481 }
6482 if (CurrentOrder[I * PartSz + Idx] >
6483 static_cast<unsigned>(I * PartSz + K) &&
6484 CurrentOrder[I * PartSz + Idx] !=
6485 static_cast<unsigned>(I * PartSz + Idx))
6486 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6487 }
6488 // Shuffle of at least 2 vectors - ignore.
6489 if (SecondVecFound) {
6490 llvm::fill(Slice, NumScalars);
6491 ShuffledSubMasks.set(I);
6492 continue;
6493 }
6494 }
6495 };
6496 int PartSz = getPartNumElems(NumScalars, NumParts);
6497 if (!ExtractShuffles.empty())
6498 TransformMaskToOrder(
6499 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6500 if (!ExtractShuffles[I])
6501 return 0U;
6502 unsigned VF = 0;
6503 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6504 for (unsigned Idx : seq<unsigned>(Sz)) {
6505 int K = I * PartSz + Idx;
6506 if (ExtractMask[K] == PoisonMaskElem)
6507 continue;
6508 if (!TE.ReuseShuffleIndices.empty())
6509 K = TE.ReuseShuffleIndices[K];
6510 if (K == PoisonMaskElem)
6511 continue;
6512 if (!TE.ReorderIndices.empty())
6513 K = std::distance(TE.ReorderIndices.begin(),
6514 find(TE.ReorderIndices, K));
6515 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6516 if (!EI)
6517 continue;
6518 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6519 ->getElementCount()
6520 .getKnownMinValue());
6521 }
6522 return VF;
6523 });
6524 // Check special corner case - single shuffle of the same entry.
6525 if (GatherShuffles.size() == 1 && NumParts != 1) {
6526 if (ShuffledSubMasks.any())
6527 return std::nullopt;
6528 PartSz = NumScalars;
6529 NumParts = 1;
6530 }
6531 if (!Entries.empty())
6532 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6533 if (!GatherShuffles[I])
6534 return 0U;
6535 return std::max(Entries[I].front()->getVectorFactor(),
6536 Entries[I].back()->getVectorFactor());
6537 });
6538 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6539 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6540 return std::nullopt;
6541 return std::move(CurrentOrder);
6542}
6543
6544static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6545 const TargetLibraryInfo &TLI,
6546 bool CompareOpcodes = true) {
6549 return false;
6550 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6551 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6552 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6553 (!GEP2 || GEP2->getNumOperands() == 2) &&
6554 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6555 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6556 !CompareOpcodes ||
6557 (GEP1 && GEP2 &&
6558 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6559}
6560
6561/// Calculates minimal alignment as a common alignment.
6562template <typename T>
6564 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6565 for (Value *V : VL)
6566 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6567 return CommonAlignment;
6568}
6569
6570/// Check if \p Order represents reverse order.
6572 assert(!Order.empty() &&
6573 "Order is empty. Please check it before using isReverseOrder.");
6574 unsigned Sz = Order.size();
6575 return all_of(enumerate(Order), [&](const auto &Pair) {
6576 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6577 });
6578}
6579
6580/// Checks if the provided list of pointers \p Pointers represents the strided
6581/// pointers for type ElemTy. If they are not, nullptr is returned.
6582/// Otherwise, SCEV* of the stride value is returned.
6583/// If `PointerOps` can be rearanged into the following sequence:
6584/// ```
6585/// %x + c_0 * stride,
6586/// %x + c_1 * stride,
6587/// %x + c_2 * stride
6588/// ...
6589/// ```
6590/// where each `c_i` is constant. The `Coeffs` will contain `c_0, c_1, c_2, ..`
6591/// and the SCEV of the `stride` will be returned.
6592static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6593 const DataLayout &DL, ScalarEvolution &SE,
6594 SmallVectorImpl<unsigned> &SortedIndices,
6595 SmallVectorImpl<int64_t> &Coeffs) {
6596 assert(Coeffs.size() == PointerOps.size() &&
6597 "Coeffs vector needs to be of correct size");
6599 const SCEV *PtrSCEVLowest = nullptr;
6600 const SCEV *PtrSCEVHighest = nullptr;
6601 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6602 // addresses).
6603 for (Value *Ptr : PointerOps) {
6604 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6605 if (!PtrSCEV)
6606 return nullptr;
6607 SCEVs.push_back(PtrSCEV);
6608 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6609 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6610 continue;
6611 }
6612 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6613 if (isa<SCEVCouldNotCompute>(Diff))
6614 return nullptr;
6615 if (Diff->isNonConstantNegative()) {
6616 PtrSCEVLowest = PtrSCEV;
6617 continue;
6618 }
6619 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6620 if (isa<SCEVCouldNotCompute>(Diff1))
6621 return nullptr;
6622 if (Diff1->isNonConstantNegative()) {
6623 PtrSCEVHighest = PtrSCEV;
6624 continue;
6625 }
6626 }
6627 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6628 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6629 if (isa<SCEVCouldNotCompute>(Dist))
6630 return nullptr;
6631 int Size = DL.getTypeStoreSize(ElemTy);
6632 auto TryGetStride = [&](const SCEV *Dist,
6633 const SCEV *Multiplier) -> const SCEV * {
6634 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6635 if (M->getOperand(0) == Multiplier)
6636 return M->getOperand(1);
6637 if (M->getOperand(1) == Multiplier)
6638 return M->getOperand(0);
6639 return nullptr;
6640 }
6641 if (Multiplier == Dist)
6642 return SE.getConstant(Dist->getType(), 1);
6643 return SE.getUDivExactExpr(Dist, Multiplier);
6644 };
6645 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6646 const SCEV *Stride = nullptr;
6647 if (Size != 1 || SCEVs.size() > 2) {
6648 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6649 Stride = TryGetStride(Dist, Sz);
6650 if (!Stride)
6651 return nullptr;
6652 }
6653 if (!Stride || isa<SCEVConstant>(Stride))
6654 return nullptr;
6655 // Iterate through all pointers and check if all distances are
6656 // unique multiple of Stride.
6657 using DistOrdPair = std::pair<int64_t, int>;
6658 auto Compare = llvm::less_first();
6659 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6660 int Cnt = 0;
6661 bool IsConsecutive = true;
6662 for (const auto [Idx, PtrSCEV] : enumerate(SCEVs)) {
6663 unsigned Dist = 0;
6664 if (PtrSCEV != PtrSCEVLowest) {
6665 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6666 const SCEV *Coeff = TryGetStride(Diff, Stride);
6667 if (!Coeff)
6668 return nullptr;
6669 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6670 if (!SC || isa<SCEVCouldNotCompute>(SC))
6671 return nullptr;
6672 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6673 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6674 SE.getMulExpr(Stride, SC)))
6675 ->isZero())
6676 return nullptr;
6677 Dist = SC->getAPInt().getZExtValue();
6678 } else {
6679 Coeffs[Idx] = 0;
6680 }
6681 // If the strides are not the same or repeated, we can't vectorize.
6682 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6683 return nullptr;
6684 auto Res = Offsets.emplace(Dist, Cnt);
6685 if (!Res.second)
6686 return nullptr;
6687 // Consecutive order if the inserted element is the last one.
6688 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6689 ++Cnt;
6690 }
6691 if (Offsets.size() != SCEVs.size())
6692 return nullptr;
6693 SortedIndices.clear();
6694 if (!IsConsecutive) {
6695 // Fill SortedIndices array only if it is non-consecutive.
6696 SortedIndices.resize(PointerOps.size());
6697 Cnt = 0;
6698 for (const std::pair<int64_t, int> &Pair : Offsets) {
6699 SortedIndices[Cnt] = Pair.second;
6700 ++Cnt;
6701 }
6702 }
6703 return Stride;
6704}
6705
6706static std::pair<InstructionCost, InstructionCost>
6708 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6709 Type *ScalarTy, VectorType *VecTy);
6710
6711/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6712/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6713/// subvector pattern.
6714static InstructionCost
6716 VectorType *Tp, ArrayRef<int> Mask = {},
6718 int Index = 0, VectorType *SubTp = nullptr,
6720 VectorType *DstTy = Tp;
6721 if (!Mask.empty())
6722 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6723
6724 if (Kind != TTI::SK_PermuteTwoSrc)
6725 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6726 Args);
6727 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6728 int NumSubElts;
6730 Mask, NumSrcElts, NumSubElts, Index)) {
6731 if (Index + NumSubElts > NumSrcElts &&
6732 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6733 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6734 TTI::TCK_RecipThroughput, Index, Tp);
6735 }
6736 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6737 Args);
6738}
6739
6740/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6741/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6742/// instead of a scalar.
6743static InstructionCost
6745 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6746 bool Extract, TTI::TargetCostKind CostKind,
6747 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6749 "ScalableVectorType is not supported.");
6750 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6751 getNumElements(Ty) &&
6752 "Incorrect usage.");
6753 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6754 assert(SLPReVec && "Only supported by REVEC.");
6755 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6756 // of CreateInsertElement.
6757 unsigned ScalarTyNumElements = VecTy->getNumElements();
6758 InstructionCost Cost = 0;
6759 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6760 if (!DemandedElts[I])
6761 continue;
6762 if (Insert)
6764 I * ScalarTyNumElements, VecTy);
6765 if (Extract)
6767 I * ScalarTyNumElements, VecTy);
6768 }
6769 return Cost;
6770 }
6771 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6772 CostKind, ForPoisonSrc, VL);
6773}
6774
6775/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6776/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6778 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6779 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6780 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6781 if (Opcode == Instruction::ExtractElement) {
6782 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6783 assert(SLPReVec && "Only supported by REVEC.");
6784 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6786 cast<VectorType>(Val), {}, CostKind,
6787 Index * VecTy->getNumElements(), VecTy);
6788 }
6789 }
6790 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6791 ScalarUserAndIdx);
6792}
6793
6794/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6795/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6797 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6798 VectorType *VecTy, unsigned Index,
6800 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6801 assert(SLPReVec && "Only supported by REVEC.");
6802 auto *SubTp =
6803 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6805 Index * ScalarTy->getNumElements(), SubTp) +
6806 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6807 CostKind);
6808 }
6809 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6810}
6811
6812/// Creates subvector insert. Generates shuffle using \p Generator or
6813/// using default shuffle.
6815 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6816 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6817 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6818 return Vec;
6819 const unsigned SubVecVF = getNumElements(V->getType());
6820 // Create shuffle, insertvector requires that index is multiple of
6821 // the subvector length.
6822 const unsigned VecVF = getNumElements(Vec->getType());
6824 if (isa<PoisonValue>(Vec)) {
6825 auto *Begin = std::next(Mask.begin(), Index);
6826 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6827 Vec = Builder.CreateShuffleVector(V, Mask);
6828 return Vec;
6829 }
6830 std::iota(Mask.begin(), Mask.end(), 0);
6831 std::iota(std::next(Mask.begin(), Index),
6832 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6833 if (Generator)
6834 return Generator(Vec, V, Mask);
6835 // 1. Resize V to the size of Vec.
6836 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6837 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6838 V = Builder.CreateShuffleVector(V, ResizeMask);
6839 // 2. Insert V into Vec.
6840 return Builder.CreateShuffleVector(Vec, V, Mask);
6841}
6842
6843/// Generates subvector extract using \p Generator or using default shuffle.
6845 unsigned SubVecVF, unsigned Index) {
6846 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6847 std::iota(Mask.begin(), Mask.end(), Index);
6848 return Builder.CreateShuffleVector(Vec, Mask);
6849}
6850
6851/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6852/// with \p Order.
6853/// \return true if the mask represents strided access, false - otherwise.
6855 ArrayRef<unsigned> Order, Type *ScalarTy,
6856 const DataLayout &DL, ScalarEvolution &SE,
6857 SmallVectorImpl<int> &CompressMask) {
6858 const unsigned Sz = PointerOps.size();
6859 CompressMask.assign(Sz, PoisonMaskElem);
6860 // The first element always set.
6861 CompressMask[0] = 0;
6862 // Check if the mask represents strided access.
6863 std::optional<unsigned> Stride = 0;
6864 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6865 for (unsigned I : seq<unsigned>(1, Sz)) {
6866 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6867 std::optional<int64_t> OptPos =
6868 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6869 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6870 return false;
6871 unsigned Pos = static_cast<unsigned>(*OptPos);
6872 CompressMask[I] = Pos;
6873 if (!Stride)
6874 continue;
6875 if (*Stride == 0) {
6876 *Stride = Pos;
6877 continue;
6878 }
6879 if (Pos != *Stride * I)
6880 Stride.reset();
6881 }
6882 return Stride.has_value();
6883}
6884
6885/// Checks if the \p VL can be transformed to a (masked)load + compress or
6886/// (masked) interleaved load.
6888 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6891 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6892 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6893 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6894 VectorType *&LoadVecTy) {
6895 InterleaveFactor = 0;
6896 Type *ScalarTy = VL.front()->getType();
6897 const size_t Sz = VL.size();
6898 auto *VecTy = getWidenedType(ScalarTy, Sz);
6900 SmallVector<int> Mask;
6901 if (!Order.empty())
6902 inversePermutation(Order, Mask);
6903 // Check external uses.
6904 for (const auto [I, V] : enumerate(VL)) {
6905 if (AreAllUsersVectorized(V))
6906 continue;
6907 InstructionCost ExtractCost =
6908 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6909 Mask.empty() ? I : Mask[I]);
6910 InstructionCost ScalarCost =
6911 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6912 if (ExtractCost <= ScalarCost)
6913 return false;
6914 }
6915 Value *Ptr0;
6916 Value *PtrN;
6917 if (Order.empty()) {
6918 Ptr0 = PointerOps.front();
6919 PtrN = PointerOps.back();
6920 } else {
6921 Ptr0 = PointerOps[Order.front()];
6922 PtrN = PointerOps[Order.back()];
6923 }
6924 std::optional<int64_t> Diff =
6925 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6926 if (!Diff)
6927 return false;
6928 const size_t MaxRegSize =
6930 .getFixedValue();
6931 // Check for very large distances between elements.
6932 if (*Diff / Sz >= MaxRegSize / 8)
6933 return false;
6934 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6935 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6936 Align CommonAlignment = LI->getAlign();
6937 IsMasked = !isSafeToLoadUnconditionally(
6938 Ptr0, LoadVecTy, CommonAlignment, DL,
6939 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6940 &TLI);
6941 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6942 LI->getPointerAddressSpace()))
6943 return false;
6944 // TODO: perform the analysis of each scalar load for better
6945 // safe-load-unconditionally analysis.
6946 bool IsStrided =
6947 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6948 assert(CompressMask.size() >= 2 && "At least two elements are required");
6949 SmallVector<Value *> OrderedPointerOps(PointerOps);
6950 if (!Order.empty())
6951 reorderScalars(OrderedPointerOps, Mask);
6952 auto [ScalarGEPCost, VectorGEPCost] =
6953 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6954 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6955 // The cost of scalar loads.
6956 InstructionCost ScalarLoadsCost =
6957 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6958 [&](InstructionCost C, Value *V) {
6959 return C + TTI.getInstructionCost(cast<Instruction>(V),
6960 CostKind);
6961 }) +
6962 ScalarGEPCost;
6963 APInt DemandedElts = APInt::getAllOnes(Sz);
6964 InstructionCost GatherCost =
6965 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6966 /*Insert=*/true,
6967 /*Extract=*/false, CostKind) +
6968 ScalarLoadsCost;
6969 InstructionCost LoadCost = 0;
6970 if (IsMasked) {
6971 LoadCost = TTI.getMemIntrinsicInstrCost(
6972 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
6973 CommonAlignment,
6974 LI->getPointerAddressSpace()),
6975 CostKind);
6976 } else {
6977 LoadCost =
6978 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6979 LI->getPointerAddressSpace(), CostKind);
6980 }
6981 if (IsStrided && !IsMasked && Order.empty()) {
6982 // Check for potential segmented(interleaved) loads.
6983 VectorType *AlignedLoadVecTy = getWidenedType(
6984 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6985 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6986 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6987 &TLI))
6988 AlignedLoadVecTy = LoadVecTy;
6989 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6990 CommonAlignment,
6991 LI->getPointerAddressSpace())) {
6992 InstructionCost InterleavedCost =
6993 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6994 Instruction::Load, AlignedLoadVecTy,
6995 CompressMask[1], {}, CommonAlignment,
6996 LI->getPointerAddressSpace(), CostKind, IsMasked);
6997 if (InterleavedCost < GatherCost) {
6998 InterleaveFactor = CompressMask[1];
6999 LoadVecTy = AlignedLoadVecTy;
7000 return true;
7001 }
7002 }
7003 }
7004 InstructionCost CompressCost = ::getShuffleCost(
7005 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
7006 if (!Order.empty()) {
7007 SmallVector<int> NewMask(Sz, PoisonMaskElem);
7008 for (unsigned I : seq<unsigned>(Sz)) {
7009 NewMask[I] = CompressMask[Mask[I]];
7010 }
7011 CompressMask.swap(NewMask);
7012 }
7013 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7014 return TotalVecCost < GatherCost;
7015}
7016
7017/// Checks if the \p VL can be transformed to a (masked)load + compress or
7018/// (masked) interleaved load.
7019static bool
7022 const DataLayout &DL, ScalarEvolution &SE,
7023 AssumptionCache &AC, const DominatorTree &DT,
7024 const TargetLibraryInfo &TLI,
7025 const function_ref<bool(Value *)> AreAllUsersVectorized) {
7026 bool IsMasked;
7027 unsigned InterleaveFactor;
7028 SmallVector<int> CompressMask;
7029 VectorType *LoadVecTy;
7030 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
7031 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7032 CompressMask, LoadVecTy);
7033}
7034
7035/// Checks if strided loads can be generated out of \p VL loads with pointers \p
7036/// PointerOps:
7037/// 1. Target with strided load support is detected.
7038/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
7039/// potential stride <= MaxProfitableLoadStride and the potential stride is
7040/// power-of-2 (to avoid perf regressions for the very small number of loads)
7041/// and max distance > number of loads, or potential stride is -1.
7042/// 3. The loads are ordered, or number of unordered loads <=
7043/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
7044/// to avoid extra costs for very expensive shuffles).
7045/// 4. Any pointer operand is an instruction with the users outside of the
7046/// current graph (for masked gathers extra extractelement instructions
7047/// might be required).
7049 Align Alignment, const int64_t Diff,
7050 const size_t Sz) const {
7051 if (Diff % (Sz - 1) != 0)
7052 return false;
7053
7054 // Try to generate strided load node.
7055 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
7056 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
7057 return !isVectorized(U) && !MustGather.contains(U);
7058 });
7059 });
7060
7061 const uint64_t AbsoluteDiff = std::abs(Diff);
7062 auto *VecTy = getWidenedType(ScalarTy, Sz);
7063 if (IsAnyPointerUsedOutGraph ||
7064 (AbsoluteDiff > Sz &&
7066 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
7067 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
7068 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7069 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7070 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7071 return false;
7072 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7073 return false;
7074 return true;
7075 }
7076 return false;
7077}
7078
7080 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7081 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7082 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
7083 const size_t Sz = PointerOps.size();
7084 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7085 // Go through `PointerOps` in sorted order and record offsets from `Ptr0`.
7086 for (unsigned I : seq<unsigned>(Sz)) {
7087 Value *Ptr =
7088 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7089 SortedOffsetsFromBase[I] =
7090 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
7091 }
7092
7093 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7094 // ```
7095 // [
7096 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7097 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7098 // ...
7099 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7100 // GroupSize - 1}), // last group
7101 // ]
7102 // ```
7103 // The distance between consecutive elements within each group should all be
7104 // the same `StrideWithinGroup`. The distance between the first elements of
7105 // consecutive groups should all be the same `StrideBetweenGroups`.
7106
7107 int64_t StrideWithinGroup =
7108 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7109 // Determine size of the first group. Later we will check that all other
7110 // groups have the same size.
7111 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7112 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7113 StrideWithinGroup;
7114 };
7115 auto Indices = seq<unsigned>(1, Sz);
7116 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7117 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7118
7119 unsigned VecSz = Sz;
7120 Type *NewScalarTy = ScalarTy;
7121
7122 // Quick detour: at this point we can say what the type of strided load would
7123 // be if all the checks pass. Check if this type is legal for the target.
7124 bool NeedsWidening = Sz != GroupSize;
7125 if (NeedsWidening) {
7126 if (Sz % GroupSize != 0)
7127 return false;
7128
7129 if (StrideWithinGroup != 1)
7130 return false;
7131 VecSz = Sz / GroupSize;
7132 NewScalarTy = Type::getIntNTy(
7133 SE->getContext(),
7134 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7135 }
7136
7137 if (!isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7138 return false;
7139
7140 int64_t StrideIntVal = StrideWithinGroup;
7141 if (NeedsWidening) {
7142 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7143 // Check that the strides between groups are all the same.
7144 unsigned CurrentGroupStartIdx = GroupSize;
7145 int64_t StrideBetweenGroups =
7146 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7147 StrideIntVal = StrideBetweenGroups;
7148 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7149 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7150 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7151 StrideBetweenGroups)
7152 return false;
7153 }
7154
7155 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7156 auto Indices = seq<unsigned>(StartIdx + 1, Sz);
7157 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7158 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7159 return GroupEndIdx - StartIdx == GroupSize;
7160 };
7161 for (unsigned I = 0; I < Sz; I += GroupSize) {
7162 if (!CheckGroup(I))
7163 return false;
7164 }
7165 }
7166
7167 Type *StrideTy = DL->getIndexType(Ptr0->getType());
7168 SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal);
7169 SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
7170 return true;
7171}
7172
7174 Type *ScalarTy, Align CommonAlignment,
7175 SmallVectorImpl<unsigned> &SortedIndices,
7176 StridedPtrInfo &SPtrInfo) const {
7177 // If each value in `PointerOps` is of the form `%x + Offset` where `Offset`
7178 // is constant, we partition `PointerOps` sequence into subsequences of
7179 // pointers with the same offset. For each offset we record values from
7180 // `PointerOps` and their indicies in `PointerOps`.
7182 OffsetToPointerOpIdxMap;
7183 for (auto [Idx, Ptr] : enumerate(PointerOps)) {
7184 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7185 if (!PtrSCEV)
7186 return false;
7187
7188 const auto *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
7189 int64_t Offset = 0;
7190 if (Add) {
7191 // `Offset` is non-zero.
7192 for (int I : seq<int>(Add->getNumOperands())) {
7193 const auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
7194 if (!SC)
7195 continue;
7196 Offset = SC->getAPInt().getSExtValue();
7197 break;
7198 }
7199 }
7200 OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
7201 OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
7202 }
7203 unsigned NumOffsets = OffsetToPointerOpIdxMap.size();
7204
7205 // Quick detour: at this point we can say what the type of strided load would
7206 // be if all the checks pass. Check if this type is legal for the target.
7207 const unsigned Sz = PointerOps.size();
7208 unsigned VecSz = Sz;
7209 Type *NewScalarTy = ScalarTy;
7210 if (NumOffsets > 1) {
7211 if (Sz % NumOffsets != 0)
7212 return false;
7213 VecSz = Sz / NumOffsets;
7214 NewScalarTy = Type::getIntNTy(
7215 SE->getContext(),
7216 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * NumOffsets);
7217 }
7218 FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
7219 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7220 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7221 return false;
7222
7223 // Check if the offsets are contiguous and that each group has the required
7224 // size.
7225 SmallVector<int64_t> SortedOffsetsV(NumOffsets);
7226 for (auto [Idx, MapPair] : enumerate(OffsetToPointerOpIdxMap)) {
7227 if (MapPair.second.first.size() != VecSz)
7228 return false;
7229 SortedOffsetsV[Idx] = MapPair.first;
7230 }
7231 sort(SortedOffsetsV);
7232
7233 if (NumOffsets > 1) {
7234 for (int I : seq<int>(1, SortedOffsetsV.size())) {
7235 if (SortedOffsetsV[I] - SortedOffsetsV[I - 1] != 1)
7236 return false;
7237 }
7238 }
7239
7240 // Introduce some notation for the explanations below. Let `PointerOps_j`
7241 // denote the subsequence of `PointerOps` with offsets equal to
7242 // `SortedOffsetsV[j]`. Let `SortedIndices_j` be a such that the sequence
7243 // ```
7244 // PointerOps_j[SortedIndices_j[0]],
7245 // PointerOps_j[SortedIndices_j[1]],
7246 // PointerOps_j[SortedIndices_j[2]],
7247 // ...
7248 // ```
7249 // is sorted. Also, let `IndicesInAllPointerOps_j` be the vector
7250 // of indices of the subsequence `PointerOps_j` in all of `PointerOps`,
7251 // i.e `PointerOps_j[i] = PointerOps[IndicesInAllPointerOps_j[i]]`.
7252 // The entire sorted `PointerOps` looks like this:
7253 // ```
7254 // PointerOps_0[SortedIndices_0[0]] = PointerOps[IndicesInAllPointerOps_0[0]],
7255 // PointerOps_1[SortedIndices_1[0]] = PointerOps[IndicesInAllPointerOps_1[0]],
7256 // PointerOps_2[SortedIndices_2[0]] = PointerOps[IndicesInAllPointerOps_2[0]],
7257 // ...
7258 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[0]] =
7259 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[0]],
7260 //
7261 // PointerOps_0[SortedIndices_0[1]] = PointerOps[IndicesInAllPointerOps_0[1]],
7262 // PointerOps_1[SortedIndices_1[1]] = PointerOps[IndicesInAllPointerOps_1[1]],
7263 // PointerOps_2[SortedIndices_2[1]] = PointerOps[IndicesInAllPointerOps_2[1]],
7264 // ...
7265 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[1]] =
7266 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[1]],
7267 //
7268 // PointerOps_0[SortedIndices_0[2]] = PointerOps[IndicesInAllPointerOps_0[2]],
7269 // PointerOps_1[SortedIndices_1[2]] = PointerOps[IndicesInAllPointerOps_1[2]],
7270 // PointerOps_2[SortedIndices_2[2]] = PointerOps[IndicesInAllPointerOps_2[2]],
7271 // ...
7272 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[2]] =
7273 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[2]],
7274 // ...
7275 // ...
7276 // ...
7277 // PointerOps_0[SortedIndices_0[VecSz - 1]] =
7278 // PointerOps[IndicesInAllPointerOps_0[VecSz - 1]],
7279 // PointerOps_1[SortedIndices_1[VecSz - 1]] =
7280 // PointerOps[IndicesInAllPointerOps_1[VecSz - 1]],
7281 // PointerOps_2[SortedIndices_2[VecSz - 1]] =
7282 // PointerOps[IndicesInAllPointerOps_2[VecSz - 1]],
7283 // ...
7284 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[VecSz - 1]] =
7285 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[VecSz - 1]],
7286 // ```
7287 // In order to be able to generate a strided load, we need the following
7288 // checks to pass:
7289 //
7290 // (1) for each `PointerOps_j` check that the distance
7291 // between adjacent pointers are all equal to the same value (stride).
7292 // (2) for each `PointerOps_j` check that coefficients calculated by
7293 // `calculateRtStride` are all the same.
7294 //
7295 // As we do that, also calculate SortedIndices. Since we should not modify
7296 // `SortedIndices` unless we know that all the checks succeed, record the
7297 // indicies into `SortedIndicesDraft`.
7298 SmallVector<unsigned> SortedIndicesDraft(Sz);
7299
7300 // Given sorted indices for a particular offset (as calculated by
7301 // calculateRtStride), update the `SortedIndicesDraft` for all of PointerOps.
7302 // Let `Offset` be `SortedOffsetsV[OffsetNum]`.
7303 // \param `OffsetNum` the index of `Offset` in `SortedOffsetsV`.
7304 // \param `IndicesInAllPointerOps` vector of indices of the
7305 // subsequence `PointerOps_OffsetNum` in `PointerOps`, i.e. using the above
7306 // notation `IndicesInAllPointerOps = IndicesInAllPointerOps_OffsetNum`.
7307 // \param `SortedIndicesForOffset = SortedIndices_OffsetNum`
7308 auto UpdateSortedIndices =
7309 [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7310 ArrayRef<unsigned> IndicesInAllPointerOps, const int64_t OffsetNum) {
7311 if (SortedIndicesForOffset.empty()) {
7312 SortedIndicesForOffset.resize(IndicesInAllPointerOps.size());
7313 std::iota(SortedIndicesForOffset.begin(),
7314 SortedIndicesForOffset.end(), 0);
7315 }
7316 for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
7317 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7318 IndicesInAllPointerOps[Idx];
7319 }
7320 };
7321
7322 int64_t LowestOffset = SortedOffsetsV[0];
7323 ArrayRef<Value *> PointerOps0 = OffsetToPointerOpIdxMap[LowestOffset].first;
7324
7325 SmallVector<int64_t> Coeffs0(VecSz);
7326 SmallVector<unsigned> SortedIndicesForOffset0;
7327 const SCEV *Stride0 = calculateRtStride(PointerOps0, ScalarTy, *DL, *SE,
7328 SortedIndicesForOffset0, Coeffs0);
7329 if (!Stride0)
7330 return false;
7331 unsigned NumCoeffs0 = Coeffs0.size();
7332 if (NumCoeffs0 * NumOffsets != Sz)
7333 return false;
7334 sort(Coeffs0);
7335
7336 ArrayRef<unsigned> IndicesInAllPointerOps0 =
7337 OffsetToPointerOpIdxMap[LowestOffset].second;
7338 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7339
7340 // Now that we know what the common stride and coefficients has to be check
7341 // the remaining `PointerOps_j`.
7342 SmallVector<int64_t> Coeffs;
7343 SmallVector<unsigned> SortedIndicesForOffset;
7344 for (int J : seq<int>(1, NumOffsets)) {
7345 Coeffs.clear();
7346 Coeffs.resize(VecSz);
7347 SortedIndicesForOffset.clear();
7348
7349 int64_t Offset = SortedOffsetsV[J];
7350 ArrayRef<Value *> PointerOpsForOffset =
7351 OffsetToPointerOpIdxMap[Offset].first;
7352 ArrayRef<unsigned> IndicesInAllPointerOps =
7353 OffsetToPointerOpIdxMap[Offset].second;
7354 const SCEV *StrideWithinGroup =
7355 calculateRtStride(PointerOpsForOffset, ScalarTy, *DL, *SE,
7356 SortedIndicesForOffset, Coeffs);
7357
7358 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7359 return false;
7360 if (Coeffs.size() != NumCoeffs0)
7361 return false;
7362 sort(Coeffs);
7363 if (Coeffs != Coeffs0)
7364 return false;
7365
7366 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7367 }
7368
7369 SortedIndices.clear();
7370 SortedIndices = SortedIndicesDraft;
7371 SPtrInfo.StrideSCEV = Stride0;
7372 SPtrInfo.Ty = StridedLoadTy;
7373 return true;
7374}
7375
7377 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7378 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7379 unsigned *BestVF, bool TryRecursiveCheck) const {
7380 // Check that a vectorized load would load the same memory as a scalar
7381 // load. For example, we don't want to vectorize loads that are smaller
7382 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7383 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7384 // from such a struct, we read/write packed bits disagreeing with the
7385 // unvectorized version.
7386 if (BestVF)
7387 *BestVF = 0;
7389 return LoadsState::Gather;
7390 Type *ScalarTy = VL0->getType();
7391
7392 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7393 return LoadsState::Gather;
7394
7395 // Make sure all loads in the bundle are simple - we can't vectorize
7396 // atomic or volatile loads.
7397 PointerOps.clear();
7398 const size_t Sz = VL.size();
7399 PointerOps.resize(Sz);
7400 auto *POIter = PointerOps.begin();
7401 for (Value *V : VL) {
7402 auto *L = dyn_cast<LoadInst>(V);
7403 if (!L || !L->isSimple())
7404 return LoadsState::Gather;
7405 *POIter = L->getPointerOperand();
7406 ++POIter;
7407 }
7408
7409 Order.clear();
7410 // Check the order of pointer operands or that all pointers are the same.
7411 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7412
7413 auto *VecTy = getWidenedType(ScalarTy, Sz);
7414 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7415 if (!IsSorted) {
7416 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7417 SPtrInfo))
7419
7420 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7421 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7422 return LoadsState::Gather;
7423
7424 if (!all_of(PointerOps, [&](Value *P) {
7425 return arePointersCompatible(P, PointerOps.front(), *TLI);
7426 }))
7427 return LoadsState::Gather;
7428
7429 } else {
7430 Value *Ptr0;
7431 Value *PtrN;
7432 if (Order.empty()) {
7433 Ptr0 = PointerOps.front();
7434 PtrN = PointerOps.back();
7435 } else {
7436 Ptr0 = PointerOps[Order.front()];
7437 PtrN = PointerOps[Order.back()];
7438 }
7439 std::optional<int64_t> Diff =
7440 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7441 // Check that the sorted loads are consecutive.
7442 if (static_cast<uint64_t>(*Diff) == Sz - 1)
7443 return LoadsState::Vectorize;
7444 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7445 *TLI, [&](Value *V) {
7446 return areAllUsersVectorized(
7447 cast<Instruction>(V), UserIgnoreList);
7448 }))
7450 Align Alignment =
7451 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7452 ->getAlign();
7453 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7454 *Diff, Ptr0, PtrN, SPtrInfo))
7456 }
7457 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7458 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7459 return LoadsState::Gather;
7460 // Correctly identify compare the cost of loads + shuffles rather than
7461 // strided/masked gather loads. Returns true if vectorized + shuffles
7462 // representation is better than just gather.
7463 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7464 unsigned *BestVF,
7465 bool ProfitableGatherPointers) {
7466 if (BestVF)
7467 *BestVF = 0;
7468 // Compare masked gather cost and loads + insert subvector costs.
7470 auto [ScalarGEPCost, VectorGEPCost] =
7471 getGEPCosts(TTI, PointerOps, PointerOps.front(),
7472 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7473 // Estimate the cost of masked gather GEP. If not a splat, roughly
7474 // estimate as a buildvector, otherwise estimate as splat.
7475 APInt DemandedElts = APInt::getAllOnes(Sz);
7476 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7477 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7478 if (static_cast<unsigned>(count_if(
7479 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7480 any_of(PointerOps, [&](Value *V) {
7481 return getUnderlyingObject(V) !=
7482 getUnderlyingObject(PointerOps.front());
7483 }))
7484 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7485 DemandedElts, /*Insert=*/true,
7486 /*Extract=*/false, CostKind);
7487 else
7488 VectorGEPCost +=
7490 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7491 /*Insert=*/true, /*Extract=*/false, CostKind) +
7492 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7493 // The cost of scalar loads.
7494 InstructionCost ScalarLoadsCost =
7495 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7496 [&](InstructionCost C, Value *V) {
7497 return C + TTI.getInstructionCost(
7499 }) +
7500 ScalarGEPCost;
7501 // The cost of masked gather.
7502 InstructionCost MaskedGatherCost =
7503 TTI.getMemIntrinsicInstrCost(
7504 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
7506 /*VariableMask=*/false, CommonAlignment),
7507 CostKind) +
7508 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7509 InstructionCost GatherCost =
7510 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7511 /*Insert=*/true,
7512 /*Extract=*/false, CostKind) +
7513 ScalarLoadsCost;
7514 // The list of loads is small or perform partial check already - directly
7515 // compare masked gather cost and gather cost.
7516 constexpr unsigned ListLimit = 4;
7517 if (!TryRecursiveCheck || VL.size() < ListLimit)
7518 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7519
7520 // FIXME: The following code has not been updated for non-power-of-2
7521 // vectors (and not whole registers). The splitting logic here does not
7522 // cover the original vector if the vector factor is not a power of two.
7523 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7524 return false;
7525
7526 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7527 unsigned MinVF = getMinVF(2 * Sz);
7528 DemandedElts.clearAllBits();
7529 // Iterate through possible vectorization factors and check if vectorized +
7530 // shuffles is better than just gather.
7531 for (unsigned VF =
7532 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7533 VF >= MinVF;
7534 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7536 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7537 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7539 SmallVector<Value *> PointerOps;
7540 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7541 PointerOps, SPtrInfo, BestVF,
7542 /*TryRecursiveCheck=*/false);
7543 // Check that the sorted loads are consecutive.
7544 if (LS == LoadsState::Gather) {
7545 if (BestVF) {
7546 DemandedElts.setAllBits();
7547 break;
7548 }
7549 DemandedElts.setBits(Cnt, Cnt + VF);
7550 continue;
7551 }
7552 // If need the reorder - consider as high-cost masked gather for now.
7553 if ((LS == LoadsState::Vectorize ||
7556 !Order.empty() && !isReverseOrder(Order))
7558 States.push_back(LS);
7559 }
7560 if (DemandedElts.isAllOnes())
7561 // All loads gathered - try smaller VF.
7562 continue;
7563 // Can be vectorized later as a serie of loads/insertelements.
7564 InstructionCost VecLdCost = 0;
7565 if (!DemandedElts.isZero()) {
7566 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7567 /*Insert=*/true,
7568 /*Extract=*/false, CostKind) +
7569 ScalarGEPCost;
7570 for (unsigned Idx : seq<unsigned>(VL.size()))
7571 if (DemandedElts[Idx])
7572 VecLdCost +=
7573 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7574 }
7575 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7576 for (auto [I, LS] : enumerate(States)) {
7577 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7578 InstructionCost VectorGEPCost =
7579 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7580 ? 0
7581 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7582 LI0->getPointerOperand(),
7583 Instruction::GetElementPtr, CostKind, ScalarTy,
7584 SubVecTy)
7585 .second;
7586 if (LS == LoadsState::ScatterVectorize) {
7587 if (static_cast<unsigned>(
7588 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7589 PointerOps.size() - 1 ||
7590 any_of(PointerOps, [&](Value *V) {
7591 return getUnderlyingObject(V) !=
7592 getUnderlyingObject(PointerOps.front());
7593 }))
7594 VectorGEPCost += getScalarizationOverhead(
7595 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7596 /*Insert=*/true, /*Extract=*/false, CostKind);
7597 else
7598 VectorGEPCost +=
7600 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7601 /*Insert=*/true, /*Extract=*/false, CostKind) +
7602 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7603 CostKind);
7604 }
7605 switch (LS) {
7607 VecLdCost +=
7608 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7609 LI0->getPointerAddressSpace(), CostKind,
7611 VectorGEPCost;
7612 break;
7614 VecLdCost += TTI.getMemIntrinsicInstrCost(
7616 Intrinsic::experimental_vp_strided_load,
7617 SubVecTy, LI0->getPointerOperand(),
7618 /*VariableMask=*/false, CommonAlignment),
7619 CostKind) +
7620 VectorGEPCost;
7621 break;
7623 VecLdCost += TTI.getMemIntrinsicInstrCost(
7625 Intrinsic::masked_load, SubVecTy,
7626 CommonAlignment, LI0->getPointerAddressSpace()),
7627 CostKind) +
7629 {}, CostKind);
7630 break;
7632 VecLdCost += TTI.getMemIntrinsicInstrCost(
7634 Intrinsic::masked_gather, SubVecTy,
7635 LI0->getPointerOperand(),
7636 /*VariableMask=*/false, CommonAlignment),
7637 CostKind) +
7638 VectorGEPCost;
7639 break;
7640 case LoadsState::Gather:
7641 // Gathers are already calculated - ignore.
7642 continue;
7643 }
7644 SmallVector<int> ShuffleMask(VL.size());
7645 for (int Idx : seq<int>(0, VL.size()))
7646 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7647 if (I > 0)
7648 VecLdCost +=
7649 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7650 CostKind, I * VF, SubVecTy);
7651 }
7652 // If masked gather cost is higher - better to vectorize, so
7653 // consider it as a gather node. It will be better estimated
7654 // later.
7655 if (MaskedGatherCost >= VecLdCost &&
7656 VecLdCost - GatherCost < -SLPCostThreshold) {
7657 if (BestVF)
7658 *BestVF = VF;
7659 return true;
7660 }
7661 }
7662 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7663 };
7664 // TODO: need to improve analysis of the pointers, if not all of them are
7665 // GEPs or have > 2 operands, we end up with a gather node, which just
7666 // increases the cost.
7667 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7668 bool ProfitableGatherPointers =
7669 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7670 return L->isLoopInvariant(V);
7671 })) <= Sz / 2;
7672 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7674 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7675 (GEP && GEP->getNumOperands() == 2 &&
7676 isa<Constant, Instruction>(GEP->getOperand(1)));
7677 })) {
7678 // Check if potential masked gather can be represented as series
7679 // of loads + insertsubvectors.
7680 // If masked gather cost is higher - better to vectorize, so
7681 // consider it as a gather node. It will be better estimated
7682 // later.
7683 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7684 ProfitableGatherPointers))
7686 }
7687
7688 return LoadsState::Gather;
7689}
7690
7692 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7693 const DataLayout &DL, ScalarEvolution &SE,
7694 SmallVectorImpl<unsigned> &SortedIndices) {
7695 assert(
7696 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7697 "Expected list of pointer operands.");
7698 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7699 // Ptr into, sort and return the sorted indices with values next to one
7700 // another.
7702 std::pair<BasicBlock *, Value *>,
7704 Bases;
7705 Bases
7706 .try_emplace(std::make_pair(
7708 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7709
7710 SortedIndices.clear();
7711 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7712 auto Key = std::make_pair(BBs[Cnt + 1],
7714 bool Found = any_of(Bases.try_emplace(Key).first->second,
7715 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7716 std::optional<int64_t> Diff =
7717 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7718 ElemTy, Ptr, DL, SE,
7719 /*StrictCheck=*/true);
7720 if (!Diff)
7721 return false;
7722
7723 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7724 return true;
7725 });
7726
7727 if (!Found) {
7728 // If we haven't found enough to usefully cluster, return early.
7729 if (Bases.size() > VL.size() / 2 - 1)
7730 return false;
7731
7732 // Not found already - add a new Base
7733 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7734 }
7735 }
7736
7737 if (Bases.size() == VL.size())
7738 return false;
7739
7740 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7741 Bases.front().second.size() == VL.size()))
7742 return false;
7743
7744 // For each of the bases sort the pointers by Offset and check if any of the
7745 // base become consecutively allocated.
7746 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7747 SmallPtrSet<Value *, 13> FirstPointers;
7748 SmallPtrSet<Value *, 13> SecondPointers;
7749 Value *P1 = Ptr1;
7750 Value *P2 = Ptr2;
7751 unsigned Depth = 0;
7752 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7753 if (P1 == P2 || Depth > RecursionMaxDepth)
7754 return false;
7755 FirstPointers.insert(P1);
7756 SecondPointers.insert(P2);
7757 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7758 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7759 ++Depth;
7760 }
7761 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7762 "Unable to find matching root.");
7763 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7764 };
7765 for (auto &Base : Bases) {
7766 for (auto &Vec : Base.second) {
7767 if (Vec.size() > 1) {
7769 int64_t InitialOffset = std::get<1>(Vec[0]);
7770 bool AnyConsecutive =
7771 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7772 return std::get<1>(P.value()) ==
7773 int64_t(P.index()) + InitialOffset;
7774 });
7775 // Fill SortedIndices array only if it looks worth-while to sort the
7776 // ptrs.
7777 if (!AnyConsecutive)
7778 return false;
7779 }
7780 }
7781 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7782 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7783 });
7784 }
7785
7786 for (auto &T : Bases)
7787 for (const auto &Vec : T.second)
7788 for (const auto &P : Vec)
7789 SortedIndices.push_back(std::get<2>(P));
7790
7791 assert(SortedIndices.size() == VL.size() &&
7792 "Expected SortedIndices to be the size of VL");
7793 return true;
7794}
7795
7796std::optional<BoUpSLP::OrdersType>
7797BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7798 assert(TE.isGather() && "Expected gather node only.");
7799 Type *ScalarTy = TE.Scalars[0]->getType();
7800
7802 Ptrs.reserve(TE.Scalars.size());
7804 BBs.reserve(TE.Scalars.size());
7805 for (Value *V : TE.Scalars) {
7806 auto *L = dyn_cast<LoadInst>(V);
7807 if (!L || !L->isSimple())
7808 return std::nullopt;
7809 Ptrs.push_back(L->getPointerOperand());
7810 BBs.push_back(L->getParent());
7811 }
7812
7813 BoUpSLP::OrdersType Order;
7814 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7815 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7816 return std::move(Order);
7817 return std::nullopt;
7818}
7819
7820/// Check if two insertelement instructions are from the same buildvector.
7823 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7824 // Instructions must be from the same basic blocks.
7825 if (VU->getParent() != V->getParent())
7826 return false;
7827 // Checks if 2 insertelements are from the same buildvector.
7828 if (VU->getType() != V->getType())
7829 return false;
7830 // Multiple used inserts are separate nodes.
7831 if (!VU->hasOneUse() && !V->hasOneUse())
7832 return false;
7833 auto *IE1 = VU;
7834 auto *IE2 = V;
7835 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7836 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7837 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7838 return false;
7839 // Go through the vector operand of insertelement instructions trying to find
7840 // either VU as the original vector for IE2 or V as the original vector for
7841 // IE1.
7842 SmallBitVector ReusedIdx(
7843 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7844 bool IsReusedIdx = false;
7845 do {
7846 if (IE2 == VU && !IE1)
7847 return VU->hasOneUse();
7848 if (IE1 == V && !IE2)
7849 return V->hasOneUse();
7850 if (IE1 && IE1 != V) {
7851 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7852 IsReusedIdx |= ReusedIdx.test(Idx1);
7853 ReusedIdx.set(Idx1);
7854 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7855 IE1 = nullptr;
7856 else
7857 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7858 }
7859 if (IE2 && IE2 != VU) {
7860 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7861 IsReusedIdx |= ReusedIdx.test(Idx2);
7862 ReusedIdx.set(Idx2);
7863 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7864 IE2 = nullptr;
7865 else
7866 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7867 }
7868 } while (!IsReusedIdx && (IE1 || IE2));
7869 return false;
7870}
7871
7872/// Checks if the specified instruction \p I is an alternate operation for
7873/// the given \p MainOp and \p AltOp instructions.
7874static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7875 Instruction *AltOp,
7876 const TargetLibraryInfo &TLI);
7877
7878std::optional<BoUpSLP::OrdersType>
7879BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7880 bool IgnoreReorder) {
7881 // No need to reorder if need to shuffle reuses, still need to shuffle the
7882 // node.
7883 if (!TE.ReuseShuffleIndices.empty()) {
7884 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7885 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7886 "Reshuffling scalars not yet supported for nodes with padding");
7887
7888 if (isSplat(TE.Scalars))
7889 return std::nullopt;
7890 // Check if reuse shuffle indices can be improved by reordering.
7891 // For this, check that reuse mask is "clustered", i.e. each scalar values
7892 // is used once in each submask of size <number_of_scalars>.
7893 // Example: 4 scalar values.
7894 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7895 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7896 // element 3 is used twice in the second submask.
7897 unsigned Sz = TE.Scalars.size();
7898 if (TE.isGather()) {
7899 if (std::optional<OrdersType> CurrentOrder =
7900 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7901 SmallVector<int> Mask;
7902 fixupOrderingIndices(*CurrentOrder);
7903 inversePermutation(*CurrentOrder, Mask);
7904 ::addMask(Mask, TE.ReuseShuffleIndices);
7905 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7906 unsigned Sz = TE.Scalars.size();
7907 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7908 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7909 if (Idx != PoisonMaskElem)
7910 Res[Idx + K * Sz] = I + K * Sz;
7911 }
7912 return std::move(Res);
7913 }
7914 }
7915 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7916 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7917 2 * TE.getVectorFactor())) == 1)
7918 return std::nullopt;
7919 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7920 return std::nullopt;
7921 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7922 Sz)) {
7923 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7924 if (TE.ReorderIndices.empty())
7925 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7926 else
7927 inversePermutation(TE.ReorderIndices, ReorderMask);
7928 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7929 unsigned VF = ReorderMask.size();
7930 OrdersType ResOrder(VF, VF);
7931 unsigned NumParts = divideCeil(VF, Sz);
7932 SmallBitVector UsedVals(NumParts);
7933 for (unsigned I = 0; I < VF; I += Sz) {
7934 int Val = PoisonMaskElem;
7935 unsigned UndefCnt = 0;
7936 unsigned Limit = std::min(Sz, VF - I);
7937 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7938 [&](int Idx) {
7939 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7940 Val = Idx;
7941 if (Idx == PoisonMaskElem)
7942 ++UndefCnt;
7943 return Idx != PoisonMaskElem && Idx != Val;
7944 }) ||
7945 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7946 UndefCnt > Sz / 2)
7947 return std::nullopt;
7948 UsedVals.set(Val);
7949 for (unsigned K = 0; K < NumParts; ++K) {
7950 unsigned Idx = Val + Sz * K;
7951 if (Idx < VF && I + K < VF)
7952 ResOrder[Idx] = I + K;
7953 }
7954 }
7955 return std::move(ResOrder);
7956 }
7957 unsigned VF = TE.getVectorFactor();
7958 // Try build correct order for extractelement instructions.
7959 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7960 TE.ReuseShuffleIndices.end());
7961 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7962 all_of(TE.Scalars, [Sz](Value *V) {
7963 if (isa<PoisonValue>(V))
7964 return true;
7965 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7966 return Idx && *Idx < Sz;
7967 })) {
7968 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7969 "by BinaryOperator and CastInst.");
7970 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7971 if (TE.ReorderIndices.empty())
7972 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7973 else
7974 inversePermutation(TE.ReorderIndices, ReorderMask);
7975 for (unsigned I = 0; I < VF; ++I) {
7976 int &Idx = ReusedMask[I];
7977 if (Idx == PoisonMaskElem)
7978 continue;
7979 Value *V = TE.Scalars[ReorderMask[Idx]];
7980 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7981 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7982 }
7983 }
7984 // Build the order of the VF size, need to reorder reuses shuffles, they are
7985 // always of VF size.
7986 OrdersType ResOrder(VF);
7987 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7988 auto *It = ResOrder.begin();
7989 for (unsigned K = 0; K < VF; K += Sz) {
7990 OrdersType CurrentOrder(TE.ReorderIndices);
7991 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7992 if (SubMask.front() == PoisonMaskElem)
7993 std::iota(SubMask.begin(), SubMask.end(), 0);
7994 reorderOrder(CurrentOrder, SubMask);
7995 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7996 std::advance(It, Sz);
7997 }
7998 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7999 return Data.index() == Data.value();
8000 }))
8001 return std::nullopt; // No need to reorder.
8002 return std::move(ResOrder);
8003 }
8004 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8005 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8006 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
8007 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
8008 return std::nullopt;
8009 if (TE.State == TreeEntry::SplitVectorize ||
8010 ((TE.State == TreeEntry::Vectorize ||
8011 TE.State == TreeEntry::StridedVectorize ||
8012 TE.State == TreeEntry::CompressVectorize) &&
8014 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
8015 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8016 "Alternate instructions are only supported by "
8017 "BinaryOperator and CastInst.");
8018 return TE.ReorderIndices;
8019 }
8020 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8021 TE.isAltShuffle()) {
8022 assert(TE.ReuseShuffleIndices.empty() &&
8023 "ReuseShuffleIndices should be "
8024 "empty for alternate instructions.");
8025 SmallVector<int> Mask;
8026 TE.buildAltOpShuffleMask(
8027 [&](Instruction *I) {
8028 assert(TE.getMatchingMainOpOrAltOp(I) &&
8029 "Unexpected main/alternate opcode");
8030 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
8031 },
8032 Mask);
8033 const int VF = TE.getVectorFactor();
8034 OrdersType ResOrder(VF, VF);
8035 for (unsigned I : seq<unsigned>(VF)) {
8036 if (Mask[I] == PoisonMaskElem)
8037 continue;
8038 ResOrder[Mask[I] % VF] = I;
8039 }
8040 return std::move(ResOrder);
8041 }
8042 if (!TE.ReorderIndices.empty())
8043 return TE.ReorderIndices;
8044 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8045 if (!TE.ReorderIndices.empty())
8046 return TE.ReorderIndices;
8047
8048 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
8049 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
8050 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
8051 continue;
8052 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
8053 if (!II)
8054 continue;
8055 Instruction *BVHead = nullptr;
8056 BasicBlock *BB = II->getParent();
8057 while (II && II->hasOneUse() && II->getParent() == BB) {
8058 BVHead = II;
8059 II = dyn_cast<InsertElementInst>(II->getOperand(0));
8060 }
8061 I = BVHead;
8062 }
8063
8064 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
8065 assert(BB1 != BB2 && "Expected different basic blocks.");
8066 if (!DT->isReachableFromEntry(BB1))
8067 return false;
8068 if (!DT->isReachableFromEntry(BB2))
8069 return true;
8070 auto *NodeA = DT->getNode(BB1);
8071 auto *NodeB = DT->getNode(BB2);
8072 assert(NodeA && "Should only process reachable instructions");
8073 assert(NodeB && "Should only process reachable instructions");
8074 assert((NodeA == NodeB) ==
8075 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8076 "Different nodes should have different DFS numbers");
8077 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8078 };
8079 auto PHICompare = [&](unsigned I1, unsigned I2) {
8080 Value *V1 = TE.Scalars[I1];
8081 Value *V2 = TE.Scalars[I2];
8082 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
8083 return false;
8084 if (isa<PoisonValue>(V1))
8085 return true;
8086 if (isa<PoisonValue>(V2))
8087 return false;
8088 if (V1->getNumUses() < V2->getNumUses())
8089 return true;
8090 if (V1->getNumUses() > V2->getNumUses())
8091 return false;
8092 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
8093 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
8094 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8095 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8096 FirstUserOfPhi2->getParent());
8097 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
8098 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
8099 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
8100 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
8101 if (IE1 && !IE2)
8102 return true;
8103 if (!IE1 && IE2)
8104 return false;
8105 if (IE1 && IE2) {
8106 if (UserBVHead[I1] && !UserBVHead[I2])
8107 return true;
8108 if (!UserBVHead[I1])
8109 return false;
8110 if (UserBVHead[I1] == UserBVHead[I2])
8111 return getElementIndex(IE1) < getElementIndex(IE2);
8112 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
8113 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
8114 UserBVHead[I2]->getParent());
8115 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8116 }
8117 if (EE1 && !EE2)
8118 return true;
8119 if (!EE1 && EE2)
8120 return false;
8121 if (EE1 && EE2) {
8122 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
8123 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
8124 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
8125 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
8126 if (!Inst2 && !P2)
8127 return Inst1 || P1;
8128 if (EE1->getOperand(0) == EE2->getOperand(0))
8129 return getElementIndex(EE1) < getElementIndex(EE2);
8130 if (!Inst1 && Inst2)
8131 return false;
8132 if (Inst1 && Inst2) {
8133 if (Inst1->getParent() != Inst2->getParent())
8134 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
8135 return Inst1->comesBefore(Inst2);
8136 }
8137 if (!P1 && P2)
8138 return false;
8139 assert(P1 && P2 &&
8140 "Expected either instructions or arguments vector operands.");
8141 return P1->getArgNo() < P2->getArgNo();
8142 }
8143 return false;
8144 };
8145 OrdersType Phis(TE.Scalars.size());
8146 std::iota(Phis.begin(), Phis.end(), 0);
8147 stable_sort(Phis, PHICompare);
8148 if (isIdentityOrder(Phis))
8149 return std::nullopt; // No need to reorder.
8150 return std::move(Phis);
8151 }
8152 if (TE.isGather() &&
8153 (!TE.hasState() || !TE.isAltShuffle() ||
8154 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8155 allSameType(TE.Scalars)) {
8156 // TODO: add analysis of other gather nodes with extractelement
8157 // instructions and other values/instructions, not only undefs.
8158 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8160 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
8161 all_of(TE.Scalars, [](Value *V) {
8162 auto *EE = dyn_cast<ExtractElementInst>(V);
8163 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8164 })) {
8165 // Check that gather of extractelements can be represented as
8166 // just a shuffle of a single vector.
8167 OrdersType CurrentOrder;
8168 bool Reuse =
8169 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
8170 if (Reuse || !CurrentOrder.empty())
8171 return std::move(CurrentOrder);
8172 }
8173 // If the gather node is <undef, v, .., poison> and
8174 // insertelement poison, v, 0 [+ permute]
8175 // is cheaper than
8176 // insertelement poison, v, n - try to reorder.
8177 // If rotating the whole graph, exclude the permute cost, the whole graph
8178 // might be transformed.
8179 int Sz = TE.Scalars.size();
8180 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
8181 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
8182 const auto *It = find_if_not(TE.Scalars, isConstant);
8183 if (It == TE.Scalars.begin())
8184 return OrdersType();
8185 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
8186 if (It != TE.Scalars.end()) {
8187 OrdersType Order(Sz, Sz);
8188 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8189 Order[Idx] = 0;
8190 fixupOrderingIndices(Order);
8191 SmallVector<int> Mask;
8192 inversePermutation(Order, Mask);
8193 InstructionCost PermuteCost =
8194 TopToBottom
8195 ? 0
8196 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
8197 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
8198 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
8199 PoisonValue::get(Ty), *It);
8200 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
8201 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
8202 PoisonValue::get(Ty), *It);
8203 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8204 OrdersType Order(Sz, Sz);
8205 Order[Idx] = 0;
8206 return std::move(Order);
8207 }
8208 }
8209 }
8210 if (isSplat(TE.Scalars))
8211 return std::nullopt;
8212 if (TE.Scalars.size() >= 3)
8213 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
8214 return Order;
8215 // Check if can include the order of vectorized loads. For masked gathers do
8216 // extra analysis later, so include such nodes into a special list.
8217 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8218 SmallVector<Value *> PointerOps;
8219 StridedPtrInfo SPtrInfo;
8220 OrdersType CurrentOrder;
8221 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
8222 CurrentOrder, PointerOps, SPtrInfo);
8225 return std::move(CurrentOrder);
8226 }
8227 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
8228 // has been auditted for correctness with non-power-of-two vectors.
8229 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
8230 if (std::optional<OrdersType> CurrentOrder =
8231 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
8232 return CurrentOrder;
8233 }
8234 return std::nullopt;
8235}
8236
8237/// Checks if the given mask is a "clustered" mask with the same clusters of
8238/// size \p Sz, which are not identity submasks.
8240 unsigned Sz) {
8241 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
8242 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
8243 return false;
8244 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
8245 ArrayRef<int> Cluster = Mask.slice(I, Sz);
8246 if (Cluster != FirstCluster)
8247 return false;
8248 }
8249 return true;
8250}
8251
8252void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8253 // Reorder reuses mask.
8254 reorderReuses(TE.ReuseShuffleIndices, Mask);
8255 const unsigned Sz = TE.Scalars.size();
8256 // For vectorized and non-clustered reused no need to do anything else.
8257 if (!TE.isGather() ||
8259 Sz) ||
8260 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
8261 return;
8262 SmallVector<int> NewMask;
8263 inversePermutation(TE.ReorderIndices, NewMask);
8264 addMask(NewMask, TE.ReuseShuffleIndices);
8265 // Clear reorder since it is going to be applied to the new mask.
8266 TE.ReorderIndices.clear();
8267 // Try to improve gathered nodes with clustered reuses, if possible.
8268 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
8269 SmallVector<unsigned> NewOrder(Slice);
8270 inversePermutation(NewOrder, NewMask);
8271 reorderScalars(TE.Scalars, NewMask);
8272 // Fill the reuses mask with the identity submasks.
8273 for (auto *It = TE.ReuseShuffleIndices.begin(),
8274 *End = TE.ReuseShuffleIndices.end();
8275 It != End; std::advance(It, Sz))
8276 std::iota(It, std::next(It, Sz), 0);
8277}
8278
8280 ArrayRef<unsigned> SecondaryOrder) {
8281 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8282 "Expected same size of orders");
8283 size_t Sz = Order.size();
8284 SmallBitVector UsedIndices(Sz);
8285 for (unsigned Idx : seq<unsigned>(0, Sz)) {
8286 if (Order[Idx] != Sz)
8287 UsedIndices.set(Order[Idx]);
8288 }
8289 if (SecondaryOrder.empty()) {
8290 for (unsigned Idx : seq<unsigned>(0, Sz))
8291 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8292 Order[Idx] = Idx;
8293 } else {
8294 for (unsigned Idx : seq<unsigned>(0, Sz))
8295 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8296 !UsedIndices.test(SecondaryOrder[Idx]))
8297 Order[Idx] = SecondaryOrder[Idx];
8298 }
8299}
8300
8303 return false;
8304
8305 constexpr unsigned TinyVF = 2;
8306 constexpr unsigned TinyTree = 10;
8307 constexpr unsigned PhiOpsLimit = 12;
8308 constexpr unsigned GatherLoadsLimit = 2;
8309 if (VectorizableTree.size() <= TinyTree)
8310 return true;
8311 if (VectorizableTree.front()->hasState() &&
8312 !VectorizableTree.front()->isGather() &&
8313 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8314 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8315 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8316 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8317 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8318 VectorizableTree.front()->ReorderIndices.empty()) {
8319 // Check if the tree has only single store and single (unordered) load node,
8320 // other nodes are phis or geps/binops, combined with phis, and/or single
8321 // gather load node
8322 if (VectorizableTree.front()->hasState() &&
8323 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8324 VectorizableTree.front()->Scalars.size() == TinyVF &&
8325 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8326 return false;
8327 // Single node, which require reorder - skip.
8328 if (VectorizableTree.front()->hasState() &&
8329 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8330 VectorizableTree.front()->ReorderIndices.empty()) {
8331 const unsigned ReorderedSplitsCnt =
8332 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8333 return TE->State == TreeEntry::SplitVectorize &&
8334 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8335 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8336 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
8337 });
8338 if (ReorderedSplitsCnt <= 1 &&
8339 static_cast<unsigned>(count_if(
8340 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8341 return ((!TE->isGather() &&
8342 (TE->ReorderIndices.empty() ||
8343 (TE->UserTreeIndex.UserTE &&
8344 TE->UserTreeIndex.UserTE->State ==
8345 TreeEntry::Vectorize &&
8346 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8347 .empty()))) ||
8348 (TE->isGather() && TE->ReorderIndices.empty() &&
8349 (!TE->hasState() || TE->isAltShuffle() ||
8350 TE->getOpcode() == Instruction::Load ||
8351 TE->getOpcode() == Instruction::ZExt ||
8352 TE->getOpcode() == Instruction::SExt))) &&
8353 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8354 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
8355 return !isConstant(V) && isVectorized(V);
8356 }));
8357 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8358 return false;
8359 }
8360 bool HasPhis = false;
8361 bool HasLoad = true;
8362 unsigned GatherLoads = 0;
8363 for (const std::unique_ptr<TreeEntry> &TE :
8364 ArrayRef(VectorizableTree).drop_front()) {
8365 if (TE->State == TreeEntry::SplitVectorize)
8366 continue;
8367 if (!TE->hasState()) {
8368 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
8370 continue;
8371 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8373 continue;
8374 return true;
8375 }
8376 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8377 if (!TE->isGather()) {
8378 HasLoad = false;
8379 continue;
8380 }
8381 if (HasLoad)
8382 return true;
8383 ++GatherLoads;
8384 if (GatherLoads >= GatherLoadsLimit)
8385 return true;
8386 }
8387 if (TE->getOpcode() == Instruction::GetElementPtr ||
8388 Instruction::isBinaryOp(TE->getOpcode()))
8389 continue;
8390 if (TE->getOpcode() != Instruction::PHI &&
8391 (!TE->hasCopyableElements() ||
8392 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8393 TE->Scalars.size() / 2))
8394 return true;
8395 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8396 TE->getNumOperands() > PhiOpsLimit)
8397 return false;
8398 HasPhis = true;
8399 }
8400 return !HasPhis;
8401 }
8402 return true;
8403}
8404
8405void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8406 ArrayRef<int> MaskOrder) {
8407 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8408 SmallVector<int> NewMask(getVectorFactor());
8409 SmallVector<int> NewMaskOrder(getVectorFactor());
8410 std::iota(NewMask.begin(), NewMask.end(), 0);
8411 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8412 if (Idx == 0) {
8413 copy(Mask, NewMask.begin());
8414 copy(MaskOrder, NewMaskOrder.begin());
8415 } else {
8416 assert(Idx == 1 && "Expected either 0 or 1 index.");
8417 unsigned Offset = CombinedEntriesWithIndices.back().second;
8418 for (unsigned I : seq<unsigned>(Mask.size())) {
8419 NewMask[I + Offset] = Mask[I] + Offset;
8420 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8421 }
8422 }
8423 reorderScalars(Scalars, NewMask);
8424 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8425 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8426 ReorderIndices.clear();
8427}
8428
8430 // Maps VF to the graph nodes.
8432 // ExtractElement gather nodes which can be vectorized and need to handle
8433 // their ordering.
8435
8436 // Phi nodes can have preferred ordering based on their result users
8438
8439 // AltShuffles can also have a preferred ordering that leads to fewer
8440 // instructions, e.g., the addsub instruction in x86.
8441 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8442
8443 // Maps a TreeEntry to the reorder indices of external users.
8445 ExternalUserReorderMap;
8446 // Find all reorderable nodes with the given VF.
8447 // Currently the are vectorized stores,loads,extracts + some gathering of
8448 // extracts.
8449 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8450 const std::unique_ptr<TreeEntry> &TE) {
8451 // Look for external users that will probably be vectorized.
8452 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8453 findExternalStoreUsersReorderIndices(TE.get());
8454 if (!ExternalUserReorderIndices.empty()) {
8455 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8456 ExternalUserReorderMap.try_emplace(TE.get(),
8457 std::move(ExternalUserReorderIndices));
8458 }
8459
8460 // Patterns like [fadd,fsub] can be combined into a single instruction in
8461 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8462 // to take into account their order when looking for the most used order.
8463 if (TE->hasState() && TE->isAltShuffle() &&
8464 TE->State != TreeEntry::SplitVectorize) {
8465 Type *ScalarTy = TE->Scalars[0]->getType();
8466 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8467 unsigned Opcode0 = TE->getOpcode();
8468 unsigned Opcode1 = TE->getAltOpcode();
8469 SmallBitVector OpcodeMask(
8470 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8471 // If this pattern is supported by the target then we consider the order.
8472 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8473 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8474 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8475 }
8476 // TODO: Check the reverse order too.
8477 }
8478
8479 bool IgnoreReorder =
8480 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8481 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8482 VectorizableTree.front()->getOpcode() == Instruction::Store);
8483 if (std::optional<OrdersType> CurrentOrder =
8484 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8485 // Do not include ordering for nodes used in the alt opcode vectorization,
8486 // better to reorder them during bottom-to-top stage. If follow the order
8487 // here, it causes reordering of the whole graph though actually it is
8488 // profitable just to reorder the subgraph that starts from the alternate
8489 // opcode vectorization node. Such nodes already end-up with the shuffle
8490 // instruction and it is just enough to change this shuffle rather than
8491 // rotate the scalars for the whole graph.
8492 unsigned Cnt = 0;
8493 const TreeEntry *UserTE = TE.get();
8494 while (UserTE && Cnt < RecursionMaxDepth) {
8495 if (!UserTE->UserTreeIndex)
8496 break;
8497 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8498 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8499 UserTE->UserTreeIndex.UserTE->Idx != 0)
8500 return;
8501 UserTE = UserTE->UserTreeIndex.UserTE;
8502 ++Cnt;
8503 }
8504 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8505 if (!(TE->State == TreeEntry::Vectorize ||
8506 TE->State == TreeEntry::StridedVectorize ||
8507 TE->State == TreeEntry::SplitVectorize ||
8508 TE->State == TreeEntry::CompressVectorize) ||
8509 !TE->ReuseShuffleIndices.empty())
8510 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8511 if (TE->State == TreeEntry::Vectorize &&
8512 TE->getOpcode() == Instruction::PHI)
8513 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8514 }
8515 });
8516
8517 // Reorder the graph nodes according to their vectorization factor.
8518 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8519 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8520 auto It = VFToOrderedEntries.find(VF);
8521 if (It == VFToOrderedEntries.end())
8522 continue;
8523 // Try to find the most profitable order. We just are looking for the most
8524 // used order and reorder scalar elements in the nodes according to this
8525 // mostly used order.
8526 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8527 // Delete VF entry upon exit.
8528 llvm::scope_exit Cleanup([&]() { VFToOrderedEntries.erase(It); });
8529
8530 // All operands are reordered and used only in this node - propagate the
8531 // most used order to the user node.
8534 OrdersUses;
8535 for (const TreeEntry *OpTE : OrderedEntries) {
8536 // No need to reorder this nodes, still need to extend and to use shuffle,
8537 // just need to merge reordering shuffle and the reuse shuffle.
8538 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8539 OpTE->State != TreeEntry::SplitVectorize)
8540 continue;
8541 // Count number of orders uses.
8542 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8543 &PhisToOrders]() -> const OrdersType & {
8544 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8545 auto It = GathersToOrders.find(OpTE);
8546 if (It != GathersToOrders.end())
8547 return It->second;
8548 }
8549 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8550 auto It = AltShufflesToOrders.find(OpTE);
8551 if (It != AltShufflesToOrders.end())
8552 return It->second;
8553 }
8554 if (OpTE->State == TreeEntry::Vectorize &&
8555 OpTE->getOpcode() == Instruction::PHI) {
8556 auto It = PhisToOrders.find(OpTE);
8557 if (It != PhisToOrders.end())
8558 return It->second;
8559 }
8560 return OpTE->ReorderIndices;
8561 }();
8562 // First consider the order of the external scalar users.
8563 auto It = ExternalUserReorderMap.find(OpTE);
8564 if (It != ExternalUserReorderMap.end()) {
8565 const auto &ExternalUserReorderIndices = It->second;
8566 // If the OpTE vector factor != number of scalars - use natural order,
8567 // it is an attempt to reorder node with reused scalars but with
8568 // external uses.
8569 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8570 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8571 ExternalUserReorderIndices.size();
8572 } else {
8573 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8574 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8575 }
8576 // No other useful reorder data in this entry.
8577 if (Order.empty())
8578 continue;
8579 }
8580 // Stores actually store the mask, not the order, need to invert.
8581 if (OpTE->State == TreeEntry::Vectorize &&
8582 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8583 assert(!OpTE->isAltShuffle() &&
8584 "Alternate instructions are only supported by BinaryOperator "
8585 "and CastInst.");
8586 SmallVector<int> Mask;
8587 inversePermutation(Order, Mask);
8588 unsigned E = Order.size();
8589 OrdersType CurrentOrder(E, E);
8590 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8591 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8592 });
8593 fixupOrderingIndices(CurrentOrder);
8594 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8595 } else {
8596 ++OrdersUses.try_emplace(Order, 0).first->second;
8597 }
8598 }
8599 if (OrdersUses.empty())
8600 continue;
8601 // Choose the most used order.
8602 unsigned IdentityCnt = 0;
8603 unsigned FilledIdentityCnt = 0;
8604 OrdersType IdentityOrder(VF, VF);
8605 for (auto &Pair : OrdersUses) {
8606 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8607 if (!Pair.first.empty())
8608 FilledIdentityCnt += Pair.second;
8609 IdentityCnt += Pair.second;
8610 combineOrders(IdentityOrder, Pair.first);
8611 }
8612 }
8613 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8614 unsigned Cnt = IdentityCnt;
8615 for (auto &Pair : OrdersUses) {
8616 // Prefer identity order. But, if filled identity found (non-empty order)
8617 // with same number of uses, as the new candidate order, we can choose
8618 // this candidate order.
8619 if (Cnt < Pair.second ||
8620 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8621 Cnt == Pair.second && !BestOrder.empty() &&
8622 isIdentityOrder(BestOrder))) {
8623 combineOrders(Pair.first, BestOrder);
8624 BestOrder = Pair.first;
8625 Cnt = Pair.second;
8626 } else {
8627 combineOrders(BestOrder, Pair.first);
8628 }
8629 }
8630 // Set order of the user node.
8631 if (isIdentityOrder(BestOrder))
8632 continue;
8633 fixupOrderingIndices(BestOrder);
8634 SmallVector<int> Mask;
8635 inversePermutation(BestOrder, Mask);
8636 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8637 unsigned E = BestOrder.size();
8638 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8639 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8640 });
8641 // Do an actual reordering, if profitable.
8642 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8643 // Just do the reordering for the nodes with the given VF.
8644 if (TE->Scalars.size() != VF) {
8645 if (TE->ReuseShuffleIndices.size() == VF) {
8646 assert(TE->State != TreeEntry::SplitVectorize &&
8647 "Split vectorized not expected.");
8648 // Need to reorder the reuses masks of the operands with smaller VF to
8649 // be able to find the match between the graph nodes and scalar
8650 // operands of the given node during vectorization/cost estimation.
8651 assert(
8652 (!TE->UserTreeIndex ||
8653 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8654 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8655 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8656 "All users must be of VF size.");
8657 if (SLPReVec) {
8658 assert(SLPReVec && "Only supported by REVEC.");
8659 // ShuffleVectorInst does not do reorderOperands (and it should not
8660 // because ShuffleVectorInst supports only a limited set of
8661 // patterns). Only do reorderNodeWithReuses if the user is not
8662 // ShuffleVectorInst.
8663 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8664 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8665 continue;
8666 }
8667 // Update ordering of the operands with the smaller VF than the given
8668 // one.
8669 reorderNodeWithReuses(*TE, Mask);
8670 // Update orders in user split vectorize nodes.
8671 if (TE->UserTreeIndex &&
8672 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8673 TE->UserTreeIndex.UserTE->reorderSplitNode(
8674 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8675 }
8676 continue;
8677 }
8678 if ((TE->State == TreeEntry::SplitVectorize &&
8679 TE->ReuseShuffleIndices.empty()) ||
8680 ((TE->State == TreeEntry::Vectorize ||
8681 TE->State == TreeEntry::StridedVectorize ||
8682 TE->State == TreeEntry::CompressVectorize) &&
8684 InsertElementInst>(TE->getMainOp()) ||
8685 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8686 assert(
8687 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8688 TE->ReuseShuffleIndices.empty())) &&
8689 "Alternate instructions are only supported by BinaryOperator "
8690 "and CastInst.");
8691 // Build correct orders for extract{element,value}, loads,
8692 // stores and alternate (split) nodes.
8693 reorderOrder(TE->ReorderIndices, Mask);
8694 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8695 TE->reorderOperands(Mask);
8696 } else {
8697 // Reorder the node and its operands.
8698 TE->reorderOperands(Mask);
8699 assert(TE->ReorderIndices.empty() &&
8700 "Expected empty reorder sequence.");
8701 reorderScalars(TE->Scalars, Mask);
8702 }
8703 if (!TE->ReuseShuffleIndices.empty()) {
8704 // Apply reversed order to keep the original ordering of the reused
8705 // elements to avoid extra reorder indices shuffling.
8706 OrdersType CurrentOrder;
8707 reorderOrder(CurrentOrder, MaskOrder);
8708 SmallVector<int> NewReuses;
8709 inversePermutation(CurrentOrder, NewReuses);
8710 addMask(NewReuses, TE->ReuseShuffleIndices);
8711 TE->ReuseShuffleIndices.swap(NewReuses);
8712 } else if (TE->UserTreeIndex &&
8713 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8714 // Update orders in user split vectorize nodes.
8715 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8716 Mask, MaskOrder);
8717 }
8718 }
8719}
8720
8721void BoUpSLP::buildReorderableOperands(
8722 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8723 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8724 SmallVectorImpl<TreeEntry *> &GatherOps) {
8725 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8726 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8727 return OpData.first == I &&
8728 (OpData.second->State == TreeEntry::Vectorize ||
8729 OpData.second->State == TreeEntry::StridedVectorize ||
8730 OpData.second->State == TreeEntry::CompressVectorize ||
8731 OpData.second->State == TreeEntry::SplitVectorize);
8732 }))
8733 continue;
8734 // Do not request operands, if they do not exist.
8735 if (UserTE->hasState()) {
8736 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8737 UserTE->getOpcode() == Instruction::ExtractValue)
8738 continue;
8739 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8740 continue;
8741 if (UserTE->getOpcode() == Instruction::Store &&
8742 UserTE->State == TreeEntry::Vectorize && I == 1)
8743 continue;
8744 if (UserTE->getOpcode() == Instruction::Load &&
8745 (UserTE->State == TreeEntry::Vectorize ||
8746 UserTE->State == TreeEntry::StridedVectorize ||
8747 UserTE->State == TreeEntry::CompressVectorize))
8748 continue;
8749 }
8750 TreeEntry *TE = getOperandEntry(UserTE, I);
8751 assert(TE && "Expected operand entry.");
8752 if (!TE->isGather()) {
8753 // Add the node to the list of the ordered nodes with the identity
8754 // order.
8755 Edges.emplace_back(I, TE);
8756 // Add ScatterVectorize nodes to the list of operands, where just
8757 // reordering of the scalars is required. Similar to the gathers, so
8758 // simply add to the list of gathered ops.
8759 // If there are reused scalars, process this node as a regular vectorize
8760 // node, just reorder reuses mask.
8761 if (TE->State == TreeEntry::ScatterVectorize &&
8762 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8763 GatherOps.push_back(TE);
8764 continue;
8765 }
8766 if (ReorderableGathers.contains(TE))
8767 GatherOps.push_back(TE);
8768 }
8769}
8770
8771void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8772 struct TreeEntryCompare {
8773 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8774 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8775 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8776 return LHS->Idx < RHS->Idx;
8777 }
8778 };
8780 DenseSet<const TreeEntry *> GathersToOrders;
8781 // Find all reorderable leaf nodes with the given VF.
8782 // Currently the are vectorized loads,extracts without alternate operands +
8783 // some gathering of extracts.
8785 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8786 if (TE->State != TreeEntry::Vectorize &&
8787 TE->State != TreeEntry::StridedVectorize &&
8788 TE->State != TreeEntry::CompressVectorize &&
8789 TE->State != TreeEntry::SplitVectorize)
8790 NonVectorized.insert(TE.get());
8791 if (std::optional<OrdersType> CurrentOrder =
8792 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8793 Queue.push(TE.get());
8794 if (!(TE->State == TreeEntry::Vectorize ||
8795 TE->State == TreeEntry::StridedVectorize ||
8796 TE->State == TreeEntry::CompressVectorize ||
8797 TE->State == TreeEntry::SplitVectorize) ||
8798 !TE->ReuseShuffleIndices.empty())
8799 GathersToOrders.insert(TE.get());
8800 }
8801 }
8802
8803 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8804 // I.e., if the node has operands, that are reordered, try to make at least
8805 // one operand order in the natural order and reorder others + reorder the
8806 // user node itself.
8807 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8808 while (!Queue.empty()) {
8809 // 1. Filter out only reordered nodes.
8810 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8811 TreeEntry *TE = Queue.top();
8812 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8813 Queue.pop();
8814 SmallVector<TreeEntry *> OrderedOps(1, TE);
8815 while (!Queue.empty()) {
8816 TE = Queue.top();
8817 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8818 break;
8819 Queue.pop();
8820 OrderedOps.push_back(TE);
8821 }
8822 for (TreeEntry *TE : OrderedOps) {
8823 if (!(TE->State == TreeEntry::Vectorize ||
8824 TE->State == TreeEntry::StridedVectorize ||
8825 TE->State == TreeEntry::CompressVectorize ||
8826 TE->State == TreeEntry::SplitVectorize ||
8827 (TE->isGather() && GathersToOrders.contains(TE))) ||
8828 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8829 !Visited.insert(TE).second)
8830 continue;
8831 // Build a map between user nodes and their operands order to speedup
8832 // search. The graph currently does not provide this dependency directly.
8833 Users.first = TE->UserTreeIndex.UserTE;
8834 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8835 }
8836 if (Users.first) {
8837 auto &Data = Users;
8838 if (Data.first->State == TreeEntry::SplitVectorize) {
8839 assert(
8840 Data.second.size() <= 2 &&
8841 "Expected not greater than 2 operands for split vectorize node.");
8842 if (any_of(Data.second,
8843 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8844 continue;
8845 // Update orders in user split vectorize nodes.
8846 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8847 "Expected exactly 2 entries.");
8848 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8849 TreeEntry &OpTE = *VectorizableTree[P.first];
8850 OrdersType Order = OpTE.ReorderIndices;
8851 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8852 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8853 continue;
8854 const auto BestOrder =
8855 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8856 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8857 continue;
8858 Order = *BestOrder;
8859 }
8860 fixupOrderingIndices(Order);
8861 SmallVector<int> Mask;
8862 inversePermutation(Order, Mask);
8863 const unsigned E = Order.size();
8864 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8865 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8866 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8867 });
8868 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8869 // Clear ordering of the operand.
8870 if (!OpTE.ReorderIndices.empty()) {
8871 OpTE.ReorderIndices.clear();
8872 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8873 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8874 } else {
8875 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8876 reorderScalars(OpTE.Scalars, Mask);
8877 }
8878 }
8879 if (Data.first->ReuseShuffleIndices.empty() &&
8880 !Data.first->ReorderIndices.empty()) {
8881 // Insert user node to the list to try to sink reordering deeper in
8882 // the graph.
8883 Queue.push(Data.first);
8884 }
8885 continue;
8886 }
8887 // Check that operands are used only in the User node.
8888 SmallVector<TreeEntry *> GatherOps;
8889 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8890 GatherOps);
8891 // All operands are reordered and used only in this node - propagate the
8892 // most used order to the user node.
8895 OrdersUses;
8896 // Do the analysis for each tree entry only once, otherwise the order of
8897 // the same node my be considered several times, though might be not
8898 // profitable.
8901 for (const auto &Op : Data.second) {
8902 TreeEntry *OpTE = Op.second;
8903 if (!VisitedOps.insert(OpTE).second)
8904 continue;
8905 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8906 continue;
8907 const auto Order = [&]() -> const OrdersType {
8908 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8909 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8910 IgnoreReorder)
8911 .value_or(OrdersType(1));
8912 return OpTE->ReorderIndices;
8913 }();
8914 // The order is partially ordered, skip it in favor of fully non-ordered
8915 // orders.
8916 if (Order.size() == 1)
8917 continue;
8918
8919 // Check that the reordering does not increase number of shuffles, i.e.
8920 // same-values-nodes has same parents or their parents has same parents.
8921 if (!Order.empty() && !isIdentityOrder(Order)) {
8922 Value *Root = OpTE->hasState()
8923 ? OpTE->getMainOp()
8924 : *find_if_not(OpTE->Scalars, isConstant);
8925 auto GetSameNodesUsers = [&](Value *Root) {
8927 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8928 if (TE != OpTE && TE->UserTreeIndex &&
8929 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8930 TE->Scalars.size() == OpTE->Scalars.size() &&
8931 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8932 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8933 Res.insert(TE->UserTreeIndex.UserTE);
8934 }
8935 for (const TreeEntry *TE : getTreeEntries(Root)) {
8936 if (TE != OpTE && TE->UserTreeIndex &&
8937 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8938 TE->Scalars.size() == OpTE->Scalars.size() &&
8939 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8940 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8941 Res.insert(TE->UserTreeIndex.UserTE);
8942 }
8943 return Res.takeVector();
8944 };
8945 auto GetNumOperands = [](const TreeEntry *TE) {
8946 if (TE->State == TreeEntry::SplitVectorize)
8947 return TE->getNumOperands();
8948 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8949 return CI->arg_size();
8950 return TE->getNumOperands();
8951 };
8952 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8953 const TreeEntry *TE) {
8955 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8957 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8960 continue;
8961 const TreeEntry *Op = getOperandEntry(TE, Idx);
8962 if (Op->isGather() && Op->hasState()) {
8963 const TreeEntry *VecOp =
8964 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8965 if (VecOp)
8966 Op = VecOp;
8967 }
8968 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8969 return false;
8970 }
8971 return true;
8972 };
8973 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8974 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8975 if (!RevisitedOps.insert(UTE).second)
8976 return false;
8977 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8978 !UTE->ReuseShuffleIndices.empty() ||
8979 (UTE->UserTreeIndex &&
8980 UTE->UserTreeIndex.UserTE == Data.first) ||
8981 (Data.first->UserTreeIndex &&
8982 Data.first->UserTreeIndex.UserTE == UTE) ||
8983 (IgnoreReorder && UTE->UserTreeIndex &&
8984 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8985 NodeShouldBeReorderedWithOperands(UTE);
8986 }))
8987 continue;
8988 for (TreeEntry *UTE : Users) {
8990 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8992 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8995 continue;
8996 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8997 Visited.erase(Op);
8998 Queue.push(const_cast<TreeEntry *>(Op));
8999 }
9000 }
9001 }
9002 unsigned NumOps = count_if(
9003 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
9004 return P.second == OpTE;
9005 });
9006 // Stores actually store the mask, not the order, need to invert.
9007 if (OpTE->State == TreeEntry::Vectorize &&
9008 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
9009 assert(!OpTE->isAltShuffle() &&
9010 "Alternate instructions are only supported by BinaryOperator "
9011 "and CastInst.");
9012 SmallVector<int> Mask;
9013 inversePermutation(Order, Mask);
9014 unsigned E = Order.size();
9015 OrdersType CurrentOrder(E, E);
9016 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
9017 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9018 });
9019 fixupOrderingIndices(CurrentOrder);
9020 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
9021 } else {
9022 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
9023 }
9024 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
9025 const auto AllowsReordering = [&](const TreeEntry *TE) {
9026 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9027 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9028 (IgnoreReorder && TE->Idx == 0))
9029 return true;
9030 if (TE->isGather()) {
9031 if (GathersToOrders.contains(TE))
9032 return !getReorderingData(*TE, /*TopToBottom=*/false,
9033 IgnoreReorder)
9034 .value_or(OrdersType(1))
9035 .empty();
9036 return true;
9037 }
9038 return false;
9039 };
9040 if (OpTE->UserTreeIndex) {
9041 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9042 if (!VisitedUsers.insert(UserTE).second)
9043 continue;
9044 // May reorder user node if it requires reordering, has reused
9045 // scalars, is an alternate op vectorize node or its op nodes require
9046 // reordering.
9047 if (AllowsReordering(UserTE))
9048 continue;
9049 // Check if users allow reordering.
9050 // Currently look up just 1 level of operands to avoid increase of
9051 // the compile time.
9052 // Profitable to reorder if definitely more operands allow
9053 // reordering rather than those with natural order.
9055 if (static_cast<unsigned>(count_if(
9056 Ops, [UserTE, &AllowsReordering](
9057 const std::pair<unsigned, TreeEntry *> &Op) {
9058 return AllowsReordering(Op.second) &&
9059 Op.second->UserTreeIndex.UserTE == UserTE;
9060 })) <= Ops.size() / 2)
9061 ++Res.first->second;
9062 }
9063 }
9064 if (OrdersUses.empty()) {
9065 Visited.insert_range(llvm::make_second_range(Data.second));
9066 continue;
9067 }
9068 // Choose the most used order.
9069 unsigned IdentityCnt = 0;
9070 unsigned VF = Data.second.front().second->getVectorFactor();
9071 OrdersType IdentityOrder(VF, VF);
9072 for (auto &Pair : OrdersUses) {
9073 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
9074 IdentityCnt += Pair.second;
9075 combineOrders(IdentityOrder, Pair.first);
9076 }
9077 }
9078 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
9079 unsigned Cnt = IdentityCnt;
9080 for (auto &Pair : OrdersUses) {
9081 // Prefer identity order. But, if filled identity found (non-empty
9082 // order) with same number of uses, as the new candidate order, we can
9083 // choose this candidate order.
9084 if (Cnt < Pair.second) {
9085 combineOrders(Pair.first, BestOrder);
9086 BestOrder = Pair.first;
9087 Cnt = Pair.second;
9088 } else {
9089 combineOrders(BestOrder, Pair.first);
9090 }
9091 }
9092 // Set order of the user node.
9093 if (isIdentityOrder(BestOrder)) {
9094 Visited.insert_range(llvm::make_second_range(Data.second));
9095 continue;
9096 }
9097 fixupOrderingIndices(BestOrder);
9098 // Erase operands from OrderedEntries list and adjust their orders.
9099 VisitedOps.clear();
9100 SmallVector<int> Mask;
9101 inversePermutation(BestOrder, Mask);
9102 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
9103 unsigned E = BestOrder.size();
9104 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
9105 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9106 });
9107 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
9108 TreeEntry *TE = Op.second;
9109 if (!VisitedOps.insert(TE).second)
9110 continue;
9111 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
9112 reorderNodeWithReuses(*TE, Mask);
9113 continue;
9114 }
9115 // Gathers are processed separately.
9116 if (TE->State != TreeEntry::Vectorize &&
9117 TE->State != TreeEntry::StridedVectorize &&
9118 TE->State != TreeEntry::CompressVectorize &&
9119 TE->State != TreeEntry::SplitVectorize &&
9120 (TE->State != TreeEntry::ScatterVectorize ||
9121 TE->ReorderIndices.empty()))
9122 continue;
9123 assert((BestOrder.size() == TE->ReorderIndices.size() ||
9124 TE->ReorderIndices.empty()) &&
9125 "Non-matching sizes of user/operand entries.");
9126 reorderOrder(TE->ReorderIndices, Mask);
9127 if (IgnoreReorder && TE == VectorizableTree.front().get())
9128 IgnoreReorder = false;
9129 }
9130 // For gathers just need to reorder its scalars.
9131 for (TreeEntry *Gather : GatherOps) {
9132 assert(Gather->ReorderIndices.empty() &&
9133 "Unexpected reordering of gathers.");
9134 if (!Gather->ReuseShuffleIndices.empty()) {
9135 // Just reorder reuses indices.
9136 reorderReuses(Gather->ReuseShuffleIndices, Mask);
9137 continue;
9138 }
9139 reorderScalars(Gather->Scalars, Mask);
9140 Visited.insert(Gather);
9141 }
9142 // Reorder operands of the user node and set the ordering for the user
9143 // node itself.
9144 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
9145 return TE.isAltShuffle() &&
9146 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9147 TE.ReorderIndices.empty());
9148 };
9149 if (Data.first->State != TreeEntry::Vectorize ||
9151 Data.first->getMainOp()) ||
9152 IsNotProfitableAltCodeNode(*Data.first))
9153 Data.first->reorderOperands(Mask);
9154 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
9155 IsNotProfitableAltCodeNode(*Data.first) ||
9156 Data.first->State == TreeEntry::StridedVectorize ||
9157 Data.first->State == TreeEntry::CompressVectorize) {
9158 reorderScalars(Data.first->Scalars, Mask);
9159 reorderOrder(Data.first->ReorderIndices, MaskOrder,
9160 /*BottomOrder=*/true);
9161 if (Data.first->ReuseShuffleIndices.empty() &&
9162 !Data.first->ReorderIndices.empty() &&
9163 !IsNotProfitableAltCodeNode(*Data.first)) {
9164 // Insert user node to the list to try to sink reordering deeper in
9165 // the graph.
9166 Queue.push(Data.first);
9167 }
9168 } else {
9169 reorderOrder(Data.first->ReorderIndices, Mask);
9170 }
9171 }
9172 }
9173 // If the reordering is unnecessary, just remove the reorder.
9174 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9175 VectorizableTree.front()->ReuseShuffleIndices.empty())
9176 VectorizableTree.front()->ReorderIndices.clear();
9177}
9178
9179Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
9180 if (Entry.hasState() &&
9181 (Entry.getOpcode() == Instruction::Store ||
9182 Entry.getOpcode() == Instruction::Load) &&
9183 Entry.State == TreeEntry::StridedVectorize &&
9184 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
9185 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
9186 return dyn_cast<Instruction>(Entry.Scalars.front());
9187}
9188
9190 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
9191 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9192 DenseMap<Value *, unsigned> ScalarToExtUses;
9193 // Collect the values that we need to extract from the tree.
9194 for (auto &TEPtr : VectorizableTree) {
9195 TreeEntry *Entry = TEPtr.get();
9196
9197 // No need to handle users of gathered values.
9198 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9199 DeletedNodes.contains(Entry) ||
9200 TransformedToGatherNodes.contains(Entry))
9201 continue;
9202
9203 // For each lane:
9204 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9205 Value *Scalar = Entry->Scalars[Lane];
9206 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
9207 continue;
9208
9209 // All uses must be replaced already? No need to do it again.
9210 auto It = ScalarToExtUses.find(Scalar);
9211 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
9212 continue;
9213
9214 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
9215 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9216 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
9217 << " from " << *Scalar << "for many users.\n");
9218 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9219 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9220 ExternalUsesWithNonUsers.insert(Scalar);
9221 continue;
9222 }
9223
9224 // Check if the scalar is externally used as an extra arg.
9225 const auto ExtI = ExternallyUsedValues.find(Scalar);
9226 if (ExtI != ExternallyUsedValues.end()) {
9227 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9228 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
9229 << FoundLane << " from " << *Scalar << ".\n");
9230 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
9231 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9232 continue;
9233 }
9234 for (User *U : Scalar->users()) {
9235 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
9236
9237 Instruction *UserInst = dyn_cast<Instruction>(U);
9238 if (!UserInst || isDeleted(UserInst))
9239 continue;
9240
9241 // Ignore users in the user ignore list.
9242 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9243 continue;
9244
9245 // Skip in-tree scalars that become vectors
9246 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
9247 any_of(UseEntries, [this](const TreeEntry *UseEntry) {
9248 return !DeletedNodes.contains(UseEntry) &&
9249 !TransformedToGatherNodes.contains(UseEntry);
9250 })) {
9251 // Some in-tree scalars will remain as scalar in vectorized
9252 // instructions. If that is the case, the one in FoundLane will
9253 // be used.
9254 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9255 isa<LoadInst, StoreInst>(UserInst)) ||
9256 isa<CallInst>(UserInst)) ||
9257 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9258 if (DeletedNodes.contains(UseEntry) ||
9259 TransformedToGatherNodes.contains(UseEntry))
9260 return true;
9261 return UseEntry->State == TreeEntry::ScatterVectorize ||
9263 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9264 TTI);
9265 })) {
9266 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9267 << ".\n");
9268 assert(none_of(UseEntries,
9269 [](TreeEntry *UseEntry) {
9270 return UseEntry->isGather();
9271 }) &&
9272 "Bad state");
9273 continue;
9274 }
9275 U = nullptr;
9276 if (It != ScalarToExtUses.end()) {
9277 ExternalUses[It->second].User = nullptr;
9278 break;
9279 }
9280 }
9281
9282 if (U && Scalar->hasNUsesOrMore(UsesLimit))
9283 U = nullptr;
9284 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9285 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9286 << " from lane " << FoundLane << " from " << *Scalar
9287 << ".\n");
9288 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9289 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9290 ExternalUsesWithNonUsers.insert(Scalar);
9291 if (!U)
9292 break;
9293 }
9294 }
9295 }
9296}
9297
9299BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9302 PtrToStoresMap;
9303 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
9304 Value *V = TE->Scalars[Lane];
9305 // Don't iterate over the users of constant data.
9306 if (!isa<Instruction>(V))
9307 continue;
9308 // To save compilation time we don't visit if we have too many users.
9309 if (V->hasNUsesOrMore(UsesLimit))
9310 break;
9311
9312 // Collect stores per pointer object.
9313 for (User *U : V->users()) {
9314 auto *SI = dyn_cast<StoreInst>(U);
9315 // Test whether we can handle the store. V might be a global, which could
9316 // be used in a different function.
9317 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9318 !isValidElementType(SI->getValueOperand()->getType()))
9319 continue;
9320 // Skip entry if already
9321 if (isVectorized(U))
9322 continue;
9323
9324 Value *Ptr =
9325 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
9326 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9327 SI->getValueOperand()->getType(), Ptr}];
9328 // For now just keep one store per pointer object per lane.
9329 // TODO: Extend this to support multiple stores per pointer per lane
9330 if (StoresVec.size() > Lane)
9331 continue;
9332 if (!StoresVec.empty()) {
9333 std::optional<int64_t> Diff = getPointersDiff(
9334 SI->getValueOperand()->getType(), SI->getPointerOperand(),
9335 SI->getValueOperand()->getType(),
9336 StoresVec.front()->getPointerOperand(), *DL, *SE,
9337 /*StrictCheck=*/true);
9338 // We failed to compare the pointers so just abandon this store.
9339 if (!Diff)
9340 continue;
9341 }
9342 StoresVec.push_back(SI);
9343 }
9344 }
9345 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9346 unsigned I = 0;
9347 for (auto &P : PtrToStoresMap) {
9348 Res[I].swap(P.second);
9349 ++I;
9350 }
9351 return Res;
9352}
9353
9354bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9355 OrdersType &ReorderIndices) const {
9356 // We check whether the stores in StoreVec can form a vector by sorting them
9357 // and checking whether they are consecutive.
9358
9359 // To avoid calling getPointersDiff() while sorting we create a vector of
9360 // pairs {store, offset from first} and sort this instead.
9362 StoreInst *S0 = StoresVec[0];
9363 StoreOffsetVec.emplace_back(0, 0);
9364 Type *S0Ty = S0->getValueOperand()->getType();
9365 Value *S0Ptr = S0->getPointerOperand();
9366 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
9367 StoreInst *SI = StoresVec[Idx];
9368 std::optional<int64_t> Diff =
9369 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
9370 SI->getPointerOperand(), *DL, *SE,
9371 /*StrictCheck=*/true);
9372 StoreOffsetVec.emplace_back(*Diff, Idx);
9373 }
9374
9375 // Check if the stores are consecutive by checking if their difference is 1.
9376 if (StoreOffsetVec.size() != StoresVec.size())
9377 return false;
9378 sort(StoreOffsetVec, llvm::less_first());
9379 unsigned Idx = 0;
9380 int64_t PrevDist = 0;
9381 for (const auto &P : StoreOffsetVec) {
9382 if (Idx > 0 && P.first != PrevDist + 1)
9383 return false;
9384 PrevDist = P.first;
9385 ++Idx;
9386 }
9387
9388 // Calculate the shuffle indices according to their offset against the sorted
9389 // StoreOffsetVec.
9390 ReorderIndices.assign(StoresVec.size(), 0);
9391 bool IsIdentity = true;
9392 for (auto [I, P] : enumerate(StoreOffsetVec)) {
9393 ReorderIndices[P.second] = I;
9394 IsIdentity &= P.second == I;
9395 }
9396 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9397 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9398 // same convention here.
9399 if (IsIdentity)
9400 ReorderIndices.clear();
9401
9402 return true;
9403}
9404
9405#ifndef NDEBUG
9407 for (unsigned Idx : Order)
9408 dbgs() << Idx << ", ";
9409 dbgs() << "\n";
9410}
9411#endif
9412
9414BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9415 unsigned NumLanes = TE->Scalars.size();
9416
9417 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9418
9419 // Holds the reorder indices for each candidate store vector that is a user of
9420 // the current TreeEntry.
9421 SmallVector<OrdersType, 1> ExternalReorderIndices;
9422
9423 // Now inspect the stores collected per pointer and look for vectorization
9424 // candidates. For each candidate calculate the reorder index vector and push
9425 // it into `ExternalReorderIndices`
9426 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9427 // If we have fewer than NumLanes stores, then we can't form a vector.
9428 if (StoresVec.size() != NumLanes)
9429 continue;
9430
9431 // If the stores are not consecutive then abandon this StoresVec.
9432 OrdersType ReorderIndices;
9433 if (!canFormVector(StoresVec, ReorderIndices))
9434 continue;
9435
9436 // We now know that the scalars in StoresVec can form a vector instruction,
9437 // so set the reorder indices.
9438 ExternalReorderIndices.push_back(ReorderIndices);
9439 }
9440 return ExternalReorderIndices;
9441}
9442
9444 const SmallDenseSet<Value *> &UserIgnoreLst) {
9445 deleteTree();
9446 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9447 "TreeEntryToStridedPtrInfoMap is not cleared");
9448 UserIgnoreList = &UserIgnoreLst;
9449 if (!allSameType(Roots))
9450 return;
9451 buildTreeRec(Roots, 0, EdgeInfo());
9452}
9453
9455 deleteTree();
9456 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9457 "TreeEntryToStridedPtrInfoMap is not cleared");
9458 if (!allSameType(Roots))
9459 return;
9460 buildTreeRec(Roots, 0, EdgeInfo());
9461}
9462
9463/// Tries to find subvector of loads and builds new vector of only loads if can
9464/// be profitable.
9466 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9468 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9469 bool AddNew = true) {
9470 if (VL.empty())
9471 return;
9472 Type *ScalarTy = getValueType(VL.front());
9473 if (!isValidElementType(ScalarTy))
9474 return;
9476 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9477 for (Value *V : VL) {
9478 auto *LI = dyn_cast<LoadInst>(V);
9479 if (!LI)
9480 continue;
9481 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9482 continue;
9483 bool IsFound = false;
9484 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9485 assert(LI->getParent() == Data.front().first->getParent() &&
9486 LI->getType() == Data.front().first->getType() &&
9487 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9488 getUnderlyingObject(Data.front().first->getPointerOperand(),
9490 "Expected loads with the same type, same parent and same "
9491 "underlying pointer.");
9492 std::optional<int64_t> Dist = getPointersDiff(
9493 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9494 Data.front().first->getPointerOperand(), DL, SE,
9495 /*StrictCheck=*/true);
9496 if (!Dist)
9497 continue;
9498 auto It = Map.find(*Dist);
9499 if (It != Map.end() && It->second != LI)
9500 continue;
9501 if (It == Map.end()) {
9502 Data.emplace_back(LI, *Dist);
9503 Map.try_emplace(*Dist, LI);
9504 }
9505 IsFound = true;
9506 break;
9507 }
9508 if (!IsFound) {
9509 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9510 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9511 }
9512 }
9513 auto FindMatchingLoads =
9516 &GatheredLoads,
9517 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9518 int64_t &Offset, unsigned &Start) {
9519 if (Loads.empty())
9520 return GatheredLoads.end();
9521 LoadInst *LI = Loads.front().first;
9522 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9523 if (Idx < Start)
9524 continue;
9525 ToAdd.clear();
9526 if (LI->getParent() != Data.front().first->getParent() ||
9527 LI->getType() != Data.front().first->getType())
9528 continue;
9529 std::optional<int64_t> Dist =
9531 Data.front().first->getType(),
9532 Data.front().first->getPointerOperand(), DL, SE,
9533 /*StrictCheck=*/true);
9534 if (!Dist)
9535 continue;
9536 SmallSet<int64_t, 4> DataDists;
9538 for (std::pair<LoadInst *, int64_t> P : Data) {
9539 DataDists.insert(P.second);
9540 DataLoads.insert(P.first);
9541 }
9542 // Found matching gathered loads - check if all loads are unique or
9543 // can be effectively vectorized.
9544 unsigned NumUniques = 0;
9545 for (auto [Cnt, Pair] : enumerate(Loads)) {
9546 bool Used = DataLoads.contains(Pair.first);
9547 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9548 ++NumUniques;
9549 ToAdd.insert(Cnt);
9550 } else if (Used) {
9551 Repeated.insert(Cnt);
9552 }
9553 }
9554 if (NumUniques > 0 &&
9555 (Loads.size() == NumUniques ||
9556 (Loads.size() - NumUniques >= 2 &&
9557 Loads.size() - NumUniques >= Loads.size() / 2 &&
9558 (has_single_bit(Data.size() + NumUniques) ||
9559 bit_ceil(Data.size()) <
9560 bit_ceil(Data.size() + NumUniques))))) {
9561 Offset = *Dist;
9562 Start = Idx + 1;
9563 return std::next(GatheredLoads.begin(), Idx);
9564 }
9565 }
9566 ToAdd.clear();
9567 return GatheredLoads.end();
9568 };
9569 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9570 unsigned Start = 0;
9571 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9572 int64_t Offset = 0;
9573 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9574 Offset, Start);
9575 while (It != GatheredLoads.end()) {
9576 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9577 for (unsigned Idx : LocalToAdd)
9578 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9579 ToAdd.insert_range(LocalToAdd);
9580 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9581 Start);
9582 }
9583 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9584 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9585 })) {
9586 auto AddNewLoads =
9588 for (unsigned Idx : seq<unsigned>(Data.size())) {
9589 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9590 continue;
9591 Loads.push_back(Data[Idx]);
9592 }
9593 };
9594 if (!AddNew) {
9595 LoadInst *LI = Data.front().first;
9596 It = find_if(
9597 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9598 return PD.front().first->getParent() == LI->getParent() &&
9599 PD.front().first->getType() == LI->getType();
9600 });
9601 while (It != GatheredLoads.end()) {
9602 AddNewLoads(*It);
9603 It = std::find_if(
9604 std::next(It), GatheredLoads.end(),
9605 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9606 return PD.front().first->getParent() == LI->getParent() &&
9607 PD.front().first->getType() == LI->getType();
9608 });
9609 }
9610 }
9611 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9612 AddNewLoads(GatheredLoads.emplace_back());
9613 }
9614 }
9615}
9616
9617void BoUpSLP::tryToVectorizeGatheredLoads(
9618 const SmallMapVector<
9619 std::tuple<BasicBlock *, Value *, Type *>,
9620 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9621 &GatheredLoads) {
9622 GatheredLoadsEntriesFirst = VectorizableTree.size();
9623
9624 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9625 LoadEntriesToVectorize.size());
9626 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9627 Set.insert_range(VectorizableTree[Idx]->Scalars);
9628
9629 // Sort loads by distance.
9630 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9631 const std::pair<LoadInst *, int64_t> &L2) {
9632 return L1.second > L2.second;
9633 };
9634
9635 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9636 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9637 Loads.size());
9638 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9639 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9640 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9641 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9642 };
9643
9644 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9645 BoUpSLP::ValueSet &VectorizedLoads,
9646 SmallVectorImpl<LoadInst *> &NonVectorized,
9647 bool Final, unsigned MaxVF) {
9649 unsigned StartIdx = 0;
9650 SmallVector<int> CandidateVFs;
9651 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9652 CandidateVFs.push_back(MaxVF);
9653 for (int NumElts = getFloorFullVectorNumberOfElements(
9654 *TTI, Loads.front()->getType(), MaxVF);
9655 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9656 *TTI, Loads.front()->getType(), NumElts - 1)) {
9657 CandidateVFs.push_back(NumElts);
9658 if (VectorizeNonPowerOf2 && NumElts > 2)
9659 CandidateVFs.push_back(NumElts - 1);
9660 }
9661
9662 if (Final && CandidateVFs.empty())
9663 return Results;
9664
9665 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9666 for (unsigned NumElts : CandidateVFs) {
9667 if (Final && NumElts > BestVF)
9668 continue;
9669 SmallVector<unsigned> MaskedGatherVectorized;
9670 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9671 ++Cnt) {
9672 ArrayRef<LoadInst *> Slice =
9673 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9674 if (VectorizedLoads.count(Slice.front()) ||
9675 VectorizedLoads.count(Slice.back()) ||
9677 continue;
9678 // Check if it is profitable to try vectorizing gathered loads. It is
9679 // profitable if we have more than 3 consecutive loads or if we have
9680 // less but all users are vectorized or deleted.
9681 bool AllowToVectorize = false;
9682 // Check if it is profitable to vectorize 2-elements loads.
9683 if (NumElts == 2) {
9684 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9685 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9686 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9687 for (LoadInst *LI : Slice) {
9688 // If single use/user - allow to vectorize.
9689 if (LI->hasOneUse())
9690 continue;
9691 // 1. Check if number of uses equals number of users.
9692 // 2. All users are deleted.
9693 // 3. The load broadcasts are not allowed or the load is not
9694 // broadcasted.
9695 if (static_cast<unsigned int>(std::distance(
9696 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9697 return false;
9698 if (!IsLegalBroadcastLoad)
9699 continue;
9700 if (LI->hasNUsesOrMore(UsesLimit))
9701 return false;
9702 for (User *U : LI->users()) {
9703 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9704 continue;
9705 for (const TreeEntry *UTE : getTreeEntries(U)) {
9706 for (int I : seq<int>(UTE->getNumOperands())) {
9707 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9708 return V == LI || isa<PoisonValue>(V);
9709 }))
9710 // Found legal broadcast - do not vectorize.
9711 return false;
9712 }
9713 }
9714 }
9715 }
9716 return true;
9717 };
9718 AllowToVectorize = CheckIfAllowed(Slice);
9719 } else {
9720 AllowToVectorize =
9721 (NumElts >= 3 ||
9722 any_of(ValueToGatherNodes.at(Slice.front()),
9723 [=](const TreeEntry *TE) {
9724 return TE->Scalars.size() == 2 &&
9725 ((TE->Scalars.front() == Slice.front() &&
9726 TE->Scalars.back() == Slice.back()) ||
9727 (TE->Scalars.front() == Slice.back() &&
9728 TE->Scalars.back() == Slice.front()));
9729 })) &&
9730 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9731 Slice.size());
9732 }
9733 if (AllowToVectorize) {
9734 SmallVector<Value *> PointerOps;
9735 OrdersType CurrentOrder;
9736 // Try to build vector load.
9737 ArrayRef<Value *> Values(
9738 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9739 StridedPtrInfo SPtrInfo;
9740 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9741 PointerOps, SPtrInfo, &BestVF);
9742 if (LS != LoadsState::Gather ||
9743 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9744 if (LS == LoadsState::ScatterVectorize) {
9745 if (MaskedGatherVectorized.empty() ||
9746 Cnt >= MaskedGatherVectorized.back() + NumElts)
9747 MaskedGatherVectorized.push_back(Cnt);
9748 continue;
9749 }
9750 if (LS != LoadsState::Gather) {
9751 Results.emplace_back(Values, LS);
9752 VectorizedLoads.insert_range(Slice);
9753 // If we vectorized initial block, no need to try to vectorize it
9754 // again.
9755 if (Cnt == StartIdx)
9756 StartIdx += NumElts;
9757 }
9758 // Check if the whole array was vectorized already - exit.
9759 if (StartIdx >= Loads.size())
9760 break;
9761 // Erase last masked gather candidate, if another candidate within
9762 // the range is found to be better.
9763 if (!MaskedGatherVectorized.empty() &&
9764 Cnt < MaskedGatherVectorized.back() + NumElts)
9765 MaskedGatherVectorized.pop_back();
9766 Cnt += NumElts - 1;
9767 continue;
9768 }
9769 }
9770 if (!AllowToVectorize || BestVF == 0)
9772 }
9773 // Mark masked gathers candidates as vectorized, if any.
9774 for (unsigned Cnt : MaskedGatherVectorized) {
9775 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9776 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9777 ArrayRef<Value *> Values(
9778 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9779 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9780 VectorizedLoads.insert_range(Slice);
9781 // If we vectorized initial block, no need to try to vectorize it again.
9782 if (Cnt == StartIdx)
9783 StartIdx += NumElts;
9784 }
9785 }
9786 for (LoadInst *LI : Loads) {
9787 if (!VectorizedLoads.contains(LI))
9788 NonVectorized.push_back(LI);
9789 }
9790 return Results;
9791 };
9792 auto ProcessGatheredLoads =
9793 [&, &TTI = *TTI](
9795 bool Final = false) {
9796 SmallVector<LoadInst *> NonVectorized;
9797 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9798 GatheredLoads) {
9799 if (LoadsDists.size() <= 1) {
9800 NonVectorized.push_back(LoadsDists.back().first);
9801 continue;
9802 }
9804 LoadsDists);
9805 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9806 stable_sort(LocalLoadsDists, LoadSorter);
9808 unsigned MaxConsecutiveDistance = 0;
9809 unsigned CurrentConsecutiveDist = 1;
9810 int64_t LastDist = LocalLoadsDists.front().second;
9811 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9812 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9813 if (isVectorized(L.first))
9814 continue;
9815 assert(LastDist >= L.second &&
9816 "Expected first distance always not less than second");
9817 if (static_cast<uint64_t>(LastDist - L.second) ==
9818 CurrentConsecutiveDist) {
9819 ++CurrentConsecutiveDist;
9820 MaxConsecutiveDistance =
9821 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9822 Loads.push_back(L.first);
9823 continue;
9824 }
9825 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9826 !Loads.empty())
9827 Loads.pop_back();
9828 CurrentConsecutiveDist = 1;
9829 LastDist = L.second;
9830 Loads.push_back(L.first);
9831 }
9832 if (Loads.size() <= 1)
9833 continue;
9834 if (AllowMaskedGather)
9835 MaxConsecutiveDistance = Loads.size();
9836 else if (MaxConsecutiveDistance < 2)
9837 continue;
9838 BoUpSLP::ValueSet VectorizedLoads;
9839 SmallVector<LoadInst *> SortedNonVectorized;
9841 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9842 Final, MaxConsecutiveDistance);
9843 if (!Results.empty() && !SortedNonVectorized.empty() &&
9844 OriginalLoads.size() == Loads.size() &&
9845 MaxConsecutiveDistance == Loads.size() &&
9847 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9848 return P.second == LoadsState::ScatterVectorize;
9849 })) {
9850 VectorizedLoads.clear();
9851 SmallVector<LoadInst *> UnsortedNonVectorized;
9853 UnsortedResults =
9854 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9855 UnsortedNonVectorized, Final,
9856 OriginalLoads.size());
9857 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9858 SortedNonVectorized.swap(UnsortedNonVectorized);
9859 Results.swap(UnsortedResults);
9860 }
9861 }
9862 for (auto [Slice, _] : Results) {
9863 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9864 << Slice.size() << ")\n");
9865 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9866 for (Value *L : Slice)
9867 if (!isVectorized(L))
9868 SortedNonVectorized.push_back(cast<LoadInst>(L));
9869 continue;
9870 }
9871
9872 // Select maximum VF as a maximum of user gathered nodes and
9873 // distance between scalar loads in these nodes.
9874 unsigned MaxVF = Slice.size();
9875 unsigned UserMaxVF = 0;
9876 unsigned InterleaveFactor = 0;
9877 if (MaxVF == 2) {
9878 UserMaxVF = MaxVF;
9879 } else {
9880 // Found distance between segments of the interleaved loads.
9881 std::optional<unsigned> InterleavedLoadsDistance = 0;
9882 unsigned Order = 0;
9883 std::optional<unsigned> CommonVF = 0;
9884 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9885 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9886 for (auto [Idx, V] : enumerate(Slice)) {
9887 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9888 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9889 unsigned Pos =
9890 EntryToPosition.try_emplace(E, Idx).first->second;
9891 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9892 if (CommonVF) {
9893 if (*CommonVF == 0) {
9894 CommonVF = E->Scalars.size();
9895 continue;
9896 }
9897 if (*CommonVF != E->Scalars.size())
9898 CommonVF.reset();
9899 }
9900 // Check if the load is the part of the interleaved load.
9901 if (Pos != Idx && InterleavedLoadsDistance) {
9902 if (!DeinterleavedNodes.contains(E) &&
9903 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9904 if (isa<Constant>(V))
9905 return false;
9906 if (isVectorized(V))
9907 return true;
9908 const auto &Nodes = ValueToGatherNodes.at(V);
9909 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9910 !is_contained(Slice, V);
9911 })) {
9912 InterleavedLoadsDistance.reset();
9913 continue;
9914 }
9915 DeinterleavedNodes.insert(E);
9916 if (*InterleavedLoadsDistance == 0) {
9917 InterleavedLoadsDistance = Idx - Pos;
9918 continue;
9919 }
9920 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9921 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9922 InterleavedLoadsDistance.reset();
9923 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9924 }
9925 }
9926 }
9927 DeinterleavedNodes.clear();
9928 // Check if the large load represents interleaved load operation.
9929 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9930 CommonVF.value_or(0) != 0) {
9931 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9932 unsigned VF = *CommonVF;
9933 OrdersType Order;
9934 SmallVector<Value *> PointerOps;
9935 StridedPtrInfo SPtrInfo;
9936 // Segmented load detected - vectorize at maximum vector factor.
9937 if (InterleaveFactor <= Slice.size() &&
9938 TTI.isLegalInterleavedAccessType(
9939 getWidenedType(Slice.front()->getType(), VF),
9940 InterleaveFactor,
9941 cast<LoadInst>(Slice.front())->getAlign(),
9942 cast<LoadInst>(Slice.front())
9944 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9945 SPtrInfo) == LoadsState::Vectorize) {
9946 UserMaxVF = InterleaveFactor * VF;
9947 } else {
9948 InterleaveFactor = 0;
9949 }
9950 }
9951 // Cannot represent the loads as consecutive vectorizable nodes -
9952 // just exit.
9953 unsigned ConsecutiveNodesSize = 0;
9954 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9955 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9956 [&, Slice = Slice](const auto &P) {
9957 const auto *It = find_if(Slice, [&](Value *V) {
9958 return std::get<1>(P).contains(V);
9959 });
9960 if (It == Slice.end())
9961 return false;
9962 const TreeEntry &TE =
9963 *VectorizableTree[std::get<0>(P)];
9964 ArrayRef<Value *> VL = TE.Scalars;
9965 OrdersType Order;
9966 SmallVector<Value *> PointerOps;
9967 StridedPtrInfo SPtrInfo;
9969 VL, VL.front(), Order, PointerOps, SPtrInfo);
9970 if (State == LoadsState::ScatterVectorize ||
9972 return false;
9973 ConsecutiveNodesSize += VL.size();
9974 size_t Start = std::distance(Slice.begin(), It);
9975 size_t Sz = Slice.size() - Start;
9976 return Sz < VL.size() ||
9977 Slice.slice(Start, VL.size()) != VL;
9978 }))
9979 continue;
9980 // Try to build long masked gather loads.
9981 UserMaxVF = bit_ceil(UserMaxVF);
9982 if (InterleaveFactor == 0 &&
9983 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9984 [&, Slice = Slice](unsigned Idx) {
9985 OrdersType Order;
9986 SmallVector<Value *> PointerOps;
9987 StridedPtrInfo SPtrInfo;
9988 return canVectorizeLoads(
9989 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9990 Slice[Idx * UserMaxVF], Order, PointerOps,
9991 SPtrInfo) == LoadsState::ScatterVectorize;
9992 }))
9993 UserMaxVF = MaxVF;
9994 if (Slice.size() != ConsecutiveNodesSize)
9995 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9996 }
9997 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9998 bool IsVectorized = true;
9999 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
10000 ArrayRef<Value *> SubSlice =
10001 Slice.slice(I, std::min(VF, E - I));
10002 if (isVectorized(SubSlice.front()))
10003 continue;
10004 // Check if the subslice is to be-vectorized entry, which is not
10005 // equal to entry.
10006 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10007 [&](const auto &P) {
10008 return !SubSlice.equals(
10009 VectorizableTree[std::get<0>(P)]
10010 ->Scalars) &&
10011 set_is_subset(SubSlice, std::get<1>(P));
10012 }))
10013 continue;
10014 unsigned Sz = VectorizableTree.size();
10015 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
10016 if (Sz == VectorizableTree.size()) {
10017 IsVectorized = false;
10018 // Try non-interleaved vectorization with smaller vector
10019 // factor.
10020 if (InterleaveFactor > 0) {
10021 VF = 2 * (MaxVF / InterleaveFactor);
10022 InterleaveFactor = 0;
10023 }
10024 continue;
10025 }
10026 }
10027 if (IsVectorized)
10028 break;
10029 }
10030 }
10031 NonVectorized.append(SortedNonVectorized);
10032 }
10033 return NonVectorized;
10034 };
10035 for (const auto &GLs : GatheredLoads) {
10036 const auto &Ref = GLs.second;
10037 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
10038 if (!Ref.empty() && !NonVectorized.empty() &&
10039 std::accumulate(
10040 Ref.begin(), Ref.end(), 0u,
10041 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10042 -> unsigned { return S + LoadsDists.size(); }) !=
10043 NonVectorized.size() &&
10044 IsMaskedGatherSupported(NonVectorized)) {
10046 FinalGatheredLoads;
10047 for (LoadInst *LI : NonVectorized) {
10048 // Reinsert non-vectorized loads to other list of loads with the same
10049 // base pointers.
10050 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
10051 FinalGatheredLoads,
10052 /*AddNew=*/false);
10053 }
10054 // Final attempt to vectorize non-vectorized loads.
10055 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
10056 }
10057 }
10058 // Try to vectorize postponed load entries, previously marked as gathered.
10059 for (unsigned Idx : LoadEntriesToVectorize) {
10060 const TreeEntry &E = *VectorizableTree[Idx];
10061 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
10062 // Avoid reordering, if possible.
10063 if (!E.ReorderIndices.empty()) {
10064 // Build a mask out of the reorder indices and reorder scalars per this
10065 // mask.
10066 SmallVector<int> ReorderMask;
10067 inversePermutation(E.ReorderIndices, ReorderMask);
10068 reorderScalars(GatheredScalars, ReorderMask);
10069 }
10070 buildTreeRec(GatheredScalars, 0, EdgeInfo());
10071 }
10072 // If no new entries created, consider it as no gathered loads entries must be
10073 // handled.
10074 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10075 VectorizableTree.size())
10076 GatheredLoadsEntriesFirst.reset();
10077}
10078
10079/// Generates key/subkey pair for the given value to provide effective sorting
10080/// of the values and better detection of the vectorizable values sequences. The
10081/// keys/subkeys can be used for better sorting of the values themselves (keys)
10082/// and in values subgroups (subkeys).
10083static std::pair<size_t, size_t> generateKeySubkey(
10084 Value *V, const TargetLibraryInfo *TLI,
10085 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
10086 bool AllowAlternate) {
10087 hash_code Key = hash_value(V->getValueID() + 2);
10088 hash_code SubKey = hash_value(0);
10089 // Sort the loads by the distance between the pointers.
10090 if (auto *LI = dyn_cast<LoadInst>(V)) {
10091 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
10092 if (LI->isSimple())
10093 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
10094 else
10095 Key = SubKey = hash_value(LI);
10096 } else if (isVectorLikeInstWithConstOps(V)) {
10097 // Sort extracts by the vector operands.
10099 Key = hash_value(Value::UndefValueVal + 1);
10100 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
10101 if (!isUndefVector(EI->getVectorOperand()).all() &&
10102 !isa<UndefValue>(EI->getIndexOperand()))
10103 SubKey = hash_value(EI->getVectorOperand());
10104 }
10105 } else if (auto *I = dyn_cast<Instruction>(V)) {
10106 // Sort other instructions just by the opcodes except for CMPInst.
10107 // For CMP also sort by the predicate kind.
10109 isValidForAlternation(I->getOpcode())) {
10110 if (AllowAlternate)
10111 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
10112 else
10113 Key = hash_combine(hash_value(I->getOpcode()), Key);
10114 SubKey = hash_combine(
10115 hash_value(I->getOpcode()), hash_value(I->getType()),
10117 ? I->getType()
10118 : cast<CastInst>(I)->getOperand(0)->getType()));
10119 // For casts, look through the only operand to improve compile time.
10120 if (isa<CastInst>(I)) {
10121 std::pair<size_t, size_t> OpVals =
10122 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
10123 /*AllowAlternate=*/true);
10124 Key = hash_combine(OpVals.first, Key);
10125 SubKey = hash_combine(OpVals.first, SubKey);
10126 }
10127 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
10128 CmpInst::Predicate Pred = CI->getPredicate();
10129 if (CI->isCommutative())
10130 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
10132 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
10133 hash_value(SwapPred),
10134 hash_value(CI->getOperand(0)->getType()));
10135 } else if (auto *Call = dyn_cast<CallInst>(I)) {
10138 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
10139 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
10140 SubKey = hash_combine(hash_value(I->getOpcode()),
10141 hash_value(Call->getCalledFunction()));
10142 } else {
10144 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
10145 }
10146 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
10147 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
10148 hash_value(Op.Tag), SubKey);
10149 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
10150 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
10151 SubKey = hash_value(Gep->getPointerOperand());
10152 else
10153 SubKey = hash_value(Gep);
10154 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
10155 !isa<ConstantInt>(I->getOperand(1))) {
10156 // Do not try to vectorize instructions with potentially high cost.
10157 SubKey = hash_value(I);
10158 } else {
10159 SubKey = hash_value(I->getOpcode());
10160 }
10161 Key = hash_combine(hash_value(I->getParent()), Key);
10162 }
10163 return std::make_pair(Key, SubKey);
10164}
10165
10166/// Checks if the specified instruction \p I is an main operation for the given
10167/// \p MainOp and \p AltOp instructions.
10168static bool isMainInstruction(Instruction *I, Instruction *MainOp,
10169 Instruction *AltOp, const TargetLibraryInfo &TLI);
10170
10171bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
10172 ArrayRef<Value *> VL) const {
10173 Type *ScalarTy = S.getMainOp()->getType();
10174 unsigned Opcode0 = S.getOpcode();
10175 unsigned Opcode1 = S.getAltOpcode();
10176 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10177 // If this pattern is supported by the target then consider it profitable.
10178 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
10179 Opcode1, OpcodeMask))
10180 return true;
10181 SmallVector<ValueList> Operands;
10182 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
10183 Operands.emplace_back();
10184 // Prepare the operand vector.
10185 for (Value *V : VL) {
10186 if (isa<PoisonValue>(V)) {
10187 Operands.back().push_back(
10188 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
10189 continue;
10190 }
10191 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
10192 }
10193 }
10194 if (Operands.size() == 2) {
10195 // Try find best operands candidates.
10196 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
10198 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
10199 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
10200 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
10201 std::optional<int> Res = findBestRootPair(Candidates);
10202 switch (Res.value_or(0)) {
10203 case 0:
10204 break;
10205 case 1:
10206 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
10207 break;
10208 case 2:
10209 std::swap(Operands[0][I], Operands[1][I]);
10210 break;
10211 default:
10212 llvm_unreachable("Unexpected index.");
10213 }
10214 }
10215 }
10216 DenseSet<unsigned> UniqueOpcodes;
10217 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
10218 unsigned NonInstCnt = 0;
10219 // Estimate number of instructions, required for the vectorized node and for
10220 // the buildvector node.
10221 unsigned UndefCnt = 0;
10222 // Count the number of extra shuffles, required for vector nodes.
10223 unsigned ExtraShuffleInsts = 0;
10224 // Check that operands do not contain same values and create either perfect
10225 // diamond match or shuffled match.
10226 if (Operands.size() == 2) {
10227 // Do not count same operands twice.
10228 if (Operands.front() == Operands.back()) {
10229 Operands.erase(Operands.begin());
10230 } else if (!allConstant(Operands.front()) &&
10231 all_of(Operands.front(), [&](Value *V) {
10232 return is_contained(Operands.back(), V);
10233 })) {
10234 Operands.erase(Operands.begin());
10235 ++ExtraShuffleInsts;
10236 }
10237 }
10238 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
10239 // Vectorize node, if:
10240 // 1. at least single operand is constant or splat.
10241 // 2. Operands have many loop invariants (the instructions are not loop
10242 // invariants).
10243 // 3. At least single unique operands is supposed to vectorized.
10244 return none_of(Operands,
10245 [&](ArrayRef<Value *> Op) {
10246 if (allConstant(Op) ||
10247 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
10248 getSameOpcode(Op, *TLI)))
10249 return false;
10250 DenseMap<Value *, unsigned> Uniques;
10251 for (Value *V : Op) {
10253 isVectorized(V) || (L && L->isLoopInvariant(V))) {
10254 if (isa<UndefValue>(V))
10255 ++UndefCnt;
10256 continue;
10257 }
10258 auto Res = Uniques.try_emplace(V, 0);
10259 // Found first duplicate - need to add shuffle.
10260 if (!Res.second && Res.first->second == 1)
10261 ++ExtraShuffleInsts;
10262 ++Res.first->getSecond();
10263 if (auto *I = dyn_cast<Instruction>(V))
10264 UniqueOpcodes.insert(I->getOpcode());
10265 else if (Res.second)
10266 ++NonInstCnt;
10267 }
10268 return none_of(Uniques, [&](const auto &P) {
10269 return P.first->hasNUsesOrMore(P.second + 1) &&
10270 none_of(P.first->users(), [&](User *U) {
10271 return isVectorized(U) || Uniques.contains(U);
10272 });
10273 });
10274 }) ||
10275 // Do not vectorize node, if estimated number of vector instructions is
10276 // more than estimated number of buildvector instructions. Number of
10277 // vector operands is number of vector instructions + number of vector
10278 // instructions for operands (buildvectors). Number of buildvector
10279 // instructions is just number_of_operands * number_of_scalars.
10280 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10281 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
10282 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10283}
10284
10285/// Builds the arguments types vector for the given call instruction with the
10286/// given \p ID for the specified vector factor.
10289 const unsigned VF, unsigned MinBW,
10290 const TargetTransformInfo *TTI) {
10291 SmallVector<Type *> ArgTys;
10292 for (auto [Idx, Arg] : enumerate(CI->args())) {
10295 ArgTys.push_back(Arg->getType());
10296 continue;
10297 }
10298 if (MinBW > 0) {
10299 ArgTys.push_back(
10300 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
10301 continue;
10302 }
10303 }
10304 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
10305 }
10306 return ArgTys;
10307}
10308
10309/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10310/// function (if possible) calls. Returns invalid cost for the corresponding
10311/// calls, if they cannot be vectorized/will be scalarized.
10312static std::pair<InstructionCost, InstructionCost>
10315 ArrayRef<Type *> ArgTys) {
10316 auto Shape = VFShape::get(CI->getFunctionType(),
10318 false /*HasGlobalPred*/);
10319 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10320 auto LibCost = InstructionCost::getInvalid();
10321 if (!CI->isNoBuiltin() && VecFunc) {
10322 // Calculate the cost of the vector library call.
10323 // If the corresponding vector call is cheaper, return its cost.
10324 LibCost =
10325 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
10326 }
10328
10329 // Calculate the cost of the vector intrinsic call.
10330 FastMathFlags FMF;
10331 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
10332 FMF = FPCI->getFastMathFlags();
10333 const InstructionCost ScalarLimit = 10000;
10334 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10335 LibCost.isValid() ? LibCost : ScalarLimit);
10336 auto IntrinsicCost =
10337 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
10338 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10339 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10341
10342 return {IntrinsicCost, LibCost};
10343}
10344
10345BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10346 const InstructionsState &S, ArrayRef<Value *> VL,
10347 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10348 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10349 assert(S.getMainOp() &&
10350 "Expected instructions with same/alternate opcodes only.");
10351
10352 unsigned ShuffleOrOp =
10353 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10354 Instruction *VL0 = S.getMainOp();
10355 switch (ShuffleOrOp) {
10356 case Instruction::PHI: {
10357 // Too many operands - gather, most probably won't be vectorized.
10358 if (VL0->getNumOperands() > MaxPHINumOperands)
10359 return TreeEntry::NeedToGather;
10360 // Check for terminator values (e.g. invoke).
10361 for (Value *V : VL) {
10362 auto *PHI = dyn_cast<PHINode>(V);
10363 if (!PHI)
10364 continue;
10365 for (Value *Incoming : PHI->incoming_values()) {
10367 if (Term && Term->isTerminator()) {
10369 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10370 return TreeEntry::NeedToGather;
10371 }
10372 }
10373 }
10374
10375 return TreeEntry::Vectorize;
10376 }
10377 case Instruction::ExtractElement:
10378 if (any_of(VL, [&](Value *V) {
10379 auto *EI = dyn_cast<ExtractElementInst>(V);
10380 if (!EI)
10381 return true;
10382 return isVectorized(EI->getOperand(0));
10383 }))
10384 return TreeEntry::NeedToGather;
10385 [[fallthrough]];
10386 case Instruction::ExtractValue: {
10387 bool Reuse = canReuseExtract(VL, CurrentOrder);
10388 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10389 // non-full registers).
10390 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
10391 return TreeEntry::NeedToGather;
10392 if (Reuse || !CurrentOrder.empty())
10393 return TreeEntry::Vectorize;
10394 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10395 return TreeEntry::NeedToGather;
10396 }
10397 case Instruction::InsertElement: {
10398 // Check that we have a buildvector and not a shuffle of 2 or more
10399 // different vectors.
10400 ValueSet SourceVectors;
10401 for (Value *V : VL) {
10402 if (isa<PoisonValue>(V)) {
10403 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10404 return TreeEntry::NeedToGather;
10405 }
10406 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10407 assert(getElementIndex(V) != std::nullopt &&
10408 "Non-constant or undef index?");
10409 }
10410
10411 if (count_if(VL, [&SourceVectors](Value *V) {
10412 return !SourceVectors.contains(V);
10413 }) >= 2) {
10414 // Found 2nd source vector - cancel.
10415 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10416 "different source vectors.\n");
10417 return TreeEntry::NeedToGather;
10418 }
10419
10420 if (any_of(VL, [&SourceVectors](Value *V) {
10421 // The last InsertElement can have multiple uses.
10422 return SourceVectors.contains(V) && !V->hasOneUse();
10423 })) {
10424 assert(SLPReVec && "Only supported by REVEC.");
10425 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10426 "multiple uses.\n");
10427 return TreeEntry::NeedToGather;
10428 }
10429
10430 return TreeEntry::Vectorize;
10431 }
10432 case Instruction::Load: {
10433 // Check that a vectorized load would load the same memory as a scalar
10434 // load. For example, we don't want to vectorize loads that are smaller
10435 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10436 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10437 // from such a struct, we read/write packed bits disagreeing with the
10438 // unvectorized version.
10439 auto IsGatheredNode = [&]() {
10440 if (!GatheredLoadsEntriesFirst)
10441 return false;
10442 return all_of(VL, [&](Value *V) {
10443 if (isa<PoisonValue>(V))
10444 return true;
10445 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10446 return TE->Idx >= *GatheredLoadsEntriesFirst;
10447 });
10448 });
10449 };
10450 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10452 return TreeEntry::Vectorize;
10454 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10455 // Delay slow vectorized nodes for better vectorization attempts.
10456 LoadEntriesToVectorize.insert(VectorizableTree.size());
10457 return TreeEntry::NeedToGather;
10458 }
10459 return IsGatheredNode() ? TreeEntry::NeedToGather
10460 : TreeEntry::CompressVectorize;
10462 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10463 // Delay slow vectorized nodes for better vectorization attempts.
10464 LoadEntriesToVectorize.insert(VectorizableTree.size());
10465 return TreeEntry::NeedToGather;
10466 }
10467 return IsGatheredNode() ? TreeEntry::NeedToGather
10468 : TreeEntry::ScatterVectorize;
10470 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10471 // Delay slow vectorized nodes for better vectorization attempts.
10472 LoadEntriesToVectorize.insert(VectorizableTree.size());
10473 return TreeEntry::NeedToGather;
10474 }
10475 return IsGatheredNode() ? TreeEntry::NeedToGather
10476 : TreeEntry::StridedVectorize;
10477 case LoadsState::Gather:
10478#ifndef NDEBUG
10479 Type *ScalarTy = VL0->getType();
10480 if (DL->getTypeSizeInBits(ScalarTy) !=
10481 DL->getTypeAllocSizeInBits(ScalarTy))
10482 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10483 else if (any_of(VL, [](Value *V) {
10484 auto *LI = dyn_cast<LoadInst>(V);
10485 return !LI || !LI->isSimple();
10486 }))
10487 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10488 else
10489 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10490#endif // NDEBUG
10492 return TreeEntry::NeedToGather;
10493 }
10494 llvm_unreachable("Unexpected state of loads");
10495 }
10496 case Instruction::ZExt:
10497 case Instruction::SExt:
10498 case Instruction::FPToUI:
10499 case Instruction::FPToSI:
10500 case Instruction::FPExt:
10501 case Instruction::PtrToInt:
10502 case Instruction::IntToPtr:
10503 case Instruction::SIToFP:
10504 case Instruction::UIToFP:
10505 case Instruction::Trunc:
10506 case Instruction::FPTrunc:
10507 case Instruction::BitCast: {
10508 Type *SrcTy = VL0->getOperand(0)->getType();
10509 for (Value *V : VL) {
10510 if (isa<PoisonValue>(V))
10511 continue;
10512 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10513 if (Ty != SrcTy || !isValidElementType(Ty)) {
10514 LLVM_DEBUG(
10515 dbgs() << "SLP: Gathering casts with different src types.\n");
10516 return TreeEntry::NeedToGather;
10517 }
10518 }
10519 return TreeEntry::Vectorize;
10520 }
10521 case Instruction::ICmp:
10522 case Instruction::FCmp: {
10523 // Check that all of the compares have the same predicate.
10524 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10526 Type *ComparedTy = VL0->getOperand(0)->getType();
10527 for (Value *V : VL) {
10528 if (isa<PoisonValue>(V))
10529 continue;
10530 auto *Cmp = cast<CmpInst>(V);
10531 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10532 Cmp->getOperand(0)->getType() != ComparedTy) {
10533 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10534 return TreeEntry::NeedToGather;
10535 }
10536 }
10537 return TreeEntry::Vectorize;
10538 }
10539 case Instruction::Select:
10540 case Instruction::FNeg:
10541 case Instruction::Add:
10542 case Instruction::FAdd:
10543 case Instruction::Sub:
10544 case Instruction::FSub:
10545 case Instruction::Mul:
10546 case Instruction::FMul:
10547 case Instruction::UDiv:
10548 case Instruction::SDiv:
10549 case Instruction::FDiv:
10550 case Instruction::URem:
10551 case Instruction::SRem:
10552 case Instruction::FRem:
10553 case Instruction::Shl:
10554 case Instruction::LShr:
10555 case Instruction::AShr:
10556 case Instruction::And:
10557 case Instruction::Or:
10558 case Instruction::Xor:
10559 case Instruction::Freeze:
10560 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10561 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10562 auto *I = dyn_cast<Instruction>(V);
10563 return I && I->isBinaryOp() && !I->isFast();
10564 }))
10565 return TreeEntry::NeedToGather;
10566 return TreeEntry::Vectorize;
10567 case Instruction::GetElementPtr: {
10568 // We don't combine GEPs with complicated (nested) indexing.
10569 for (Value *V : VL) {
10570 auto *I = dyn_cast<GetElementPtrInst>(V);
10571 if (!I)
10572 continue;
10573 if (I->getNumOperands() != 2) {
10574 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10575 return TreeEntry::NeedToGather;
10576 }
10577 }
10578
10579 // We can't combine several GEPs into one vector if they operate on
10580 // different types.
10581 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10582 for (Value *V : VL) {
10583 auto *GEP = dyn_cast<GEPOperator>(V);
10584 if (!GEP)
10585 continue;
10586 Type *CurTy = GEP->getSourceElementType();
10587 if (Ty0 != CurTy) {
10588 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10589 return TreeEntry::NeedToGather;
10590 }
10591 }
10592
10593 // We don't combine GEPs with non-constant indexes.
10594 Type *Ty1 = VL0->getOperand(1)->getType();
10595 for (Value *V : VL) {
10596 auto *I = dyn_cast<GetElementPtrInst>(V);
10597 if (!I)
10598 continue;
10599 auto *Op = I->getOperand(1);
10600 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10601 (Op->getType() != Ty1 &&
10602 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10603 Op->getType()->getScalarSizeInBits() >
10604 DL->getIndexSizeInBits(
10605 V->getType()->getPointerAddressSpace())))) {
10606 LLVM_DEBUG(
10607 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10608 return TreeEntry::NeedToGather;
10609 }
10610 }
10611
10612 return TreeEntry::Vectorize;
10613 }
10614 case Instruction::Store: {
10615 // Check if the stores are consecutive or if we need to swizzle them.
10616 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10617 // Avoid types that are padded when being allocated as scalars, while
10618 // being packed together in a vector (such as i1).
10619 if (DL->getTypeSizeInBits(ScalarTy) !=
10620 DL->getTypeAllocSizeInBits(ScalarTy)) {
10621 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10622 return TreeEntry::NeedToGather;
10623 }
10624 // Make sure all stores in the bundle are simple - we can't vectorize
10625 // atomic or volatile stores.
10626 for (Value *V : VL) {
10627 auto *SI = cast<StoreInst>(V);
10628 if (!SI->isSimple()) {
10629 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10630 return TreeEntry::NeedToGather;
10631 }
10632 PointerOps.push_back(SI->getPointerOperand());
10633 }
10634
10635 // Check the order of pointer operands.
10636 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10637 Value *Ptr0;
10638 Value *PtrN;
10639 if (CurrentOrder.empty()) {
10640 Ptr0 = PointerOps.front();
10641 PtrN = PointerOps.back();
10642 } else {
10643 Ptr0 = PointerOps[CurrentOrder.front()];
10644 PtrN = PointerOps[CurrentOrder.back()];
10645 }
10646 std::optional<int64_t> Dist =
10647 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10648 // Check that the sorted pointer operands are consecutive.
10649 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10650 return TreeEntry::Vectorize;
10651 }
10652
10653 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10654 return TreeEntry::NeedToGather;
10655 }
10656 case Instruction::Call: {
10657 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10658 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10659 auto *I = dyn_cast<Instruction>(V);
10660 return I && !I->isFast();
10661 }))
10662 return TreeEntry::NeedToGather;
10663 // Check if the calls are all to the same vectorizable intrinsic or
10664 // library function.
10665 CallInst *CI = cast<CallInst>(VL0);
10667
10668 VFShape Shape = VFShape::get(
10669 CI->getFunctionType(),
10670 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10671 false /*HasGlobalPred*/);
10672 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10673
10674 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10675 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10676 return TreeEntry::NeedToGather;
10677 }
10678 Function *F = CI->getCalledFunction();
10679 unsigned NumArgs = CI->arg_size();
10680 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10681 for (unsigned J = 0; J != NumArgs; ++J)
10683 ScalarArgs[J] = CI->getArgOperand(J);
10684 for (Value *V : VL) {
10685 CallInst *CI2 = dyn_cast<CallInst>(V);
10686 if (!CI2 || CI2->getCalledFunction() != F ||
10687 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10688 (VecFunc &&
10689 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10691 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10692 << "\n");
10693 return TreeEntry::NeedToGather;
10694 }
10695 // Some intrinsics have scalar arguments and should be same in order for
10696 // them to be vectorized.
10697 for (unsigned J = 0; J != NumArgs; ++J) {
10699 Value *A1J = CI2->getArgOperand(J);
10700 if (ScalarArgs[J] != A1J) {
10702 << "SLP: mismatched arguments in call:" << *CI
10703 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10704 return TreeEntry::NeedToGather;
10705 }
10706 }
10707 }
10708 // Verify that the bundle operands are identical between the two calls.
10709 if (CI->hasOperandBundles() &&
10710 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10711 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10712 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10713 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10714 << "!=" << *V << '\n');
10715 return TreeEntry::NeedToGather;
10716 }
10717 }
10718 SmallVector<Type *> ArgTys =
10719 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10720 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10721 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10722 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10723 return TreeEntry::NeedToGather;
10724
10725 return TreeEntry::Vectorize;
10726 }
10727 case Instruction::ShuffleVector: {
10728 if (!S.isAltShuffle()) {
10729 // REVEC can support non alternate shuffle.
10731 return TreeEntry::Vectorize;
10732 // If this is not an alternate sequence of opcode like add-sub
10733 // then do not vectorize this instruction.
10734 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10735 return TreeEntry::NeedToGather;
10736 }
10737 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10738 LLVM_DEBUG(
10739 dbgs()
10740 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10741 "the whole alt sequence is not profitable.\n");
10742 return TreeEntry::NeedToGather;
10743 }
10744
10745 return TreeEntry::Vectorize;
10746 }
10747 default:
10748 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10749 return TreeEntry::NeedToGather;
10750 }
10751}
10752
10753namespace {
10754/// Allows to correctly handle operands of the phi nodes based on the \p Main
10755/// PHINode order of incoming basic blocks/values.
10756class PHIHandler {
10757 DominatorTree &DT;
10758 PHINode *Main = nullptr;
10761
10762public:
10763 PHIHandler() = delete;
10764 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10765 : DT(DT), Main(Main), Phis(Phis),
10766 Operands(Main->getNumIncomingValues(),
10767 SmallVector<Value *>(Phis.size(), nullptr)) {}
10768 void buildOperands() {
10769 constexpr unsigned FastLimit = 4;
10770 if (Main->getNumIncomingValues() <= FastLimit) {
10771 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10772 BasicBlock *InBB = Main->getIncomingBlock(I);
10773 if (!DT.isReachableFromEntry(InBB)) {
10774 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10775 continue;
10776 }
10777 // Prepare the operand vector.
10778 for (auto [Idx, V] : enumerate(Phis)) {
10779 auto *P = dyn_cast<PHINode>(V);
10780 if (!P) {
10782 "Expected isa instruction or poison value.");
10783 Operands[I][Idx] = V;
10784 continue;
10785 }
10786 if (P->getIncomingBlock(I) == InBB)
10787 Operands[I][Idx] = P->getIncomingValue(I);
10788 else
10789 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10790 }
10791 }
10792 return;
10793 }
10794 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10795 Blocks;
10796 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10797 BasicBlock *InBB = Main->getIncomingBlock(I);
10798 if (!DT.isReachableFromEntry(InBB)) {
10799 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10800 continue;
10801 }
10802 Blocks.try_emplace(InBB).first->second.push_back(I);
10803 }
10804 for (auto [Idx, V] : enumerate(Phis)) {
10805 if (isa<PoisonValue>(V)) {
10806 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10807 Operands[I][Idx] = V;
10808 continue;
10809 }
10810 auto *P = cast<PHINode>(V);
10811 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10812 BasicBlock *InBB = P->getIncomingBlock(I);
10813 if (InBB == Main->getIncomingBlock(I)) {
10814 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10815 continue;
10816 Operands[I][Idx] = P->getIncomingValue(I);
10817 continue;
10818 }
10819 auto *It = Blocks.find(InBB);
10820 if (It == Blocks.end())
10821 continue;
10822 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10823 }
10824 }
10825 for (const auto &P : Blocks) {
10826 ArrayRef<unsigned> IncomingValues = P.second;
10827 if (IncomingValues.size() <= 1)
10828 continue;
10829 unsigned BasicI = IncomingValues.consume_front();
10830 for (unsigned I : IncomingValues) {
10831 assert(all_of(enumerate(Operands[I]),
10832 [&](const auto &Data) {
10833 return !Data.value() ||
10834 Data.value() == Operands[BasicI][Data.index()];
10835 }) &&
10836 "Expected empty operands list.");
10837 Operands[I] = Operands[BasicI];
10838 }
10839 }
10840 }
10841 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10842};
10843} // namespace
10844
10845/// Returns main/alternate instructions for the given \p VL. Unlike
10846/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10847/// node support.
10848/// \returns first main/alt instructions, if only poisons and instruction with
10849/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10850static std::pair<Instruction *, Instruction *>
10852 Instruction *MainOp = nullptr;
10853 Instruction *AltOp = nullptr;
10854 for (Value *V : VL) {
10855 if (isa<PoisonValue>(V))
10856 continue;
10857 auto *I = dyn_cast<Instruction>(V);
10858 if (!I)
10859 return {};
10860 if (!MainOp) {
10861 MainOp = I;
10862 continue;
10863 }
10864 if (MainOp->getOpcode() == I->getOpcode()) {
10865 if (I->getParent() != MainOp->getParent())
10866 return {};
10867 continue;
10868 }
10869 if (!AltOp) {
10870 AltOp = I;
10871 continue;
10872 }
10873 if (AltOp->getOpcode() == I->getOpcode()) {
10874 if (I->getParent() != AltOp->getParent())
10875 return {};
10876 continue;
10877 }
10878 return {};
10879 }
10880 if (!AltOp)
10881 return {};
10882 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10883 "Expected different main and alt instructions.");
10884 return std::make_pair(MainOp, AltOp);
10885}
10886
10887/// Checks that every instruction appears once in the list and if not, packs
10888/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10889/// unique scalars is extended by poison values to the whole register size.
10890///
10891/// \returns false if \p VL could not be uniquified, in which case \p VL is
10892/// unchanged and \p ReuseShuffleIndices is empty.
10894 SmallVectorImpl<int> &ReuseShuffleIndices,
10895 const TargetTransformInfo &TTI,
10896 const TargetLibraryInfo &TLI,
10897 const InstructionsState &S,
10898 const BoUpSLP::EdgeInfo &UserTreeIdx,
10899 bool TryPad = false) {
10900 // Check that every instruction appears once in this bundle.
10901 SmallVector<Value *> UniqueValues;
10902 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10903 for (Value *V : VL) {
10904 if (isConstant(V)) {
10905 // Constants are always considered distinct, even if the same constant
10906 // appears multiple times in VL.
10907 ReuseShuffleIndices.emplace_back(
10908 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10909 UniqueValues.emplace_back(V);
10910 continue;
10911 }
10912 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10913 ReuseShuffleIndices.emplace_back(Res.first->second);
10914 if (Res.second)
10915 UniqueValues.emplace_back(V);
10916 }
10917
10918 // Easy case: VL has unique values and a "natural" size
10919 size_t NumUniqueScalarValues = UniqueValues.size();
10920 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10921 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10922 if (NumUniqueScalarValues == VL.size() &&
10923 (VectorizeNonPowerOf2 || IsFullVectors)) {
10924 ReuseShuffleIndices.clear();
10925 return true;
10926 }
10927
10928 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10929 if ((UserTreeIdx.UserTE &&
10930 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10932 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10933 "for nodes with padding.\n");
10934 ReuseShuffleIndices.clear();
10935 return false;
10936 }
10937
10938 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10939 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10940 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10941 return isa<UndefValue>(V) || !isConstant(V);
10942 }))) {
10943 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10944 S.getMainOp()->isSafeToRemove() &&
10945 (S.areInstructionsWithCopyableElements() ||
10946 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10947 // Find the number of elements, which forms full vectors.
10948 unsigned PWSz = getFullVectorNumberOfElements(
10949 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10950 PWSz = std::min<unsigned>(PWSz, VL.size());
10951 if (PWSz == VL.size()) {
10952 // We ended up with the same size after removing duplicates and
10953 // upgrading the resulting vector size to a "nice size". Just keep
10954 // the initial VL then.
10955 ReuseShuffleIndices.clear();
10956 } else {
10957 // Pad unique values with poison to grow the vector to a "nice" size
10958 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10959 UniqueValues.end());
10960 PaddedUniqueValues.append(
10961 PWSz - UniqueValues.size(),
10962 PoisonValue::get(UniqueValues.front()->getType()));
10963 // Check that extended with poisons/copyable operations are still valid
10964 // for vectorization (div/rem are not allowed).
10965 if ((!S.areInstructionsWithCopyableElements() &&
10966 !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
10967 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10968 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10969 isa<CallInst>(S.getMainOp())))) {
10970 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10971 ReuseShuffleIndices.clear();
10972 return false;
10973 }
10974 VL = std::move(PaddedUniqueValues);
10975 }
10976 return true;
10977 }
10978 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10979 ReuseShuffleIndices.clear();
10980 return false;
10981 }
10982 VL = std::move(UniqueValues);
10983 return true;
10984}
10985
10986bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10987 const InstructionsState &LocalState,
10988 SmallVectorImpl<Value *> &Op1,
10989 SmallVectorImpl<Value *> &Op2,
10990 OrdersType &ReorderIndices) const {
10991 constexpr unsigned SmallNodeSize = 4;
10992 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10994 return false;
10995
10996 // Check if this is a duplicate of another split entry.
10997 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10998 << ".\n");
10999 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
11000 if (E->isSame(VL)) {
11001 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
11002 << *LocalState.getMainOp() << ".\n");
11003 return false;
11004 }
11005 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11006 if (all_of(VL, [&](Value *V) {
11007 return isa<PoisonValue>(V) || Values.contains(V);
11008 })) {
11009 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11010 return false;
11011 }
11012 }
11013
11014 ReorderIndices.assign(VL.size(), VL.size());
11015 SmallBitVector Op1Indices(VL.size());
11016 for (auto [Idx, V] : enumerate(VL)) {
11017 auto *I = dyn_cast<Instruction>(V);
11018 if (!I) {
11019 Op1.push_back(V);
11020 Op1Indices.set(Idx);
11021 continue;
11022 }
11023 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11024 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
11025 *TLI)) ||
11026 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11027 !isAlternateInstruction(I, LocalState.getMainOp(),
11028 LocalState.getAltOp(), *TLI))) {
11029 Op1.push_back(V);
11030 Op1Indices.set(Idx);
11031 continue;
11032 }
11033 Op2.push_back(V);
11034 }
11035 Type *ScalarTy = getValueType(VL.front());
11036 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
11037 unsigned Opcode0 = LocalState.getOpcode();
11038 unsigned Opcode1 = LocalState.getAltOpcode();
11039 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11040 // Enable split node, only if all nodes do not form legal alternate
11041 // instruction (like X86 addsub).
11042 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
11043 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
11044 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11045 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
11046 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
11047 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
11048 return false;
11049 // Enable split node, only if all nodes are power-of-2/full registers.
11050 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
11051 for (unsigned Idx : seq<unsigned>(VL.size())) {
11052 if (Op1Indices.test(Idx)) {
11053 ReorderIndices[Op1Cnt] = Idx;
11054 ++Op1Cnt;
11055 } else {
11056 ReorderIndices[Op2Cnt] = Idx;
11057 ++Op2Cnt;
11058 }
11059 }
11060 if (isIdentityOrder(ReorderIndices))
11061 ReorderIndices.clear();
11062 SmallVector<int> Mask;
11063 if (!ReorderIndices.empty())
11064 inversePermutation(ReorderIndices, Mask);
11065 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11066 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
11067 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
11068 // Check non-profitable single register ops, which better to be represented
11069 // as alternate ops.
11070 if (NumParts >= VL.size())
11071 return false;
11073 InstructionCost InsertCost = ::getShuffleCost(
11074 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
11075 FixedVectorType *SubVecTy =
11076 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
11077 InstructionCost NewShuffleCost =
11078 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
11079 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11080 (Mask.empty() || InsertCost >= NewShuffleCost))
11081 return false;
11082 if ((LocalState.getMainOp()->isBinaryOp() &&
11083 LocalState.getAltOp()->isBinaryOp() &&
11084 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11085 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11086 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11087 (LocalState.getMainOp()->isUnaryOp() &&
11088 LocalState.getAltOp()->isUnaryOp())) {
11089 InstructionCost OriginalVecOpsCost =
11090 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11091 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11092 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
11093 for (unsigned Idx : seq<unsigned>(VL.size())) {
11094 if (isa<PoisonValue>(VL[Idx]))
11095 continue;
11096 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
11097 }
11098 InstructionCost OriginalCost =
11099 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
11100 VecTy, OriginalMask, Kind);
11101 InstructionCost NewVecOpsCost =
11102 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11103 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11104 InstructionCost NewCost =
11105 NewVecOpsCost + InsertCost +
11106 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11107 VectorizableTree.front()->getOpcode() == Instruction::Store
11108 ? NewShuffleCost
11109 : 0);
11110 // If not profitable to split - exit.
11111 if (NewCost >= OriginalCost)
11112 return false;
11113 }
11114 return true;
11115}
11116
11117namespace {
11118/// Class accepts incoming list of values, checks if it is able to model
11119/// "copyable" values as compatible operations, and generates the list of values
11120/// for scheduling and list of operands doe the new nodes.
11121class InstructionsCompatibilityAnalysis {
11122 DominatorTree &DT;
11123 const DataLayout &DL;
11124 const TargetTransformInfo &TTI;
11125 const TargetLibraryInfo &TLI;
11126 unsigned MainOpcode = 0;
11127 Instruction *MainOp = nullptr;
11128
11129 /// Checks if the opcode is supported as the main opcode for copyable
11130 /// elements.
11131 static bool isSupportedOpcode(const unsigned Opcode) {
11132 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11133 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11134 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11135 Opcode == Instruction::And || Opcode == Instruction::Or ||
11136 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11137 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11138 Opcode == Instruction::FDiv;
11139 }
11140
11141 /// Identifies the best candidate value, which represents main opcode
11142 /// operation.
11143 /// Currently the best candidate is the Add instruction with the parent
11144 /// block with the highest DFS incoming number (block, that dominates other).
11145 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
11146 BasicBlock *Parent = nullptr;
11147 // Checks if the instruction has supported opcode.
11148 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
11149 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
11150 return false;
11151 return I && isSupportedOpcode(I->getOpcode()) &&
11152 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
11153 };
11154 // Exclude operands instructions immediately to improve compile time, it
11155 // will be unable to schedule anyway.
11156 SmallDenseSet<Value *, 8> Operands;
11157 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11158 bool AnyUndef = false;
11159 for (Value *V : VL) {
11160 auto *I = dyn_cast<Instruction>(V);
11161 if (!I) {
11162 AnyUndef |= isa<UndefValue>(V);
11163 continue;
11164 }
11165 if (!DT.isReachableFromEntry(I->getParent()))
11166 continue;
11167 if (Candidates.empty()) {
11168 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11169 Parent = I->getParent();
11170 Operands.insert(I->op_begin(), I->op_end());
11171 continue;
11172 }
11173 if (Parent == I->getParent()) {
11174 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11175 Operands.insert(I->op_begin(), I->op_end());
11176 continue;
11177 }
11178 auto *NodeA = DT.getNode(Parent);
11179 auto *NodeB = DT.getNode(I->getParent());
11180 assert(NodeA && "Should only process reachable instructions");
11181 assert(NodeB && "Should only process reachable instructions");
11182 assert((NodeA == NodeB) ==
11183 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11184 "Different nodes should have different DFS numbers");
11185 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11186 Candidates.clear();
11187 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11188 Parent = I->getParent();
11189 Operands.clear();
11190 Operands.insert(I->op_begin(), I->op_end());
11191 }
11192 }
11193 unsigned BestOpcodeNum = 0;
11194 MainOp = nullptr;
11195 bool UsedOutside = false;
11196 for (const auto &P : Candidates) {
11197 bool PUsedOutside = all_of(P.second, isUsedOutsideBlock);
11198 if (UsedOutside && !PUsedOutside)
11199 continue;
11200 if (!UsedOutside && PUsedOutside)
11201 BestOpcodeNum = 0;
11202 if (P.second.size() < BestOpcodeNum)
11203 continue;
11204 // If have inner dependencies - skip.
11205 if (!PUsedOutside && any_of(P.second, [&](Instruction *I) {
11206 return Operands.contains(I);
11207 }))
11208 continue;
11209 UsedOutside = PUsedOutside;
11210 for (Instruction *I : P.second) {
11211 if (IsSupportedInstruction(I, AnyUndef)) {
11212 MainOp = I;
11213 BestOpcodeNum = P.second.size();
11214 break;
11215 }
11216 }
11217 }
11218 if (MainOp) {
11219 // Do not match, if any copyable is a terminator from the same block as
11220 // the main operation.
11221 if (any_of(VL, [&](Value *V) {
11222 auto *I = dyn_cast<Instruction>(V);
11223 return I && I->getParent() == MainOp->getParent() &&
11224 I->isTerminator();
11225 })) {
11226 MainOp = nullptr;
11227 return;
11228 }
11229 MainOpcode = MainOp->getOpcode();
11230 }
11231 }
11232
11233 /// Returns the idempotent value for the \p MainOp with the detected \p
11234 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
11235 /// the operand itself, since V or V == V.
11236 Value *selectBestIdempotentValue() const {
11237 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11238 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
11239 !MainOp->isCommutative());
11240 }
11241
11242 /// Returns the value and operands for the \p V, considering if it is original
11243 /// instruction and its actual operands should be returned, or it is a
11244 /// copyable element and its should be represented as idempotent instruction.
11245 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
11246 if (isa<PoisonValue>(V))
11247 return {V, V};
11248 if (!S.isCopyableElement(V))
11249 return convertTo(cast<Instruction>(V), S).second;
11250 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11251 return {V, selectBestIdempotentValue()};
11252 }
11253
11254 /// Builds operands for the original instructions.
11255 void
11256 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
11257 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
11258
11259 unsigned ShuffleOrOp =
11260 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11261 Instruction *VL0 = S.getMainOp();
11262
11263 switch (ShuffleOrOp) {
11264 case Instruction::PHI: {
11265 auto *PH = cast<PHINode>(VL0);
11266
11267 // Keeps the reordered operands to avoid code duplication.
11268 PHIHandler Handler(DT, PH, VL);
11269 Handler.buildOperands();
11270 Operands.assign(PH->getNumOperands(), {});
11271 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
11272 Operands[I].assign(Handler.getOperands(I).begin(),
11273 Handler.getOperands(I).end());
11274 return;
11275 }
11276 case Instruction::ExtractValue:
11277 case Instruction::ExtractElement:
11278 // This is a special case, as it does not gather, but at the same time
11279 // we are not extending buildTree_rec() towards the operands.
11280 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
11281 return;
11282 case Instruction::InsertElement:
11283 Operands.assign(2, {VL.size(), nullptr});
11284 for (auto [Idx, V] : enumerate(VL)) {
11285 auto *IE = cast<InsertElementInst>(V);
11286 for (auto [OpIdx, Ops] : enumerate(Operands))
11287 Ops[Idx] = IE->getOperand(OpIdx);
11288 }
11289 return;
11290 case Instruction::Load:
11291 Operands.assign(
11292 1, {VL.size(),
11293 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
11294 for (auto [V, Op] : zip(VL, Operands.back())) {
11295 auto *LI = dyn_cast<LoadInst>(V);
11296 if (!LI)
11297 continue;
11298 Op = LI->getPointerOperand();
11299 }
11300 return;
11301 case Instruction::ZExt:
11302 case Instruction::SExt:
11303 case Instruction::FPToUI:
11304 case Instruction::FPToSI:
11305 case Instruction::FPExt:
11306 case Instruction::PtrToInt:
11307 case Instruction::IntToPtr:
11308 case Instruction::SIToFP:
11309 case Instruction::UIToFP:
11310 case Instruction::Trunc:
11311 case Instruction::FPTrunc:
11312 case Instruction::BitCast:
11313 case Instruction::ICmp:
11314 case Instruction::FCmp:
11315 case Instruction::Select:
11316 case Instruction::FNeg:
11317 case Instruction::Add:
11318 case Instruction::FAdd:
11319 case Instruction::Sub:
11320 case Instruction::FSub:
11321 case Instruction::Mul:
11322 case Instruction::FMul:
11323 case Instruction::UDiv:
11324 case Instruction::SDiv:
11325 case Instruction::FDiv:
11326 case Instruction::URem:
11327 case Instruction::SRem:
11328 case Instruction::FRem:
11329 case Instruction::Shl:
11330 case Instruction::LShr:
11331 case Instruction::AShr:
11332 case Instruction::And:
11333 case Instruction::Or:
11334 case Instruction::Xor:
11335 case Instruction::Freeze:
11336 case Instruction::Store:
11337 case Instruction::ShuffleVector:
11338 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11339 for (auto [Idx, V] : enumerate(VL)) {
11340 auto *I = dyn_cast<Instruction>(V);
11341 if (!I) {
11342 for (auto [OpIdx, Ops] : enumerate(Operands))
11343 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11344 continue;
11345 }
11346 auto [Op, ConvertedOps] = convertTo(I, S);
11347 for (auto [OpIdx, Ops] : enumerate(Operands))
11348 Ops[Idx] = ConvertedOps[OpIdx];
11349 }
11350 return;
11351 case Instruction::GetElementPtr: {
11352 Operands.assign(2, {VL.size(), nullptr});
11353 // Need to cast all indices to the same type before vectorization to
11354 // avoid crash.
11355 // Required to be able to find correct matches between different gather
11356 // nodes and reuse the vectorized values rather than trying to gather them
11357 // again.
11358 const unsigned IndexIdx = 1;
11359 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
11360 Type *Ty =
11361 all_of(VL,
11362 [&](Value *V) {
11364 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
11365 })
11366 ? VL0Ty
11367 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
11368 ->getPointerOperandType()
11369 ->getScalarType());
11370 for (auto [Idx, V] : enumerate(VL)) {
11372 if (!GEP) {
11373 Operands[0][Idx] = V;
11374 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11375 continue;
11376 }
11377 Operands[0][Idx] = GEP->getPointerOperand();
11378 auto *Op = GEP->getOperand(IndexIdx);
11379 auto *CI = dyn_cast<ConstantInt>(Op);
11380 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
11381 CI, Ty, CI->getValue().isSignBitSet(), DL)
11382 : Op;
11383 }
11384 return;
11385 }
11386 case Instruction::Call: {
11387 auto *CI = cast<CallInst>(VL0);
11389 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
11391 continue;
11392 auto &Ops = Operands.emplace_back();
11393 for (Value *V : VL) {
11394 auto *I = dyn_cast<Instruction>(V);
11395 Ops.push_back(I ? I->getOperand(Idx)
11396 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
11397 }
11398 }
11399 return;
11400 }
11401 default:
11402 break;
11403 }
11404 llvm_unreachable("Unexpected vectorization of the instructions.");
11405 }
11406
11407public:
11408 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11409 const TargetTransformInfo &TTI,
11410 const TargetLibraryInfo &TLI)
11411 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11412
11413 InstructionsState
11414 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
11415 bool TryCopyableElementsVectorization,
11416 bool WithProfitabilityCheck = false,
11417 bool SkipSameCodeCheck = false) {
11418 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11419 ? InstructionsState::invalid()
11420 : getSameOpcode(VL, TLI);
11421 if (S)
11422 return S;
11423 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
11424 return S;
11425 findAndSetMainInstruction(VL, R);
11426 if (!MainOp)
11427 return InstructionsState::invalid();
11428 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11429 if (!WithProfitabilityCheck)
11430 return S;
11431 // Check if it is profitable to vectorize the instruction.
11432 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11433 auto BuildCandidates =
11434 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11435 Value *V2) {
11436 if (V1 != V2 && isa<PHINode>(V1))
11437 return;
11438 auto *I1 = dyn_cast<Instruction>(V1);
11439 auto *I2 = dyn_cast<Instruction>(V2);
11440 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11441 I1->getParent() != I2->getParent())
11442 return;
11443 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
11444 };
11445 if (VL.size() == 2) {
11446 // Check if the operands allow better vectorization.
11447 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11448 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11449 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11450 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11451 R.findBestRootPair(Candidates1) &&
11452 R.findBestRootPair(Candidates2);
11453 if (!Res && isCommutative(MainOp)) {
11454 Candidates1.clear();
11455 Candidates2.clear();
11456 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11457 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11458 Res = !Candidates1.empty() && !Candidates2.empty() &&
11459 R.findBestRootPair(Candidates1) &&
11460 R.findBestRootPair(Candidates2);
11461 }
11462 if (!Res)
11463 return InstructionsState::invalid();
11465 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
11466 InstructionCost VectorCost;
11467 FixedVectorType *VecTy =
11468 getWidenedType(S.getMainOp()->getType(), VL.size());
11469 switch (MainOpcode) {
11470 case Instruction::Add:
11471 case Instruction::Sub:
11472 case Instruction::LShr:
11473 case Instruction::Shl:
11474 case Instruction::SDiv:
11475 case Instruction::UDiv:
11476 case Instruction::And:
11477 case Instruction::Or:
11478 case Instruction::Xor:
11479 case Instruction::FAdd:
11480 case Instruction::FMul:
11481 case Instruction::FSub:
11482 case Instruction::FDiv:
11483 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
11484 break;
11485 default:
11486 llvm_unreachable("Unexpected instruction.");
11487 }
11488 if (VectorCost > ScalarCost)
11489 return InstructionsState::invalid();
11490 return S;
11491 }
11492 assert(Operands.size() == 2 && "Unexpected number of operands!");
11493 unsigned CopyableNum =
11494 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
11495 if (CopyableNum < VL.size() / 2)
11496 return S;
11497 // Too many phi copyables - exit.
11498 const unsigned Limit = VL.size() / 24;
11499 if ((CopyableNum >= VL.size() - Limit ||
11500 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11501 CopyableNum >= MaxPHINumOperands) &&
11502 all_of(VL, [&](Value *V) {
11503 return isa<PHINode>(V) || !S.isCopyableElement(V);
11504 }))
11505 return InstructionsState::invalid();
11506 // Check profitability if number of copyables > VL.size() / 2.
11507 // 1. Reorder operands for better matching.
11508 if (isCommutative(MainOp)) {
11509 for (auto &Ops : Operands) {
11510 // Make instructions the first operands.
11511 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
11512 std::swap(Ops.front(), Ops.back());
11513 continue;
11514 }
11515 // Make constants the second operands.
11516 if (isa<Constant>(Ops.front())) {
11517 std::swap(Ops.front(), Ops.back());
11518 continue;
11519 }
11520 }
11521 }
11522 // 2. Check, if operands can be vectorized.
11523 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
11524 return InstructionsState::invalid();
11525 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11526 if (allConstant(Ops) || isSplat(Ops))
11527 return true;
11528 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11529 // one is different.
11530 constexpr unsigned Limit = 4;
11531 if (Operands.front().size() >= Limit) {
11532 SmallDenseMap<const Value *, unsigned> Counters;
11533 for (Value *V : Ops) {
11534 if (isa<UndefValue>(V))
11535 continue;
11536 ++Counters[V];
11537 }
11538 if (Counters.size() == 2 &&
11539 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11540 return C.second == 1;
11541 }))
11542 return true;
11543 }
11544 // First operand not a constant or splat? Last attempt - check for
11545 // potential vectorization.
11546 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11547 InstructionsState OpS = Analysis.buildInstructionsState(
11548 Ops, R, /*TryCopyableElementsVectorization=*/true);
11549 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11550 return false;
11551 unsigned CopyableNum =
11552 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11553 return CopyableNum <= VL.size() / 2;
11554 };
11555 if (!CheckOperand(Operands.front()))
11556 return InstructionsState::invalid();
11557
11558 return S;
11559 }
11560
11561 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11562 ArrayRef<Value *> VL) {
11563 assert(S && "Invalid state!");
11565 if (S.areInstructionsWithCopyableElements()) {
11566 MainOp = S.getMainOp();
11567 MainOpcode = S.getOpcode();
11568 Operands.assign(MainOp->getNumOperands(),
11569 BoUpSLP::ValueList(VL.size(), nullptr));
11570 for (auto [Idx, V] : enumerate(VL)) {
11571 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11572 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11573 Operands[OperandIdx][Idx] = Operand;
11574 }
11575 } else {
11576 buildOriginalOperands(S, VL, Operands);
11577 }
11578 return Operands;
11579 }
11580};
11581} // namespace
11582
11583BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11584 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11585 bool TryCopyableElementsVectorization) const {
11586 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11587
11588 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11589 InstructionsState S = Analysis.buildInstructionsState(
11590 VL, *this, TryCopyableElementsVectorization,
11591 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11592
11593 bool AreScatterAllGEPSameBlock = false;
11594 if (!S) {
11595 SmallVector<unsigned> SortedIndices;
11596 BasicBlock *BB = nullptr;
11597 bool IsScatterVectorizeUserTE =
11598 UserTreeIdx.UserTE &&
11599 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11600 AreScatterAllGEPSameBlock =
11601 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11602 VL.size() > 2 &&
11603 all_of(VL,
11604 [&BB](Value *V) {
11605 auto *I = dyn_cast<GetElementPtrInst>(V);
11606 if (!I)
11607 return doesNotNeedToBeScheduled(V);
11608 if (!BB)
11609 BB = I->getParent();
11610 return BB == I->getParent() && I->getNumOperands() == 2;
11611 }) &&
11612 BB &&
11613 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL,
11614 *SE, SortedIndices));
11615 if (!AreScatterAllGEPSameBlock) {
11616 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11617 "C,S,B,O, small shuffle. \n";
11618 dbgs() << "[";
11619 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11620 dbgs() << "]\n");
11621 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11622 /*TryToFindDuplicates=*/true,
11623 /*TrySplitVectorize=*/true);
11624 }
11625 // Reset S to make it GetElementPtr kind of node.
11626 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11627 assert(It != VL.end() && "Expected at least one GEP.");
11628 S = getSameOpcode(*It, *TLI);
11629 }
11630 assert(S && "Must be valid.");
11631
11632 // Don't handle vectors.
11633 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11634 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11635 // Do not try to pack to avoid extra instructions here.
11636 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11637 /*TryToFindDuplicates=*/false);
11638 }
11639
11640 // Check that all of the users of the scalars that we want to vectorize are
11641 // schedulable.
11642 BasicBlock *BB = S.getMainOp()->getParent();
11643
11645 !DT->isReachableFromEntry(BB)) {
11646 // Don't go into unreachable blocks. They may contain instructions with
11647 // dependency cycles which confuse the final scheduling.
11648 // Do not vectorize EH and non-returning blocks, not profitable in most
11649 // cases.
11650 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11651 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11652 }
11653
11654 // Don't go into catchswitch blocks, which can happen with PHIs.
11655 // Such blocks can only have PHIs and the catchswitch. There is no
11656 // place to insert a shuffle if we need to, so just avoid that issue.
11658 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11659 // Do not try to pack to avoid extra instructions here.
11660 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11661 /*TryToFindDuplicates=*/false);
11662 }
11663
11664 // Don't handle scalable vectors
11665 if (S.getOpcode() == Instruction::ExtractElement &&
11667 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11668 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11669 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11670 }
11671
11672 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11673 // a load), in which case peek through to include it in the tree, without
11674 // ballooning over-budget.
11675 if (Depth >= RecursionMaxDepth &&
11676 (S.isAltShuffle() || VL.size() < 4 ||
11677 !(match(S.getMainOp(), m_Load(m_Value())) ||
11678 all_of(VL, [&S](const Value *I) {
11679 return match(I,
11681 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11682 })))) {
11683 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11684 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11685 }
11686
11687 // Check if this is a duplicate of another entry.
11688 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11689 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11690 if (E->isSame(VL)) {
11691 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11692 << ".\n");
11693 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11694 }
11695 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11696 if (all_of(VL, [&](Value *V) {
11697 return isa<PoisonValue>(V) || Values.contains(V) ||
11698 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11699 LI->getLoopFor(S.getMainOp()->getParent()) &&
11700 isVectorized(V));
11701 })) {
11702 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11703 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11704 }
11705 }
11706
11707 // If all of the operands are identical or constant we have a simple solution.
11708 // If we deal with insert/extract instructions, they all must have constant
11709 // indices, otherwise we should gather them, not try to vectorize.
11710 // If alternate op node with 2 elements with gathered operands - do not
11711 // vectorize.
11712 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11713 if (!S || !S.isAltShuffle() || VL.size() > 2)
11714 return false;
11715 if (VectorizableTree.size() < MinTreeSize)
11716 return false;
11717 if (Depth >= RecursionMaxDepth - 1)
11718 return true;
11719 // Check if all operands are extracts, part of vector node or can build a
11720 // regular vectorize node.
11721 SmallVector<unsigned, 8> InstsCount;
11722 for (Value *V : VL) {
11723 auto *I = cast<Instruction>(V);
11724 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11725 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11726 }));
11727 }
11728 bool IsCommutative =
11729 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11730 if ((IsCommutative &&
11731 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11732 (!IsCommutative &&
11733 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11734 return true;
11735 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11737 auto *I1 = cast<Instruction>(VL.front());
11738 auto *I2 = cast<Instruction>(VL.back());
11739 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11740 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11741 I2->getOperand(Op));
11742 if (static_cast<unsigned>(count_if(
11743 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11745 })) >= S.getMainOp()->getNumOperands() / 2)
11746 return false;
11747 if (S.getMainOp()->getNumOperands() > 2)
11748 return true;
11749 if (IsCommutative) {
11750 // Check permuted operands.
11751 Candidates.clear();
11752 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11753 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11754 I2->getOperand((Op + 1) % E));
11755 if (any_of(
11756 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11758 }))
11759 return false;
11760 }
11761 return true;
11762 };
11763 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11764 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11765 if (!AreAllSameInsts || isSplat(VL) ||
11767 S.getMainOp()) &&
11769 NotProfitableForVectorization(VL)) {
11770 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11771 dbgs() << "[";
11772 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11773 dbgs() << "]\n");
11774 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11775 }
11776
11777 // Don't vectorize ephemeral values.
11778 if (!EphValues.empty()) {
11779 for (Value *V : VL) {
11780 if (EphValues.count(V)) {
11781 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11782 << ") is ephemeral.\n");
11783 // Do not try to pack to avoid extra instructions here.
11784 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11785 /*TryToFindDuplicates=*/false);
11786 }
11787 }
11788 }
11789
11790 // We now know that this is a vector of instructions of the same type from
11791 // the same block.
11792
11793 // Check that none of the instructions in the bundle are already in the tree
11794 // and the node may be not profitable for the vectorization as the small
11795 // alternate node.
11796 if (S.isAltShuffle()) {
11797 auto GetNumVectorizedExtracted = [&]() {
11798 APInt Extracted = APInt::getZero(VL.size());
11799 APInt Vectorized = APInt::getAllOnes(VL.size());
11800 for (auto [Idx, V] : enumerate(VL)) {
11801 auto *I = dyn_cast<Instruction>(V);
11802 if (!I || doesNotNeedToBeScheduled(I) ||
11803 all_of(I->operands(), [&](const Use &U) {
11804 return isa<ExtractElementInst>(U.get());
11805 }))
11806 continue;
11807 if (isVectorized(I))
11808 Vectorized.clearBit(Idx);
11809 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11810 Extracted.setBit(Idx);
11811 }
11812 return std::make_pair(Vectorized, Extracted);
11813 };
11814 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11816 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11817 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11818 // Rough cost estimation, if the vector code (+ potential extracts) is
11819 // more profitable than the scalar + buildvector.
11820 Type *ScalarTy = VL.front()->getType();
11821 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11822 InstructionCost VectorizeCostEstimate =
11823 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11824 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11825 /*Insert=*/false, /*Extract=*/true, Kind);
11826 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11827 *TTI, ScalarTy, VecTy, Vectorized,
11828 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11829 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11830 }
11831 if (PreferScalarize) {
11832 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11833 "node is not profitable.\n");
11834 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11835 }
11836 }
11837
11838 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11839 if (UserIgnoreList && !UserIgnoreList->empty()) {
11840 for (Value *V : VL) {
11841 if (UserIgnoreList->contains(V)) {
11842 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11843 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11844 }
11845 }
11846 }
11847
11848 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11849}
11850
11851void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11852 const EdgeInfo &UserTreeIdx,
11853 unsigned InterleaveFactor) {
11854 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11855
11856 SmallVector<int> ReuseShuffleIndices;
11857 SmallVector<Value *> VL(VLRef);
11858
11859 // Tries to build split node.
11860 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11861 SmallVector<Value *> Op1, Op2;
11862 OrdersType ReorderIndices;
11863 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11864 return false;
11865
11866 auto Invalid = ScheduleBundle::invalid();
11867 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11868 UserTreeIdx, {}, ReorderIndices);
11869 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11870 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11871 InstructionsState S = getSameOpcode(Op, *TLI);
11872 if (S && (isa<LoadInst>(S.getMainOp()) ||
11873 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11874 // Build gather node for loads, they will be gathered later.
11875 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11876 Idx == 0 ? 0 : Op1.size());
11877 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11878 } else {
11879 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11880 Idx == 0 ? 0 : Op1.size());
11881 buildTreeRec(Op, Depth, {TE, Idx});
11882 }
11883 };
11884 AddNode(Op1, 0);
11885 AddNode(Op2, 1);
11886 return true;
11887 };
11888
11889 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11890 bool AreConsts = false;
11891 for (Value *V : VL) {
11892 if (isa<PoisonValue>(V))
11893 continue;
11894 if (isa<Constant>(V)) {
11895 AreConsts = true;
11896 continue;
11897 }
11898 if (!isa<PHINode>(V))
11899 return false;
11900 }
11901 return AreConsts;
11902 };
11903 if (AreOnlyConstsWithPHIs(VL)) {
11904 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11905 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11906 return;
11907 }
11908
11909 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11910 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11911 InstructionsState S = Legality.getInstructionsState();
11912 if (!Legality.isLegal()) {
11913 if (Legality.trySplitVectorize()) {
11914 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11915 // Last chance to try to vectorize alternate node.
11916 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11917 return;
11918 }
11919 if (!S)
11920 Legality = getScalarsVectorizationLegality(
11921 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11922 if (!Legality.isLegal()) {
11923 if (Legality.tryToFindDuplicates())
11924 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11925 UserTreeIdx);
11926
11927 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11928 return;
11929 }
11930 S = Legality.getInstructionsState();
11931 }
11932
11933 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11934 if (S.isAltShuffle() && TrySplitNode(S))
11935 return;
11936
11937 // Check that every instruction appears once in this bundle.
11938 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11939 /*TryPad=*/true)) {
11940 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11941 return;
11942 }
11943
11944 // Perform specific checks for each particular instruction kind.
11945 bool IsScatterVectorizeUserTE =
11946 UserTreeIdx.UserTE &&
11947 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11948 OrdersType CurrentOrder;
11949 SmallVector<Value *> PointerOps;
11950 StridedPtrInfo SPtrInfo;
11951 TreeEntry::EntryState State = getScalarsVectorizationState(
11952 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11953 if (State == TreeEntry::NeedToGather) {
11954 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11955 return;
11956 }
11957
11958 Instruction *VL0 = S.getMainOp();
11959 BasicBlock *BB = VL0->getParent();
11960 auto &BSRef = BlocksSchedules[BB];
11961 if (!BSRef)
11962 BSRef = std::make_unique<BlockScheduling>(BB);
11963
11964 BlockScheduling &BS = *BSRef;
11965
11966 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11967 std::optional<ScheduleBundle *> BundlePtr =
11968 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11969#ifdef EXPENSIVE_CHECKS
11970 // Make sure we didn't break any internal invariants
11971 BS.verify();
11972#endif
11973 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11974 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11975 // Last chance to try to vectorize alternate node.
11976 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11977 return;
11978 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11979 NonScheduledFirst.insert(VL.front());
11980 if (S.getOpcode() == Instruction::Load &&
11981 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11983 return;
11984 }
11985 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11986 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11987 ScheduleBundle Empty;
11988 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11989 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11990
11991 unsigned ShuffleOrOp =
11992 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11993 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11994 // Postpone PHI nodes creation
11995 SmallVector<unsigned> PHIOps;
11996 for (unsigned I : seq<unsigned>(Operands.size())) {
11997 ArrayRef<Value *> Op = Operands[I];
11998 if (Op.empty())
11999 continue;
12000 InstructionsState S = getSameOpcode(Op, *TLI);
12001 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12002 buildTreeRec(Op, Depth + 1, {TE, I});
12003 else
12004 PHIOps.push_back(I);
12005 }
12006 for (unsigned I : PHIOps)
12007 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12008 };
12009 switch (ShuffleOrOp) {
12010 case Instruction::PHI: {
12011 TreeEntry *TE =
12012 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12013 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
12014 TE->dump());
12015
12016 TE->setOperands(Operands);
12017 CreateOperandNodes(TE, Operands);
12018 return;
12019 }
12020 case Instruction::ExtractValue:
12021 case Instruction::ExtractElement: {
12022 if (CurrentOrder.empty()) {
12023 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
12024 } else {
12025 LLVM_DEBUG({
12026 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
12027 "with order";
12028 for (unsigned Idx : CurrentOrder)
12029 dbgs() << " " << Idx;
12030 dbgs() << "\n";
12031 });
12032 fixupOrderingIndices(CurrentOrder);
12033 }
12034 // Insert new order with initial value 0, if it does not exist,
12035 // otherwise return the iterator to the existing one.
12036 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12037 ReuseShuffleIndices, CurrentOrder);
12038 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
12039 "(ExtractValueInst/ExtractElementInst).\n";
12040 TE->dump());
12041 // This is a special case, as it does not gather, but at the same time
12042 // we are not extending buildTreeRec() towards the operands.
12043 TE->setOperands(Operands);
12044 return;
12045 }
12046 case Instruction::InsertElement: {
12047 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
12048
12049 auto OrdCompare = [](const std::pair<int, int> &P1,
12050 const std::pair<int, int> &P2) {
12051 return P1.first > P2.first;
12052 };
12053 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
12054 decltype(OrdCompare)>
12055 Indices(OrdCompare);
12056 for (int I = 0, E = VL.size(); I < E; ++I) {
12057 unsigned Idx = *getElementIndex(VL[I]);
12058 Indices.emplace(Idx, I);
12059 }
12060 OrdersType CurrentOrder(VL.size(), VL.size());
12061 bool IsIdentity = true;
12062 for (int I = 0, E = VL.size(); I < E; ++I) {
12063 CurrentOrder[Indices.top().second] = I;
12064 IsIdentity &= Indices.top().second == I;
12065 Indices.pop();
12066 }
12067 if (IsIdentity)
12068 CurrentOrder.clear();
12069 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12070 {}, CurrentOrder);
12071 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
12072 TE->dump());
12073
12074 TE->setOperands(Operands);
12075 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
12076 return;
12077 }
12078 case Instruction::Load: {
12079 // Check that a vectorized load would load the same memory as a scalar
12080 // load. For example, we don't want to vectorize loads that are smaller
12081 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
12082 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
12083 // from such a struct, we read/write packed bits disagreeing with the
12084 // unvectorized version.
12085 TreeEntry *TE = nullptr;
12086 fixupOrderingIndices(CurrentOrder);
12087 switch (State) {
12088 case TreeEntry::Vectorize:
12089 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12090 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
12091 if (CurrentOrder.empty())
12092 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
12093 TE->dump());
12094 else
12096 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
12097 TE->dump());
12098 break;
12099 case TreeEntry::CompressVectorize:
12100 // Vectorizing non-consecutive loads with (masked)load + compress.
12101 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
12102 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12103 LLVM_DEBUG(
12104 dbgs()
12105 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12106 TE->dump());
12107 break;
12108 case TreeEntry::StridedVectorize:
12109 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12110 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
12111 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12112 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
12113 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
12114 TE->dump());
12115 break;
12116 case TreeEntry::ScatterVectorize:
12117 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12118 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
12119 UserTreeIdx, ReuseShuffleIndices);
12120 LLVM_DEBUG(
12121 dbgs()
12122 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12123 TE->dump());
12124 break;
12125 case TreeEntry::CombinedVectorize:
12126 case TreeEntry::SplitVectorize:
12127 case TreeEntry::NeedToGather:
12128 llvm_unreachable("Unexpected loads state.");
12129 }
12130 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12131 assert(Operands.size() == 1 && "Expected a single operand only");
12132 SmallVector<int> Mask;
12133 inversePermutation(CurrentOrder, Mask);
12134 reorderScalars(Operands.front(), Mask);
12135 }
12136 TE->setOperands(Operands);
12137 if (State == TreeEntry::ScatterVectorize)
12138 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
12139 return;
12140 }
12141 case Instruction::ZExt:
12142 case Instruction::SExt:
12143 case Instruction::FPToUI:
12144 case Instruction::FPToSI:
12145 case Instruction::FPExt:
12146 case Instruction::PtrToInt:
12147 case Instruction::IntToPtr:
12148 case Instruction::SIToFP:
12149 case Instruction::UIToFP:
12150 case Instruction::Trunc:
12151 case Instruction::FPTrunc:
12152 case Instruction::BitCast: {
12153 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12154 std::make_pair(std::numeric_limits<unsigned>::min(),
12155 std::numeric_limits<unsigned>::max()));
12156 if (ShuffleOrOp == Instruction::ZExt ||
12157 ShuffleOrOp == Instruction::SExt) {
12158 CastMaxMinBWSizes = std::make_pair(
12159 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
12160 PrevMaxBW),
12161 std::min<unsigned>(
12162 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
12163 PrevMinBW));
12164 } else if (ShuffleOrOp == Instruction::Trunc) {
12165 CastMaxMinBWSizes = std::make_pair(
12166 std::max<unsigned>(
12167 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
12168 PrevMaxBW),
12169 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
12170 PrevMinBW));
12171 }
12172 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12173 ReuseShuffleIndices);
12174 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
12175 TE->dump());
12176
12177 TE->setOperands(Operands);
12178 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12179 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12180 if (ShuffleOrOp == Instruction::Trunc) {
12181 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12182 } else if (ShuffleOrOp == Instruction::SIToFP ||
12183 ShuffleOrOp == Instruction::UIToFP) {
12184 unsigned NumSignBits =
12185 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
12186 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
12187 APInt Mask = DB->getDemandedBits(OpI);
12188 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
12189 }
12190 if (NumSignBits * 2 >=
12191 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
12192 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12193 }
12194 return;
12195 }
12196 case Instruction::ICmp:
12197 case Instruction::FCmp: {
12198 // Check that all of the compares have the same predicate.
12199 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12200 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12201 ReuseShuffleIndices);
12202 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
12203 TE->dump());
12204
12205 VLOperands Ops(VL, Operands, S, *this);
12206 if (cast<CmpInst>(VL0)->isCommutative()) {
12207 // Commutative predicate - collect + sort operands of the instructions
12208 // so that each side is more likely to have the same opcode.
12210 "Commutative Predicate mismatch");
12211 Ops.reorder();
12212 Operands.front() = Ops.getVL(0);
12213 Operands.back() = Ops.getVL(1);
12214 } else {
12215 // Collect operands - commute if it uses the swapped predicate.
12216 for (auto [Idx, V] : enumerate(VL)) {
12217 if (isa<PoisonValue>(V))
12218 continue;
12219 auto *Cmp = cast<CmpInst>(V);
12220 if (Cmp->getPredicate() != P0)
12221 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12222 }
12223 }
12224 TE->setOperands(Operands);
12225 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
12226 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
12227 if (ShuffleOrOp == Instruction::ICmp) {
12228 unsigned NumSignBits0 =
12229 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
12230 if (NumSignBits0 * 2 >=
12231 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
12232 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12233 unsigned NumSignBits1 =
12234 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
12235 if (NumSignBits1 * 2 >=
12236 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
12237 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
12238 }
12239 return;
12240 }
12241 case Instruction::Select:
12242 case Instruction::FNeg:
12243 case Instruction::Add:
12244 case Instruction::FAdd:
12245 case Instruction::Sub:
12246 case Instruction::FSub:
12247 case Instruction::Mul:
12248 case Instruction::FMul:
12249 case Instruction::UDiv:
12250 case Instruction::SDiv:
12251 case Instruction::FDiv:
12252 case Instruction::URem:
12253 case Instruction::SRem:
12254 case Instruction::FRem:
12255 case Instruction::Shl:
12256 case Instruction::LShr:
12257 case Instruction::AShr:
12258 case Instruction::And:
12259 case Instruction::Or:
12260 case Instruction::Xor:
12261 case Instruction::Freeze: {
12262 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12263 ReuseShuffleIndices);
12264 LLVM_DEBUG(
12265 dbgs() << "SLP: added a new TreeEntry "
12266 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12267 TE->dump());
12268
12269 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
12270 VLOperands Ops(VL, Operands, S, *this);
12271 Ops.reorder();
12272 Operands[0] = Ops.getVL(0);
12273 Operands[1] = Ops.getVL(1);
12274 }
12275 TE->setOperands(Operands);
12276 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12277 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12278 return;
12279 }
12280 case Instruction::GetElementPtr: {
12281 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12282 ReuseShuffleIndices);
12283 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
12284 TE->dump());
12285 TE->setOperands(Operands);
12286
12287 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
12288 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12289 return;
12290 }
12291 case Instruction::Store: {
12292 bool Consecutive = CurrentOrder.empty();
12293 if (!Consecutive)
12294 fixupOrderingIndices(CurrentOrder);
12295 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12296 ReuseShuffleIndices, CurrentOrder);
12297 if (Consecutive)
12298 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
12299 TE->dump());
12300 else
12301 LLVM_DEBUG(
12302 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
12303 TE->dump());
12304 TE->setOperands(Operands);
12305 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
12306 return;
12307 }
12308 case Instruction::Call: {
12309 // Check if the calls are all to the same vectorizable intrinsic or
12310 // library function.
12311 CallInst *CI = cast<CallInst>(VL0);
12313
12314 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12315 ReuseShuffleIndices);
12316 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
12317 TE->dump());
12318 if (isCommutative(VL0)) {
12319 VLOperands Ops(VL, Operands, S, *this);
12320 Ops.reorder();
12321 Operands[0] = Ops.getVL(0);
12322 Operands[1] = Ops.getVL(1);
12323 }
12324 TE->setOperands(Operands);
12325 for (unsigned I : seq<unsigned>(CI->arg_size())) {
12326 // For scalar operands no need to create an entry since no need to
12327 // vectorize it.
12329 continue;
12330 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12331 }
12332 return;
12333 }
12334 case Instruction::ShuffleVector: {
12335 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12336 ReuseShuffleIndices);
12337 if (S.isAltShuffle()) {
12338 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
12339 TE->dump());
12340 } else {
12341 assert(SLPReVec && "Only supported by REVEC.");
12342 LLVM_DEBUG(
12343 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12344 TE->dump());
12345 }
12346
12347 // Reorder operands if reordering would enable vectorization.
12348 auto *CI = dyn_cast<CmpInst>(VL0);
12349 if (CI && any_of(VL, [](Value *V) {
12350 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
12351 })) {
12352 auto *MainCI = cast<CmpInst>(S.getMainOp());
12353 auto *AltCI = cast<CmpInst>(S.getAltOp());
12354 CmpInst::Predicate MainP = MainCI->getPredicate();
12355 CmpInst::Predicate AltP = AltCI->getPredicate();
12356 assert(MainP != AltP &&
12357 "Expected different main/alternate predicates.");
12358 // Collect operands - commute if it uses the swapped predicate or
12359 // alternate operation.
12360 for (auto [Idx, V] : enumerate(VL)) {
12361 if (isa<PoisonValue>(V))
12362 continue;
12363 auto *Cmp = cast<CmpInst>(V);
12364
12365 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
12366 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12367 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12368 } else {
12369 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12370 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12371 }
12372 }
12373 TE->setOperands(Operands);
12374 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
12375 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
12376 return;
12377 }
12378
12379 if (isa<BinaryOperator>(VL0) || CI) {
12380 VLOperands Ops(VL, Operands, S, *this);
12381 Ops.reorder();
12382 Operands[0] = Ops.getVL(0);
12383 Operands[1] = Ops.getVL(1);
12384 }
12385 TE->setOperands(Operands);
12386 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12387 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12388 return;
12389 }
12390 default:
12391 break;
12392 }
12393 llvm_unreachable("Unexpected vectorization of the instructions.");
12394}
12395
12396unsigned BoUpSLP::canMapToVector(Type *T) const {
12397 unsigned N = 1;
12398 Type *EltTy = T;
12399
12401 if (EltTy->isEmptyTy())
12402 return 0;
12403 if (auto *ST = dyn_cast<StructType>(EltTy)) {
12404 // Check that struct is homogeneous.
12405 for (const auto *Ty : ST->elements())
12406 if (Ty != *ST->element_begin())
12407 return 0;
12408 N *= ST->getNumElements();
12409 EltTy = *ST->element_begin();
12410 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
12411 N *= AT->getNumElements();
12412 EltTy = AT->getElementType();
12413 } else {
12414 auto *VT = cast<FixedVectorType>(EltTy);
12415 N *= VT->getNumElements();
12416 EltTy = VT->getElementType();
12417 }
12418 }
12419
12420 if (!isValidElementType(EltTy))
12421 return 0;
12422 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
12423 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12424 VTSize != DL->getTypeStoreSizeInBits(T))
12425 return 0;
12426 return N;
12427}
12428
12429bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12430 SmallVectorImpl<unsigned> &CurrentOrder,
12431 bool ResizeAllowed) const {
12433 assert(It != VL.end() && "Expected at least one extract instruction.");
12434 auto *E0 = cast<Instruction>(*It);
12435 assert(
12437 "Invalid opcode");
12438 // Check if all of the extracts come from the same vector and from the
12439 // correct offset.
12440 Value *Vec = E0->getOperand(0);
12441
12442 CurrentOrder.clear();
12443
12444 // We have to extract from a vector/aggregate with the same number of elements.
12445 unsigned NElts;
12446 if (E0->getOpcode() == Instruction::ExtractValue) {
12447 NElts = canMapToVector(Vec->getType());
12448 if (!NElts)
12449 return false;
12450 // Check if load can be rewritten as load of vector.
12451 LoadInst *LI = dyn_cast<LoadInst>(Vec);
12452 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
12453 return false;
12454 } else {
12455 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
12456 }
12457
12458 unsigned E = VL.size();
12459 if (!ResizeAllowed && NElts != E)
12460 return false;
12461 SmallVector<int> Indices(E, PoisonMaskElem);
12462 unsigned MinIdx = NElts, MaxIdx = 0;
12463 for (auto [I, V] : enumerate(VL)) {
12464 auto *Inst = dyn_cast<Instruction>(V);
12465 if (!Inst)
12466 continue;
12467 if (Inst->getOperand(0) != Vec)
12468 return false;
12469 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
12470 if (isa<UndefValue>(EE->getIndexOperand()))
12471 continue;
12472 std::optional<unsigned> Idx = getExtractIndex(Inst);
12473 if (!Idx)
12474 return false;
12475 const unsigned ExtIdx = *Idx;
12476 if (ExtIdx >= NElts)
12477 continue;
12478 Indices[I] = ExtIdx;
12479 if (MinIdx > ExtIdx)
12480 MinIdx = ExtIdx;
12481 if (MaxIdx < ExtIdx)
12482 MaxIdx = ExtIdx;
12483 }
12484 if (MaxIdx - MinIdx + 1 > E)
12485 return false;
12486 if (MaxIdx + 1 <= E)
12487 MinIdx = 0;
12488
12489 // Check that all of the indices extract from the correct offset.
12490 bool ShouldKeepOrder = true;
12491 // Assign to all items the initial value E + 1 so we can check if the extract
12492 // instruction index was used already.
12493 // Also, later we can check that all the indices are used and we have a
12494 // consecutive access in the extract instructions, by checking that no
12495 // element of CurrentOrder still has value E + 1.
12496 CurrentOrder.assign(E, E);
12497 for (unsigned I = 0; I < E; ++I) {
12498 if (Indices[I] == PoisonMaskElem)
12499 continue;
12500 const unsigned ExtIdx = Indices[I] - MinIdx;
12501 if (CurrentOrder[ExtIdx] != E) {
12502 CurrentOrder.clear();
12503 return false;
12504 }
12505 ShouldKeepOrder &= ExtIdx == I;
12506 CurrentOrder[ExtIdx] = I;
12507 }
12508 if (ShouldKeepOrder)
12509 CurrentOrder.clear();
12510
12511 return ShouldKeepOrder;
12512}
12513
12514bool BoUpSLP::areAllUsersVectorized(
12515 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12516 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
12517 all_of(I->users(), [this](User *U) {
12518 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12519 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12520 });
12521}
12522
12523void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12524 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12525 SmallVectorImpl<Value *> *OpScalars,
12526 SmallVectorImpl<Value *> *AltScalars) const {
12527 unsigned Sz = Scalars.size();
12528 Mask.assign(Sz, PoisonMaskElem);
12529 SmallVector<int> OrderMask;
12530 if (!ReorderIndices.empty())
12531 inversePermutation(ReorderIndices, OrderMask);
12532 for (unsigned I = 0; I < Sz; ++I) {
12533 unsigned Idx = I;
12534 if (!ReorderIndices.empty())
12535 Idx = OrderMask[I];
12536 if (isa<PoisonValue>(Scalars[Idx]))
12537 continue;
12538 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12539 if (IsAltOp(OpInst)) {
12540 Mask[I] = Sz + Idx;
12541 if (AltScalars)
12542 AltScalars->push_back(OpInst);
12543 } else {
12544 Mask[I] = Idx;
12545 if (OpScalars)
12546 OpScalars->push_back(OpInst);
12547 }
12548 }
12549 if (!ReuseShuffleIndices.empty()) {
12550 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12551 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12552 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12553 });
12554 Mask.swap(NewMask);
12555 }
12556}
12557
12559 Instruction *AltOp,
12560 const TargetLibraryInfo &TLI) {
12561 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12562}
12563
12565 Instruction *AltOp,
12566 const TargetLibraryInfo &TLI) {
12567 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12568 auto *AltCI = cast<CmpInst>(AltOp);
12569 CmpInst::Predicate MainP = MainCI->getPredicate();
12570 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12571 assert(MainP != AltP && "Expected different main/alternate predicates.");
12572 auto *CI = cast<CmpInst>(I);
12573 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12574 return false;
12575 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12576 return true;
12577 CmpInst::Predicate P = CI->getPredicate();
12579
12580 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12581 "CmpInst expected to match either main or alternate predicate or "
12582 "their swap.");
12583 return MainP != P && MainP != SwappedP;
12584 }
12585 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12586}
12587
12588TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12589 assert(!Ops.empty());
12590 const auto *Op0 = Ops.front();
12591
12592 const bool IsConstant = all_of(Ops, [](Value *V) {
12593 // TODO: We should allow undef elements here
12594 return isConstant(V) && !isa<UndefValue>(V);
12595 });
12596 const bool IsUniform = all_of(Ops, [=](Value *V) {
12597 // TODO: We should allow undef elements here
12598 return V == Op0;
12599 });
12600 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12601 // TODO: We should allow undef elements here
12602 if (auto *CI = dyn_cast<ConstantInt>(V))
12603 return CI->getValue().isPowerOf2();
12604 return false;
12605 });
12606 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12607 // TODO: We should allow undef elements here
12608 if (auto *CI = dyn_cast<ConstantInt>(V))
12609 return CI->getValue().isNegatedPowerOf2();
12610 return false;
12611 });
12612
12614 if (IsConstant && IsUniform)
12616 else if (IsConstant)
12618 else if (IsUniform)
12620
12622 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12623 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12624
12625 return {VK, VP};
12626}
12627
12628namespace {
12629/// The base class for shuffle instruction emission and shuffle cost estimation.
12630class BaseShuffleAnalysis {
12631protected:
12632 Type *ScalarTy = nullptr;
12633
12634 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12635
12636 /// V is expected to be a vectorized value.
12637 /// When REVEC is disabled, there is no difference between VF and
12638 /// VNumElements.
12639 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12640 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12641 /// of 8.
12642 unsigned getVF(Value *V) const {
12643 assert(V && "V cannot be nullptr");
12644 assert(isa<FixedVectorType>(V->getType()) &&
12645 "V does not have FixedVectorType");
12646 assert(ScalarTy && "ScalarTy cannot be nullptr");
12647 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12648 unsigned VNumElements =
12649 cast<FixedVectorType>(V->getType())->getNumElements();
12650 assert(VNumElements > ScalarTyNumElements &&
12651 "the number of elements of V is not large enough");
12652 assert(VNumElements % ScalarTyNumElements == 0 &&
12653 "the number of elements of V is not a vectorized value");
12654 return VNumElements / ScalarTyNumElements;
12655 }
12656
12657 /// Checks if the mask is an identity mask.
12658 /// \param IsStrict if is true the function returns false if mask size does
12659 /// not match vector size.
12660 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12661 bool IsStrict) {
12662 int Limit = Mask.size();
12663 int VF = VecTy->getNumElements();
12664 int Index = -1;
12665 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12666 return true;
12667 if (!IsStrict) {
12668 // Consider extract subvector starting from index 0.
12669 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12670 Index == 0)
12671 return true;
12672 // All VF-size submasks are identity (e.g.
12673 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12674 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12675 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12676 return all_of(Slice, equal_to(PoisonMaskElem)) ||
12678 }))
12679 return true;
12680 }
12681 return false;
12682 }
12683
12684 /// Tries to combine 2 different masks into single one.
12685 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12686 /// change the size of the vector, \p LocalVF is the original size of the
12687 /// shuffled vector.
12688 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12689 ArrayRef<int> ExtMask) {
12690 unsigned VF = Mask.size();
12691 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12692 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12693 if (ExtMask[I] == PoisonMaskElem)
12694 continue;
12695 int MaskedIdx = Mask[ExtMask[I] % VF];
12696 NewMask[I] =
12697 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12698 }
12699 Mask.swap(NewMask);
12700 }
12701
12702 /// Looks through shuffles trying to reduce final number of shuffles in the
12703 /// code. The function looks through the previously emitted shuffle
12704 /// instructions and properly mark indices in mask as undef.
12705 /// For example, given the code
12706 /// \code
12707 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12708 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12709 /// \endcode
12710 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12711 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12712 /// <0, 1, 2, 3> for the shuffle.
12713 /// If 2 operands are of different size, the smallest one will be resized and
12714 /// the mask recalculated properly.
12715 /// For example, given the code
12716 /// \code
12717 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12718 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12719 /// \endcode
12720 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12721 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12722 /// <0, 1, 2, 3> for the shuffle.
12723 /// So, it tries to transform permutations to simple vector merge, if
12724 /// possible.
12725 /// \param V The input vector which must be shuffled using the given \p Mask.
12726 /// If the better candidate is found, \p V is set to this best candidate
12727 /// vector.
12728 /// \param Mask The input mask for the shuffle. If the best candidate is found
12729 /// during looking-through-shuffles attempt, it is updated accordingly.
12730 /// \param SinglePermute true if the shuffle operation is originally a
12731 /// single-value-permutation. In this case the look-through-shuffles procedure
12732 /// may look for resizing shuffles as the best candidates.
12733 /// \return true if the shuffle results in the non-resizing identity shuffle
12734 /// (and thus can be ignored), false - otherwise.
12735 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12736 bool SinglePermute) {
12737 Value *Op = V;
12738 ShuffleVectorInst *IdentityOp = nullptr;
12739 SmallVector<int> IdentityMask;
12740 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12741 // Exit if not a fixed vector type or changing size shuffle.
12742 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12743 if (!SVTy)
12744 break;
12745 // Remember the identity or broadcast mask, if it is not a resizing
12746 // shuffle. If no better candidates are found, this Op and Mask will be
12747 // used in the final shuffle.
12748 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12749 if (!IdentityOp || !SinglePermute ||
12750 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12752 IdentityMask.size()))) {
12753 IdentityOp = SV;
12754 // Store current mask in the IdentityMask so later we did not lost
12755 // this info if IdentityOp is selected as the best candidate for the
12756 // permutation.
12757 IdentityMask.assign(Mask);
12758 }
12759 }
12760 // Remember the broadcast mask. If no better candidates are found, this Op
12761 // and Mask will be used in the final shuffle.
12762 // Zero splat can be used as identity too, since it might be used with
12763 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12764 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12765 // expensive, the analysis founds out, that the source vector is just a
12766 // broadcast, this original mask can be transformed to identity mask <0,
12767 // 1, 2, 3>.
12768 // \code
12769 // %0 = shuffle %v, poison, zeroinitalizer
12770 // %res = shuffle %0, poison, <3, 1, 2, 0>
12771 // \endcode
12772 // may be transformed to
12773 // \code
12774 // %0 = shuffle %v, poison, zeroinitalizer
12775 // %res = shuffle %0, poison, <0, 1, 2, 3>
12776 // \endcode
12777 if (SV->isZeroEltSplat()) {
12778 IdentityOp = SV;
12779 IdentityMask.assign(Mask);
12780 }
12781 int LocalVF = Mask.size();
12782 if (auto *SVOpTy =
12783 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12784 LocalVF = SVOpTy->getNumElements();
12785 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12786 for (auto [Idx, I] : enumerate(Mask)) {
12787 if (I == PoisonMaskElem ||
12788 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12789 continue;
12790 ExtMask[Idx] = SV->getMaskValue(I);
12791 }
12792 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12793 SV->getOperand(0),
12794 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12795 .all();
12796 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12797 SV->getOperand(1),
12798 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12799 .all();
12800 if (!IsOp1Undef && !IsOp2Undef) {
12801 // Update mask and mark undef elems.
12802 for (int &I : Mask) {
12803 if (I == PoisonMaskElem)
12804 continue;
12805 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12807 I = PoisonMaskElem;
12808 }
12809 break;
12810 }
12811 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12812 combineMasks(LocalVF, ShuffleMask, Mask);
12813 Mask.swap(ShuffleMask);
12814 if (IsOp2Undef)
12815 Op = SV->getOperand(0);
12816 else
12817 Op = SV->getOperand(1);
12818 }
12819 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12820 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12822 if (IdentityOp) {
12823 V = IdentityOp;
12824 assert(Mask.size() == IdentityMask.size() &&
12825 "Expected masks of same sizes.");
12826 // Clear known poison elements.
12827 for (auto [I, Idx] : enumerate(Mask))
12828 if (Idx == PoisonMaskElem)
12829 IdentityMask[I] = PoisonMaskElem;
12830 Mask.swap(IdentityMask);
12831 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12832 return SinglePermute &&
12833 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12834 /*IsStrict=*/true) ||
12835 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12836 Shuffle->isZeroEltSplat() &&
12838 all_of(enumerate(Mask), [&](const auto &P) {
12839 return P.value() == PoisonMaskElem ||
12840 Shuffle->getShuffleMask()[P.index()] == 0;
12841 })));
12842 }
12843 V = Op;
12844 return false;
12845 }
12846 V = Op;
12847 return true;
12848 }
12849
12850 /// Smart shuffle instruction emission, walks through shuffles trees and
12851 /// tries to find the best matching vector for the actual shuffle
12852 /// instruction.
12853 template <typename T, typename ShuffleBuilderTy>
12854 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12855 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12856 assert(V1 && "Expected at least one vector value.");
12857 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12858 SmallVector<int> NewMask(Mask);
12859 if (ScalarTyNumElements != 1) {
12860 assert(SLPReVec && "FixedVectorType is not expected.");
12861 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12862 Mask = NewMask;
12863 }
12864 if (V2)
12865 Builder.resizeToMatch(V1, V2);
12866 int VF = Mask.size();
12867 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12868 VF = FTy->getNumElements();
12870 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12871 .all()) {
12872 // Peek through shuffles.
12873 Value *Op1 = V1;
12874 Value *Op2 = V2;
12875 int VF =
12876 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12877 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12878 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12879 for (int I = 0, E = Mask.size(); I < E; ++I) {
12880 if (Mask[I] < VF)
12881 CombinedMask1[I] = Mask[I];
12882 else
12883 CombinedMask2[I] = Mask[I] - VF;
12884 }
12885 Value *PrevOp1;
12886 Value *PrevOp2;
12887 do {
12888 PrevOp1 = Op1;
12889 PrevOp2 = Op2;
12890 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12891 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12892 // Check if we have 2 resizing shuffles - need to peek through operands
12893 // again.
12894 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12895 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12896 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12897 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12898 if (I == PoisonMaskElem)
12899 continue;
12900 ExtMask1[Idx] = SV1->getMaskValue(I);
12901 }
12902 SmallBitVector UseMask1 = buildUseMask(
12903 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12904 ->getNumElements(),
12905 ExtMask1, UseMask::SecondArg);
12906 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12907 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12908 if (I == PoisonMaskElem)
12909 continue;
12910 ExtMask2[Idx] = SV2->getMaskValue(I);
12911 }
12912 SmallBitVector UseMask2 = buildUseMask(
12913 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12914 ->getNumElements(),
12915 ExtMask2, UseMask::SecondArg);
12916 if (SV1->getOperand(0)->getType() ==
12917 SV2->getOperand(0)->getType() &&
12918 SV1->getOperand(0)->getType() != SV1->getType() &&
12919 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12920 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12921 Op1 = SV1->getOperand(0);
12922 Op2 = SV2->getOperand(0);
12923 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12924 int LocalVF = ShuffleMask1.size();
12925 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12926 LocalVF = FTy->getNumElements();
12927 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12928 CombinedMask1.swap(ShuffleMask1);
12929 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12930 LocalVF = ShuffleMask2.size();
12931 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12932 LocalVF = FTy->getNumElements();
12933 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12934 CombinedMask2.swap(ShuffleMask2);
12935 }
12936 }
12937 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12938 Builder.resizeToMatch(Op1, Op2);
12939 VF = std::max(cast<VectorType>(Op1->getType())
12940 ->getElementCount()
12941 .getKnownMinValue(),
12943 ->getElementCount()
12944 .getKnownMinValue());
12945 for (int I = 0, E = Mask.size(); I < E; ++I) {
12946 if (CombinedMask2[I] != PoisonMaskElem) {
12947 assert(CombinedMask1[I] == PoisonMaskElem &&
12948 "Expected undefined mask element");
12949 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12950 }
12951 }
12952 if (Op1 == Op2 &&
12953 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12954 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12956 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12957 ArrayRef(CombinedMask1))))
12958 return Builder.createIdentity(Op1);
12959 return Builder.createShuffleVector(
12960 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12961 CombinedMask1);
12962 }
12963 if (isa<PoisonValue>(V1))
12964 return Builder.createPoison(
12965 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12966 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12967 assert(V1 && "Expected non-null value after looking through shuffles.");
12968
12969 if (!IsIdentity)
12970 return Builder.createShuffleVector(V1, NewMask);
12971 return Builder.createIdentity(V1);
12972 }
12973
12974 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12975 /// shuffle emission.
12976 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12977 ArrayRef<int> Mask) {
12978 for (unsigned I : seq<unsigned>(CommonMask.size()))
12979 if (Mask[I] != PoisonMaskElem)
12980 CommonMask[I] = I;
12981 }
12982};
12983} // namespace
12984
12985/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12986static std::pair<InstructionCost, InstructionCost>
12988 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12989 Type *ScalarTy, VectorType *VecTy) {
12990 InstructionCost ScalarCost = 0;
12991 InstructionCost VecCost = 0;
12992 // Here we differentiate two cases: (1) when Ptrs represent a regular
12993 // vectorization tree node (as they are pointer arguments of scattered
12994 // loads) or (2) when Ptrs are the arguments of loads or stores being
12995 // vectorized as plane wide unit-stride load/store since all the
12996 // loads/stores are known to be from/to adjacent locations.
12997 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12998 // Case 2: estimate costs for pointer related costs when vectorizing to
12999 // a wide load/store.
13000 // Scalar cost is estimated as a set of pointers with known relationship
13001 // between them.
13002 // For vector code we will use BasePtr as argument for the wide load/store
13003 // but we also need to account all the instructions which are going to
13004 // stay in vectorized code due to uses outside of these scalar
13005 // loads/stores.
13006 ScalarCost = TTI.getPointersChainCost(
13007 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
13008 CostKind);
13009
13010 SmallVector<const Value *> PtrsRetainedInVecCode;
13011 for (Value *V : Ptrs) {
13012 if (V == BasePtr) {
13013 PtrsRetainedInVecCode.push_back(V);
13014 continue;
13015 }
13016 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
13017 // For simplicity assume Ptr to stay in vectorized code if it's not a
13018 // GEP instruction. We don't care since it's cost considered free.
13019 // TODO: We should check for any uses outside of vectorizable tree
13020 // rather than just single use.
13021 if (!Ptr || !Ptr->hasOneUse())
13022 PtrsRetainedInVecCode.push_back(V);
13023 }
13024
13025 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
13026 // If all pointers stay in vectorized code then we don't have
13027 // any savings on that.
13028 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
13029 }
13030 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
13031 TTI::PointersChainInfo::getKnownStride(),
13032 VecTy, CostKind);
13033 } else {
13034 // Case 1: Ptrs are the arguments of loads that we are going to transform
13035 // into masked gather load intrinsic.
13036 // All the scalar GEPs will be removed as a result of vectorization.
13037 // For any external uses of some lanes extract element instructions will
13038 // be generated (which cost is estimated separately).
13039 TTI::PointersChainInfo PtrsInfo =
13040 all_of(Ptrs,
13041 [](const Value *V) {
13042 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
13043 return Ptr && !Ptr->hasAllConstantIndices();
13044 })
13045 ? TTI::PointersChainInfo::getUnknownStride()
13046 : TTI::PointersChainInfo::getKnownStride();
13047
13048 ScalarCost =
13049 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
13050 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
13051 if (!BaseGEP) {
13052 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
13053 if (It != Ptrs.end())
13054 BaseGEP = cast<GEPOperator>(*It);
13055 }
13056 if (BaseGEP) {
13057 SmallVector<const Value *> Indices(BaseGEP->indices());
13058 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
13059 BaseGEP->getPointerOperand(), Indices, VecTy,
13060 CostKind);
13061 }
13062 }
13063
13064 return std::make_pair(ScalarCost, VecCost);
13065}
13066
13067void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13068 assert(TE.isGather() && TE.ReorderIndices.empty() &&
13069 "Expected gather node without reordering.");
13070 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
13071 SmallSet<size_t, 2> LoadKeyUsed;
13072
13073 // Do not reorder nodes if it small (just 2 elements), all-constant or all
13074 // instructions have same opcode already.
13075 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
13076 all_of(TE.Scalars, isConstant))
13077 return;
13078
13079 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
13080 return VectorizableTree[Idx]->isSame(TE.Scalars);
13081 }))
13082 return;
13083
13084 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
13085 Key = hash_combine(hash_value(LI->getParent()), Key);
13086 Value *Ptr =
13087 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
13088 if (LoadKeyUsed.contains(Key)) {
13089 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
13090 if (LIt != LoadsMap.end()) {
13091 for (LoadInst *RLI : LIt->second) {
13092 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
13093 LI->getType(), LI->getPointerOperand(), *DL, *SE,
13094 /*StrictCheck=*/true))
13095 return hash_value(RLI->getPointerOperand());
13096 }
13097 for (LoadInst *RLI : LIt->second) {
13099 LI->getPointerOperand(), *TLI)) {
13100 hash_code SubKey = hash_value(RLI->getPointerOperand());
13101 return SubKey;
13102 }
13103 }
13104 if (LIt->second.size() > 2) {
13105 hash_code SubKey =
13106 hash_value(LIt->second.back()->getPointerOperand());
13107 return SubKey;
13108 }
13109 }
13110 }
13111 LoadKeyUsed.insert(Key);
13112 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
13113 return hash_value(LI->getPointerOperand());
13114 };
13115 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13116 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
13117 bool IsOrdered = true;
13118 unsigned NumInstructions = 0;
13119 // Try to "cluster" scalar instructions, to be able to build extra vectorized
13120 // nodes.
13121 for (auto [I, V] : enumerate(TE.Scalars)) {
13122 size_t Key = 1, Idx = 1;
13123 if (auto *Inst = dyn_cast<Instruction>(V);
13125 !isDeleted(Inst) && !isVectorized(V)) {
13126 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
13127 /*AllowAlternate=*/false);
13128 ++NumInstructions;
13129 }
13130 auto &Container = SortedValues[Key];
13131 if (IsOrdered && !KeyToIndex.contains(V) &&
13134 ((Container.contains(Idx) &&
13135 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
13136 (!Container.empty() && !Container.contains(Idx) &&
13137 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
13138 IsOrdered = false;
13139 auto &KTI = KeyToIndex[V];
13140 if (KTI.empty())
13141 Container[Idx].push_back(V);
13142 KTI.push_back(I);
13143 }
13145 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
13146 if (!IsOrdered && NumInstructions > 1) {
13147 unsigned Cnt = 0;
13148 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
13149 for (const auto &D : SortedValues) {
13150 for (const auto &P : D.second) {
13151 unsigned Sz = 0;
13152 for (Value *V : P.second) {
13153 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
13154 for (auto [K, Idx] : enumerate(Indices)) {
13155 TE.ReorderIndices[Cnt + K] = Idx;
13156 TE.Scalars[Cnt + K] = V;
13157 }
13158 Sz += Indices.size();
13159 Cnt += Indices.size();
13160 }
13161 if (Sz > 1 && isa<Instruction>(P.second.front())) {
13162 const unsigned SubVF = getFloorFullVectorNumberOfElements(
13163 *TTI, TE.Scalars.front()->getType(), Sz);
13164 SubVectors.emplace_back(Cnt - Sz, SubVF);
13165 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
13166 DemandedElts.clearBit(I);
13167 } else if (!P.second.empty() && isConstant(P.second.front())) {
13168 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
13169 DemandedElts.clearBit(I);
13170 }
13171 }
13172 }
13173 }
13174 // Reuses always require shuffles, so consider it as profitable.
13175 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
13176 return;
13177 // Do simple cost estimation.
13180 auto *ScalarTy = TE.Scalars.front()->getType();
13181 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
13182 for (auto [Idx, Sz] : SubVectors) {
13184 Idx, getWidenedType(ScalarTy, Sz));
13185 }
13186 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
13187 /*Insert=*/true,
13188 /*Extract=*/false, CostKind);
13189 int Sz = TE.Scalars.size();
13190 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
13191 TE.ReorderIndices.end());
13192 for (unsigned I : seq<unsigned>(Sz)) {
13193 Value *V = TE.getOrdered(I);
13194 if (isa<PoisonValue>(V)) {
13195 ReorderMask[I] = PoisonMaskElem;
13196 } else if (isConstant(V) || DemandedElts[I]) {
13197 ReorderMask[I] = I + TE.ReorderIndices.size();
13198 }
13199 }
13200 Cost += ::getShuffleCost(*TTI,
13201 any_of(ReorderMask, [&](int I) { return I >= Sz; })
13204 VecTy, ReorderMask);
13205 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
13206 ReorderMask.assign(Sz, PoisonMaskElem);
13207 for (unsigned I : seq<unsigned>(Sz)) {
13208 Value *V = TE.getOrdered(I);
13209 if (isConstant(V)) {
13210 DemandedElts.clearBit(I);
13211 if (!isa<PoisonValue>(V))
13212 ReorderMask[I] = I;
13213 } else {
13214 ReorderMask[I] = I + Sz;
13215 }
13216 }
13217 InstructionCost BVCost =
13218 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
13219 /*Insert=*/true, /*Extract=*/false, CostKind);
13220 if (!DemandedElts.isAllOnes())
13221 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
13222 if (Cost >= BVCost) {
13223 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
13224 reorderScalars(TE.Scalars, Mask);
13225 TE.ReorderIndices.clear();
13226 }
13227}
13228
13229/// Check if we can convert fadd/fsub sequence to FMAD.
13230/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
13232 const InstructionsState &S,
13233 DominatorTree &DT, const DataLayout &DL,
13235 const TargetLibraryInfo &TLI) {
13236 assert(all_of(VL,
13237 [](Value *V) {
13238 return V->getType()->getScalarType()->isFloatingPointTy();
13239 }) &&
13240 "Can only convert to FMA for floating point types");
13241 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
13242
13243 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
13244 FastMathFlags FMF;
13245 FMF.set();
13246 for (Value *V : VL) {
13247 auto *I = dyn_cast<Instruction>(V);
13248 if (!I)
13249 continue;
13250 if (S.isCopyableElement(I))
13251 continue;
13252 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
13253 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13254 continue;
13255 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13256 FMF &= FPCI->getFastMathFlags();
13257 }
13258 return FMF.allowContract();
13259 };
13260 if (!CheckForContractable(VL))
13262 // fmul also should be contractable
13263 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
13264 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
13265
13266 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
13267 if (!OpS.valid())
13269
13270 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13272 if (!CheckForContractable(Operands.front()))
13274 // Compare the costs.
13275 InstructionCost FMulPlusFAddCost = 0;
13276 InstructionCost FMACost = 0;
13278 FastMathFlags FMF;
13279 FMF.set();
13280 for (Value *V : VL) {
13281 auto *I = dyn_cast<Instruction>(V);
13282 if (!I)
13283 continue;
13284 if (!S.isCopyableElement(I))
13285 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13286 FMF &= FPCI->getFastMathFlags();
13287 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13288 }
13289 unsigned NumOps = 0;
13290 for (auto [V, Op] : zip(VL, Operands.front())) {
13291 if (S.isCopyableElement(V))
13292 continue;
13293 auto *I = dyn_cast<Instruction>(Op);
13294 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
13295 if (auto *OpI = dyn_cast<Instruction>(V))
13296 FMACost += TTI.getInstructionCost(OpI, CostKind);
13297 if (I)
13298 FMACost += TTI.getInstructionCost(I, CostKind);
13299 continue;
13300 }
13301 ++NumOps;
13302 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13303 FMF &= FPCI->getFastMathFlags();
13304 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13305 }
13306 Type *Ty = VL.front()->getType();
13307 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
13308 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
13309 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
13310}
13311
13314 BaseGraphSize = VectorizableTree.size();
13315 // Turn graph transforming mode on and off, when done.
13316 class GraphTransformModeRAAI {
13317 bool &SavedIsGraphTransformMode;
13318
13319 public:
13320 GraphTransformModeRAAI(bool &IsGraphTransformMode)
13321 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13322 IsGraphTransformMode = true;
13323 }
13324 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
13325 } TransformContext(IsGraphTransformMode);
13326 // Operands are profitable if they are:
13327 // 1. At least one constant
13328 // or
13329 // 2. Splats
13330 // or
13331 // 3. Results in good vectorization opportunity, i.e. may generate vector
13332 // nodes and reduce cost of the graph.
13333 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
13334 const InstructionsState &S) {
13336 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
13337 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
13338 I2->getOperand(Op));
13339 return all_of(
13340 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
13341 return all_of(Cand,
13342 [](const std::pair<Value *, Value *> &P) {
13343 return isa<Constant>(P.first) ||
13344 isa<Constant>(P.second) || P.first == P.second;
13345 }) ||
13347 });
13348 };
13349
13350 // Try to reorder gather nodes for better vectorization opportunities.
13351 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13352 TreeEntry &E = *VectorizableTree[Idx];
13353 if (E.isGather())
13354 reorderGatherNode(E);
13355 }
13356
13357 // Better to use full gathered loads analysis, if there are only 2 loads
13358 // gathered nodes each having less than 16 elements.
13359 constexpr unsigned VFLimit = 16;
13360 bool ForceLoadGather =
13361 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
13362 return TE->isGather() && TE->hasState() &&
13363 TE->getOpcode() == Instruction::Load &&
13364 TE->getVectorFactor() < VFLimit;
13365 }) == 2;
13366
13367 // Checks if the scalars are used in other node.
13368 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
13369 function_ref<bool(Value *)> CheckContainer) {
13370 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
13371 if (isa<PoisonValue>(V))
13372 return true;
13373 auto *I = dyn_cast<Instruction>(V);
13374 if (!I)
13375 return false;
13376 return is_contained(TE->Scalars, I) || CheckContainer(I);
13377 });
13378 };
13379 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
13380 if (E.hasState()) {
13381 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
13382 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13383 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13384 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13385 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13386 return is_contained(TEs, TE);
13387 });
13388 });
13389 }))
13390 return true;
13391 ;
13392 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
13393 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13394 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13395 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13396 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13397 return is_contained(TEs, TE);
13398 });
13399 });
13400 }))
13401 return true;
13402 } else {
13403 // Check if the gather node full copy of split node.
13404 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
13405 if (It != E.Scalars.end()) {
13406 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
13407 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13408 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13409 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13410 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13411 return is_contained(TEs, TE);
13412 });
13413 });
13414 }))
13415 return true;
13416 }
13417 }
13418 return false;
13419 };
13420 // The tree may grow here, so iterate over nodes, built before.
13421 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13422 TreeEntry &E = *VectorizableTree[Idx];
13423 if (E.isGather()) {
13424 ArrayRef<Value *> VL = E.Scalars;
13425 const unsigned Sz = getVectorElementSize(VL.front());
13426 unsigned MinVF = getMinVF(2 * Sz);
13427 // Do not try partial vectorization for small nodes (<= 2), nodes with the
13428 // same opcode and same parent block or all constants.
13429 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13430 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
13431 // We use allSameOpcode instead of isAltShuffle because we don't
13432 // want to use interchangeable instruction here.
13433 !allSameOpcode(VL) || !allSameBlock(VL)) ||
13434 allConstant(VL) || isSplat(VL))
13435 continue;
13436 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13437 continue;
13438 // Check if the node is a copy of other vector nodes.
13439 if (CheckForSameVectorNodes(E))
13440 continue;
13441 // Try to find vectorizable sequences and transform them into a series of
13442 // insertvector instructions.
13443 unsigned StartIdx = 0;
13444 unsigned End = VL.size();
13445 SmallBitVector Processed(End);
13446 for (unsigned VF = getFloorFullVectorNumberOfElements(
13447 *TTI, VL.front()->getType(), VL.size() - 1);
13448 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13449 *TTI, VL.front()->getType(), VF - 1)) {
13450 if (StartIdx + VF > End)
13451 continue;
13453 bool AllStrided = true;
13454 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13455 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
13456 // If any instruction is vectorized already - do not try again.
13457 // Reuse the existing node, if it fully matches the slice.
13458 if ((Processed.test(Cnt) || isVectorized(Slice.front())) &&
13459 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
13460 continue;
13461 // Constant already handled effectively - skip.
13462 if (allConstant(Slice))
13463 continue;
13464 // Do not try to vectorize small splats (less than vector register and
13465 // only with the single non-undef element).
13466 bool IsSplat = isSplat(Slice);
13467 bool IsTwoRegisterSplat = true;
13468 if (IsSplat && VF == 2) {
13469 unsigned NumRegs2VF = ::getNumberOfParts(
13470 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
13471 IsTwoRegisterSplat = NumRegs2VF == 2;
13472 }
13473 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
13474 count(Slice, Slice.front()) ==
13475 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
13476 : 1)) {
13477 if (IsSplat)
13478 continue;
13479 InstructionsState S = getSameOpcode(Slice, *TLI);
13480 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
13481 (S.getOpcode() == Instruction::Load &&
13483 (S.getOpcode() != Instruction::Load &&
13484 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
13485 continue;
13486 if (VF == 2) {
13487 // Try to vectorize reduced values or if all users are vectorized.
13488 // For expensive instructions extra extracts might be profitable.
13489 if ((!UserIgnoreList || E.Idx != 0) &&
13490 TTI->getInstructionCost(S.getMainOp(), CostKind) <
13492 !all_of(Slice, [&](Value *V) {
13493 if (isa<PoisonValue>(V))
13494 return true;
13495 return areAllUsersVectorized(cast<Instruction>(V),
13496 UserIgnoreList);
13497 }))
13498 continue;
13499 if (S.getOpcode() == Instruction::Load) {
13500 OrdersType Order;
13501 SmallVector<Value *> PointerOps;
13502 StridedPtrInfo SPtrInfo;
13503 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
13504 PointerOps, SPtrInfo);
13505 AllStrided &= Res == LoadsState::StridedVectorize ||
13507 Res == LoadsState::Gather;
13508 // Do not vectorize gathers.
13509 if (Res == LoadsState::ScatterVectorize ||
13510 Res == LoadsState::Gather) {
13511 if (Res == LoadsState::Gather) {
13513 // If reductions and the scalars from the root node are
13514 // analyzed - mark as non-vectorizable reduction.
13515 if (UserIgnoreList && E.Idx == 0)
13516 analyzedReductionVals(Slice);
13517 }
13518 continue;
13519 }
13520 } else if (S.getOpcode() == Instruction::ExtractElement ||
13521 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
13523 !CheckOperandsProfitability(
13524 S.getMainOp(),
13527 S))) {
13528 // Do not vectorize extractelements (handled effectively
13529 // alread). Do not vectorize non-profitable instructions (with
13530 // low cost and non-vectorizable operands.)
13531 continue;
13532 }
13533 }
13534 }
13535 Slices.emplace_back(Cnt, Slice.size());
13536 }
13537 // Do not try to vectorize if all slides are strided or gathered with
13538 // vector factor 2 and there are more than 2 slices. Better to handle
13539 // them in gathered loads analysis, may result in better vectorization.
13540 if (VF == 2 && AllStrided && Slices.size() > 2)
13541 continue;
13542 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13543 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13544 Processed.set(Cnt, Cnt + Sz);
13545 if (StartIdx == Cnt)
13546 StartIdx = Cnt + Sz;
13547 if (End == Cnt + Sz)
13548 End = Cnt;
13549 };
13550 for (auto [Cnt, Sz] : Slices) {
13551 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13552 const TreeEntry *SameTE = nullptr;
13553 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13554 It != Slice.end()) {
13555 // If any instruction is vectorized already - do not try again.
13556 SameTE = getSameValuesTreeEntry(*It, Slice);
13557 }
13558 unsigned PrevSize = VectorizableTree.size();
13559 [[maybe_unused]] unsigned PrevEntriesSize =
13560 LoadEntriesToVectorize.size();
13561 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13562 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13563 VectorizableTree[PrevSize]->isGather() &&
13564 VectorizableTree[PrevSize]->hasState() &&
13565 VectorizableTree[PrevSize]->getOpcode() !=
13566 Instruction::ExtractElement &&
13567 !isSplat(Slice)) {
13568 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13569 analyzedReductionVals(Slice);
13570 VectorizableTree.pop_back();
13571 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13572 "LoadEntriesToVectorize expected to remain the same");
13573 continue;
13574 }
13575 AddCombinedNode(PrevSize, Cnt, Sz);
13576 }
13577 }
13578 // Restore ordering, if no extra vectorization happened.
13579 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13580 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13581 reorderScalars(E.Scalars, Mask);
13582 E.ReorderIndices.clear();
13583 }
13584 }
13585 if (!E.hasState())
13586 continue;
13587 switch (E.getOpcode()) {
13588 case Instruction::Load: {
13589 // No need to reorder masked gather loads, just reorder the scalar
13590 // operands.
13591 if (E.State != TreeEntry::Vectorize)
13592 break;
13593 Type *ScalarTy = E.getMainOp()->getType();
13594 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13595 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13596 // Check if profitable to represent consecutive load + reverse as strided
13597 // load with stride -1.
13598 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13599 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13600 SmallVector<int> Mask;
13601 inversePermutation(E.ReorderIndices, Mask);
13602 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13603 InstructionCost OriginalVecCost =
13604 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13605 BaseLI->getPointerAddressSpace(), CostKind,
13607 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13608 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13609 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13610 VecTy, BaseLI->getPointerOperand(),
13611 /*VariableMask=*/false, CommonAlignment,
13612 BaseLI),
13613 CostKind);
13614 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13615 // Strided load is more profitable than consecutive load + reverse -
13616 // transform the node to strided load.
13617 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13618 ->getPointerOperand()
13619 ->getType());
13620 StridedPtrInfo SPtrInfo;
13621 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13622 SPtrInfo.Ty = VecTy;
13623 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13624 E.State = TreeEntry::StridedVectorize;
13625 }
13626 }
13627 break;
13628 }
13629 case Instruction::Store: {
13630 Type *ScalarTy =
13631 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13632 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13633 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13634 // Check if profitable to represent consecutive load + reverse as strided
13635 // load with stride -1.
13636 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13637 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13638 SmallVector<int> Mask;
13639 inversePermutation(E.ReorderIndices, Mask);
13640 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13641 InstructionCost OriginalVecCost =
13642 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13643 BaseSI->getPointerAddressSpace(), CostKind,
13645 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13646 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13647 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13648 VecTy, BaseSI->getPointerOperand(),
13649 /*VariableMask=*/false, CommonAlignment,
13650 BaseSI),
13651 CostKind);
13652 if (StridedCost < OriginalVecCost)
13653 // Strided store is more profitable than reverse + consecutive store -
13654 // transform the node to strided store.
13655 E.State = TreeEntry::StridedVectorize;
13656 } else if (!E.ReorderIndices.empty()) {
13657 // Check for interleaved stores.
13658 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13659 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13660 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13661 if (Mask.size() < 4)
13662 return 0u;
13663 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13665 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13666 TTI.isLegalInterleavedAccessType(
13667 VecTy, Factor, BaseSI->getAlign(),
13668 BaseSI->getPointerAddressSpace()))
13669 return Factor;
13670 }
13671
13672 return 0u;
13673 };
13674 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13675 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13676 if (InterleaveFactor != 0)
13677 E.setInterleave(InterleaveFactor);
13678 }
13679 break;
13680 }
13681 case Instruction::Select: {
13682 if (E.State != TreeEntry::Vectorize)
13683 break;
13684 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13685 if (MinMaxID == Intrinsic::not_intrinsic)
13686 break;
13687 // This node is a minmax node.
13688 E.CombinedOp = TreeEntry::MinMax;
13689 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13690 if (SelectOnly && CondEntry->UserTreeIndex &&
13691 CondEntry->State == TreeEntry::Vectorize) {
13692 // The condition node is part of the combined minmax node.
13693 CondEntry->State = TreeEntry::CombinedVectorize;
13694 }
13695 break;
13696 }
13697 case Instruction::FSub:
13698 case Instruction::FAdd: {
13699 // Check if possible to convert (a*b)+c to fma.
13700 if (E.State != TreeEntry::Vectorize ||
13701 !E.getOperations().isAddSubLikeOp())
13702 break;
13703 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13704 .isValid())
13705 break;
13706 // This node is a fmuladd node.
13707 E.CombinedOp = TreeEntry::FMulAdd;
13708 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13709 if (FMulEntry->UserTreeIndex &&
13710 FMulEntry->State == TreeEntry::Vectorize) {
13711 // The FMul node is part of the combined fmuladd node.
13712 FMulEntry->State = TreeEntry::CombinedVectorize;
13713 }
13714 break;
13715 }
13716 default:
13717 break;
13718 }
13719 }
13720
13721 if (LoadEntriesToVectorize.empty()) {
13722 // Single load node - exit.
13723 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13724 VectorizableTree.front()->getOpcode() == Instruction::Load)
13725 return;
13726 // Small graph with small VF - exit.
13727 constexpr unsigned SmallTree = 3;
13728 constexpr unsigned SmallVF = 2;
13729 if ((VectorizableTree.size() <= SmallTree &&
13730 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13731 (VectorizableTree.size() <= 2 && UserIgnoreList))
13732 return;
13733
13734 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13735 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13736 getCanonicalGraphSize() <= SmallTree &&
13737 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13738 [](const std::unique_ptr<TreeEntry> &TE) {
13739 return TE->isGather() && TE->hasState() &&
13740 TE->getOpcode() == Instruction::Load &&
13741 !allSameBlock(TE->Scalars);
13742 }) == 1)
13743 return;
13744 }
13745
13746 // A list of loads to be gathered during the vectorization process. We can
13747 // try to vectorize them at the end, if profitable.
13748 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13750 GatheredLoads;
13751
13752 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13753 TreeEntry &E = *TE;
13754 if (E.isGather() &&
13755 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13756 (!E.hasState() && any_of(E.Scalars,
13757 [&](Value *V) {
13758 return isa<LoadInst>(V) &&
13759 !isVectorized(V) &&
13760 !isDeleted(cast<Instruction>(V));
13761 }))) &&
13762 !isSplat(E.Scalars)) {
13763 for (Value *V : E.Scalars) {
13764 auto *LI = dyn_cast<LoadInst>(V);
13765 if (!LI)
13766 continue;
13767 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13768 continue;
13770 *this, V, *DL, *SE, *TTI,
13771 GatheredLoads[std::make_tuple(
13772 LI->getParent(),
13773 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13774 LI->getType())]);
13775 }
13776 }
13777 }
13778 // Try to vectorize gathered loads if this is not just a gather of loads.
13779 if (!GatheredLoads.empty())
13780 tryToVectorizeGatheredLoads(GatheredLoads);
13781}
13782
13783/// Merges shuffle masks and emits final shuffle instruction, if required. It
13784/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13785/// when the actual shuffle instruction is generated only if this is actually
13786/// required. Otherwise, the shuffle instruction emission is delayed till the
13787/// end of the process, to reduce the number of emitted instructions and further
13788/// analysis/transformations.
13789class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13790 bool IsFinalized = false;
13791 SmallVector<int> CommonMask;
13793 const TargetTransformInfo &TTI;
13794 InstructionCost Cost = 0;
13795 SmallDenseSet<Value *> VectorizedVals;
13796 BoUpSLP &R;
13797 SmallPtrSetImpl<Value *> &CheckedExtracts;
13798 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13799 /// While set, still trying to estimate the cost for the same nodes and we
13800 /// can delay actual cost estimation (virtual shuffle instruction emission).
13801 /// May help better estimate the cost if same nodes must be permuted + allows
13802 /// to move most of the long shuffles cost estimation to TTI.
13803 bool SameNodesEstimated = true;
13804
13805 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13806 if (Ty->getScalarType()->isPointerTy()) {
13809 IntegerType::get(Ty->getContext(),
13810 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13811 Ty->getScalarType());
13812 if (auto *VTy = dyn_cast<VectorType>(Ty))
13813 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13814 return Res;
13815 }
13816 return Constant::getAllOnesValue(Ty);
13817 }
13818
13819 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13820 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13821 return TTI::TCC_Free;
13822 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13823 InstructionCost GatherCost = 0;
13824 SmallVector<Value *> Gathers(VL);
13825 if (!Root && isSplat(VL)) {
13826 // Found the broadcasting of the single scalar, calculate the cost as
13827 // the broadcast.
13828 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13829 assert(It != VL.end() && "Expected at least one non-undef value.");
13830 // Add broadcast for non-identity shuffle only.
13831 bool NeedShuffle =
13832 count(VL, *It) > 1 &&
13833 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13834 if (!NeedShuffle) {
13835 if (isa<FixedVectorType>(ScalarTy)) {
13836 assert(SLPReVec && "FixedVectorType is not expected.");
13837 return TTI.getShuffleCost(
13838 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13839 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13840 cast<FixedVectorType>(ScalarTy));
13841 }
13842 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13843 CostKind, std::distance(VL.begin(), It),
13844 PoisonValue::get(VecTy), *It);
13845 }
13846
13847 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13848 transform(VL, ShuffleMask.begin(), [](Value *V) {
13849 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13850 });
13851 InstructionCost InsertCost =
13852 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13853 PoisonValue::get(VecTy), *It);
13854 return InsertCost + ::getShuffleCost(TTI,
13856 VecTy, ShuffleMask, CostKind,
13857 /*Index=*/0, /*SubTp=*/nullptr,
13858 /*Args=*/*It);
13859 }
13860 return GatherCost +
13861 (all_of(Gathers, IsaPred<UndefValue>)
13863 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13864 ScalarTy));
13865 };
13866
13867 /// Compute the cost of creating a vector containing the extracted values from
13868 /// \p VL.
13870 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13871 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13872 unsigned NumParts) {
13873 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13874 unsigned NumElts =
13875 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13876 auto *EE = dyn_cast<ExtractElementInst>(V);
13877 if (!EE)
13878 return Sz;
13879 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13880 if (!VecTy)
13881 return Sz;
13882 return std::max(Sz, VecTy->getNumElements());
13883 });
13884 // FIXME: this must be moved to TTI for better estimation.
13885 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13886 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13888 SmallVectorImpl<unsigned> &SubVecSizes)
13889 -> std::optional<TTI::ShuffleKind> {
13890 if (NumElts <= EltsPerVector)
13891 return std::nullopt;
13892 int OffsetReg0 =
13893 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13894 [](int S, int I) {
13895 if (I == PoisonMaskElem)
13896 return S;
13897 return std::min(S, I);
13898 }),
13899 EltsPerVector);
13900 int OffsetReg1 = OffsetReg0;
13901 DenseSet<int> RegIndices;
13902 // Check that if trying to permute same single/2 input vectors.
13904 int FirstRegId = -1;
13905 Indices.assign(1, OffsetReg0);
13906 for (auto [Pos, I] : enumerate(Mask)) {
13907 if (I == PoisonMaskElem)
13908 continue;
13909 int Idx = I - OffsetReg0;
13910 int RegId =
13911 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13912 if (FirstRegId < 0)
13913 FirstRegId = RegId;
13914 RegIndices.insert(RegId);
13915 if (RegIndices.size() > 2)
13916 return std::nullopt;
13917 if (RegIndices.size() == 2) {
13918 ShuffleKind = TTI::SK_PermuteTwoSrc;
13919 if (Indices.size() == 1) {
13920 OffsetReg1 = alignDown(
13921 std::accumulate(
13922 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13923 [&](int S, int I) {
13924 if (I == PoisonMaskElem)
13925 return S;
13926 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13927 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13928 if (RegId == FirstRegId)
13929 return S;
13930 return std::min(S, I);
13931 }),
13932 EltsPerVector);
13933 unsigned Index = OffsetReg1 % NumElts;
13934 Indices.push_back(Index);
13935 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13936 }
13937 Idx = I - OffsetReg1;
13938 }
13939 I = (Idx % NumElts) % EltsPerVector +
13940 (RegId == FirstRegId ? 0 : EltsPerVector);
13941 }
13942 return ShuffleKind;
13943 };
13944 InstructionCost Cost = 0;
13945
13946 // Process extracts in blocks of EltsPerVector to check if the source vector
13947 // operand can be re-used directly. If not, add the cost of creating a
13948 // shuffle to extract the values into a vector register.
13949 for (unsigned Part : seq<unsigned>(NumParts)) {
13950 if (!ShuffleKinds[Part])
13951 continue;
13952 ArrayRef<int> MaskSlice = Mask.slice(
13953 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13954 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13955 copy(MaskSlice, SubMask.begin());
13957 SmallVector<unsigned, 2> SubVecSizes;
13958 std::optional<TTI::ShuffleKind> RegShuffleKind =
13959 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13960 if (!RegShuffleKind) {
13961 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13963 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13964 Cost +=
13965 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13966 getWidenedType(ScalarTy, NumElts), MaskSlice);
13967 continue;
13968 }
13969 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13970 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13971 Cost +=
13972 ::getShuffleCost(TTI, *RegShuffleKind,
13973 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13974 }
13975 const unsigned BaseVF = getFullVectorNumberOfElements(
13976 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13977 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13978 assert((Idx + SubVecSize) <= BaseVF &&
13979 "SK_ExtractSubvector index out of range");
13981 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13982 Idx, getWidenedType(ScalarTy, SubVecSize));
13983 }
13984 // Second attempt to check, if just a permute is better estimated than
13985 // subvector extract.
13986 SubMask.assign(NumElts, PoisonMaskElem);
13987 copy(MaskSlice, SubMask.begin());
13988 InstructionCost OriginalCost = ::getShuffleCost(
13989 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13990 if (OriginalCost < Cost)
13991 Cost = OriginalCost;
13992 }
13993 return Cost;
13994 }
13995 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13996 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13997 /// elements.
13998 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13999 ArrayRef<int> Mask, unsigned Part,
14000 unsigned SliceSize) {
14001 if (SameNodesEstimated) {
14002 // Delay the cost estimation if the same nodes are reshuffling.
14003 // If we already requested the cost of reshuffling of E1 and E2 before, no
14004 // need to estimate another cost with the sub-Mask, instead include this
14005 // sub-Mask into the CommonMask to estimate it later and avoid double cost
14006 // estimation.
14007 if ((InVectors.size() == 2 &&
14008 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
14009 cast<const TreeEntry *>(InVectors.back()) == E2) ||
14010 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
14011 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
14012 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
14013 [](int Idx) { return Idx == PoisonMaskElem; }) &&
14014 "Expected all poisoned elements.");
14015 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
14016 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
14017 return;
14018 }
14019 // Found non-matching nodes - need to estimate the cost for the matched
14020 // and transform mask.
14021 Cost += createShuffle(InVectors.front(),
14022 InVectors.size() == 1 ? nullptr : InVectors.back(),
14023 CommonMask);
14024 transformMaskAfterShuffle(CommonMask, CommonMask);
14025 } else if (InVectors.size() == 2) {
14026 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14027 transformMaskAfterShuffle(CommonMask, CommonMask);
14028 }
14029 SameNodesEstimated = false;
14030 if (!E2 && InVectors.size() == 1) {
14031 unsigned VF = E1.getVectorFactor();
14032 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
14033 VF = std::max(VF, getVF(V1));
14034 } else {
14035 const auto *E = cast<const TreeEntry *>(InVectors.front());
14036 VF = std::max(VF, E->getVectorFactor());
14037 }
14038 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14039 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14040 CommonMask[Idx] = Mask[Idx] + VF;
14041 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
14042 transformMaskAfterShuffle(CommonMask, CommonMask);
14043 } else {
14044 auto P = InVectors.front();
14045 Cost += createShuffle(&E1, E2, Mask);
14046 unsigned VF = Mask.size();
14047 if (Value *V1 = dyn_cast<Value *>(P)) {
14048 VF = std::max(VF,
14049 getNumElements(V1->getType()));
14050 } else {
14051 const auto *E = cast<const TreeEntry *>(P);
14052 VF = std::max(VF, E->getVectorFactor());
14053 }
14054 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14055 if (Mask[Idx] != PoisonMaskElem)
14056 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
14057 Cost += createShuffle(P, InVectors.front(), CommonMask);
14058 transformMaskAfterShuffle(CommonMask, CommonMask);
14059 }
14060 }
14061
14062 class ShuffleCostBuilder {
14063 const TargetTransformInfo &TTI;
14064
14065 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
14066 int Index = -1;
14067 return Mask.empty() ||
14068 (VF == Mask.size() &&
14071 Index == 0);
14072 }
14073
14074 public:
14075 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
14076 ~ShuffleCostBuilder() = default;
14077 InstructionCost createShuffleVector(Value *V1, Value *,
14078 ArrayRef<int> Mask) const {
14079 // Empty mask or identity mask are free.
14080 unsigned VF =
14081 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
14082 if (isEmptyOrIdentity(Mask, VF))
14083 return TTI::TCC_Free;
14084 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
14085 cast<VectorType>(V1->getType()), Mask);
14086 }
14087 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
14088 // Empty mask or identity mask are free.
14089 unsigned VF =
14090 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
14091 if (isEmptyOrIdentity(Mask, VF))
14092 return TTI::TCC_Free;
14093 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
14094 cast<VectorType>(V1->getType()), Mask);
14095 }
14096 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
14097 InstructionCost createPoison(Type *Ty, unsigned VF) const {
14098 return TTI::TCC_Free;
14099 }
14100 void resizeToMatch(Value *&, Value *&) const {}
14101 };
14102
14103 /// Smart shuffle instruction emission, walks through shuffles trees and
14104 /// tries to find the best matching vector for the actual shuffle
14105 /// instruction.
14107 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
14109 ArrayRef<int> Mask) {
14110 ShuffleCostBuilder Builder(TTI);
14111 SmallVector<int> CommonMask(Mask);
14112 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
14113 unsigned CommonVF = Mask.size();
14114 InstructionCost ExtraCost = 0;
14115 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
14116 unsigned VF) -> InstructionCost {
14117 if (E.isGather() && allConstant(E.Scalars))
14118 return TTI::TCC_Free;
14119 Type *EScalarTy = E.Scalars.front()->getType();
14120 bool IsSigned = true;
14121 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
14122 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
14123 IsSigned = It->second.second;
14124 }
14125 if (EScalarTy != ScalarTy) {
14126 unsigned CastOpcode = Instruction::Trunc;
14127 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14128 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14129 if (DstSz > SrcSz)
14130 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14131 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
14132 getWidenedType(EScalarTy, VF),
14133 TTI::CastContextHint::None, CostKind);
14134 }
14135 return TTI::TCC_Free;
14136 };
14137 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
14138 if (isa<Constant>(V))
14139 return TTI::TCC_Free;
14140 auto *VecTy = cast<VectorType>(V->getType());
14141 Type *EScalarTy = VecTy->getElementType();
14142 if (EScalarTy != ScalarTy) {
14143 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
14144 unsigned CastOpcode = Instruction::Trunc;
14145 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14146 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14147 if (DstSz > SrcSz)
14148 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14149 return TTI.getCastInstrCost(
14150 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
14151 VecTy, TTI::CastContextHint::None, CostKind);
14152 }
14153 return TTI::TCC_Free;
14154 };
14155 if (!V1 && !V2 && !P2.isNull()) {
14156 // Shuffle 2 entry nodes.
14157 const TreeEntry *E = cast<const TreeEntry *>(P1);
14158 unsigned VF = E->getVectorFactor();
14159 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
14160 CommonVF = std::max(VF, E2->getVectorFactor());
14161 assert(all_of(Mask,
14162 [=](int Idx) {
14163 return Idx < 2 * static_cast<int>(CommonVF);
14164 }) &&
14165 "All elements in mask must be less than 2 * CommonVF.");
14166 if (E->Scalars.size() == E2->Scalars.size()) {
14167 SmallVector<int> EMask = E->getCommonMask();
14168 SmallVector<int> E2Mask = E2->getCommonMask();
14169 if (!EMask.empty() || !E2Mask.empty()) {
14170 for (int &Idx : CommonMask) {
14171 if (Idx == PoisonMaskElem)
14172 continue;
14173 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
14174 Idx = EMask[Idx];
14175 else if (Idx >= static_cast<int>(CommonVF))
14176 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
14177 E->Scalars.size();
14178 }
14179 }
14180 CommonVF = E->Scalars.size();
14181 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14182 GetNodeMinBWAffectedCost(*E2, CommonVF);
14183 } else {
14184 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14185 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14186 }
14187 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14188 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14189 } else if (!V1 && P2.isNull()) {
14190 // Shuffle single entry node.
14191 const TreeEntry *E = cast<const TreeEntry *>(P1);
14192 unsigned VF = E->getVectorFactor();
14193 CommonVF = VF;
14194 assert(
14195 all_of(Mask,
14196 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14197 "All elements in mask must be less than CommonVF.");
14198 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14199 SmallVector<int> EMask = E->getCommonMask();
14200 assert(!EMask.empty() && "Expected non-empty common mask.");
14201 for (int &Idx : CommonMask) {
14202 if (Idx != PoisonMaskElem)
14203 Idx = EMask[Idx];
14204 }
14205 CommonVF = E->Scalars.size();
14206 } else if (unsigned Factor = E->getInterleaveFactor();
14207 Factor > 0 && E->Scalars.size() != Mask.size() &&
14209 Factor)) {
14210 // Deinterleaved nodes are free.
14211 std::iota(CommonMask.begin(), CommonMask.end(), 0);
14212 }
14213 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14214 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14215 // Not identity/broadcast? Try to see if the original vector is better.
14216 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14217 CommonVF == CommonMask.size() &&
14218 any_of(enumerate(CommonMask),
14219 [](const auto &&P) {
14220 return P.value() != PoisonMaskElem &&
14221 static_cast<unsigned>(P.value()) != P.index();
14222 }) &&
14223 any_of(CommonMask,
14224 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
14225 SmallVector<int> ReorderMask;
14226 inversePermutation(E->ReorderIndices, ReorderMask);
14227 ::addMask(CommonMask, ReorderMask);
14228 }
14229 } else if (V1 && P2.isNull()) {
14230 // Shuffle single vector.
14231 ExtraCost += GetValueMinBWAffectedCost(V1);
14232 CommonVF = getVF(V1);
14233 assert(
14234 all_of(Mask,
14235 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14236 "All elements in mask must be less than CommonVF.");
14237 } else if (V1 && !V2) {
14238 // Shuffle vector and tree node.
14239 unsigned VF = getVF(V1);
14240 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
14241 CommonVF = std::max(VF, E2->getVectorFactor());
14242 assert(all_of(Mask,
14243 [=](int Idx) {
14244 return Idx < 2 * static_cast<int>(CommonVF);
14245 }) &&
14246 "All elements in mask must be less than 2 * CommonVF.");
14247 if (E2->Scalars.size() == VF && VF != CommonVF) {
14248 SmallVector<int> E2Mask = E2->getCommonMask();
14249 assert(!E2Mask.empty() && "Expected non-empty common mask.");
14250 for (int &Idx : CommonMask) {
14251 if (Idx == PoisonMaskElem)
14252 continue;
14253 if (Idx >= static_cast<int>(CommonVF))
14254 Idx = E2Mask[Idx - CommonVF] + VF;
14255 }
14256 CommonVF = VF;
14257 }
14258 ExtraCost += GetValueMinBWAffectedCost(V1);
14259 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14260 ExtraCost += GetNodeMinBWAffectedCost(
14261 *E2, std::min(CommonVF, E2->getVectorFactor()));
14262 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14263 } else if (!V1 && V2) {
14264 // Shuffle vector and tree node.
14265 unsigned VF = getVF(V2);
14266 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
14267 CommonVF = std::max(VF, E1->getVectorFactor());
14268 assert(all_of(Mask,
14269 [=](int Idx) {
14270 return Idx < 2 * static_cast<int>(CommonVF);
14271 }) &&
14272 "All elements in mask must be less than 2 * CommonVF.");
14273 if (E1->Scalars.size() == VF && VF != CommonVF) {
14274 SmallVector<int> E1Mask = E1->getCommonMask();
14275 assert(!E1Mask.empty() && "Expected non-empty common mask.");
14276 for (int &Idx : CommonMask) {
14277 if (Idx == PoisonMaskElem)
14278 continue;
14279 if (Idx >= static_cast<int>(CommonVF))
14280 Idx = E1Mask[Idx - CommonVF] + VF;
14281 else
14282 Idx = E1Mask[Idx];
14283 }
14284 CommonVF = VF;
14285 }
14286 ExtraCost += GetNodeMinBWAffectedCost(
14287 *E1, std::min(CommonVF, E1->getVectorFactor()));
14288 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14289 ExtraCost += GetValueMinBWAffectedCost(V2);
14290 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14291 } else {
14292 assert(V1 && V2 && "Expected both vectors.");
14293 unsigned VF = getVF(V1);
14294 CommonVF = std::max(VF, getVF(V2));
14295 assert(all_of(Mask,
14296 [=](int Idx) {
14297 return Idx < 2 * static_cast<int>(CommonVF);
14298 }) &&
14299 "All elements in mask must be less than 2 * CommonVF.");
14300 ExtraCost +=
14301 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14302 if (V1->getType() != V2->getType()) {
14303 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14304 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14305 } else {
14306 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
14307 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14308 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
14309 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14310 }
14311 }
14312 InVectors.front() =
14313 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14314 if (InVectors.size() == 2)
14315 InVectors.pop_back();
14316 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14317 V1, V2, CommonMask, Builder, ScalarTy);
14318 }
14319
14320public:
14322 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
14323 SmallPtrSetImpl<Value *> &CheckedExtracts)
14324 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14325 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14326 CheckedExtracts(CheckedExtracts) {}
14327 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14328 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14329 unsigned NumParts, bool &UseVecBaseAsInput) {
14330 UseVecBaseAsInput = false;
14331 if (Mask.empty())
14332 return nullptr;
14333 Value *VecBase = nullptr;
14334 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14335 if (!E->ReorderIndices.empty()) {
14336 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14337 E->ReorderIndices.end());
14338 reorderScalars(VL, ReorderMask);
14339 }
14340 // Check if it can be considered reused if same extractelements were
14341 // vectorized already.
14342 bool PrevNodeFound = any_of(
14343 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14344 [&](const std::unique_ptr<TreeEntry> &TE) {
14345 return ((TE->hasState() && !TE->isAltShuffle() &&
14346 TE->getOpcode() == Instruction::ExtractElement) ||
14347 TE->isGather()) &&
14348 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14349 return VL.size() > Data.index() &&
14350 (Mask[Data.index()] == PoisonMaskElem ||
14351 isa<UndefValue>(VL[Data.index()]) ||
14352 Data.value() == VL[Data.index()]);
14353 });
14354 });
14355 SmallPtrSet<Value *, 4> UniqueBases;
14356 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14357 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
14358 for (unsigned Part : seq<unsigned>(NumParts)) {
14359 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14360 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14361 for (auto [I, V] :
14362 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
14363 // Ignore non-extractelement scalars.
14364 if (isa<UndefValue>(V) ||
14365 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
14366 continue;
14367 // If all users of instruction are going to be vectorized and this
14368 // instruction itself is not going to be vectorized, consider this
14369 // instruction as dead and remove its cost from the final cost of the
14370 // vectorized tree.
14371 // Also, avoid adjusting the cost for extractelements with multiple uses
14372 // in different graph entries.
14373 auto *EE = cast<ExtractElementInst>(V);
14374 VecBase = EE->getVectorOperand();
14375 UniqueBases.insert(VecBase);
14376 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
14377 if (!CheckedExtracts.insert(V).second ||
14378 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
14379 any_of(VEs,
14380 [&](const TreeEntry *TE) {
14381 return R.DeletedNodes.contains(TE) ||
14382 R.TransformedToGatherNodes.contains(TE);
14383 }) ||
14384 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
14385 !R.isVectorized(EE) &&
14386 count_if(E->Scalars, [&](Value *V) { return V == EE; }) !=
14387 count_if(E->UserTreeIndex.UserTE->Scalars,
14388 [&](Value *V) { return V == EE; })) ||
14389 any_of(EE->users(),
14390 [&](User *U) {
14391 return isa<GetElementPtrInst>(U) &&
14392 !R.areAllUsersVectorized(cast<Instruction>(U),
14393 &VectorizedVals);
14394 }) ||
14395 (!VEs.empty() && !is_contained(VEs, E)))
14396 continue;
14397 std::optional<unsigned> EEIdx = getExtractIndex(EE);
14398 if (!EEIdx)
14399 continue;
14400 unsigned Idx = *EEIdx;
14401 // Take credit for instruction that will become dead.
14402 if (EE->hasOneUse() || !PrevNodeFound) {
14403 Instruction *Ext = EE->user_back();
14404 if (isa<SExtInst, ZExtInst>(Ext) &&
14406 // Use getExtractWithExtendCost() to calculate the cost of
14407 // extractelement/ext pair.
14408 Cost -= TTI.getExtractWithExtendCost(
14409 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
14410 Idx, CostKind);
14411 // Add back the cost of s|zext which is subtracted separately.
14412 Cost += TTI.getCastInstrCost(
14413 Ext->getOpcode(), Ext->getType(), EE->getType(),
14415 continue;
14416 }
14417 }
14418 APInt &DemandedElts =
14419 VectorOpsToExtracts
14420 .try_emplace(VecBase,
14421 APInt::getZero(getNumElements(VecBase->getType())))
14422 .first->getSecond();
14423 DemandedElts.setBit(Idx);
14424 }
14425 }
14426 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14428 DemandedElts, /*Insert=*/false,
14429 /*Extract=*/true, CostKind);
14430 // Check that gather of extractelements can be represented as just a
14431 // shuffle of a single/two vectors the scalars are extracted from.
14432 // Found the bunch of extractelement instructions that must be gathered
14433 // into a vector and can be represented as a permutation elements in a
14434 // single input vector or of 2 input vectors.
14435 // Done for reused if same extractelements were vectorized already.
14436 if (!PrevNodeFound)
14437 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14438 InVectors.assign(1, E);
14439 CommonMask.assign(Mask.begin(), Mask.end());
14440 transformMaskAfterShuffle(CommonMask, CommonMask);
14441 SameNodesEstimated = false;
14442 if (NumParts != 1 && UniqueBases.size() != 1) {
14443 UseVecBaseAsInput = true;
14444 VecBase =
14445 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14446 }
14447 return VecBase;
14448 }
14449 /// Checks if the specified entry \p E needs to be delayed because of its
14450 /// dependency nodes.
14451 std::optional<InstructionCost>
14452 needToDelay(const TreeEntry *,
14454 // No need to delay the cost estimation during analysis.
14455 return std::nullopt;
14456 }
14457 /// Reset the builder to handle perfect diamond match.
14459 IsFinalized = false;
14460 CommonMask.clear();
14461 InVectors.clear();
14462 Cost = 0;
14463 VectorizedVals.clear();
14464 SameNodesEstimated = true;
14465 }
14466 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14467 if (&E1 == &E2) {
14468 assert(all_of(Mask,
14469 [&](int Idx) {
14470 return Idx < static_cast<int>(E1.getVectorFactor());
14471 }) &&
14472 "Expected single vector shuffle mask.");
14473 add(E1, Mask);
14474 return;
14475 }
14476 if (InVectors.empty()) {
14477 CommonMask.assign(Mask.begin(), Mask.end());
14478 InVectors.assign({&E1, &E2});
14479 return;
14480 }
14481 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14482 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14483 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14484 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14485 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
14486 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14487 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14488 }
14489 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14490 if (InVectors.empty()) {
14491 CommonMask.assign(Mask.begin(), Mask.end());
14492 InVectors.assign(1, &E1);
14493 return;
14494 }
14495 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14496 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14497 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14498 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14499 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
14500 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14501 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
14502 if (!SameNodesEstimated && InVectors.size() == 1)
14503 InVectors.emplace_back(&E1);
14504 }
14505 /// Adds 2 input vectors and the mask for their shuffling.
14506 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14507 // May come only for shuffling of 2 vectors with extractelements, already
14508 // handled in adjustExtracts.
14509 assert(InVectors.size() == 1 &&
14510 all_of(enumerate(CommonMask),
14511 [&](auto P) {
14512 if (P.value() == PoisonMaskElem)
14513 return Mask[P.index()] == PoisonMaskElem;
14514 auto *EI = cast<ExtractElementInst>(
14515 cast<const TreeEntry *>(InVectors.front())
14516 ->getOrdered(P.index()));
14517 return EI->getVectorOperand() == V1 ||
14518 EI->getVectorOperand() == V2;
14519 }) &&
14520 "Expected extractelement vectors.");
14521 }
14522 /// Adds another one input vector and the mask for the shuffling.
14523 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14524 if (InVectors.empty()) {
14525 assert(CommonMask.empty() && !ForExtracts &&
14526 "Expected empty input mask/vectors.");
14527 CommonMask.assign(Mask.begin(), Mask.end());
14528 InVectors.assign(1, V1);
14529 return;
14530 }
14531 if (ForExtracts) {
14532 // No need to add vectors here, already handled them in adjustExtracts.
14533 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14534 !CommonMask.empty() &&
14535 all_of(enumerate(CommonMask),
14536 [&](auto P) {
14537 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14538 ->getOrdered(P.index());
14539 if (P.value() == PoisonMaskElem)
14540 return P.value() == Mask[P.index()] ||
14541 isa<UndefValue>(Scalar);
14542 if (isa<Constant>(V1))
14543 return true;
14544 auto *EI = cast<ExtractElementInst>(Scalar);
14545 return EI->getVectorOperand() == V1;
14546 }) &&
14547 "Expected only tree entry for extractelement vectors.");
14548 return;
14549 }
14550 assert(!InVectors.empty() && !CommonMask.empty() &&
14551 "Expected only tree entries from extracts/reused buildvectors.");
14552 unsigned VF = getVF(V1);
14553 if (InVectors.size() == 2) {
14554 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14555 transformMaskAfterShuffle(CommonMask, CommonMask);
14556 VF = std::max<unsigned>(VF, CommonMask.size());
14557 } else if (const auto *InTE =
14558 InVectors.front().dyn_cast<const TreeEntry *>()) {
14559 VF = std::max(VF, InTE->getVectorFactor());
14560 } else {
14561 VF = std::max(
14562 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14563 ->getNumElements());
14564 }
14565 InVectors.push_back(V1);
14566 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14567 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14568 CommonMask[Idx] = Mask[Idx] + VF;
14569 }
14570 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14571 Value *Root = nullptr) {
14572 Cost += getBuildVectorCost(VL, Root);
14573 if (!Root) {
14574 // FIXME: Need to find a way to avoid use of getNullValue here.
14576 unsigned VF = VL.size();
14577 if (MaskVF != 0)
14578 VF = std::min(VF, MaskVF);
14579 Type *VLScalarTy = VL.front()->getType();
14580 for (Value *V : VL.take_front(VF)) {
14581 Type *ScalarTy = VLScalarTy->getScalarType();
14582 if (isa<PoisonValue>(V)) {
14583 Vals.push_back(PoisonValue::get(ScalarTy));
14584 continue;
14585 }
14586 if (isa<UndefValue>(V)) {
14587 Vals.push_back(UndefValue::get(ScalarTy));
14588 continue;
14589 }
14590 Vals.push_back(Constant::getNullValue(ScalarTy));
14591 }
14592 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14593 assert(SLPReVec && "FixedVectorType is not expected.");
14594 // When REVEC is enabled, we need to expand vector types into scalar
14595 // types.
14596 Vals = replicateMask(Vals, VecTy->getNumElements());
14597 }
14598 return ConstantVector::get(Vals);
14599 }
14602 cast<FixedVectorType>(Root->getType())->getNumElements()),
14603 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14604 }
14606 /// Finalize emission of the shuffles.
14608 ArrayRef<int> ExtMask,
14609 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14610 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14613 Action = {}) {
14614 IsFinalized = true;
14615 if (Action) {
14616 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14617 if (InVectors.size() == 2)
14618 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14619 else
14620 Cost += createShuffle(Vec, nullptr, CommonMask);
14621 transformMaskAfterShuffle(CommonMask, CommonMask);
14622 assert(VF > 0 &&
14623 "Expected vector length for the final value before action.");
14624 Value *V = cast<Value *>(Vec);
14625 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14626 Cost += createShuffle(V1, V2, Mask);
14627 return V1;
14628 });
14629 InVectors.front() = V;
14630 }
14631 if (!SubVectors.empty()) {
14632 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14633 if (InVectors.size() == 2)
14634 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14635 else
14636 Cost += createShuffle(Vec, nullptr, CommonMask);
14637 transformMaskAfterShuffle(CommonMask, CommonMask);
14638 // Add subvectors permutation cost.
14639 if (!SubVectorsMask.empty()) {
14640 assert(SubVectorsMask.size() <= CommonMask.size() &&
14641 "Expected same size of masks for subvectors and common mask.");
14642 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14643 copy(SubVectorsMask, SVMask.begin());
14644 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14645 if (I2 != PoisonMaskElem) {
14646 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14647 I1 = I2 + CommonMask.size();
14648 }
14649 }
14651 getWidenedType(ScalarTy, CommonMask.size()),
14652 SVMask, CostKind);
14653 }
14654 for (auto [E, Idx] : SubVectors) {
14655 Type *EScalarTy = E->Scalars.front()->getType();
14656 bool IsSigned = true;
14657 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14658 EScalarTy =
14659 IntegerType::get(EScalarTy->getContext(), It->second.first);
14660 IsSigned = It->second.second;
14661 }
14662 if (ScalarTy != EScalarTy) {
14663 unsigned CastOpcode = Instruction::Trunc;
14664 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14665 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14666 if (DstSz > SrcSz)
14667 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14668 Cost += TTI.getCastInstrCost(
14669 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14670 getWidenedType(EScalarTy, E->getVectorFactor()),
14672 }
14675 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14676 getWidenedType(ScalarTy, E->getVectorFactor()));
14677 if (!CommonMask.empty()) {
14678 std::iota(std::next(CommonMask.begin(), Idx),
14679 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14680 Idx);
14681 }
14682 }
14683 }
14684
14685 if (!ExtMask.empty()) {
14686 if (CommonMask.empty()) {
14687 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14688 } else {
14689 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14690 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14691 if (ExtMask[I] == PoisonMaskElem)
14692 continue;
14693 NewMask[I] = CommonMask[ExtMask[I]];
14694 }
14695 CommonMask.swap(NewMask);
14696 }
14697 }
14698 if (CommonMask.empty()) {
14699 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14700 return Cost;
14701 }
14702 return Cost +
14703 createShuffle(InVectors.front(),
14704 InVectors.size() == 2 ? InVectors.back() : nullptr,
14705 CommonMask);
14706 }
14707
14709 assert((IsFinalized || CommonMask.empty()) &&
14710 "Shuffle construction must be finalized.");
14711 }
14712};
14713
14714const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14715 unsigned Idx) const {
14716 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14717 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14718 return Op;
14719}
14720
14721TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14722 if (TE.State == TreeEntry::ScatterVectorize ||
14723 TE.State == TreeEntry::StridedVectorize)
14725 if (TE.State == TreeEntry::CompressVectorize)
14727 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14728 !TE.isAltShuffle()) {
14729 if (TE.ReorderIndices.empty())
14731 SmallVector<int> Mask;
14732 inversePermutation(TE.ReorderIndices, Mask);
14733 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14735 }
14737}
14738
14740BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14741 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14742 ArrayRef<Value *> VL = E->Scalars;
14743
14744 Type *ScalarTy = getValueType(VL[0]);
14745 if (!isValidElementType(ScalarTy))
14746 return InstructionCost::getInvalid();
14748
14749 // If we have computed a smaller type for the expression, update VecTy so
14750 // that the costs will be accurate.
14751 auto It = MinBWs.find(E);
14752 Type *OrigScalarTy = ScalarTy;
14753 if (It != MinBWs.end()) {
14754 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14755 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14756 if (VecTy)
14757 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14758 }
14759 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14760 unsigned EntryVF = E->getVectorFactor();
14761 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14762
14763 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
14764 if (allConstant(VL))
14765 return 0;
14766 if (isa<InsertElementInst>(VL[0]))
14767 return InstructionCost::getInvalid();
14768 if (isa<CmpInst>(VL.front()))
14769 ScalarTy = VL.front()->getType();
14770 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14771 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14772 }
14773 if (E->State == TreeEntry::SplitVectorize) {
14774 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14775 "Expected exactly 2 combined entries.");
14776 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14777 InstructionCost VectorCost = 0;
14778 if (E->ReorderIndices.empty()) {
14779 VectorCost = ::getShuffleCost(
14780 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14781 E->CombinedEntriesWithIndices.back().second,
14783 ScalarTy,
14784 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14785 ->getVectorFactor()));
14786 } else {
14787 unsigned CommonVF =
14788 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14789 ->getVectorFactor(),
14790 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14791 ->getVectorFactor());
14792 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14793 getWidenedType(ScalarTy, CommonVF),
14794 E->getSplitMask(), CostKind);
14795 }
14796 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14797 return VectorCost;
14798 }
14799 InstructionCost CommonCost = 0;
14800 SmallVector<int> Mask;
14801 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14802 (E->State != TreeEntry::StridedVectorize ||
14803 !isReverseOrder(E->ReorderIndices))) {
14804 SmallVector<int> NewMask;
14805 if (E->getOpcode() == Instruction::Store) {
14806 // For stores the order is actually a mask.
14807 NewMask.resize(E->ReorderIndices.size());
14808 copy(E->ReorderIndices, NewMask.begin());
14809 } else {
14810 inversePermutation(E->ReorderIndices, NewMask);
14811 }
14812 ::addMask(Mask, NewMask);
14813 }
14814 if (!E->ReuseShuffleIndices.empty())
14815 ::addMask(Mask, E->ReuseShuffleIndices);
14816 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14817 CommonCost =
14818 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14819 assert((E->State == TreeEntry::Vectorize ||
14820 E->State == TreeEntry::ScatterVectorize ||
14821 E->State == TreeEntry::StridedVectorize ||
14822 E->State == TreeEntry::CompressVectorize) &&
14823 "Unhandled state");
14824 assert(E->getOpcode() &&
14825 ((allSameType(VL) && allSameBlock(VL)) ||
14826 (E->getOpcode() == Instruction::GetElementPtr &&
14827 E->getMainOp()->getType()->isPointerTy()) ||
14828 E->hasCopyableElements()) &&
14829 "Invalid VL");
14830 Instruction *VL0 = E->getMainOp();
14831 unsigned ShuffleOrOp =
14832 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14833 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14834 ShuffleOrOp = E->CombinedOp;
14835 SmallSetVector<Value *, 16> UniqueValues;
14836 SmallVector<unsigned, 16> UniqueIndexes;
14837 for (auto [Idx, V] : enumerate(VL))
14838 if (UniqueValues.insert(V))
14839 UniqueIndexes.push_back(Idx);
14840 const unsigned Sz = UniqueValues.size();
14841 SmallBitVector UsedScalars(Sz, false);
14842 for (unsigned I = 0; I < Sz; ++I) {
14843 if (isa<Instruction>(UniqueValues[I]) &&
14844 !E->isCopyableElement(UniqueValues[I]) &&
14845 getTreeEntries(UniqueValues[I]).front() == E)
14846 continue;
14847 UsedScalars.set(I);
14848 }
14849 auto GetCastContextHint = [&](Value *V) {
14850 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14851 return getCastContextHint(*OpTEs.front());
14852 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14853 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14854 !SrcState.isAltShuffle())
14857 };
14858 auto GetCostDiff =
14859 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14860 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14861 // Calculate the cost of this instruction.
14862 InstructionCost ScalarCost = 0;
14863 if (isa<CastInst, CallInst>(VL0)) {
14864 // For some of the instructions no need to calculate cost for each
14865 // particular instruction, we can use the cost of the single
14866 // instruction x total number of scalar instructions.
14867 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14868 } else {
14869 for (unsigned I = 0; I < Sz; ++I) {
14870 if (UsedScalars.test(I))
14871 continue;
14872 ScalarCost += ScalarEltCost(I);
14873 }
14874 }
14875
14876 InstructionCost VecCost = VectorCost(CommonCost);
14877 // Check if the current node must be resized, if the parent node is not
14878 // resized.
14879 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14880 E->Idx != 0 &&
14881 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14882 const EdgeInfo &EI = E->UserTreeIndex;
14883 if (!EI.UserTE->hasState() ||
14884 EI.UserTE->getOpcode() != Instruction::Select ||
14885 EI.EdgeIdx != 0) {
14886 auto UserBWIt = MinBWs.find(EI.UserTE);
14887 Type *UserScalarTy =
14888 (EI.UserTE->isGather() ||
14889 EI.UserTE->State == TreeEntry::SplitVectorize)
14890 ? EI.UserTE->Scalars.front()->getType()
14891 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14892 if (UserBWIt != MinBWs.end())
14893 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14894 UserBWIt->second.first);
14895 if (ScalarTy != UserScalarTy) {
14896 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14897 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14898 unsigned VecOpcode;
14899 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14900 if (BWSz > SrcBWSz)
14901 VecOpcode = Instruction::Trunc;
14902 else
14903 VecOpcode =
14904 It->second.second ? Instruction::SExt : Instruction::ZExt;
14905 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14906 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14907 CostKind);
14908 }
14909 }
14910 }
14911 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14912 ScalarCost, "Calculated costs for Tree"));
14913 return VecCost - ScalarCost;
14914 };
14915 // Calculate cost difference from vectorizing set of GEPs.
14916 // Negative value means vectorizing is profitable.
14917 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14918 assert((E->State == TreeEntry::Vectorize ||
14919 E->State == TreeEntry::StridedVectorize ||
14920 E->State == TreeEntry::CompressVectorize) &&
14921 "Entry state expected to be Vectorize, StridedVectorize or "
14922 "MaskedLoadCompressVectorize here.");
14923 InstructionCost ScalarCost = 0;
14924 InstructionCost VecCost = 0;
14925 std::tie(ScalarCost, VecCost) = getGEPCosts(
14926 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14927 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14928 "Calculated GEPs cost for Tree"));
14929
14930 return VecCost - ScalarCost;
14931 };
14932
14933 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14934 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14935 if (MinMaxID == Intrinsic::not_intrinsic)
14936 return InstructionCost::getInvalid();
14937 Type *CanonicalType = Ty;
14938 if (CanonicalType->isPtrOrPtrVectorTy())
14939 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14940 CanonicalType->getContext(),
14941 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14942
14943 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14944 {CanonicalType, CanonicalType});
14946 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14947 // If the selects are the only uses of the compares, they will be
14948 // dead and we can adjust the cost by removing their cost.
14949 if (VI && SelectOnly) {
14950 assert((!Ty->isVectorTy() || SLPReVec) &&
14951 "Expected only for scalar type.");
14952 auto *CI = cast<CmpInst>(VI->getOperand(0));
14953 IntrinsicCost -= TTI->getCmpSelInstrCost(
14954 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14955 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14956 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14957 }
14958 return IntrinsicCost;
14959 };
14960 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14961 Instruction *VI) {
14962 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14963 return Cost;
14964 };
14965 switch (ShuffleOrOp) {
14966 case Instruction::PHI: {
14967 // Count reused scalars.
14968 InstructionCost ScalarCost = 0;
14969 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14970 for (Value *V : UniqueValues) {
14971 auto *PHI = dyn_cast<PHINode>(V);
14972 if (!PHI)
14973 continue;
14974
14975 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14976 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14977 Value *Op = PHI->getIncomingValue(I);
14978 Operands[I] = Op;
14979 }
14980 if (const TreeEntry *OpTE =
14981 getSameValuesTreeEntry(Operands.front(), Operands))
14982 if (CountedOps.insert(OpTE).second &&
14983 !OpTE->ReuseShuffleIndices.empty())
14984 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14985 OpTE->Scalars.size());
14986 }
14987
14988 return CommonCost - ScalarCost;
14989 }
14990 case Instruction::ExtractValue:
14991 case Instruction::ExtractElement: {
14992 APInt DemandedElts;
14993 VectorType *SrcVecTy = nullptr;
14994 auto GetScalarCost = [&](unsigned Idx) {
14995 if (isa<PoisonValue>(UniqueValues[Idx]))
14997
14998 auto *I = cast<Instruction>(UniqueValues[Idx]);
14999 if (!SrcVecTy) {
15000 if (ShuffleOrOp == Instruction::ExtractElement) {
15001 auto *EE = cast<ExtractElementInst>(I);
15002 SrcVecTy = EE->getVectorOperandType();
15003 } else {
15004 auto *EV = cast<ExtractValueInst>(I);
15005 Type *AggregateTy = EV->getAggregateOperand()->getType();
15006 unsigned NumElts;
15007 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
15008 NumElts = ATy->getNumElements();
15009 else
15010 NumElts = AggregateTy->getStructNumElements();
15011 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
15012 }
15013 }
15014 if (I->hasOneUse()) {
15015 Instruction *Ext = I->user_back();
15016 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
15018 // Use getExtractWithExtendCost() to calculate the cost of
15019 // extractelement/ext pair.
15020 InstructionCost Cost = TTI->getExtractWithExtendCost(
15021 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
15022 CostKind);
15023 // Subtract the cost of s|zext which is subtracted separately.
15024 Cost -= TTI->getCastInstrCost(
15025 Ext->getOpcode(), Ext->getType(), I->getType(),
15027 return Cost;
15028 }
15029 }
15030 if (DemandedElts.isZero())
15031 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
15032 DemandedElts.setBit(*getExtractIndex(I));
15034 };
15035 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15036 return CommonCost - (DemandedElts.isZero()
15038 : TTI.getScalarizationOverhead(
15039 SrcVecTy, DemandedElts, /*Insert=*/false,
15040 /*Extract=*/true, CostKind));
15041 };
15042 return GetCostDiff(GetScalarCost, GetVectorCost);
15043 }
15044 case Instruction::InsertElement: {
15045 assert(E->ReuseShuffleIndices.empty() &&
15046 "Unique insertelements only are expected.");
15047 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
15048 unsigned const NumElts = SrcVecTy->getNumElements();
15049 unsigned const NumScalars = VL.size();
15050
15051 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
15052
15053 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15054 unsigned OffsetBeg = *getElementIndex(VL.front());
15055 unsigned OffsetEnd = OffsetBeg;
15056 InsertMask[OffsetBeg] = 0;
15057 for (auto [I, V] : enumerate(VL.drop_front())) {
15058 unsigned Idx = *getElementIndex(V);
15059 if (OffsetBeg > Idx)
15060 OffsetBeg = Idx;
15061 else if (OffsetEnd < Idx)
15062 OffsetEnd = Idx;
15063 InsertMask[Idx] = I + 1;
15064 }
15065 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
15066 if (NumOfParts > 0 && NumOfParts < NumElts)
15067 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
15068 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15069 VecScalarsSz;
15070 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15071 unsigned InsertVecSz = std::min<unsigned>(
15072 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
15073 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15074 bool IsWholeSubvector =
15075 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
15076 // Check if we can safely insert a subvector. If it is not possible, just
15077 // generate a whole-sized vector and shuffle the source vector and the new
15078 // subvector.
15079 if (OffsetBeg + InsertVecSz > VecSz) {
15080 // Align OffsetBeg to generate correct mask.
15081 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
15082 InsertVecSz = VecSz;
15083 }
15084
15085 APInt DemandedElts = APInt::getZero(NumElts);
15086 // TODO: Add support for Instruction::InsertValue.
15087 SmallVector<int> Mask;
15088 if (!E->ReorderIndices.empty()) {
15089 inversePermutation(E->ReorderIndices, Mask);
15090 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
15091 } else {
15092 Mask.assign(VecSz, PoisonMaskElem);
15093 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
15094 }
15095 bool IsIdentity = true;
15096 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
15097 Mask.swap(PrevMask);
15098 for (unsigned I = 0; I < NumScalars; ++I) {
15099 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
15100 DemandedElts.setBit(InsertIdx);
15101 IsIdentity &= InsertIdx - OffsetBeg == I;
15102 Mask[InsertIdx - OffsetBeg] = I;
15103 }
15104 assert(Offset < NumElts && "Failed to find vector index offset");
15105
15107 Cost -=
15108 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
15109 /*Insert*/ true, /*Extract*/ false, CostKind);
15110
15111 // First cost - resize to actual vector size if not identity shuffle or
15112 // need to shift the vector.
15113 // Do not calculate the cost if the actual size is the register size and
15114 // we can merge this shuffle with the following SK_Select.
15115 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
15116 if (!IsIdentity)
15118 InsertVecTy, Mask);
15119 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15120 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15121 }));
15122 // Second cost - permutation with subvector, if some elements are from the
15123 // initial vector or inserting a subvector.
15124 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
15125 // subvector of ActualVecTy.
15126 SmallBitVector InMask =
15127 isUndefVector(FirstInsert->getOperand(0),
15128 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
15129 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
15130 if (InsertVecSz != VecSz) {
15131 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
15132 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
15133 CostKind, OffsetBeg - Offset, InsertVecTy);
15134 } else {
15135 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
15136 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
15137 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
15138 I <= End; ++I)
15139 if (Mask[I] != PoisonMaskElem)
15140 Mask[I] = I + VecSz;
15141 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
15142 Mask[I] =
15143 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
15144 Cost +=
15145 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
15146 }
15147 }
15148 return Cost;
15149 }
15150 case Instruction::ZExt:
15151 case Instruction::SExt:
15152 case Instruction::FPToUI:
15153 case Instruction::FPToSI:
15154 case Instruction::FPExt:
15155 case Instruction::PtrToInt:
15156 case Instruction::IntToPtr:
15157 case Instruction::SIToFP:
15158 case Instruction::UIToFP:
15159 case Instruction::Trunc:
15160 case Instruction::FPTrunc:
15161 case Instruction::BitCast: {
15162 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15163 Type *SrcScalarTy = VL0->getOperand(0)->getType();
15164 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
15165 unsigned Opcode = ShuffleOrOp;
15166 unsigned VecOpcode = Opcode;
15167 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15168 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
15169 // Check if the values are candidates to demote.
15170 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
15171 if (SrcIt != MinBWs.end()) {
15172 SrcBWSz = SrcIt->second.first;
15173 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
15174 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
15175 SrcVecTy =
15176 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
15177 }
15178 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15179 if (BWSz == SrcBWSz) {
15180 VecOpcode = Instruction::BitCast;
15181 } else if (BWSz < SrcBWSz) {
15182 VecOpcode = Instruction::Trunc;
15183 } else if (It != MinBWs.end()) {
15184 assert(BWSz > SrcBWSz && "Invalid cast!");
15185 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15186 } else if (SrcIt != MinBWs.end()) {
15187 assert(BWSz > SrcBWSz && "Invalid cast!");
15188 VecOpcode =
15189 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15190 }
15191 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15192 !SrcIt->second.second) {
15193 VecOpcode = Instruction::UIToFP;
15194 }
15195 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
15196 assert(Idx == 0 && "Expected 0 index only");
15197 return TTI->getCastInstrCost(Opcode, VL0->getType(),
15198 VL0->getOperand(0)->getType(),
15200 };
15201 auto GetVectorCost = [=](InstructionCost CommonCost) {
15202 // Do not count cost here if minimum bitwidth is in effect and it is just
15203 // a bitcast (here it is just a noop).
15204 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
15205 return CommonCost;
15206 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
15207 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
15208
15209 bool IsArithmeticExtendedReduction =
15210 E->Idx == 0 && UserIgnoreList &&
15211 all_of(*UserIgnoreList, [](Value *V) {
15212 auto *I = cast<Instruction>(V);
15213 return is_contained({Instruction::Add, Instruction::FAdd,
15214 Instruction::Mul, Instruction::FMul,
15215 Instruction::And, Instruction::Or,
15216 Instruction::Xor},
15217 I->getOpcode());
15218 });
15219 if (IsArithmeticExtendedReduction &&
15220 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
15221 return CommonCost;
15222 return CommonCost +
15223 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
15224 VecOpcode == Opcode ? VI : nullptr);
15225 };
15226 return GetCostDiff(GetScalarCost, GetVectorCost);
15227 }
15228 case Instruction::FCmp:
15229 case Instruction::ICmp:
15230 case Instruction::Select: {
15231 CmpPredicate VecPred, SwappedVecPred;
15232 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
15233 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
15234 match(VL0, MatchCmp))
15235 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
15236 else
15237 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
15240 auto GetScalarCost = [&](unsigned Idx) {
15241 if (isa<PoisonValue>(UniqueValues[Idx]))
15243
15244 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15245 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
15248 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
15249 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
15250 !match(VI, MatchCmp)) ||
15251 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
15252 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
15253 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
15256
15257 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
15258 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
15259 CostKind, getOperandInfo(VI->getOperand(0)),
15260 getOperandInfo(VI->getOperand(1)), VI);
15261 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
15262 if (IntrinsicCost.isValid())
15263 ScalarCost = IntrinsicCost;
15264
15265 return ScalarCost;
15266 };
15267 auto GetVectorCost = [&](InstructionCost CommonCost) {
15268 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15269
15270 InstructionCost VecCost =
15271 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
15272 CostKind, getOperandInfo(E->getOperand(0)),
15273 getOperandInfo(E->getOperand(1)), VL0);
15274 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
15275 auto *CondType =
15276 getWidenedType(SI->getCondition()->getType(), VL.size());
15277 unsigned CondNumElements = CondType->getNumElements();
15278 unsigned VecTyNumElements = getNumElements(VecTy);
15279 assert(VecTyNumElements >= CondNumElements &&
15280 VecTyNumElements % CondNumElements == 0 &&
15281 "Cannot vectorize Instruction::Select");
15282 if (CondNumElements != VecTyNumElements) {
15283 // When the return type is i1 but the source is fixed vector type, we
15284 // need to duplicate the condition value.
15285 VecCost += ::getShuffleCost(
15286 *TTI, TTI::SK_PermuteSingleSrc, CondType,
15287 createReplicatedMask(VecTyNumElements / CondNumElements,
15288 CondNumElements));
15289 }
15290 }
15291 return VecCost + CommonCost;
15292 };
15293 return GetCostDiff(GetScalarCost, GetVectorCost);
15294 }
15295 case TreeEntry::MinMax: {
15296 auto GetScalarCost = [&](unsigned Idx) {
15297 return GetMinMaxCost(OrigScalarTy);
15298 };
15299 auto GetVectorCost = [&](InstructionCost CommonCost) {
15300 InstructionCost VecCost = GetMinMaxCost(VecTy);
15301 return VecCost + CommonCost;
15302 };
15303 return GetCostDiff(GetScalarCost, GetVectorCost);
15304 }
15305 case TreeEntry::FMulAdd: {
15306 auto GetScalarCost = [&](unsigned Idx) {
15307 if (isa<PoisonValue>(UniqueValues[Idx]))
15309 return GetFMulAddCost(E->getOperations(),
15310 cast<Instruction>(UniqueValues[Idx]));
15311 };
15312 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15313 FastMathFlags FMF;
15314 FMF.set();
15315 for (Value *V : E->Scalars) {
15316 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
15317 FMF &= FPCI->getFastMathFlags();
15318 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
15319 FMF &= FPCIOp->getFastMathFlags();
15320 }
15321 }
15322 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15323 {VecTy, VecTy, VecTy}, FMF);
15324 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
15325 return VecCost + CommonCost;
15326 };
15327 return GetCostDiff(GetScalarCost, GetVectorCost);
15328 }
15329 case Instruction::FNeg:
15330 case Instruction::Add:
15331 case Instruction::FAdd:
15332 case Instruction::Sub:
15333 case Instruction::FSub:
15334 case Instruction::Mul:
15335 case Instruction::FMul:
15336 case Instruction::UDiv:
15337 case Instruction::SDiv:
15338 case Instruction::FDiv:
15339 case Instruction::URem:
15340 case Instruction::SRem:
15341 case Instruction::FRem:
15342 case Instruction::Shl:
15343 case Instruction::LShr:
15344 case Instruction::AShr:
15345 case Instruction::And:
15346 case Instruction::Or:
15347 case Instruction::Xor: {
15348 auto GetScalarCost = [&](unsigned Idx) {
15349 if (isa<PoisonValue>(UniqueValues[Idx]))
15351
15352 // We cannot retrieve the operand from UniqueValues[Idx] because an
15353 // interchangeable instruction may be used. The order and the actual
15354 // operand might differ from what is retrieved from UniqueValues[Idx].
15355 unsigned Lane = UniqueIndexes[Idx];
15356 Value *Op1 = E->getOperand(0)[Lane];
15357 Value *Op2;
15358 SmallVector<const Value *, 2> Operands(1, Op1);
15359 if (isa<UnaryOperator>(UniqueValues[Idx])) {
15360 Op2 = Op1;
15361 } else {
15362 Op2 = E->getOperand(1)[Lane];
15363 Operands.push_back(Op2);
15364 }
15367 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
15368 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
15369 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
15370 I && (ShuffleOrOp == Instruction::FAdd ||
15371 ShuffleOrOp == Instruction::FSub)) {
15372 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
15373 if (IntrinsicCost.isValid())
15374 ScalarCost = IntrinsicCost;
15375 }
15376 return ScalarCost;
15377 };
15378 auto GetVectorCost = [=](InstructionCost CommonCost) {
15379 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15380 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15381 ArrayRef<Value *> Ops = E->getOperand(I);
15382 if (all_of(Ops, [&](Value *Op) {
15383 auto *CI = dyn_cast<ConstantInt>(Op);
15384 return CI && CI->getValue().countr_one() >= It->second.first;
15385 }))
15386 return CommonCost;
15387 }
15388 }
15389 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
15390 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
15391 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
15392 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
15393 Op2Info, {}, nullptr, TLI) +
15394 CommonCost;
15395 };
15396 return GetCostDiff(GetScalarCost, GetVectorCost);
15397 }
15398 case Instruction::GetElementPtr: {
15399 return CommonCost + GetGEPCostDiff(VL, VL0);
15400 }
15401 case Instruction::Load: {
15402 auto GetScalarCost = [&](unsigned Idx) {
15403 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
15404 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15405 VI->getAlign(), VI->getPointerAddressSpace(),
15407 };
15408 auto *LI0 = cast<LoadInst>(VL0);
15409 auto GetVectorCost = [&](InstructionCost CommonCost) {
15410 InstructionCost VecLdCost;
15411 switch (E->State) {
15412 case TreeEntry::Vectorize:
15413 if (unsigned Factor = E->getInterleaveFactor()) {
15414 VecLdCost = TTI->getInterleavedMemoryOpCost(
15415 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15416 LI0->getPointerAddressSpace(), CostKind);
15417
15418 } else {
15419 VecLdCost = TTI->getMemoryOpCost(
15420 Instruction::Load, VecTy, LI0->getAlign(),
15421 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15422 }
15423 break;
15424 case TreeEntry::StridedVectorize: {
15425 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
15426 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15427 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
15428 Align CommonAlignment =
15429 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15430 VecLdCost = TTI->getMemIntrinsicInstrCost(
15431 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15432 StridedLoadTy, LI0->getPointerOperand(),
15433 /*VariableMask=*/false, CommonAlignment),
15434 CostKind);
15435 if (StridedLoadTy != VecTy)
15436 VecLdCost +=
15437 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15438 getCastContextHint(*E), CostKind);
15439
15440 break;
15441 }
15442 case TreeEntry::CompressVectorize: {
15443 bool IsMasked;
15444 unsigned InterleaveFactor;
15445 SmallVector<int> CompressMask;
15446 VectorType *LoadVecTy;
15447 SmallVector<Value *> Scalars(VL);
15448 if (!E->ReorderIndices.empty()) {
15449 SmallVector<int> Mask(E->ReorderIndices.begin(),
15450 E->ReorderIndices.end());
15451 reorderScalars(Scalars, Mask);
15452 }
15453 SmallVector<Value *> PointerOps(Scalars.size());
15454 for (auto [I, V] : enumerate(Scalars))
15455 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15456 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15457 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15458 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
15459 CompressMask, LoadVecTy);
15460 assert(IsVectorized && "Failed to vectorize load");
15461 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
15462 InterleaveFactor, IsMasked);
15463 Align CommonAlignment = LI0->getAlign();
15464 if (InterleaveFactor) {
15465 VecLdCost = TTI->getInterleavedMemoryOpCost(
15466 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15467 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
15468 } else if (IsMasked) {
15469 VecLdCost = TTI->getMemIntrinsicInstrCost(
15470 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15471 CommonAlignment,
15472 LI0->getPointerAddressSpace()),
15473 CostKind);
15474 // TODO: include this cost into CommonCost.
15475 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15476 LoadVecTy, CompressMask, CostKind);
15477 } else {
15478 VecLdCost = TTI->getMemoryOpCost(
15479 Instruction::Load, LoadVecTy, CommonAlignment,
15480 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15481 // TODO: include this cost into CommonCost.
15482 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15483 LoadVecTy, CompressMask, CostKind);
15484 }
15485 break;
15486 }
15487 case TreeEntry::ScatterVectorize: {
15488 Align CommonAlignment =
15489 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15490 VecLdCost = TTI->getMemIntrinsicInstrCost(
15491 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15492 LI0->getPointerOperand(),
15493 /*VariableMask=*/false, CommonAlignment),
15494 CostKind);
15495 break;
15496 }
15497 case TreeEntry::CombinedVectorize:
15498 case TreeEntry::SplitVectorize:
15499 case TreeEntry::NeedToGather:
15500 llvm_unreachable("Unexpected vectorization state.");
15501 }
15502 return VecLdCost + CommonCost;
15503 };
15504
15505 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
15506 // If this node generates masked gather load then it is not a terminal node.
15507 // Hence address operand cost is estimated separately.
15508 if (E->State == TreeEntry::ScatterVectorize)
15509 return Cost;
15510
15511 // Estimate cost of GEPs since this tree node is a terminator.
15512 SmallVector<Value *> PointerOps(VL.size());
15513 for (auto [I, V] : enumerate(VL))
15514 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15515 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15516 }
15517 case Instruction::Store: {
15518 bool IsReorder = !E->ReorderIndices.empty();
15519 auto GetScalarCost = [=](unsigned Idx) {
15520 auto *VI = cast<StoreInst>(VL[Idx]);
15521 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
15522 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15523 VI->getAlign(), VI->getPointerAddressSpace(),
15524 CostKind, OpInfo, VI);
15525 };
15526 auto *BaseSI =
15527 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
15528 auto GetVectorCost = [=](InstructionCost CommonCost) {
15529 // We know that we can merge the stores. Calculate the cost.
15530 InstructionCost VecStCost;
15531 if (E->State == TreeEntry::StridedVectorize) {
15532 Align CommonAlignment =
15533 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
15534 VecStCost = TTI->getMemIntrinsicInstrCost(
15535 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15536 VecTy, BaseSI->getPointerOperand(),
15537 /*VariableMask=*/false, CommonAlignment),
15538 CostKind);
15539 } else {
15540 assert(E->State == TreeEntry::Vectorize &&
15541 "Expected either strided or consecutive stores.");
15542 if (unsigned Factor = E->getInterleaveFactor()) {
15543 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
15544 "No reused shuffles expected");
15545 CommonCost = 0;
15546 VecStCost = TTI->getInterleavedMemoryOpCost(
15547 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15548 BaseSI->getPointerAddressSpace(), CostKind);
15549 } else {
15550 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
15551 VecStCost = TTI->getMemoryOpCost(
15552 Instruction::Store, VecTy, BaseSI->getAlign(),
15553 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
15554 }
15555 }
15556 return VecStCost + CommonCost;
15557 };
15558 SmallVector<Value *> PointerOps(VL.size());
15559 for (auto [I, V] : enumerate(VL)) {
15560 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
15561 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
15562 }
15563
15564 return GetCostDiff(GetScalarCost, GetVectorCost) +
15565 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15566 }
15567 case Instruction::Call: {
15568 auto GetScalarCost = [&](unsigned Idx) {
15569 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15572 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15573 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15574 }
15575 return TTI->getCallInstrCost(CI->getCalledFunction(),
15577 CI->getFunctionType()->params(), CostKind);
15578 };
15579 auto GetVectorCost = [=](InstructionCost CommonCost) {
15580 auto *CI = cast<CallInst>(VL0);
15583 CI, ID, VecTy->getNumElements(),
15584 It != MinBWs.end() ? It->second.first : 0, TTI);
15585 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15586 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15587 };
15588 return GetCostDiff(GetScalarCost, GetVectorCost);
15589 }
15590 case Instruction::ShuffleVector: {
15591 if (!SLPReVec || E->isAltShuffle())
15592 assert(E->isAltShuffle() &&
15593 ((Instruction::isBinaryOp(E->getOpcode()) &&
15594 Instruction::isBinaryOp(E->getAltOpcode())) ||
15595 (Instruction::isCast(E->getOpcode()) &&
15596 Instruction::isCast(E->getAltOpcode())) ||
15597 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15598 "Invalid Shuffle Vector Operand");
15599 // Try to find the previous shuffle node with the same operands and same
15600 // main/alternate ops.
15601 auto TryFindNodeWithEqualOperands = [=]() {
15602 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15603 if (TE.get() == E)
15604 break;
15605 if (TE->hasState() && TE->isAltShuffle() &&
15606 ((TE->getOpcode() == E->getOpcode() &&
15607 TE->getAltOpcode() == E->getAltOpcode()) ||
15608 (TE->getOpcode() == E->getAltOpcode() &&
15609 TE->getAltOpcode() == E->getOpcode())) &&
15610 TE->hasEqualOperands(*E))
15611 return true;
15612 }
15613 return false;
15614 };
15615 auto GetScalarCost = [&](unsigned Idx) {
15616 if (isa<PoisonValue>(UniqueValues[Idx]))
15618
15619 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15620 assert(E->getMatchingMainOpOrAltOp(VI) &&
15621 "Unexpected main/alternate opcode");
15622 (void)E;
15623 return TTI->getInstructionCost(VI, CostKind);
15624 };
15625 // Need to clear CommonCost since the final shuffle cost is included into
15626 // vector cost.
15627 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15628 // VecCost is equal to sum of the cost of creating 2 vectors
15629 // and the cost of creating shuffle.
15630 InstructionCost VecCost = 0;
15631 if (TryFindNodeWithEqualOperands()) {
15632 LLVM_DEBUG({
15633 dbgs() << "SLP: diamond match for alternate node found.\n";
15634 E->dump();
15635 });
15636 // No need to add new vector costs here since we're going to reuse
15637 // same main/alternate vector ops, just do different shuffling.
15638 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15639 VecCost =
15640 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15641 VecCost +=
15642 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15643 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15644 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15645 VecCost = TTIRef.getCmpSelInstrCost(
15646 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15647 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15648 VL0);
15649 VecCost += TTIRef.getCmpSelInstrCost(
15650 E->getOpcode(), VecTy, MaskTy,
15651 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15652 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15653 E->getAltOp());
15654 } else {
15655 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15656 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15657 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15658 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15659 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15660 unsigned SrcBWSz =
15661 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15662 if (SrcIt != MinBWs.end()) {
15663 SrcBWSz = SrcIt->second.first;
15664 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15665 SrcTy = getWidenedType(SrcSclTy, VL.size());
15666 }
15667 if (BWSz <= SrcBWSz) {
15668 if (BWSz < SrcBWSz)
15669 VecCost =
15670 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15672 LLVM_DEBUG({
15673 dbgs()
15674 << "SLP: alternate extension, which should be truncated.\n";
15675 E->dump();
15676 });
15677 return VecCost;
15678 }
15679 }
15680 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15682 VecCost +=
15683 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15685 }
15686 SmallVector<int> Mask;
15687 E->buildAltOpShuffleMask(
15688 [&](Instruction *I) {
15689 assert(E->getMatchingMainOpOrAltOp(I) &&
15690 "Unexpected main/alternate opcode");
15691 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15692 *TLI);
15693 },
15694 Mask);
15696 FinalVecTy, Mask, CostKind);
15697 // Patterns like [fadd,fsub] can be combined into a single instruction
15698 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15699 // need to take into account their order when looking for the most used
15700 // order.
15701 unsigned Opcode0 = E->getOpcode();
15702 unsigned Opcode1 = E->getAltOpcode();
15703 SmallBitVector OpcodeMask(
15704 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15705 // If this pattern is supported by the target then we consider the
15706 // order.
15707 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15708 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15709 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15710 return AltVecCost < VecCost ? AltVecCost : VecCost;
15711 }
15712 // TODO: Check the reverse order too.
15713 return VecCost;
15714 };
15715 if (SLPReVec && !E->isAltShuffle())
15716 return GetCostDiff(
15717 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15718 // If a group uses mask in order, the shufflevector can be
15719 // eliminated by instcombine. Then the cost is 0.
15721 "Not supported shufflevector usage.");
15722 auto *SV = cast<ShuffleVectorInst>(VL.front());
15723 unsigned SVNumElements =
15724 cast<FixedVectorType>(SV->getOperand(0)->getType())
15725 ->getNumElements();
15726 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15727 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15728 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15729 int NextIndex = 0;
15730 if (!all_of(Group, [&](Value *V) {
15732 "Not supported shufflevector usage.");
15733 auto *SV = cast<ShuffleVectorInst>(V);
15734 int Index;
15735 [[maybe_unused]] bool IsExtractSubvectorMask =
15736 SV->isExtractSubvectorMask(Index);
15737 assert(IsExtractSubvectorMask &&
15738 "Not supported shufflevector usage.");
15739 if (NextIndex != Index)
15740 return false;
15741 NextIndex += SV->getShuffleMask().size();
15742 return true;
15743 }))
15744 return ::getShuffleCost(
15746 calculateShufflevectorMask(E->Scalars));
15747 }
15748 return TTI::TCC_Free;
15749 });
15750 return GetCostDiff(GetScalarCost, GetVectorCost);
15751 }
15752 case Instruction::Freeze:
15753 return CommonCost;
15754 default:
15755 llvm_unreachable("Unknown instruction");
15756 }
15757}
15758
15759bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15760 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15761 << VectorizableTree.size() << " is fully vectorizable .\n");
15762
15763 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15764 SmallVector<int> Mask;
15765 return TE->isGather() &&
15766 !any_of(TE->Scalars,
15767 [this](Value *V) { return EphValues.contains(V); }) &&
15768 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15769 TE->Scalars.size() < Limit ||
15770 (((TE->hasState() &&
15771 TE->getOpcode() == Instruction::ExtractElement) ||
15773 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15774 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15775 !TE->isAltShuffle()) ||
15776 any_of(TE->Scalars, IsaPred<LoadInst>));
15777 };
15778
15779 // We only handle trees of heights 1 and 2.
15780 if (VectorizableTree.size() == 1 &&
15781 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15782 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15783 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15784 (ForReduction &&
15785 AreVectorizableGathers(VectorizableTree[0].get(),
15786 VectorizableTree[0]->Scalars.size()) &&
15787 VectorizableTree[0]->getVectorFactor() > 2)))
15788 return true;
15789
15790 if (VectorizableTree.size() != 2)
15791 return false;
15792
15793 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15794 // with the second gather nodes if they have less scalar operands rather than
15795 // the initial tree element (may be profitable to shuffle the second gather)
15796 // or they are extractelements, which form shuffle.
15797 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15798 AreVectorizableGathers(VectorizableTree[1].get(),
15799 VectorizableTree[0]->Scalars.size()))
15800 return true;
15801
15802 // Gathering cost would be too much for tiny trees.
15803 if (VectorizableTree[0]->isGather() ||
15804 (VectorizableTree[1]->isGather() &&
15805 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15806 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15807 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15808 return false;
15809
15810 return true;
15811}
15812
15813static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15815 bool MustMatchOrInst) {
15816 // Look past the root to find a source value. Arbitrarily follow the
15817 // path through operand 0 of any 'or'. Also, peek through optional
15818 // shift-left-by-multiple-of-8-bits.
15819 Value *ZextLoad = Root;
15820 const APInt *ShAmtC;
15821 bool FoundOr = false;
15822 while (!isa<ConstantExpr>(ZextLoad) &&
15823 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15824 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15825 ShAmtC->urem(8) == 0))) {
15826 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15827 ZextLoad = BinOp->getOperand(0);
15828 if (BinOp->getOpcode() == Instruction::Or)
15829 FoundOr = true;
15830 }
15831 // Check if the input is an extended load of the required or/shift expression.
15832 Value *Load;
15833 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15834 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15835 return false;
15836
15837 // Require that the total load bit width is a legal integer type.
15838 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15839 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15840 Type *SrcTy = Load->getType();
15841 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15842 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15843 return false;
15844
15845 // Everything matched - assume that we can fold the whole sequence using
15846 // load combining.
15847 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15848 << *(cast<Instruction>(Root)) << "\n");
15849
15850 return true;
15851}
15852
15854 if (RdxKind != RecurKind::Or)
15855 return false;
15856
15857 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15858 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15859 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15860 /* MatchOr */ false);
15861}
15862
15864 // Peek through a final sequence of stores and check if all operations are
15865 // likely to be load-combined.
15866 unsigned NumElts = Stores.size();
15867 for (Value *Scalar : Stores) {
15868 Value *X;
15869 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15870 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15871 return false;
15872 }
15873 return true;
15874}
15875
15876bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15877 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15878 return true;
15879
15880 // Graph is empty - do nothing.
15881 if (VectorizableTree.empty()) {
15882 assert(ExternalUses.empty() && "We shouldn't have any external users");
15883
15884 return true;
15885 }
15886
15887 // No need to vectorize inserts of gathered values.
15888 if (VectorizableTree.size() == 2 &&
15889 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15890 VectorizableTree[1]->isGather() &&
15891 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15892 !(isSplat(VectorizableTree[1]->Scalars) ||
15893 allConstant(VectorizableTree[1]->Scalars))))
15894 return true;
15895
15896 // If the graph includes only PHI nodes and gathers, it is defnitely not
15897 // profitable for the vectorization, we can skip it, if the cost threshold is
15898 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15899 // gathers/buildvectors.
15900 constexpr int Limit = 4;
15901 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15902 !VectorizableTree.empty() &&
15903 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15904 return (TE->isGather() &&
15905 (!TE->hasState() ||
15906 TE->getOpcode() != Instruction::ExtractElement) &&
15907 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15908 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15909 }))
15910 return true;
15911
15912 // Do not vectorize small tree of phis only, if all vector phis are also
15913 // gathered.
15914 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15915 VectorizableTree.size() <= Limit &&
15916 all_of(VectorizableTree,
15917 [&](const std::unique_ptr<TreeEntry> &TE) {
15918 return (TE->isGather() &&
15919 (!TE->hasState() ||
15920 TE->getOpcode() != Instruction::ExtractElement) &&
15921 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15922 Limit) ||
15923 (TE->hasState() &&
15924 (TE->getOpcode() == Instruction::InsertElement ||
15925 (TE->getOpcode() == Instruction::PHI &&
15926 all_of(TE->Scalars, [&](Value *V) {
15927 return isa<PoisonValue>(V) || MustGather.contains(V);
15928 }))));
15929 }) &&
15930 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15931 return TE->State == TreeEntry::Vectorize &&
15932 TE->getOpcode() == Instruction::PHI;
15933 }))
15934 return true;
15935
15936 // If the tree contains only phis, buildvectors, split nodes and
15937 // small nodes with reuses, we can skip it.
15938 SmallVector<const TreeEntry *> StoreLoadNodes;
15939 unsigned NumGathers = 0;
15940 constexpr int LimitTreeSize = 36;
15941 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15942 all_of(VectorizableTree,
15943 [&](const std::unique_ptr<TreeEntry> &TE) {
15944 if (!TE->isGather() && TE->hasState() &&
15945 (TE->getOpcode() == Instruction::Load ||
15946 TE->getOpcode() == Instruction::Store)) {
15947 StoreLoadNodes.push_back(TE.get());
15948 return true;
15949 }
15950 if (TE->isGather())
15951 ++NumGathers;
15952 return TE->State == TreeEntry::SplitVectorize ||
15953 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15954 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15955 VectorizableTree.size() > LimitTreeSize) ||
15956 (TE->isGather() &&
15957 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15958 (TE->hasState() &&
15959 (TE->getOpcode() == Instruction::PHI ||
15960 (TE->hasCopyableElements() &&
15961 static_cast<unsigned>(count_if(
15962 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15963 TE->Scalars.size() / 2) ||
15964 ((!TE->ReuseShuffleIndices.empty() ||
15965 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15966 TE->Scalars.size() == 2)));
15967 }) &&
15968 (StoreLoadNodes.empty() ||
15969 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15970 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15971 return TE->getOpcode() == Instruction::Store ||
15972 all_of(TE->Scalars, [&](Value *V) {
15973 return !isa<LoadInst>(V) ||
15974 areAllUsersVectorized(cast<Instruction>(V));
15975 });
15976 })))))
15977 return true;
15978
15979 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15980 // tree node) and other buildvectors, we can skip it.
15981 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15982 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15983 VectorizableTree.size() >= Limit &&
15984 count_if(ArrayRef(VectorizableTree).drop_front(),
15985 [&](const std::unique_ptr<TreeEntry> &TE) {
15986 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15987 TE->UserTreeIndex.UserTE->Idx == 0;
15988 }) == 2)
15989 return true;
15990
15991 // If the tree contains only vectorization of the phi node from the
15992 // buildvector - skip it.
15993 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15994 VectorizableTree.size() > 2 &&
15995 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15996 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15997 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15998 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15999 all_of(
16000 ArrayRef(VectorizableTree).drop_front(2),
16001 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
16002 return true;
16003
16004 // We can vectorize the tree if its size is greater than or equal to the
16005 // minimum size specified by the MinTreeSize command line option.
16006 if (VectorizableTree.size() >= MinTreeSize)
16007 return false;
16008
16009 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
16010 // can vectorize it if we can prove it fully vectorizable.
16011 if (isFullyVectorizableTinyTree(ForReduction))
16012 return false;
16013
16014 // Check if any of the gather node forms an insertelement buildvector
16015 // somewhere.
16016 bool IsAllowedSingleBVNode =
16017 VectorizableTree.size() > 1 ||
16018 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
16019 !VectorizableTree.front()->isAltShuffle() &&
16020 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
16021 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
16022 allSameBlock(VectorizableTree.front()->Scalars));
16023 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
16024 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
16025 return isa<ExtractElementInst, Constant>(V) ||
16026 (IsAllowedSingleBVNode &&
16027 !V->hasNUsesOrMore(UsesLimit) &&
16028 any_of(V->users(), IsaPred<InsertElementInst>));
16029 });
16030 }))
16031 return false;
16032
16033 if (VectorizableTree.back()->isGather() &&
16034 VectorizableTree.back()->hasState() &&
16035 VectorizableTree.back()->isAltShuffle() &&
16036 VectorizableTree.back()->getVectorFactor() > 2 &&
16037 allSameBlock(VectorizableTree.back()->Scalars) &&
16038 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
16039 TTI->getScalarizationOverhead(
16040 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
16041 VectorizableTree.back()->getVectorFactor()),
16042 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
16043 /*Insert=*/true, /*Extract=*/false,
16045 return false;
16046
16047 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
16048 // vectorizable.
16049 return true;
16050}
16051
16054 constexpr unsigned SmallTree = 3;
16055 if (VectorizableTree.front()->isNonPowOf2Vec() &&
16056 getCanonicalGraphSize() <= SmallTree &&
16057 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
16058 [](const std::unique_ptr<TreeEntry> &TE) {
16059 return TE->isGather() && TE->hasState() &&
16060 TE->getOpcode() == Instruction::Load &&
16061 !allSameBlock(TE->Scalars);
16062 }) == 1)
16063 return true;
16064 return false;
16065 }
16066 bool Res = false;
16067 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
16068 TreeEntry &E = *VectorizableTree[Idx];
16069 if (E.State == TreeEntry::SplitVectorize)
16070 return false;
16071 if (!E.isGather())
16072 continue;
16073 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
16074 (!E.hasState() &&
16076 (isa<ExtractElementInst>(E.Scalars.front()) &&
16077 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
16078 return false;
16079 if (isSplat(E.Scalars) || allConstant(E.Scalars))
16080 continue;
16081 Res = true;
16082 }
16083 return Res;
16084}
16085
16087 // Walk from the bottom of the tree to the top, tracking which values are
16088 // live. When we see a call instruction that is not part of our tree,
16089 // query TTI to see if there is a cost to keeping values live over it
16090 // (for example, if spills and fills are required).
16091
16092 const TreeEntry *Root = VectorizableTree.front().get();
16093 if (Root->isGather())
16094 return 0;
16095
16096 InstructionCost Cost = 0;
16098 EntriesToOperands;
16099 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
16100 SmallPtrSet<const Instruction *, 8> LastInstructions;
16101 for (const auto &TEPtr : VectorizableTree) {
16102 if (!TEPtr->isGather()) {
16103 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
16104 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
16105 LastInstructions.insert(LastInst);
16106 }
16107 if (TEPtr->UserTreeIndex)
16108 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
16109 }
16110
16111 auto NoCallIntrinsic = [this](const Instruction *I) {
16112 const auto *II = dyn_cast<IntrinsicInst>(I);
16113 if (!II)
16114 return false;
16115 if (II->isAssumeLikeIntrinsic())
16116 return true;
16117 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
16118 InstructionCost IntrCost =
16119 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
16120 InstructionCost CallCost = TTI->getCallInstrCost(
16121 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
16122 return IntrCost < CallCost;
16123 };
16124
16125 // Maps last instruction in the entry to the last instruction for the one of
16126 // operand entries and the flag. If the flag is true, there are no calls in
16127 // between these instructions.
16129 CheckedInstructions;
16130 unsigned Budget = 0;
16131 const unsigned BudgetLimit =
16132 ScheduleRegionSizeBudget / VectorizableTree.size();
16133 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
16134 const Instruction *Last) {
16135 assert(First->getParent() == Last->getParent() &&
16136 "Expected instructions in same block.");
16137 if (auto It = CheckedInstructions.find(Last);
16138 It != CheckedInstructions.end()) {
16139 const Instruction *Checked = It->second.getPointer();
16140 if (Checked == First || Checked->comesBefore(First))
16141 return It->second.getInt() != 0;
16142 Last = Checked;
16143 } else if (Last == First || Last->comesBefore(First)) {
16144 return true;
16145 }
16147 ++First->getIterator().getReverse(),
16148 PrevInstIt =
16149 Last->getIterator().getReverse();
16150 SmallVector<const Instruction *> LastInstsInRange;
16151 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
16152 // Debug information does not impact spill cost.
16153 // Vectorized calls, represented as vector intrinsics, do not impact spill
16154 // cost.
16155 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
16156 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
16157 for (const Instruction *LastInst : LastInstsInRange)
16158 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
16159 return false;
16160 }
16161 if (LastInstructions.contains(&*PrevInstIt))
16162 LastInstsInRange.push_back(&*PrevInstIt);
16163
16164 ++PrevInstIt;
16165 ++Budget;
16166 }
16167 for (const Instruction *LastInst : LastInstsInRange)
16168 CheckedInstructions.try_emplace(
16169 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
16170 Budget <= BudgetLimit ? 1 : 0);
16171 return Budget <= BudgetLimit;
16172 };
16173 auto AddCosts = [&](const TreeEntry *Op) {
16174 Type *ScalarTy = Op->Scalars.front()->getType();
16175 auto It = MinBWs.find(Op);
16176 if (It != MinBWs.end())
16177 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
16178 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
16179 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
16180 if (ScalarTy->isVectorTy()) {
16181 // Handle revec dead vector instructions.
16182 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
16183 }
16184 };
16185 // Memoize the relationship between blocks, i.e. if there is (at least one)
16186 // non-vectorized call between the blocks. This allows to skip the analysis of
16187 // the same block paths multiple times.
16189 ParentOpParentToPreds;
16190 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
16191 BasicBlock *OpParent) {
16192 auto Key = std::make_pair(Root, OpParent);
16193 if (auto It = ParentOpParentToPreds.find(Key);
16194 It != ParentOpParentToPreds.end())
16195 return It->second;
16197 if (Pred)
16198 Worklist.push_back(Pred);
16199 else
16200 Worklist.append(pred_begin(Root), pred_end(Root));
16203 ParentsPairsToAdd;
16204 bool Res = false;
16206 for (const auto &KeyPair : ParentsPairsToAdd) {
16207 assert(!ParentOpParentToPreds.contains(KeyPair) &&
16208 "Should not have been added before.");
16209 ParentOpParentToPreds.try_emplace(KeyPair, Res);
16210 }
16211 });
16212 while (!Worklist.empty()) {
16213 BasicBlock *BB = Worklist.pop_back_val();
16214 if (BB == OpParent || !Visited.insert(BB).second)
16215 continue;
16216 auto Pair = std::make_pair(BB, OpParent);
16217 if (auto It = ParentOpParentToPreds.find(Pair);
16218 It != ParentOpParentToPreds.end()) {
16219 Res = It->second;
16220 return Res;
16221 }
16222 ParentsPairsToAdd.insert(Pair);
16223 unsigned BlockSize = BB->size();
16224 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
16225 return Res;
16226 Budget += BlockSize;
16227 if (Budget > BudgetLimit)
16228 return Res;
16229 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
16230 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
16231 BB->getTerminator()))
16232 return Res;
16233 Worklist.append(pred_begin(BB), pred_end(BB));
16234 }
16235 Res = true;
16236 return Res;
16237 };
16238 SmallVector<const TreeEntry *> LiveEntries(1, Root);
16239 while (!LiveEntries.empty()) {
16240 const TreeEntry *Entry = LiveEntries.pop_back_val();
16241 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
16242 if (Operands.empty())
16243 continue;
16244 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
16245 BasicBlock *Parent = LastInst->getParent();
16246 for (const TreeEntry *Op : Operands) {
16247 if (!Op->isGather())
16248 LiveEntries.push_back(Op);
16249 if (Entry->State == TreeEntry::SplitVectorize ||
16250 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
16251 (Op->isGather() && allConstant(Op->Scalars)))
16252 continue;
16253 Budget = 0;
16254 BasicBlock *Pred = nullptr;
16255 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
16256 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
16257 BasicBlock *OpParent;
16258 Instruction *OpLastInst;
16259 if (Op->isGather()) {
16260 assert(Entry->getOpcode() == Instruction::PHI &&
16261 "Expected phi node only.");
16262 OpParent = cast<PHINode>(Entry->getMainOp())
16263 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
16264 OpLastInst = OpParent->getTerminator();
16265 for (Value *V : Op->Scalars) {
16266 auto *Inst = dyn_cast<Instruction>(V);
16267 if (!Inst)
16268 continue;
16269 if (isVectorized(V)) {
16270 OpParent = Inst->getParent();
16271 OpLastInst = Inst;
16272 break;
16273 }
16274 }
16275 } else {
16276 OpLastInst = EntriesToLastInstruction.at(Op);
16277 OpParent = OpLastInst->getParent();
16278 }
16279 // Check the call instructions within the same basic blocks.
16280 if (OpParent == Parent) {
16281 if (Entry->getOpcode() == Instruction::PHI) {
16282 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16283 AddCosts(Op);
16284 continue;
16285 }
16286 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16287 AddCosts(Op);
16288 continue;
16289 }
16290 // Check for call instruction in between blocks.
16291 // 1. Check entry's block to the head.
16292 if (Entry->getOpcode() != Instruction::PHI &&
16293 !CheckForNonVecCallsInSameBlock(
16294 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
16295 LastInst)) {
16296 AddCosts(Op);
16297 continue;
16298 }
16299 // 2. Check op's block from the end.
16300 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16301 OpParent->getTerminator())) {
16302 AddCosts(Op);
16303 continue;
16304 }
16305 // 3. Check the predecessors of entry's block till op's block.
16306 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16307 AddCosts(Op);
16308 continue;
16309 }
16310 }
16311 }
16312
16313 return Cost;
16314}
16315
16316/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
16317/// buildvector sequence.
16319 const InsertElementInst *IE2) {
16320 if (IE1 == IE2)
16321 return false;
16322 const auto *I1 = IE1;
16323 const auto *I2 = IE2;
16324 const InsertElementInst *PrevI1;
16325 const InsertElementInst *PrevI2;
16326 unsigned Idx1 = *getElementIndex(IE1);
16327 unsigned Idx2 = *getElementIndex(IE2);
16328 do {
16329 if (I2 == IE1)
16330 return true;
16331 if (I1 == IE2)
16332 return false;
16333 PrevI1 = I1;
16334 PrevI2 = I2;
16335 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16336 getElementIndex(I1).value_or(Idx2) != Idx2)
16337 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
16338 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
16339 getElementIndex(I2).value_or(Idx1) != Idx1)
16340 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
16341 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16342 llvm_unreachable("Two different buildvectors not expected.");
16343}
16344
16345namespace {
16346/// Returns incoming Value *, if the requested type is Value * too, or a default
16347/// value, otherwise.
16348struct ValueSelect {
16349 template <typename U>
16350 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
16351 return V;
16352 }
16353 template <typename U>
16354 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
16355 return U();
16356 }
16357};
16358} // namespace
16359
16360/// Does the analysis of the provided shuffle masks and performs the requested
16361/// actions on the vectors with the given shuffle masks. It tries to do it in
16362/// several steps.
16363/// 1. If the Base vector is not undef vector, resizing the very first mask to
16364/// have common VF and perform action for 2 input vectors (including non-undef
16365/// Base). Other shuffle masks are combined with the resulting after the 1 stage
16366/// and processed as a shuffle of 2 elements.
16367/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
16368/// action only for 1 vector with the given mask, if it is not the identity
16369/// mask.
16370/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
16371/// vectors, combing the masks properly between the steps.
16372template <typename T>
16374 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
16375 function_ref<unsigned(T *)> GetVF,
16376 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
16378 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
16379 SmallVector<int> Mask(ShuffleMask.begin()->second);
16380 auto VMIt = std::next(ShuffleMask.begin());
16381 T *Prev = nullptr;
16382 SmallBitVector UseMask =
16383 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16384 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
16385 if (!IsBaseUndef.all()) {
16386 // Base is not undef, need to combine it with the next subvectors.
16387 std::pair<T *, bool> Res =
16388 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
16389 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
16390 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16391 if (Mask[Idx] == PoisonMaskElem)
16392 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
16393 else
16394 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16395 }
16396 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
16397 assert((!V || GetVF(V) == Mask.size()) &&
16398 "Expected base vector of VF number of elements.");
16399 Prev = Action(Mask, {nullptr, Res.first});
16400 } else if (ShuffleMask.size() == 1) {
16401 // Base is undef and only 1 vector is shuffled - perform the action only for
16402 // single vector, if the mask is not the identity mask.
16403 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16404 /*ForSingleMask=*/true);
16405 if (Res.second)
16406 // Identity mask is found.
16407 Prev = Res.first;
16408 else
16409 Prev = Action(Mask, {ShuffleMask.begin()->first});
16410 } else {
16411 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
16412 // shuffles step by step, combining shuffle between the steps.
16413 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16414 unsigned Vec2VF = GetVF(VMIt->first);
16415 if (Vec1VF == Vec2VF) {
16416 // No need to resize the input vectors since they are of the same size, we
16417 // can shuffle them directly.
16418 ArrayRef<int> SecMask = VMIt->second;
16419 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16420 if (SecMask[I] != PoisonMaskElem) {
16421 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16422 Mask[I] = SecMask[I] + Vec1VF;
16423 }
16424 }
16425 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16426 } else {
16427 // Vectors of different sizes - resize and reshuffle.
16428 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16429 /*ForSingleMask=*/false);
16430 std::pair<T *, bool> Res2 =
16431 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16432 ArrayRef<int> SecMask = VMIt->second;
16433 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16434 if (Mask[I] != PoisonMaskElem) {
16435 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16436 if (Res1.second)
16437 Mask[I] = I;
16438 } else if (SecMask[I] != PoisonMaskElem) {
16439 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16440 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
16441 }
16442 }
16443 Prev = Action(Mask, {Res1.first, Res2.first});
16444 }
16445 VMIt = std::next(VMIt);
16446 }
16447 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
16448 // Perform requested actions for the remaining masks/vectors.
16449 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16450 // Shuffle other input vectors, if any.
16451 std::pair<T *, bool> Res =
16452 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16453 ArrayRef<int> SecMask = VMIt->second;
16454 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16455 if (SecMask[I] != PoisonMaskElem) {
16456 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
16457 "Multiple uses of scalars.");
16458 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
16459 } else if (Mask[I] != PoisonMaskElem) {
16460 Mask[I] = I;
16461 }
16462 }
16463 Prev = Action(Mask, {Prev, Res.first});
16464 }
16465 return Prev;
16466}
16467
16469 ArrayRef<Value *> VectorizedVals) {
16471 SmallPtrSet<Value *, 4> CheckedExtracts;
16472 SmallPtrSet<const TreeEntry *, 4> GatheredLoadsNodes;
16473 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16474 << VectorizableTree.size() << ".\n");
16475 InstructionCost Cost = 0;
16476 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16477 TreeEntry &TE = *Ptr;
16478 // No need to count the cost for combined entries, they are combined and
16479 // just skip their cost.
16480 if (TE.State == TreeEntry::CombinedVectorize) {
16481 LLVM_DEBUG(
16482 dbgs() << "SLP: Skipping cost for combined node that starts with "
16483 << *TE.Scalars[0] << ".\n";
16484 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16485 NodesCosts.try_emplace(&TE);
16486 continue;
16487 }
16488 if (TE.hasState() &&
16489 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16490 if (const TreeEntry *E =
16491 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16492 E && E->getVectorFactor() == TE.getVectorFactor()) {
16493 // Some gather nodes might be absolutely the same as some vectorizable
16494 // nodes after reordering, need to handle it.
16495 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16496 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16497 << "SLP: Current total cost = " << Cost << "\n");
16498 NodesCosts.try_emplace(&TE);
16499 continue;
16500 }
16501 }
16502
16503 // Exclude cost of gather loads nodes which are not used. These nodes were
16504 // built as part of the final attempt to vectorize gathered loads.
16505 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16506 "Expected gather nodes with users only.");
16507
16508 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
16509 Cost += C;
16510 NodesCosts.try_emplace(&TE, C);
16511 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16512 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16513 << "SLP: Current total cost = " << Cost << "\n");
16514 // Add gathered loads nodes to the set for later processing.
16515 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
16516 TE.getOpcode() == Instruction::Load)
16517 GatheredLoadsNodes.insert(&TE);
16518 }
16519 // Bail out if the cost threshold is negative and cost already below it.
16520 if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 &&
16521 Cost < -SLPCostThreshold)
16522 return Cost;
16523 // Bail out, if gathered loads nodes are found.
16524 // TODO: add analysis for gathered load to include their cost correctly into
16525 // the related subtrees.
16526 if (!GatheredLoadsNodes.empty())
16527 return Cost;
16528 // The narrow non-profitable tree in loop? Skip, may cause regressions.
16529 constexpr unsigned PartLimit = 2;
16530 const unsigned Sz =
16531 getVectorElementSize(VectorizableTree.front()->Scalars.front());
16532 const unsigned MinVF = getMinVF(Sz);
16533 if (Cost >= -SLPCostThreshold &&
16534 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
16535 (!VectorizableTree.front()->hasState() ||
16536 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
16537 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
16538 return Cost;
16540 VectorizableTree.size());
16541 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16542 TreeEntry &TE = *Ptr;
16543 InstructionCost C = NodesCosts.at(&TE);
16544 SubtreeCosts[TE.Idx].first += C;
16545 const TreeEntry *UserTE = TE.UserTreeIndex.UserTE;
16546 while (UserTE) {
16547 SubtreeCosts[UserTE->Idx].first += C;
16548 SubtreeCosts[UserTE->Idx].second.push_back(TE.Idx);
16549 UserTE = UserTE->UserTreeIndex.UserTE;
16550 }
16551 }
16552 using CostIndicesTy =
16553 std::pair<TreeEntry *, std::pair<InstructionCost, SmallVector<unsigned>>>;
16554 struct FirstGreater {
16555 bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
16556 return LHS.second.first < RHS.second.first ||
16557 (LHS.second.first == RHS.second.first &&
16558 LHS.first->Idx < RHS.first->Idx);
16559 }
16560 };
16562 Worklist;
16563 for (const auto [Idx, P] : enumerate(SubtreeCosts))
16564 Worklist.emplace(VectorizableTree[Idx].get(), P);
16565
16566 // Narrow store trees with non-profitable immediate values - exit.
16567 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
16568 VectorizableTree.front()->hasState() &&
16569 VectorizableTree.front()->getOpcode() == Instruction::Store &&
16570 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
16571 return Cost;
16572
16573 bool Changed = false;
16574 while (!Worklist.empty() && Worklist.top().second.first > 0) {
16575 TreeEntry *TE = Worklist.top().first;
16576 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE)) {
16577 Worklist.pop();
16578 continue;
16579 }
16580
16581 // Calculate the gather cost of the root node.
16582 InstructionCost SubtreeCost = Worklist.top().second.first;
16583 if (SubtreeCost < TE->Scalars.size()) {
16584 Worklist.pop();
16585 continue;
16586 }
16587 if (!TransformedToGatherNodes.empty()) {
16588 for (unsigned Idx : Worklist.top().second.second) {
16589 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].get());
16590 if (It != TransformedToGatherNodes.end()) {
16591 SubtreeCost -= SubtreeCosts[Idx].first;
16592 SubtreeCost += It->second;
16593 }
16594 }
16595 }
16596 if (SubtreeCost < 0 || SubtreeCost < TE->Scalars.size()) {
16597 Worklist.pop();
16598 continue;
16599 }
16600 const unsigned Sz = TE->Scalars.size();
16601 APInt DemandedElts = APInt::getAllOnes(Sz);
16602 for (auto [Idx, V] : enumerate(TE->Scalars)) {
16603 if (isConstant(V))
16604 DemandedElts.clearBit(Idx);
16605 }
16607
16608 Type *ScalarTy = getValueType(TE->Scalars.front());
16609 auto *VecTy = getWidenedType(ScalarTy, Sz);
16610 const unsigned EntryVF = TE->getVectorFactor();
16611 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
16613 *TTI, ScalarTy, VecTy, DemandedElts,
16614 /*Insert=*/true, /*Extract=*/false, CostKind);
16615 SmallVector<int> Mask;
16616 if (!TE->ReorderIndices.empty() &&
16617 TE->State != TreeEntry::CompressVectorize &&
16618 (TE->State != TreeEntry::StridedVectorize ||
16619 !isReverseOrder(TE->ReorderIndices))) {
16620 SmallVector<int> NewMask;
16621 if (TE->getOpcode() == Instruction::Store) {
16622 // For stores the order is actually a mask.
16623 NewMask.resize(TE->ReorderIndices.size());
16624 copy(TE->ReorderIndices, NewMask.begin());
16625 } else {
16626 inversePermutation(TE->ReorderIndices, NewMask);
16627 }
16628 ::addMask(Mask, NewMask);
16629 }
16630 if (!TE->ReuseShuffleIndices.empty())
16631 ::addMask(Mask, TE->ReuseShuffleIndices);
16632 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF))
16633 GatherCost +=
16634 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
16635 // If all scalars are reused in gather node(s) or other vector nodes, there
16636 // might be extra cost for inserting them.
16637 if (all_of(TE->Scalars, [&](Value *V) {
16638 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
16639 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
16640 }))
16641 GatherCost *= 2;
16642 // Erase subtree if it is non-profitable.
16643 if (SubtreeCost > GatherCost) {
16644 // If the remaining tree is just a buildvector - exit, it will cause
16645 // enless attempts to vectorize.
16646 if (VectorizableTree.front()->hasState() &&
16647 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16648 TE->Idx == 1)
16650
16651 LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
16652 << TE->Idx << " with cost "
16653 << Worklist.top().second.first << " and gather cost "
16654 << GatherCost << ".\n");
16655 if (TE->UserTreeIndex) {
16656 TransformedToGatherNodes.try_emplace(TE, GatherCost);
16657 NodesCosts.erase(TE);
16658 } else {
16659 DeletedNodes.insert(TE);
16660 TransformedToGatherNodes.erase(TE);
16661 NodesCosts.erase(TE);
16662 }
16663 for (unsigned Idx : Worklist.top().second.second) {
16664 TreeEntry &ChildTE = *VectorizableTree[Idx];
16665 DeletedNodes.insert(&ChildTE);
16666 TransformedToGatherNodes.erase(&ChildTE);
16667 NodesCosts.erase(&ChildTE);
16668 }
16669 Changed = true;
16670 }
16671 Worklist.pop();
16672 }
16673 if (!Changed)
16674 return SubtreeCosts.front().first;
16675
16676 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16677 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(TE.get())) {
16678 assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
16679 continue;
16680 }
16681 if (DeletedNodes.contains(TE.get()))
16682 continue;
16683 if (!NodesCosts.contains(TE.get())) {
16685 getEntryCost(TE.get(), VectorizedVals, CheckedExtracts);
16686 NodesCosts.try_emplace(TE.get(), C);
16687 }
16688 }
16689
16690 LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
16691 InstructionCost NewCost = 0;
16692 for (const auto &P : NodesCosts) {
16693 NewCost += P.second;
16694 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
16695 << shortBundleName(P.first->Scalars, P.first->Idx)
16696 << ".\n"
16697 << "SLP: Current total cost = " << Cost << "\n");
16698 }
16699 if (NewCost >= Cost) {
16700 DeletedNodes.clear();
16701 TransformedToGatherNodes.clear();
16702 NewCost = Cost;
16703 }
16704 return NewCost;
16705}
16706
16707namespace {
16708/// Data type for handling buildvector sequences with the reused scalars from
16709/// other tree entries.
16710template <typename T> struct ShuffledInsertData {
16711 /// List of insertelements to be replaced by shuffles.
16712 SmallVector<InsertElementInst *> InsertElements;
16713 /// The parent vectors and shuffle mask for the given list of inserts.
16715};
16716} // namespace
16717
16719 ArrayRef<Value *> VectorizedVals,
16720 InstructionCost ReductionCost) {
16721 InstructionCost Cost = TreeCost + ReductionCost;
16722
16723 if (Cost >= -SLPCostThreshold &&
16724 none_of(ExternalUses, [](const ExternalUser &EU) {
16725 return isa_and_nonnull<InsertElementInst>(EU.User);
16726 }))
16727 return Cost;
16728
16729 SmallPtrSet<Value *, 16> ExtractCostCalculated;
16730 InstructionCost ExtractCost = 0;
16732 SmallVector<APInt> DemandedElts;
16733 SmallDenseSet<Value *, 4> UsedInserts;
16735 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16737 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
16738 // Keep track {Scalar, Index, User} tuple.
16739 // On AArch64, this helps in fusing a mov instruction, associated with
16740 // extractelement, with fmul in the backend so that extractelement is free.
16742 for (ExternalUser &EU : ExternalUses) {
16743 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
16744 }
16745 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
16746 for (ExternalUser &EU : ExternalUses) {
16747 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
16748 << EU.E.Idx << " in lane " << EU.Lane << "\n");
16749 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
16750 else dbgs() << " User: nullptr\n");
16751 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
16752
16753 // Uses by ephemeral values are free (because the ephemeral value will be
16754 // removed prior to code generation, and so the extraction will be
16755 // removed as well).
16756 if (EphValues.count(EU.User))
16757 continue;
16758
16759 // Check if the scalar for the given user or all users is accounted already.
16760 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
16761 (EU.User &&
16762 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
16763 continue;
16764
16765 // Used in unreachable blocks or in EH pads (rarely executed) or is
16766 // terminated with unreachable instruction.
16767 if (BasicBlock *UserParent =
16768 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
16769 UserParent &&
16770 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16771 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
16772 continue;
16773
16774 // We only add extract cost once for the same scalar.
16775 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16776 !ExtractCostCalculated.insert(EU.Scalar).second)
16777 continue;
16778
16779 // No extract cost for vector "scalar" if REVEC is disabled
16780 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16781 continue;
16782
16783 // If found user is an insertelement, do not calculate extract cost but try
16784 // to detect it as a final shuffled/identity match.
16785 // TODO: what if a user is insertvalue when REVEC is enabled?
16786 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16787 VU && VU->getOperand(1) == EU.Scalar) {
16788 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16789 if (!UsedInserts.insert(VU).second)
16790 continue;
16791 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16792 if (InsertIdx) {
16793 const TreeEntry *ScalarTE = &EU.E;
16794 auto *It = find_if(
16795 ShuffledInserts,
16796 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16797 // Checks if 2 insertelements are from the same buildvector.
16798 InsertElementInst *VecInsert = Data.InsertElements.front();
16800 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16801 Value *Op0 = II->getOperand(0);
16802 if (isVectorized(II) && !isVectorized(Op0))
16803 return nullptr;
16804 return Op0;
16805 });
16806 });
16807 int VecId = -1;
16808 if (It == ShuffledInserts.end()) {
16809 auto &Data = ShuffledInserts.emplace_back();
16810 Data.InsertElements.emplace_back(VU);
16811 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16812 VecId = ShuffledInserts.size() - 1;
16813 auto It = MinBWs.find(ScalarTE);
16814 if (It != MinBWs.end() &&
16815 VectorCasts
16816 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16817 .second) {
16818 unsigned BWSz = It->second.first;
16819 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16820 unsigned VecOpcode;
16821 if (DstBWSz < BWSz)
16822 VecOpcode = Instruction::Trunc;
16823 else
16824 VecOpcode =
16825 It->second.second ? Instruction::SExt : Instruction::ZExt;
16827 InstructionCost C = TTI->getCastInstrCost(
16828 VecOpcode, FTy,
16829 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16830 FTy->getNumElements()),
16832 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16833 << " for extending externally used vector with "
16834 "non-equal minimum bitwidth.\n");
16835 Cost += C;
16836 }
16837 } else {
16838 if (isFirstInsertElement(VU, It->InsertElements.front()))
16839 It->InsertElements.front() = VU;
16840 VecId = std::distance(ShuffledInserts.begin(), It);
16841 }
16842 int InIdx = *InsertIdx;
16843 SmallVectorImpl<int> &Mask =
16844 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16845 if (Mask.empty())
16846 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16847 Mask[InIdx] = EU.Lane;
16848 DemandedElts[VecId].setBit(InIdx);
16849 continue;
16850 }
16851 }
16852 }
16853
16855 // If we plan to rewrite the tree in a smaller type, we will need to sign
16856 // extend the extracted value back to the original type. Here, we account
16857 // for the extract and the added cost of the sign extend if needed.
16858 InstructionCost ExtraCost = TTI::TCC_Free;
16859 auto *ScalarTy = EU.Scalar->getType();
16860 const unsigned BundleWidth = EU.E.getVectorFactor();
16861 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16862 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16863 const TreeEntry *Entry = &EU.E;
16864 auto It = MinBWs.find(Entry);
16865 if (It != MinBWs.end()) {
16866 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16867 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16868 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16869 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16870 ? Instruction::ZExt
16871 : Instruction::SExt;
16872 VecTy = getWidenedType(MinTy, BundleWidth);
16873 ExtraCost =
16874 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16875 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16876 << ExtraCost << "\n");
16877 } else {
16878 ExtraCost =
16879 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16880 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16881 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16882 << *VecTy << ": " << ExtraCost << "\n");
16883 }
16884 // Leave the scalar instructions as is if they are cheaper than extracts.
16885 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16886 Entry->getOpcode() == Instruction::Load) {
16887 // Checks if the user of the external scalar is phi in loop body.
16888 auto IsPhiInLoop = [&](const ExternalUser &U) {
16889 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16890 auto *I = cast<Instruction>(U.Scalar);
16891 const Loop *L = LI->getLoopFor(Phi->getParent());
16892 return L && (Phi->getParent() == I->getParent() ||
16893 L == LI->getLoopFor(I->getParent()));
16894 }
16895 return false;
16896 };
16897 if (!ValueToExtUses) {
16898 ValueToExtUses.emplace();
16899 for (const auto &P : enumerate(ExternalUses)) {
16900 // Ignore phis in loops.
16901 if (IsPhiInLoop(P.value()))
16902 continue;
16903
16904 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16905 }
16906 }
16907 // Can use original instruction, if no operands vectorized or they are
16908 // marked as externally used already.
16909 auto *Inst = cast<Instruction>(EU.Scalar);
16910 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16911 auto OperandIsScalar = [&](Value *V) {
16912 if (!isVectorized(V)) {
16913 // Some extractelements might be not vectorized, but
16914 // transformed into shuffle and removed from the function,
16915 // consider it here.
16916 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16917 return !EE->hasOneUse() || !MustGather.contains(EE);
16918 return true;
16919 }
16920 return ValueToExtUses->contains(V);
16921 };
16922 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16923 bool CanBeUsedAsScalarCast = false;
16924 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16925 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16926 Op && all_of(Op->operands(), OperandIsScalar)) {
16927 InstructionCost OpCost =
16928 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16929 ? TTI->getInstructionCost(Op, CostKind)
16930 : 0;
16931 if (ScalarCost + OpCost <= ExtraCost) {
16932 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16933 ScalarCost += OpCost;
16934 }
16935 }
16936 }
16937 if (CanBeUsedAsScalar) {
16938 bool KeepScalar = ScalarCost <= ExtraCost;
16939 // Try to keep original scalar if the user is the phi node from the same
16940 // block as the root phis, currently vectorized. It allows to keep
16941 // better ordering info of PHIs, being vectorized currently.
16942 bool IsProfitablePHIUser =
16943 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16944 VectorizableTree.front()->Scalars.size() > 2)) &&
16945 VectorizableTree.front()->hasState() &&
16946 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16947 !Inst->hasNUsesOrMore(UsesLimit) &&
16948 none_of(Inst->users(),
16949 [&](User *U) {
16950 auto *PHIUser = dyn_cast<PHINode>(U);
16951 return (!PHIUser ||
16952 PHIUser->getParent() !=
16953 cast<Instruction>(
16954 VectorizableTree.front()->getMainOp())
16955 ->getParent()) &&
16956 !isVectorized(U);
16957 }) &&
16958 count_if(Entry->Scalars, [&](Value *V) {
16959 return ValueToExtUses->contains(V);
16960 }) <= 2;
16961 if (IsProfitablePHIUser) {
16962 KeepScalar = true;
16963 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16964 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16965 (!GatheredLoadsEntriesFirst.has_value() ||
16966 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16967 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16968 return ValueToExtUses->contains(V);
16969 });
16970 auto It = ExtractsCount.find(Entry);
16971 if (It != ExtractsCount.end()) {
16972 assert(ScalarUsesCount >= It->getSecond().size() &&
16973 "Expected total number of external uses not less than "
16974 "number of scalar uses.");
16975 ScalarUsesCount -= It->getSecond().size();
16976 }
16977 // Keep original scalar if number of externally used instructions in
16978 // the same entry is not power of 2. It may help to do some extra
16979 // vectorization for now.
16980 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16981 }
16982 if (KeepScalar) {
16983 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16984 for (Value *V : Inst->operands()) {
16985 auto It = ValueToExtUses->find(V);
16986 if (It != ValueToExtUses->end()) {
16987 // Replace all uses to avoid compiler crash.
16988 ExternalUses[It->second].User = nullptr;
16989 }
16990 }
16991 ExtraCost = ScalarCost;
16992 if (!IsPhiInLoop(EU))
16993 ExtractsCount[Entry].insert(Inst);
16994 if (CanBeUsedAsScalarCast) {
16995 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16996 // Update the users of the operands of the cast operand to avoid
16997 // compiler crash.
16998 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16999 for (Value *V : IOp->operands()) {
17000 auto It = ValueToExtUses->find(V);
17001 if (It != ValueToExtUses->end()) {
17002 // Replace all uses to avoid compiler crash.
17003 ExternalUses[It->second].User = nullptr;
17004 }
17005 }
17006 }
17007 }
17008 }
17009 }
17010 }
17011
17012 ExtractCost += ExtraCost;
17013 }
17014 // Insert externals for extract of operands of casts to be emitted as scalars
17015 // instead of extractelement.
17016 for (Value *V : ScalarOpsFromCasts) {
17017 ExternalUsesAsOriginalScalar.insert(V);
17018 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
17019 const auto *It = find_if_not(TEs, [&](TreeEntry *TE) {
17020 return TransformedToGatherNodes.contains(TE) ||
17021 DeletedNodes.contains(TE);
17022 });
17023 if (It != TEs.end()) {
17024 const TreeEntry *UserTE = *It;
17025 ExternalUses.emplace_back(V, nullptr, *UserTE,
17026 UserTE->findLaneForValue(V));
17027 }
17028 }
17029 }
17030 // Add reduced value cost, if resized.
17031 if (!VectorizedVals.empty()) {
17032 const TreeEntry &Root = *VectorizableTree.front();
17033 auto BWIt = MinBWs.find(&Root);
17034 if (BWIt != MinBWs.end()) {
17035 Type *DstTy = Root.Scalars.front()->getType();
17036 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
17037 unsigned SrcSz =
17038 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
17039 if (OriginalSz != SrcSz) {
17040 unsigned Opcode = Instruction::Trunc;
17041 if (OriginalSz > SrcSz)
17042 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
17043 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
17044 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
17045 assert(SLPReVec && "Only supported by REVEC.");
17046 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
17047 }
17048 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
17051 }
17052 }
17053 }
17054
17055 // Buildvector with externally used scalars, which should remain as scalars,
17056 // should not be vectorized, the compiler may hang.
17057 if (SLPCostThreshold < 0 && VectorizableTree.size() > 1 &&
17058 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
17059 VectorizableTree[1]->hasState() &&
17060 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17061 all_of(VectorizableTree[1]->Scalars, [&](Value *V) {
17062 return ExternalUsesAsOriginalScalar.contains(V);
17063 }))
17065
17066 Cost += ExtractCost;
17067 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
17068 bool ForSingleMask) {
17069 InstructionCost C = 0;
17070 unsigned VF = Mask.size();
17071 unsigned VecVF = TE->getVectorFactor();
17072 bool HasLargeIndex =
17073 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
17074 if ((VF != VecVF && HasLargeIndex) ||
17076
17077 if (HasLargeIndex) {
17078 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
17079 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
17080 OrigMask.begin());
17082 getWidenedType(TE->getMainOp()->getType(), VecVF),
17083 OrigMask);
17084 LLVM_DEBUG(
17085 dbgs() << "SLP: Adding cost " << C
17086 << " for final shuffle of insertelement external users.\n";
17087 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17088 Cost += C;
17089 return std::make_pair(TE, true);
17090 }
17091
17092 if (!ForSingleMask) {
17093 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
17094 for (unsigned I = 0; I < VF; ++I) {
17095 if (Mask[I] != PoisonMaskElem)
17096 ResizeMask[Mask[I]] = Mask[I];
17097 }
17098 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
17101 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
17102 LLVM_DEBUG(
17103 dbgs() << "SLP: Adding cost " << C
17104 << " for final shuffle of insertelement external users.\n";
17105 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17106
17107 Cost += C;
17108 }
17109 }
17110 return std::make_pair(TE, false);
17111 };
17112 // Calculate the cost of the reshuffled vectors, if any.
17113 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
17114 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
17115 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
17116 unsigned VF = 0;
17117 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
17119 assert((TEs.size() == 1 || TEs.size() == 2) &&
17120 "Expected exactly 1 or 2 tree entries.");
17121 if (TEs.size() == 1) {
17122 if (VF == 0)
17123 VF = TEs.front()->getVectorFactor();
17124 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17125 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
17126 !all_of(enumerate(Mask), [=](const auto &Data) {
17127 return Data.value() == PoisonMaskElem ||
17128 (Data.index() < VF &&
17129 static_cast<int>(Data.index()) == Data.value());
17130 })) {
17133 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17134 << " for final shuffle of insertelement "
17135 "external users.\n";
17136 TEs.front()->dump();
17137 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17138 Cost += C;
17139 }
17140 } else {
17141 if (VF == 0) {
17142 if (TEs.front() &&
17143 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
17144 VF = TEs.front()->getVectorFactor();
17145 else
17146 VF = Mask.size();
17147 }
17148 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17150 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
17151 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17152 << " for final shuffle of vector node and external "
17153 "insertelement users.\n";
17154 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
17155 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17156 Cost += C;
17157 }
17158 VF = Mask.size();
17159 return TEs.back();
17160 };
17162 MutableArrayRef(Vector.data(), Vector.size()), Base,
17163 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
17164 EstimateShufflesCost);
17165 InstructionCost InsertCost = TTI->getScalarizationOverhead(
17167 ShuffledInserts[I].InsertElements.front()->getType()),
17168 DemandedElts[I],
17169 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
17170 Cost -= InsertCost;
17171 }
17172
17173 // Add the cost for reduced value resize (if required).
17174 if (ReductionBitWidth != 0) {
17175 assert(UserIgnoreList && "Expected reduction tree.");
17176 const TreeEntry &E = *VectorizableTree.front();
17177 auto It = MinBWs.find(&E);
17178 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
17179 unsigned SrcSize = It->second.first;
17180 unsigned DstSize = ReductionBitWidth;
17181 unsigned Opcode = Instruction::Trunc;
17182 if (SrcSize < DstSize) {
17183 bool IsArithmeticExtendedReduction =
17184 all_of(*UserIgnoreList, [](Value *V) {
17185 auto *I = cast<Instruction>(V);
17186 return is_contained({Instruction::Add, Instruction::FAdd,
17187 Instruction::Mul, Instruction::FMul,
17188 Instruction::And, Instruction::Or,
17189 Instruction::Xor},
17190 I->getOpcode());
17191 });
17192 if (IsArithmeticExtendedReduction)
17193 Opcode =
17194 Instruction::BitCast; // Handle it by getExtendedReductionCost
17195 else
17196 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
17197 }
17198 if (Opcode != Instruction::BitCast) {
17199 auto *SrcVecTy =
17200 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
17201 auto *DstVecTy =
17202 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
17203 TTI::CastContextHint CCH = getCastContextHint(E);
17204 InstructionCost CastCost;
17205 switch (E.getOpcode()) {
17206 case Instruction::SExt:
17207 case Instruction::ZExt:
17208 case Instruction::Trunc: {
17209 const TreeEntry *OpTE = getOperandEntry(&E, 0);
17210 CCH = getCastContextHint(*OpTE);
17211 break;
17212 }
17213 default:
17214 break;
17215 }
17216 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
17218 Cost += CastCost;
17219 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
17220 << " for final resize for reduction from " << SrcVecTy
17221 << " to " << DstVecTy << "\n";
17222 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17223 }
17224 }
17225 }
17226
17227 std::optional<InstructionCost> SpillCost;
17228 if (Cost < -SLPCostThreshold) {
17229 SpillCost = getSpillCost();
17230 Cost += *SpillCost;
17231 }
17232#ifndef NDEBUG
17233 SmallString<256> Str;
17234 {
17235 raw_svector_ostream OS(Str);
17236 OS << "SLP: Spill Cost = ";
17237 if (SpillCost)
17238 OS << *SpillCost;
17239 else
17240 OS << "<skipped>";
17241 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
17242 << "SLP: Total Cost = " << Cost << ".\n";
17243 }
17244 LLVM_DEBUG(dbgs() << Str);
17245 if (ViewSLPTree)
17246 ViewGraph(this, "SLP" + F->getName(), false, Str);
17247#endif
17248
17249 return Cost;
17250}
17251
17252/// Tries to find extractelement instructions with constant indices from fixed
17253/// vector type and gather such instructions into a bunch, which highly likely
17254/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17255/// successful, the matched scalars are replaced by poison values in \p VL for
17256/// future analysis.
17257std::optional<TTI::ShuffleKind>
17258BoUpSLP::tryToGatherSingleRegisterExtractElements(
17260 // Scan list of gathered scalars for extractelements that can be represented
17261 // as shuffles.
17263 SmallVector<int> UndefVectorExtracts;
17264 for (int I = 0, E = VL.size(); I < E; ++I) {
17265 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
17266 if (!EI) {
17267 if (isa<UndefValue>(VL[I]))
17268 UndefVectorExtracts.push_back(I);
17269 continue;
17270 }
17271 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
17272 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
17273 continue;
17274 std::optional<unsigned> Idx = getExtractIndex(EI);
17275 // Undefined index.
17276 if (!Idx) {
17277 UndefVectorExtracts.push_back(I);
17278 continue;
17279 }
17280 if (Idx >= VecTy->getNumElements()) {
17281 UndefVectorExtracts.push_back(I);
17282 continue;
17283 }
17284 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
17285 ExtractMask.reset(*Idx);
17286 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
17287 UndefVectorExtracts.push_back(I);
17288 continue;
17289 }
17290 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
17291 }
17292 // Sort the vector operands by the maximum number of uses in extractelements.
17294 VectorOpToIdx.takeVector();
17295 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
17296 return P1.second.size() > P2.second.size();
17297 });
17298 // Find the best pair of the vectors or a single vector.
17299 const int UndefSz = UndefVectorExtracts.size();
17300 unsigned SingleMax = 0;
17301 unsigned PairMax = 0;
17302 if (!Vectors.empty()) {
17303 SingleMax = Vectors.front().second.size() + UndefSz;
17304 if (Vectors.size() > 1) {
17305 auto *ItNext = std::next(Vectors.begin());
17306 PairMax = SingleMax + ItNext->second.size();
17307 }
17308 }
17309 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
17310 return std::nullopt;
17311 // Check if better to perform a shuffle of 2 vectors or just of a single
17312 // vector.
17313 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
17314 SmallVector<Value *> GatheredExtracts(
17315 VL.size(), PoisonValue::get(VL.front()->getType()));
17316 if (SingleMax >= PairMax && SingleMax) {
17317 for (int Idx : Vectors.front().second)
17318 std::swap(GatheredExtracts[Idx], VL[Idx]);
17319 } else if (!Vectors.empty()) {
17320 for (unsigned Idx : {0, 1})
17321 for (int Idx : Vectors[Idx].second)
17322 std::swap(GatheredExtracts[Idx], VL[Idx]);
17323 }
17324 // Add extracts from undefs too.
17325 for (int Idx : UndefVectorExtracts)
17326 std::swap(GatheredExtracts[Idx], VL[Idx]);
17327 // Check that gather of extractelements can be represented as just a
17328 // shuffle of a single/two vectors the scalars are extracted from.
17329 std::optional<TTI::ShuffleKind> Res =
17330 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
17331 if (!Res || all_of(Mask, equal_to(PoisonMaskElem))) {
17332 // TODO: try to check other subsets if possible.
17333 // Restore the original VL if attempt was not successful.
17334 copy(SavedVL, VL.begin());
17335 return std::nullopt;
17336 }
17337 // Restore unused scalars from mask, if some of the extractelements were not
17338 // selected for shuffle.
17339 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
17340 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
17341 isa<UndefValue>(GatheredExtracts[I])) {
17342 std::swap(VL[I], GatheredExtracts[I]);
17343 continue;
17344 }
17345 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
17346 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
17347 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
17348 is_contained(UndefVectorExtracts, I))
17349 continue;
17350 }
17351 return Res;
17352}
17353
17354/// Tries to find extractelement instructions with constant indices from fixed
17355/// vector type and gather such instructions into a bunch, which highly likely
17356/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17357/// successful, the matched scalars are replaced by poison values in \p VL for
17358/// future analysis.
17360BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
17361 SmallVectorImpl<int> &Mask,
17362 unsigned NumParts) const {
17363 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
17364 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
17365 Mask.assign(VL.size(), PoisonMaskElem);
17366 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17367 for (unsigned Part : seq<unsigned>(NumParts)) {
17368 // Scan list of gathered scalars for extractelements that can be represented
17369 // as shuffles.
17370 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
17371 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17372 SmallVector<int> SubMask;
17373 std::optional<TTI::ShuffleKind> Res =
17374 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
17375 ShufflesRes[Part] = Res;
17376 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
17377 }
17378 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
17379 return Res.has_value();
17380 }))
17381 ShufflesRes.clear();
17382 return ShufflesRes;
17383}
17384
17385std::optional<TargetTransformInfo::ShuffleKind>
17386BoUpSLP::isGatherShuffledSingleRegisterEntry(
17387 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
17388 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
17389 Entries.clear();
17390 if (TE->Idx == 0)
17391 return std::nullopt;
17392 // TODO: currently checking only for Scalars in the tree entry, need to count
17393 // reused elements too for better cost estimation.
17394 auto GetUserEntry = [&](const TreeEntry *TE) {
17395 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17396 TE = TE->UserTreeIndex.UserTE;
17397 if (TE == VectorizableTree.front().get())
17398 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
17399 return TE->UserTreeIndex;
17400 };
17401 auto HasGatherUser = [&](const TreeEntry *TE) {
17402 while (TE->Idx != 0 && TE->UserTreeIndex) {
17403 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17404 return true;
17405 TE = TE->UserTreeIndex.UserTE;
17406 }
17407 return false;
17408 };
17409 const EdgeInfo TEUseEI = GetUserEntry(TE);
17410 if (!TEUseEI)
17411 return std::nullopt;
17412 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
17413 const BasicBlock *TEInsertBlock = nullptr;
17414 // Main node of PHI entries keeps the correct order of operands/incoming
17415 // blocks.
17416 if (auto *PHI = dyn_cast_or_null<PHINode>(
17417 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
17418 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
17419 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
17420 TEInsertPt = TEInsertBlock->getTerminator();
17421 } else {
17422 TEInsertBlock = TEInsertPt->getParent();
17423 }
17424 if (!DT->isReachableFromEntry(TEInsertBlock))
17425 return std::nullopt;
17426 auto *NodeUI = DT->getNode(TEInsertBlock);
17427 assert(NodeUI && "Should only process reachable instructions");
17428 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
17429 auto CheckOrdering = [&](const Instruction *InsertPt) {
17430 // Argument InsertPt is an instruction where vector code for some other
17431 // tree entry (one that shares one or more scalars with TE) is going to be
17432 // generated. This lambda returns true if insertion point of vector code
17433 // for the TE dominates that point (otherwise dependency is the other way
17434 // around). The other node is not limited to be of a gather kind. Gather
17435 // nodes are not scheduled and their vector code is inserted before their
17436 // first user. If user is PHI, that is supposed to be at the end of a
17437 // predecessor block. Otherwise it is the last instruction among scalars of
17438 // the user node. So, instead of checking dependency between instructions
17439 // themselves, we check dependency between their insertion points for vector
17440 // code (since each scalar instruction ends up as a lane of a vector
17441 // instruction).
17442 const BasicBlock *InsertBlock = InsertPt->getParent();
17443 auto *NodeEUI = DT->getNode(InsertBlock);
17444 if (!NodeEUI)
17445 return false;
17446 assert((NodeUI == NodeEUI) ==
17447 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
17448 "Different nodes should have different DFS numbers");
17449 // Check the order of the gather nodes users.
17450 if (TEInsertPt->getParent() != InsertBlock &&
17451 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
17452 return false;
17453 if (TEInsertPt->getParent() == InsertBlock &&
17454 TEInsertPt->comesBefore(InsertPt))
17455 return false;
17456 return true;
17457 };
17458 // Find all tree entries used by the gathered values. If no common entries
17459 // found - not a shuffle.
17460 // Here we build a set of tree nodes for each gathered value and trying to
17461 // find the intersection between these sets. If we have at least one common
17462 // tree node for each gathered value - we have just a permutation of the
17463 // single vector. If we have 2 different sets, we're in situation where we
17464 // have a permutation of 2 input vectors.
17466 SmallDenseMap<Value *, int> UsedValuesEntry;
17467 SmallPtrSet<const Value *, 16> VisitedValue;
17468 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
17469 // The node is reused - exit.
17470 if ((TEPtr->getVectorFactor() != VL.size() &&
17471 TEPtr->Scalars.size() != VL.size()) ||
17472 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
17473 return false;
17474 UsedTEs.clear();
17475 UsedTEs.emplace_back().insert(TEPtr);
17476 for (Value *V : VL) {
17477 if (isConstant(V))
17478 continue;
17479 UsedValuesEntry.try_emplace(V, 0);
17480 }
17481 return true;
17482 };
17483 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
17484 unsigned EdgeIdx) {
17485 const TreeEntry *Ptr1 = User1;
17486 const TreeEntry *Ptr2 = User2;
17487 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
17488 while (Ptr2) {
17489 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
17490 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
17491 Ptr2 = Ptr2->UserTreeIndex.UserTE;
17492 }
17493 while (Ptr1) {
17494 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
17495 Ptr1 = Ptr1->UserTreeIndex.UserTE;
17496 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
17497 return Idx < It->second;
17498 }
17499 return false;
17500 };
17501 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
17502 Instruction *InsertPt) {
17503 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
17504 !TEUseEI.UserTE->isCopyableElement(
17505 const_cast<Instruction *>(TEInsertPt)) &&
17506 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17507 InsertPt->getNextNode() == TEInsertPt &&
17508 (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) ||
17509 !isUsedOutsideBlock(InsertPt));
17510 };
17511 for (Value *V : VL) {
17512 if (isConstant(V) || !VisitedValue.insert(V).second)
17513 continue;
17514 // Build a list of tree entries where V is used.
17515 SmallPtrSet<const TreeEntry *, 4> VToTEs;
17517 ValueToGatherNodes.lookup(V).takeVector());
17518 if (TransformedToGatherNodes.contains(TE)) {
17519 for (TreeEntry *E : getSplitTreeEntries(V)) {
17520 if (TE == E || !TransformedToGatherNodes.contains(E) ||
17521 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
17522 continue;
17523 GatherNodes.push_back(E);
17524 }
17525 for (TreeEntry *E : getTreeEntries(V)) {
17526 if (TE == E || !TransformedToGatherNodes.contains(E) ||
17527 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
17528 continue;
17529 GatherNodes.push_back(E);
17530 }
17531 }
17532 for (const TreeEntry *TEPtr : GatherNodes) {
17533 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
17534 continue;
17535 assert(any_of(TEPtr->Scalars,
17536 [&](Value *V) { return GatheredScalars.contains(V); }) &&
17537 "Must contain at least single gathered value.");
17538 assert(TEPtr->UserTreeIndex &&
17539 "Expected only single user of a gather node.");
17540 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17541
17542 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17543 UseEI.UserTE->hasState())
17544 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
17545 : nullptr;
17546 Instruction *InsertPt =
17547 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
17548 : &getLastInstructionInBundle(UseEI.UserTE);
17549 if (TEInsertPt == InsertPt) {
17550 // Check nodes, which might be emitted first.
17551 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17552 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17553 TEUseEI.UserTE->isAltShuffle()) &&
17554 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
17555 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17556 (UseEI.UserTE->hasState() &&
17557 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17558 !UseEI.UserTE->isAltShuffle()) ||
17559 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
17560 continue;
17561 }
17562
17563 // If the schedulable insertion point is used in multiple entries - just
17564 // exit, no known ordering at this point, available only after real
17565 // scheduling.
17566 if (!doesNotNeedToBeScheduled(InsertPt) &&
17567 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17568 continue;
17569 // If the users are the PHI nodes with the same incoming blocks - skip.
17570 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17571 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17572 UseEI.UserTE->State == TreeEntry::Vectorize &&
17573 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17574 TEUseEI.UserTE != UseEI.UserTE)
17575 continue;
17576 // If 2 gathers are operands of the same entry (regardless of whether
17577 // user is PHI or else), compare operands indices, use the earlier one
17578 // as the base.
17579 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17580 continue;
17581 // If the user instruction is used for some reason in different
17582 // vectorized nodes - make it depend on index.
17583 if (TEUseEI.UserTE != UseEI.UserTE &&
17584 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17585 HasGatherUser(TEUseEI.UserTE)))
17586 continue;
17587 // If the user node is the operand of the other user node - skip.
17588 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17589 continue;
17590 }
17591
17592 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17593 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17594 UseEI.UserTE->doesNotNeedToSchedule() &&
17595 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
17596 continue;
17597 // Check if the user node of the TE comes after user node of TEPtr,
17598 // otherwise TEPtr depends on TE.
17599 if ((TEInsertBlock != InsertPt->getParent() ||
17600 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17601 (!CheckOrdering(InsertPt) ||
17602 (UseEI.UserTE->hasCopyableElements() &&
17603 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17604 is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
17605 continue;
17606 // The node is reused - exit.
17607 if (CheckAndUseSameNode(TEPtr))
17608 break;
17609 // The parent node is copyable with last inst used outside? And the last
17610 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
17611 // preserve def-use chain.
17612 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17613 continue;
17614 VToTEs.insert(TEPtr);
17615 }
17616 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
17617 const auto *It = find_if(VTEs, [&](const TreeEntry *MTE) {
17618 return MTE != TE && MTE != TEUseEI.UserTE &&
17619 !DeletedNodes.contains(MTE) &&
17620 !TransformedToGatherNodes.contains(MTE);
17621 });
17622 if (It != VTEs.end()) {
17623 const TreeEntry *VTE = *It;
17624 if (none_of(TE->CombinedEntriesWithIndices,
17625 [&](const auto &P) { return P.first == VTE->Idx; })) {
17626 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17627 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17628 continue;
17629 }
17630 // The node is reused - exit.
17631 if (CheckAndUseSameNode(VTE))
17632 break;
17633 VToTEs.insert(VTE);
17634 }
17635 }
17636 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
17637 const auto *It = find_if(VTEs, [&, MainTE = TE](const TreeEntry *TE) {
17638 return TE != MainTE && !DeletedNodes.contains(TE) &&
17639 !TransformedToGatherNodes.contains(TE);
17640 });
17641 if (It != VTEs.end()) {
17642 const TreeEntry *VTE = *It;
17643 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17644 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17645 VTEs = VTEs.drop_front();
17646 // Iterate through all vectorized nodes.
17647 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
17648 return MTE->State == TreeEntry::Vectorize;
17649 });
17650 if (MIt == VTEs.end())
17651 continue;
17652 VTE = *MIt;
17653 }
17654 if (none_of(TE->CombinedEntriesWithIndices,
17655 [&](const auto &P) { return P.first == VTE->Idx; })) {
17656 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17657 if (&LastBundleInst == TEInsertPt ||
17658 !CheckOrdering(&LastBundleInst) ||
17659 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
17660 continue;
17661 }
17662 // The node is reused - exit.
17663 if (CheckAndUseSameNode(VTE))
17664 break;
17665 VToTEs.insert(VTE);
17666 }
17667 }
17668 if (VToTEs.empty())
17669 continue;
17670 if (UsedTEs.empty()) {
17671 // The first iteration, just insert the list of nodes to vector.
17672 UsedTEs.push_back(VToTEs);
17673 UsedValuesEntry.try_emplace(V, 0);
17674 } else {
17675 // Need to check if there are any previously used tree nodes which use V.
17676 // If there are no such nodes, consider that we have another one input
17677 // vector.
17678 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
17679 unsigned Idx = 0;
17680 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
17681 // Do we have a non-empty intersection of previously listed tree entries
17682 // and tree entries using current V?
17683 set_intersect(VToTEs, Set);
17684 if (!VToTEs.empty()) {
17685 // Yes, write the new subset and continue analysis for the next
17686 // scalar.
17687 Set.swap(VToTEs);
17688 break;
17689 }
17690 VToTEs = SavedVToTEs;
17691 ++Idx;
17692 }
17693 // No non-empty intersection found - need to add a second set of possible
17694 // source vectors.
17695 if (Idx == UsedTEs.size()) {
17696 // If the number of input vectors is greater than 2 - not a permutation,
17697 // fallback to the regular gather.
17698 // TODO: support multiple reshuffled nodes.
17699 if (UsedTEs.size() == 2)
17700 continue;
17701 UsedTEs.push_back(SavedVToTEs);
17702 Idx = UsedTEs.size() - 1;
17703 }
17704 UsedValuesEntry.try_emplace(V, Idx);
17705 }
17706 }
17707
17708 if (UsedTEs.empty()) {
17709 Entries.clear();
17710 return std::nullopt;
17711 }
17712
17713 unsigned VF = 0;
17714 if (UsedTEs.size() == 1) {
17715 // Keep the order to avoid non-determinism.
17716 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
17717 UsedTEs.front().end());
17718 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17719 return TE1->Idx < TE2->Idx;
17720 });
17721 // Try to find the perfect match in another gather node at first.
17722 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
17723 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
17724 });
17725 if (It != FirstEntries.end() &&
17726 ((*It)->getVectorFactor() == VL.size() ||
17727 ((*It)->getVectorFactor() == TE->Scalars.size() &&
17728 TE->ReuseShuffleIndices.size() == VL.size() &&
17729 (*It)->isSame(TE->Scalars)))) {
17730 Entries.push_back(*It);
17731 if ((*It)->getVectorFactor() == VL.size()) {
17732 std::iota(std::next(Mask.begin(), Part * VL.size()),
17733 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
17734 } else {
17735 SmallVector<int> CommonMask = TE->getCommonMask();
17736 copy(CommonMask, Mask.begin());
17737 }
17738 // Clear undef scalars.
17739 for (unsigned I : seq<unsigned>(VL.size()))
17740 if (isa<PoisonValue>(VL[I]))
17741 Mask[Part * VL.size() + I] = PoisonMaskElem;
17743 }
17744 // No perfect match, just shuffle, so choose the first tree node from the
17745 // tree.
17746 Entries.push_back(FirstEntries.front());
17747 // Update mapping between values and corresponding tree entries.
17748 for (auto &P : UsedValuesEntry)
17749 P.second = 0;
17750 VF = FirstEntries.front()->getVectorFactor();
17751 } else {
17752 // Try to find nodes with the same vector factor.
17753 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
17754 // Keep the order of tree nodes to avoid non-determinism.
17755 DenseMap<int, const TreeEntry *> VFToTE;
17756 for (const TreeEntry *TE : UsedTEs.front()) {
17757 unsigned VF = TE->getVectorFactor();
17758 auto It = VFToTE.find(VF);
17759 if (It != VFToTE.end()) {
17760 if (It->second->Idx > TE->Idx)
17761 It->getSecond() = TE;
17762 continue;
17763 }
17764 VFToTE.try_emplace(VF, TE);
17765 }
17766 // Same, keep the order to avoid non-determinism.
17767 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
17768 UsedTEs.back().end());
17769 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17770 return TE1->Idx < TE2->Idx;
17771 });
17772 for (const TreeEntry *TE : SecondEntries) {
17773 auto It = VFToTE.find(TE->getVectorFactor());
17774 if (It != VFToTE.end()) {
17775 VF = It->first;
17776 Entries.push_back(It->second);
17777 Entries.push_back(TE);
17778 break;
17779 }
17780 }
17781 // No 2 source vectors with the same vector factor - just choose 2 with max
17782 // index.
17783 if (Entries.empty()) {
17785 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
17786 return TE1->Idx < TE2->Idx;
17787 }));
17788 Entries.push_back(SecondEntries.front());
17789 VF = std::max(Entries.front()->getVectorFactor(),
17790 Entries.back()->getVectorFactor());
17791 } else {
17792 VF = Entries.front()->getVectorFactor();
17793 }
17794 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
17795 for (const TreeEntry *E : Entries)
17796 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
17797 E->Scalars.end());
17798 // Update mapping between values and corresponding tree entries.
17799 for (auto &P : UsedValuesEntry) {
17800 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
17801 if (ValuesToEntries[Idx].contains(P.first)) {
17802 P.second = Idx;
17803 break;
17804 }
17805 }
17806 }
17807
17808 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
17809 // Checks if the 2 PHIs are compatible in terms of high possibility to be
17810 // vectorized.
17811 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
17812 auto *PHI = cast<PHINode>(V);
17813 auto *PHI1 = cast<PHINode>(V1);
17814 // Check that all incoming values are compatible/from same parent (if they
17815 // are instructions).
17816 // The incoming values are compatible if they all are constants, or
17817 // instruction with the same/alternate opcodes from the same basic block.
17818 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
17819 Value *In = PHI->getIncomingValue(I);
17820 Value *In1 = PHI1->getIncomingValue(I);
17821 if (isConstant(In) && isConstant(In1))
17822 continue;
17823 if (!getSameOpcode({In, In1}, *TLI))
17824 return false;
17825 if (cast<Instruction>(In)->getParent() !=
17827 return false;
17828 }
17829 return true;
17830 };
17831 // Check if the value can be ignored during analysis for shuffled gathers.
17832 // We suppose it is better to ignore instruction, which do not form splats,
17833 // are not vectorized/not extractelements (these instructions will be handled
17834 // by extractelements processing) or may form vector node in future.
17835 auto MightBeIgnored = [=](Value *V) {
17836 auto *I = dyn_cast<Instruction>(V);
17837 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17839 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17840 };
17841 // Check that the neighbor instruction may form a full vector node with the
17842 // current instruction V. It is possible, if they have same/alternate opcode
17843 // and same parent basic block.
17844 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17845 Value *V1 = VL[Idx];
17846 bool UsedInSameVTE = false;
17847 auto It = UsedValuesEntry.find(V1);
17848 if (It != UsedValuesEntry.end())
17849 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17850 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17851 getSameOpcode({V, V1}, *TLI) &&
17852 cast<Instruction>(V)->getParent() ==
17853 cast<Instruction>(V1)->getParent() &&
17854 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17855 };
17856 // Build a shuffle mask for better cost estimation and vector emission.
17857 SmallBitVector UsedIdxs(Entries.size());
17859 for (int I = 0, E = VL.size(); I < E; ++I) {
17860 Value *V = VL[I];
17861 auto It = UsedValuesEntry.find(V);
17862 if (It == UsedValuesEntry.end())
17863 continue;
17864 // Do not try to shuffle scalars, if they are constants, or instructions
17865 // that can be vectorized as a result of the following vector build
17866 // vectorization.
17867 if (isConstant(V) || (MightBeIgnored(V) &&
17868 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17869 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17870 continue;
17871 unsigned Idx = It->second;
17872 EntryLanes.emplace_back(Idx, I);
17873 UsedIdxs.set(Idx);
17874 }
17875 // Iterate through all shuffled scalars and select entries, which can be used
17876 // for final shuffle.
17878 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17879 if (!UsedIdxs.test(I))
17880 continue;
17881 // Fix the entry number for the given scalar. If it is the first entry, set
17882 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17883 // These indices are used when calculating final shuffle mask as the vector
17884 // offset.
17885 for (std::pair<unsigned, int> &Pair : EntryLanes)
17886 if (Pair.first == I)
17887 Pair.first = TempEntries.size();
17888 TempEntries.push_back(Entries[I]);
17889 }
17890 Entries.swap(TempEntries);
17891 if (EntryLanes.size() == Entries.size() &&
17892 !VL.equals(ArrayRef(TE->Scalars)
17893 .slice(Part * VL.size(),
17894 std::min<int>(VL.size(), TE->Scalars.size())))) {
17895 // We may have here 1 or 2 entries only. If the number of scalars is equal
17896 // to the number of entries, no need to do the analysis, it is not very
17897 // profitable. Since VL is not the same as TE->Scalars, it means we already
17898 // have some shuffles before. Cut off not profitable case.
17899 Entries.clear();
17900 return std::nullopt;
17901 }
17902 // Build the final mask, check for the identity shuffle, if possible.
17903 bool IsIdentity = Entries.size() == 1;
17904 // Pair.first is the offset to the vector, while Pair.second is the index of
17905 // scalar in the list.
17906 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17907 unsigned Idx = Part * VL.size() + Pair.second;
17908 Mask[Idx] =
17909 Pair.first * VF +
17910 (ForOrder ? std::distance(
17911 Entries[Pair.first]->Scalars.begin(),
17912 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17913 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17914 IsIdentity &= Mask[Idx] == Pair.second;
17915 }
17916 if (ForOrder || IsIdentity || Entries.empty()) {
17917 switch (Entries.size()) {
17918 case 1:
17919 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17921 break;
17922 case 2:
17923 if (EntryLanes.size() > 2 || VL.size() <= 2)
17925 break;
17926 default:
17927 break;
17928 }
17929 } else if (!isa<VectorType>(VL.front()->getType()) &&
17930 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17931 // Do the cost estimation if shuffle beneficial than buildvector.
17932 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17933 std::next(Mask.begin(), (Part + 1) * VL.size()));
17934 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17935 for (int Idx : SubMask) {
17936 if (Idx == PoisonMaskElem)
17937 continue;
17938 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17939 MinElement = Idx;
17940 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17941 MaxElement = Idx;
17942 }
17943 assert(MaxElement >= 0 && MinElement >= 0 &&
17944 MaxElement % VF >= MinElement % VF &&
17945 "Expected at least single element.");
17946 unsigned NewVF = std::max<unsigned>(
17947 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17948 (MaxElement % VF) -
17949 (MinElement % VF) + 1));
17950 if (NewVF < VF) {
17951 for (int &Idx : SubMask) {
17952 if (Idx == PoisonMaskElem)
17953 continue;
17954 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17955 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17956 }
17957 } else {
17958 NewVF = VF;
17959 }
17960
17962 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17963 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17964 auto GetShuffleCost = [&,
17965 &TTI = *TTI](ArrayRef<int> Mask,
17967 VectorType *VecTy) -> InstructionCost {
17968 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17970 Mask, Entries.front()->getInterleaveFactor()))
17971 return TTI::TCC_Free;
17972 return ::getShuffleCost(TTI,
17973 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17975 VecTy, Mask, CostKind);
17976 };
17977 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17978 InstructionCost FirstShuffleCost = 0;
17979 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17980 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17981 FirstShuffleCost = ShuffleCost;
17982 } else {
17983 // Transform mask to include only first entry.
17984 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17985 bool IsIdentity = true;
17986 for (auto [I, Idx] : enumerate(FirstMask)) {
17987 if (Idx >= static_cast<int>(NewVF)) {
17988 Idx = PoisonMaskElem;
17989 } else {
17990 DemandedElts.clearBit(I);
17991 if (Idx != PoisonMaskElem)
17992 IsIdentity &= static_cast<int>(I) == Idx;
17993 }
17994 }
17995 if (!IsIdentity)
17996 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17997 FirstShuffleCost += getScalarizationOverhead(
17998 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17999 /*Extract=*/false, CostKind);
18000 }
18001 InstructionCost SecondShuffleCost = 0;
18002 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
18003 if (Entries.size() == 1 || !Entries[1]->isGather()) {
18004 SecondShuffleCost = ShuffleCost;
18005 } else {
18006 // Transform mask to include only first entry.
18007 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
18008 bool IsIdentity = true;
18009 for (auto [I, Idx] : enumerate(SecondMask)) {
18010 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
18011 Idx = PoisonMaskElem;
18012 } else {
18013 DemandedElts.clearBit(I);
18014 if (Idx != PoisonMaskElem) {
18015 Idx -= NewVF;
18016 IsIdentity &= static_cast<int>(I) == Idx;
18017 }
18018 }
18019 }
18020 if (!IsIdentity)
18021 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
18022 SecondShuffleCost += getScalarizationOverhead(
18023 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
18024 /*Extract=*/false, CostKind);
18025 }
18026 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
18027 for (auto [I, Idx] : enumerate(SubMask))
18028 if (Idx == PoisonMaskElem)
18029 DemandedElts.clearBit(I);
18030 InstructionCost BuildVectorCost = getScalarizationOverhead(
18031 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
18032 /*Extract=*/false, CostKind);
18033 const TreeEntry *BestEntry = nullptr;
18034 if (FirstShuffleCost < ShuffleCost) {
18035 std::for_each(std::next(Mask.begin(), Part * VL.size()),
18036 std::next(Mask.begin(), (Part + 1) * VL.size()),
18037 [&](int &Idx) {
18038 if (Idx >= static_cast<int>(VF))
18039 Idx = PoisonMaskElem;
18040 });
18041 BestEntry = Entries.front();
18042 ShuffleCost = FirstShuffleCost;
18043 }
18044 if (SecondShuffleCost < ShuffleCost) {
18045 std::for_each(std::next(Mask.begin(), Part * VL.size()),
18046 std::next(Mask.begin(), (Part + 1) * VL.size()),
18047 [&](int &Idx) {
18048 if (Idx < static_cast<int>(VF))
18049 Idx = PoisonMaskElem;
18050 else
18051 Idx -= VF;
18052 });
18053 BestEntry = Entries[1];
18054 ShuffleCost = SecondShuffleCost;
18055 }
18056 if (BuildVectorCost >= ShuffleCost) {
18057 if (BestEntry) {
18058 Entries.clear();
18059 Entries.push_back(BestEntry);
18060 }
18061 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
18063 }
18064 }
18065 Entries.clear();
18066 // Clear the corresponding mask elements.
18067 std::fill(std::next(Mask.begin(), Part * VL.size()),
18068 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
18069 return std::nullopt;
18070}
18071
18073BoUpSLP::isGatherShuffledEntry(
18074 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
18075 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
18076 bool ForOrder) {
18077 assert(NumParts > 0 && NumParts < VL.size() &&
18078 "Expected positive number of registers.");
18079 Entries.clear();
18080 // No need to check for the topmost gather node.
18081 if (TE == VectorizableTree.front().get() &&
18082 (!GatheredLoadsEntriesFirst.has_value() ||
18083 none_of(ArrayRef(VectorizableTree).drop_front(),
18084 [](const std::unique_ptr<TreeEntry> &TE) {
18085 return !TE->isGather();
18086 })))
18087 return {};
18088 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
18089 // implemented yet.
18090 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
18091 return {};
18092 Mask.assign(VL.size(), PoisonMaskElem);
18093 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
18094 "Expected only single user of the gather node.");
18095 assert(VL.size() % NumParts == 0 &&
18096 "Number of scalars must be divisible by NumParts.");
18097 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
18098 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
18099 (TE->Idx == 0 ||
18100 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
18101 isSplat(TE->Scalars) ||
18102 (TE->hasState() &&
18103 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
18104 return {};
18105 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18107 for (unsigned Part : seq<unsigned>(NumParts)) {
18108 ArrayRef<Value *> SubVL =
18109 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
18110 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
18111 std::optional<TTI::ShuffleKind> SubRes =
18112 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
18113 ForOrder);
18114 if (!SubRes)
18115 SubEntries.clear();
18116 Res.push_back(SubRes);
18117 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
18118 SubEntries.front()->getVectorFactor() == VL.size() &&
18119 (SubEntries.front()->isSame(TE->Scalars) ||
18120 SubEntries.front()->isSame(VL))) {
18121 SmallVector<const TreeEntry *> LocalSubEntries;
18122 LocalSubEntries.swap(SubEntries);
18123 Entries.clear();
18124 Res.clear();
18125 std::iota(Mask.begin(), Mask.end(), 0);
18126 // Clear undef scalars.
18127 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
18128 if (isa<PoisonValue>(VL[I]))
18130 Entries.emplace_back(1, LocalSubEntries.front());
18132 return Res;
18133 }
18134 }
18135 if (all_of(Res,
18136 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
18137 Entries.clear();
18138 return {};
18139 }
18140 return Res;
18141}
18142
18143InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
18144 Type *ScalarTy) const {
18145 const unsigned VF = VL.size();
18146 auto *VecTy = getWidenedType(ScalarTy, VF);
18147 // Find the cost of inserting/extracting values from the vector.
18148 // Check if the same elements are inserted several times and count them as
18149 // shuffle candidates.
18150 APInt DemandedElements = APInt::getZero(VF);
18153 auto EstimateInsertCost = [&](unsigned I, Value *V) {
18154 DemandedElements.setBit(I);
18155 if (V->getType() != ScalarTy)
18156 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
18158 };
18159 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
18160 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
18161 for (auto [I, V] : enumerate(VL)) {
18162 // No need to shuffle duplicates for constants.
18163 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
18164 continue;
18165
18166 if (isConstant(V)) {
18167 ConstantShuffleMask[I] = I + VF;
18168 continue;
18169 }
18170 EstimateInsertCost(I, V);
18171 }
18172 // FIXME: add a cost for constant vector materialization.
18173 bool IsAnyNonUndefConst =
18174 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
18175 // 1. Shuffle input source vector and constant vector.
18176 if (!ForPoisonSrc && IsAnyNonUndefConst) {
18178 ConstantShuffleMask);
18179 }
18180
18181 // 2. Insert unique non-constants.
18182 if (!DemandedElements.isZero())
18183 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
18184 /*Insert=*/true,
18185 /*Extract=*/false, CostKind,
18186 ForPoisonSrc && !IsAnyNonUndefConst, VL);
18187 return Cost;
18188}
18189
18190Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
18191 auto It = EntryToLastInstruction.find(E);
18192 if (It != EntryToLastInstruction.end())
18193 return *cast<Instruction>(It->second);
18194 Instruction *Res = nullptr;
18195 // Get the basic block this bundle is in. All instructions in the bundle
18196 // should be in this block (except for extractelement-like instructions with
18197 // constant indices or gathered loads or copyables).
18198 Instruction *Front;
18199 unsigned Opcode;
18200 if (E->hasState()) {
18201 Front = E->getMainOp();
18202 Opcode = E->getOpcode();
18203 } else {
18204 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
18205 Opcode = Front->getOpcode();
18206 }
18207 auto *BB = Front->getParent();
18208 assert(
18209 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
18210 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
18211 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
18212 all_of(E->Scalars,
18213 [=](Value *V) -> bool {
18214 if (Opcode == Instruction::GetElementPtr &&
18215 !isa<GetElementPtrInst>(V))
18216 return true;
18217 auto *I = dyn_cast<Instruction>(V);
18218 return !I || !E->getMatchingMainOpOrAltOp(I) ||
18219 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
18220 })) &&
18221 "Expected gathered loads or GEPs or instructions from same basic "
18222 "block.");
18223
18224 auto FindLastInst = [&]() {
18225 Instruction *LastInst = Front;
18226 for (Value *V : E->Scalars) {
18227 auto *I = dyn_cast<Instruction>(V);
18228 if (!I)
18229 continue;
18230 if (E->isCopyableElement(I))
18231 continue;
18232 if (LastInst->getParent() == I->getParent()) {
18233 if (LastInst->comesBefore(I))
18234 LastInst = I;
18235 continue;
18236 }
18237 assert(((Opcode == Instruction::GetElementPtr &&
18239 E->State == TreeEntry::SplitVectorize ||
18240 (isVectorLikeInstWithConstOps(LastInst) &&
18242 (GatheredLoadsEntriesFirst.has_value() &&
18243 Opcode == Instruction::Load && E->isGather() &&
18244 E->Idx < *GatheredLoadsEntriesFirst)) &&
18245 "Expected vector-like or non-GEP in GEP node insts only.");
18246 if (!DT->isReachableFromEntry(LastInst->getParent())) {
18247 LastInst = I;
18248 continue;
18249 }
18250 if (!DT->isReachableFromEntry(I->getParent()))
18251 continue;
18252 auto *NodeA = DT->getNode(LastInst->getParent());
18253 auto *NodeB = DT->getNode(I->getParent());
18254 assert(NodeA && "Should only process reachable instructions");
18255 assert(NodeB && "Should only process reachable instructions");
18256 assert((NodeA == NodeB) ==
18257 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18258 "Different nodes should have different DFS numbers");
18259 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
18260 LastInst = I;
18261 }
18262 BB = LastInst->getParent();
18263 return LastInst;
18264 };
18265
18266 auto FindFirstInst = [&]() {
18267 Instruction *FirstInst = Front;
18268 for (Value *V : E->Scalars) {
18269 auto *I = dyn_cast<Instruction>(V);
18270 if (!I)
18271 continue;
18272 if (E->isCopyableElement(I))
18273 continue;
18274 if (FirstInst->getParent() == I->getParent()) {
18275 if (I->comesBefore(FirstInst))
18276 FirstInst = I;
18277 continue;
18278 }
18279 assert(((Opcode == Instruction::GetElementPtr &&
18281 (isVectorLikeInstWithConstOps(FirstInst) &&
18283 "Expected vector-like or non-GEP in GEP node insts only.");
18284 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
18285 FirstInst = I;
18286 continue;
18287 }
18288 if (!DT->isReachableFromEntry(I->getParent()))
18289 continue;
18290 auto *NodeA = DT->getNode(FirstInst->getParent());
18291 auto *NodeB = DT->getNode(I->getParent());
18292 assert(NodeA && "Should only process reachable instructions");
18293 assert(NodeB && "Should only process reachable instructions");
18294 assert((NodeA == NodeB) ==
18295 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18296 "Different nodes should have different DFS numbers");
18297 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
18298 FirstInst = I;
18299 }
18300 return FirstInst;
18301 };
18302
18303 if (E->State == TreeEntry::SplitVectorize) {
18304 Res = FindLastInst();
18305 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
18306 for (auto *E : Entries) {
18307 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
18308 if (!I)
18309 I = &getLastInstructionInBundle(E);
18310 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
18311 Res = I;
18312 }
18313 }
18314 EntryToLastInstruction.try_emplace(E, Res);
18315 return *Res;
18316 }
18317
18318 // Set insertpoint for gathered loads to the very first load.
18319 if (GatheredLoadsEntriesFirst.has_value() &&
18320 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
18321 Opcode == Instruction::Load) {
18322 Res = FindFirstInst();
18323 EntryToLastInstruction.try_emplace(E, Res);
18324 return *Res;
18325 }
18326
18327 // Set the insert point to the beginning of the basic block if the entry
18328 // should not be scheduled.
18329 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
18330 if (E->isGather())
18331 return nullptr;
18332 // Found previously that the instruction do not need to be scheduled.
18333 const auto *It = BlocksSchedules.find(BB);
18334 if (It == BlocksSchedules.end())
18335 return nullptr;
18336 for (Value *V : E->Scalars) {
18337 auto *I = dyn_cast<Instruction>(V);
18338 if (!I || isa<PHINode>(I) ||
18339 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
18340 continue;
18341 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
18342 if (Bundles.empty())
18343 continue;
18344 const auto *It = find_if(
18345 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
18346 if (It != Bundles.end())
18347 return *It;
18348 }
18349 return nullptr;
18350 };
18351 const ScheduleBundle *Bundle = FindScheduleBundle(E);
18352 if (!E->isGather() && !Bundle) {
18353 if ((Opcode == Instruction::GetElementPtr &&
18354 any_of(E->Scalars,
18355 [](Value *V) {
18356 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
18357 })) ||
18358 (all_of(E->Scalars,
18359 [&](Value *V) {
18360 return isa<PoisonValue>(V) ||
18361 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
18362 E->isCopyableElement(V) ||
18363 (!isVectorLikeInstWithConstOps(V) &&
18364 isUsedOutsideBlock(V));
18365 }) &&
18366 (!E->doesNotNeedToSchedule() ||
18367 any_of(E->Scalars,
18368 [&](Value *V) {
18369 if (!isa<Instruction>(V) ||
18370 (E->hasCopyableElements() && E->isCopyableElement(V)))
18371 return false;
18372 return !areAllOperandsNonInsts(V);
18373 }) ||
18374 none_of(E->Scalars, [&](Value *V) {
18375 if (!isa<Instruction>(V) ||
18376 (E->hasCopyableElements() && E->isCopyableElement(V)))
18377 return false;
18378 return MustGather.contains(V);
18379 }))))
18380 Res = FindLastInst();
18381 else
18382 Res = FindFirstInst();
18383 EntryToLastInstruction.try_emplace(E, Res);
18384 return *Res;
18385 }
18386
18387 // Find the last instruction. The common case should be that BB has been
18388 // scheduled, and the last instruction is VL.back(). So we start with
18389 // VL.back() and iterate over schedule data until we reach the end of the
18390 // bundle. The end of the bundle is marked by null ScheduleData.
18391 if (Bundle) {
18392 assert(!E->isGather() && "Gathered instructions should not be scheduled");
18393 Res = Bundle->getBundle().back()->getInst();
18394 EntryToLastInstruction.try_emplace(E, Res);
18395 return *Res;
18396 }
18397
18398 // LastInst can still be null at this point if there's either not an entry
18399 // for BB in BlocksSchedules or there's no ScheduleData available for
18400 // VL.back(). This can be the case if buildTreeRec aborts for various
18401 // reasons (e.g., the maximum recursion depth is reached, the maximum region
18402 // size is reached, etc.). ScheduleData is initialized in the scheduling
18403 // "dry-run".
18404 //
18405 // If this happens, we can still find the last instruction by brute force. We
18406 // iterate forwards from Front (inclusive) until we either see all
18407 // instructions in the bundle or reach the end of the block. If Front is the
18408 // last instruction in program order, LastInst will be set to Front, and we
18409 // will visit all the remaining instructions in the block.
18410 //
18411 // One of the reasons we exit early from buildTreeRec is to place an upper
18412 // bound on compile-time. Thus, taking an additional compile-time hit here is
18413 // not ideal. However, this should be exceedingly rare since it requires that
18414 // we both exit early from buildTreeRec and that the bundle be out-of-order
18415 // (causing us to iterate all the way to the end of the block).
18416 if (!Res)
18417 Res = FindLastInst();
18418 assert(Res && "Failed to find last instruction in bundle");
18419 EntryToLastInstruction.try_emplace(E, Res);
18420 return *Res;
18421}
18422
18423void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
18424 auto *Front = E->getMainOp();
18425 Instruction *LastInst = &getLastInstructionInBundle(E);
18426 assert(LastInst && "Failed to find last instruction in bundle");
18427 BasicBlock::iterator LastInstIt = LastInst->getIterator();
18428 // If the instruction is PHI, set the insert point after all the PHIs.
18429 bool IsPHI = isa<PHINode>(LastInst);
18430 if (IsPHI) {
18431 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
18432 if (LastInstIt != LastInst->getParent()->end() &&
18433 LastInstIt->getParent()->isLandingPad())
18434 LastInstIt = std::next(LastInstIt);
18435 }
18436 if (IsPHI ||
18437 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
18438 (E->doesNotNeedToSchedule() ||
18439 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
18440 isUsedOutsideBlock(LastInst)))) ||
18441 (GatheredLoadsEntriesFirst.has_value() &&
18442 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
18443 E->getOpcode() == Instruction::Load)) {
18444 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
18445 } else {
18446 // Set the insertion point after the last instruction in the bundle. Set the
18447 // debug location to Front.
18448 Builder.SetInsertPoint(
18449 LastInst->getParent(),
18450 LastInst->getNextNode()->getIterator());
18451 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
18452 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
18453 } else {
18454 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
18455 PoisonValue::get(Builder.getPtrTy()),
18456 MaybeAlign());
18457 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
18458 eraseInstruction(Res);
18459 LastInstructionToPos.try_emplace(LastInst, Res);
18460 }
18461 }
18462 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
18463}
18464
18465Value *BoUpSLP::gather(
18466 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
18467 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
18468 // List of instructions/lanes from current block and/or the blocks which are
18469 // part of the current loop. These instructions will be inserted at the end to
18470 // make it possible to optimize loops and hoist invariant instructions out of
18471 // the loops body with better chances for success.
18473 SmallSet<int, 4> PostponedIndices;
18474 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
18475 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
18476 SmallPtrSet<BasicBlock *, 4> Visited;
18477 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
18478 InsertBB = InsertBB->getSinglePredecessor();
18479 return InsertBB && InsertBB == InstBB;
18480 };
18481 for (int I = 0, E = VL.size(); I < E; ++I) {
18482 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
18483 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
18484 isVectorized(Inst) ||
18485 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
18486 PostponedIndices.insert(I).second)
18487 PostponedInsts.emplace_back(Inst, I);
18488 }
18489
18490 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
18491 Type *Ty) {
18492 Value *Scalar = V;
18493 if (Scalar->getType() != Ty) {
18494 assert(Scalar->getType()->isIntOrIntVectorTy() &&
18495 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
18496 Value *V = Scalar;
18497 if (auto *CI = dyn_cast<CastInst>(Scalar);
18499 Value *Op = CI->getOperand(0);
18500 if (auto *IOp = dyn_cast<Instruction>(Op);
18501 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
18502 V = Op;
18503 }
18504 Scalar = Builder.CreateIntCast(
18505 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
18506 }
18507
18508 Instruction *InsElt;
18509 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
18510 assert(SLPReVec && "FixedVectorType is not expected.");
18511 Vec =
18512 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
18513 auto *II = dyn_cast<Instruction>(Vec);
18514 if (!II)
18515 return Vec;
18516 InsElt = II;
18517 } else {
18518 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
18519 InsElt = dyn_cast<InsertElementInst>(Vec);
18520 if (!InsElt)
18521 return Vec;
18522 }
18523 GatherShuffleExtractSeq.insert(InsElt);
18524 CSEBlocks.insert(InsElt->getParent());
18525 // Add to our 'need-to-extract' list.
18526 if (isa<Instruction>(V)) {
18527 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
18528 const auto *It = find_if(Entries, [&](const TreeEntry *E) {
18529 return !TransformedToGatherNodes.contains(E) &&
18530 !DeletedNodes.contains(E);
18531 });
18532 if (It != Entries.end()) {
18533 // Find which lane we need to extract.
18534 User *UserOp = nullptr;
18535 if (Scalar != V) {
18536 if (auto *SI = dyn_cast<Instruction>(Scalar))
18537 UserOp = SI;
18538 } else {
18539 if (V->getType()->isVectorTy()) {
18540 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
18541 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
18542 // Find shufflevector, caused by resize.
18543 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
18544 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
18545 if (SV->getOperand(0) == V)
18546 return SV;
18547 if (SV->getOperand(1) == V)
18548 return SV;
18549 }
18550 return nullptr;
18551 };
18552 InsElt = nullptr;
18553 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18554 InsElt = User;
18555 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18556 InsElt = User;
18557 assert(InsElt &&
18558 "Failed to find shufflevector, caused by resize.");
18559 }
18560 }
18561 UserOp = InsElt;
18562 }
18563 if (UserOp) {
18564 unsigned FoundLane = (*It)->findLaneForValue(V);
18565 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
18566 }
18567 }
18568 }
18569 return Vec;
18570 };
18571 auto *VecTy = getWidenedType(ScalarTy, VL.size());
18572 Value *Vec = PoisonValue::get(VecTy);
18573 SmallVector<int> NonConsts;
18574 SmallVector<int> Mask(VL.size());
18575 std::iota(Mask.begin(), Mask.end(), 0);
18576 Value *OriginalRoot = Root;
18577 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
18578 SV && isa<PoisonValue>(SV->getOperand(1)) &&
18579 SV->getOperand(0)->getType() == VecTy) {
18580 Root = SV->getOperand(0);
18581 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18582 }
18583 // Insert constant values at first.
18584 for (int I = 0, E = VL.size(); I < E; ++I) {
18585 if (PostponedIndices.contains(I))
18586 continue;
18587 if (!isConstant(VL[I])) {
18588 NonConsts.push_back(I);
18589 continue;
18590 }
18591 if (isa<PoisonValue>(VL[I]))
18592 continue;
18593 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18594 Mask[I] = I + E;
18595 }
18596 if (Root) {
18597 if (isa<PoisonValue>(Vec)) {
18598 Vec = OriginalRoot;
18599 } else {
18600 Vec = CreateShuffle(Root, Vec, Mask);
18601 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
18602 OI && OI->use_empty() &&
18603 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18604 return TE->VectorizedValue == OI;
18605 }))
18606 eraseInstruction(OI);
18607 }
18608 }
18609 // Insert non-constant values.
18610 for (int I : NonConsts)
18611 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18612 // Append instructions, which are/may be part of the loop, in the end to make
18613 // it possible to hoist non-loop-based instructions.
18614 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18615 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18616
18617 return Vec;
18618}
18619
18620/// Merges shuffle masks and emits final shuffle instruction, if required. It
18621/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
18622/// when the actual shuffle instruction is generated only if this is actually
18623/// required. Otherwise, the shuffle instruction emission is delayed till the
18624/// end of the process, to reduce the number of emitted instructions and further
18625/// analysis/transformations.
18626/// The class also will look through the previously emitted shuffle instructions
18627/// and properly mark indices in mask as undef.
18628/// For example, given the code
18629/// \code
18630/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
18631/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
18632/// \endcode
18633/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
18634/// look through %s1 and %s2 and emit
18635/// \code
18636/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18637/// \endcode
18638/// instead.
18639/// If 2 operands are of different size, the smallest one will be resized and
18640/// the mask recalculated properly.
18641/// For example, given the code
18642/// \code
18643/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
18644/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
18645/// \endcode
18646/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
18647/// look through %s1 and %s2 and emit
18648/// \code
18649/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18650/// \endcode
18651/// instead.
18652class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
18653 bool IsFinalized = false;
18654 /// Combined mask for all applied operands and masks. It is built during
18655 /// analysis and actual emission of shuffle vector instructions.
18656 SmallVector<int> CommonMask;
18657 /// List of operands for the shuffle vector instruction. It hold at max 2
18658 /// operands, if the 3rd is going to be added, the first 2 are combined into
18659 /// shuffle with \p CommonMask mask, the first operand sets to be the
18660 /// resulting shuffle and the second operand sets to be the newly added
18661 /// operand. The \p CommonMask is transformed in the proper way after that.
18662 SmallVector<Value *, 2> InVectors;
18663 IRBuilderBase &Builder;
18664 BoUpSLP &R;
18665
18666 class ShuffleIRBuilder {
18667 IRBuilderBase &Builder;
18668 /// Holds all of the instructions that we gathered.
18669 SetVector<Instruction *> &GatherShuffleExtractSeq;
18670 /// A list of blocks that we are going to CSE.
18671 DenseSet<BasicBlock *> &CSEBlocks;
18672 /// Data layout.
18673 const DataLayout &DL;
18674
18675 public:
18676 ShuffleIRBuilder(IRBuilderBase &Builder,
18677 SetVector<Instruction *> &GatherShuffleExtractSeq,
18678 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
18679 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
18680 CSEBlocks(CSEBlocks), DL(DL) {}
18681 ~ShuffleIRBuilder() = default;
18682 /// Creates shufflevector for the 2 operands with the given mask.
18683 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
18684 if (V1->getType() != V2->getType()) {
18686 V1->getType()->isIntOrIntVectorTy() &&
18687 "Expected integer vector types only.");
18688 if (V1->getType() != V2->getType()) {
18689 if (cast<VectorType>(V2->getType())
18690 ->getElementType()
18691 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
18692 ->getElementType()
18693 ->getIntegerBitWidth())
18694 V2 = Builder.CreateIntCast(
18695 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
18696 else
18697 V1 = Builder.CreateIntCast(
18698 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
18699 }
18700 }
18701 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
18702 if (auto *I = dyn_cast<Instruction>(Vec)) {
18703 GatherShuffleExtractSeq.insert(I);
18704 CSEBlocks.insert(I->getParent());
18705 }
18706 return Vec;
18707 }
18708 /// Creates permutation of the single vector operand with the given mask, if
18709 /// it is not identity mask.
18710 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
18711 if (Mask.empty())
18712 return V1;
18713 unsigned VF = Mask.size();
18714 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
18715 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
18716 return V1;
18717 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18718 if (auto *I = dyn_cast<Instruction>(Vec)) {
18719 GatherShuffleExtractSeq.insert(I);
18720 CSEBlocks.insert(I->getParent());
18721 }
18722 return Vec;
18723 }
18724 Value *createIdentity(Value *V) { return V; }
18725 Value *createPoison(Type *Ty, unsigned VF) {
18726 return PoisonValue::get(getWidenedType(Ty, VF));
18727 }
18728 /// Resizes 2 input vector to match the sizes, if the they are not equal
18729 /// yet. The smallest vector is resized to the size of the larger vector.
18730 void resizeToMatch(Value *&V1, Value *&V2) {
18731 if (V1->getType() == V2->getType())
18732 return;
18733 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
18734 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
18735 int VF = std::max(V1VF, V2VF);
18736 int MinVF = std::min(V1VF, V2VF);
18737 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
18738 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
18739 0);
18740 Value *&Op = MinVF == V1VF ? V1 : V2;
18741 Op = Builder.CreateShuffleVector(Op, IdentityMask);
18742 if (auto *I = dyn_cast<Instruction>(Op)) {
18743 GatherShuffleExtractSeq.insert(I);
18744 CSEBlocks.insert(I->getParent());
18745 }
18746 if (MinVF == V1VF)
18747 V1 = Op;
18748 else
18749 V2 = Op;
18750 }
18751 };
18752
18753 /// Smart shuffle instruction emission, walks through shuffles trees and
18754 /// tries to find the best matching vector for the actual shuffle
18755 /// instruction.
18756 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
18757 assert(V1 && "Expected at least one vector value.");
18758 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18759 R.CSEBlocks, *R.DL);
18760 return BaseShuffleAnalysis::createShuffle<Value *>(
18761 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18762 }
18763
18764 /// Cast value \p V to the vector type with the same number of elements, but
18765 /// the base type \p ScalarTy.
18766 Value *castToScalarTyElem(Value *V,
18767 std::optional<bool> IsSigned = std::nullopt) {
18768 auto *VecTy = cast<VectorType>(V->getType());
18769 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
18770 if (VecTy->getElementType() == ScalarTy->getScalarType())
18771 return V;
18772 return Builder.CreateIntCast(
18773 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18774 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
18775 }
18776
18777 Value *getVectorizedValue(const TreeEntry &E) {
18778 Value *Vec = E.VectorizedValue;
18779 if (!Vec->getType()->isIntOrIntVectorTy())
18780 return Vec;
18781 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
18782 return !isa<PoisonValue>(V) &&
18783 !isKnownNonNegative(
18784 V, SimplifyQuery(*R.DL));
18785 }));
18786 }
18787
18788public:
18790 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18791
18792 /// Adjusts extractelements after reusing them.
18793 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
18794 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18795 unsigned NumParts, bool &UseVecBaseAsInput) {
18796 UseVecBaseAsInput = false;
18797 SmallPtrSet<Value *, 4> UniqueBases;
18798 Value *VecBase = nullptr;
18799 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
18800 if (!E->ReorderIndices.empty()) {
18801 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18802 E->ReorderIndices.end());
18803 reorderScalars(VL, ReorderMask);
18804 }
18805 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18806 int Idx = Mask[I];
18807 if (Idx == PoisonMaskElem)
18808 continue;
18809 auto *EI = cast<ExtractElementInst>(VL[I]);
18810 VecBase = EI->getVectorOperand();
18811 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
18812 VecBase = TEs.front()->VectorizedValue;
18813 assert(VecBase && "Expected vectorized value.");
18814 UniqueBases.insert(VecBase);
18815 // If the only one use is vectorized - can delete the extractelement
18816 // itself.
18817 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18818 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
18819 !R.isVectorized(EI) &&
18820 count_if(E->Scalars, [&](Value *V) { return V == EI; }) !=
18821 count_if(E->UserTreeIndex.UserTE->Scalars,
18822 [&](Value *V) { return V == EI; })) ||
18823 (NumParts != 1 && count(VL, EI) > 1) ||
18824 any_of(EI->users(), [&](User *U) {
18825 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18826 return UTEs.empty() || UTEs.size() > 1 ||
18827 any_of(UTEs,
18828 [&](const TreeEntry *TE) {
18829 return R.DeletedNodes.contains(TE) ||
18830 R.TransformedToGatherNodes.contains(TE);
18831 }) ||
18833 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18834 (!UTEs.empty() &&
18835 count_if(R.VectorizableTree,
18836 [&](const std::unique_ptr<TreeEntry> &TE) {
18837 return TE->UserTreeIndex.UserTE ==
18838 UTEs.front() &&
18839 is_contained(VL, EI);
18840 }) != 1);
18841 }))
18842 continue;
18843 R.eraseInstruction(EI);
18844 }
18845 if (NumParts == 1 || UniqueBases.size() == 1) {
18846 assert(VecBase && "Expected vectorized value.");
18847 return castToScalarTyElem(VecBase);
18848 }
18849 UseVecBaseAsInput = true;
18850 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
18851 for (auto [I, Idx] : enumerate(Mask))
18852 if (Idx != PoisonMaskElem)
18853 Idx = I;
18854 };
18855 // Perform multi-register vector shuffle, joining them into a single virtual
18856 // long vector.
18857 // Need to shuffle each part independently and then insert all this parts
18858 // into a long virtual vector register, forming the original vector.
18859 Value *Vec = nullptr;
18860 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18861 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18862 for (unsigned Part : seq<unsigned>(NumParts)) {
18863 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
18864 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
18865 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
18866 constexpr int MaxBases = 2;
18867 SmallVector<Value *, MaxBases> Bases(MaxBases);
18868 auto VLMask = zip(SubVL, SubMask);
18869 const unsigned VF = std::accumulate(
18870 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
18871 if (std::get<1>(D) == PoisonMaskElem)
18872 return S;
18873 Value *VecOp =
18874 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18875 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18876 !TEs.empty())
18877 VecOp = TEs.front()->VectorizedValue;
18878 assert(VecOp && "Expected vectorized value.");
18879 const unsigned Size =
18880 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18881 return std::max(S, Size);
18882 });
18883 for (const auto [V, I] : VLMask) {
18884 if (I == PoisonMaskElem)
18885 continue;
18886 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18887 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18888 VecOp = TEs.front()->VectorizedValue;
18889 assert(VecOp && "Expected vectorized value.");
18890 VecOp = castToScalarTyElem(VecOp);
18891 Bases[I / VF] = VecOp;
18892 }
18893 if (!Bases.front())
18894 continue;
18895 Value *SubVec;
18896 if (Bases.back()) {
18897 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18898 TransformToIdentity(SubMask);
18899 } else {
18900 SubVec = Bases.front();
18901 }
18902 if (!Vec) {
18903 Vec = SubVec;
18904 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18905 [&](unsigned P) {
18906 ArrayRef<int> SubMask =
18907 Mask.slice(P * SliceSize,
18908 getNumElems(Mask.size(),
18909 SliceSize, P));
18910 return all_of(SubMask, [](int Idx) {
18911 return Idx == PoisonMaskElem;
18912 });
18913 })) &&
18914 "Expected first part or all previous parts masked.");
18915 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18916 } else {
18917 unsigned NewVF =
18918 cast<FixedVectorType>(Vec->getType())->getNumElements();
18919 if (Vec->getType() != SubVec->getType()) {
18920 unsigned SubVecVF =
18921 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18922 NewVF = std::max(NewVF, SubVecVF);
18923 }
18924 // Adjust SubMask.
18925 for (int &Idx : SubMask)
18926 if (Idx != PoisonMaskElem)
18927 Idx += NewVF;
18928 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18929 Vec = createShuffle(Vec, SubVec, VecMask);
18930 TransformToIdentity(VecMask);
18931 }
18932 }
18933 copy(VecMask, Mask.begin());
18934 return Vec;
18935 }
18936 /// Checks if the specified entry \p E needs to be delayed because of its
18937 /// dependency nodes.
18938 std::optional<Value *>
18939 needToDelay(const TreeEntry *E,
18941 // No need to delay emission if all deps are ready.
18942 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18943 return all_of(
18944 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18945 }))
18946 return std::nullopt;
18947 // Postpone gather emission, will be emitted after the end of the
18948 // process to keep correct order.
18949 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18950 return Builder.CreateAlignedLoad(
18951 ResVecTy,
18952 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18953 MaybeAlign());
18954 }
18955 /// Reset the builder to handle perfect diamond match.
18957 IsFinalized = false;
18958 CommonMask.clear();
18959 InVectors.clear();
18960 }
18961 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18962 /// shuffling.
18963 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18964 Value *V1 = getVectorizedValue(E1);
18965 Value *V2 = getVectorizedValue(E2);
18966 add(V1, V2, Mask);
18967 }
18968 /// Adds single input vector (in form of tree entry) and the mask for its
18969 /// shuffling.
18970 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18971 Value *V1 = getVectorizedValue(E1);
18972 add(V1, Mask);
18973 }
18974 /// Adds 2 input vectors and the mask for their shuffling.
18975 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18976 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18979 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18980 V1 = castToScalarTyElem(V1);
18981 V2 = castToScalarTyElem(V2);
18982 if (InVectors.empty()) {
18983 InVectors.push_back(V1);
18984 InVectors.push_back(V2);
18985 CommonMask.assign(Mask.begin(), Mask.end());
18986 return;
18987 }
18988 Value *Vec = InVectors.front();
18989 if (InVectors.size() == 2) {
18990 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18991 transformMaskAfterShuffle(CommonMask, CommonMask);
18992 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18993 Mask.size()) {
18994 Vec = createShuffle(Vec, nullptr, CommonMask);
18995 transformMaskAfterShuffle(CommonMask, CommonMask);
18996 }
18997 V1 = createShuffle(V1, V2, Mask);
18998 unsigned VF = std::max(getVF(V1), getVF(Vec));
18999 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19000 if (Mask[Idx] != PoisonMaskElem)
19001 CommonMask[Idx] = Idx + VF;
19002 InVectors.front() = Vec;
19003 if (InVectors.size() == 2)
19004 InVectors.back() = V1;
19005 else
19006 InVectors.push_back(V1);
19007 }
19008 /// Adds another one input vector and the mask for the shuffling.
19009 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
19011 "castToScalarTyElem expects V1 to be FixedVectorType");
19012 V1 = castToScalarTyElem(V1);
19013 if (InVectors.empty()) {
19014 InVectors.push_back(V1);
19015 CommonMask.assign(Mask.begin(), Mask.end());
19016 return;
19017 }
19018 const auto *It = find(InVectors, V1);
19019 if (It == InVectors.end()) {
19020 if (InVectors.size() == 2 ||
19021 InVectors.front()->getType() != V1->getType()) {
19022 Value *V = InVectors.front();
19023 if (InVectors.size() == 2) {
19024 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
19025 transformMaskAfterShuffle(CommonMask, CommonMask);
19026 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
19027 CommonMask.size()) {
19028 V = createShuffle(InVectors.front(), nullptr, CommonMask);
19029 transformMaskAfterShuffle(CommonMask, CommonMask);
19030 }
19031 unsigned VF = std::max(CommonMask.size(), Mask.size());
19032 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19033 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
19034 CommonMask[Idx] = V->getType() != V1->getType()
19035 ? Idx + VF
19036 : Mask[Idx] + getVF(V1);
19037 if (V->getType() != V1->getType())
19038 V1 = createShuffle(V1, nullptr, Mask);
19039 InVectors.front() = V;
19040 if (InVectors.size() == 2)
19041 InVectors.back() = V1;
19042 else
19043 InVectors.push_back(V1);
19044 return;
19045 }
19046 // Check if second vector is required if the used elements are already
19047 // used from the first one.
19048 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19049 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
19050 InVectors.push_back(V1);
19051 break;
19052 }
19053 }
19054 unsigned VF = 0;
19055 for (Value *V : InVectors)
19056 VF = std::max(VF, getVF(V));
19057 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19058 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
19059 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
19060 }
19061 /// Adds another one input vector and the mask for the shuffling.
19063 SmallVector<int> NewMask;
19064 inversePermutation(Order, NewMask);
19065 add(V1, NewMask);
19066 }
19067 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
19068 Value *Root = nullptr) {
19069 return R.gather(VL, Root, ScalarTy,
19070 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
19071 return createShuffle(V1, V2, Mask);
19072 });
19073 }
19074 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
19075 /// Finalize emission of the shuffles.
19076 /// \param Action the action (if any) to be performed before final applying of
19077 /// the \p ExtMask mask.
19079 ArrayRef<int> ExtMask,
19080 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
19081 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
19084 Action = {}) {
19085 IsFinalized = true;
19086 if (Action) {
19087 Value *Vec = InVectors.front();
19088 if (InVectors.size() == 2) {
19089 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19090 InVectors.pop_back();
19091 } else {
19092 Vec = createShuffle(Vec, nullptr, CommonMask);
19093 }
19094 transformMaskAfterShuffle(CommonMask, CommonMask);
19095 assert(VF > 0 &&
19096 "Expected vector length for the final value before action.");
19097 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
19098 if (VecVF < VF) {
19099 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
19100 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
19101 Vec = createShuffle(Vec, nullptr, ResizeMask);
19102 }
19103 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
19104 return createShuffle(V1, V2, Mask);
19105 });
19106 InVectors.front() = Vec;
19107 }
19108 if (!SubVectors.empty()) {
19109 Value *Vec = InVectors.front();
19110 if (InVectors.size() == 2) {
19111 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19112 InVectors.pop_back();
19113 } else {
19114 Vec = createShuffle(Vec, nullptr, CommonMask);
19115 }
19116 transformMaskAfterShuffle(CommonMask, CommonMask);
19117 auto CreateSubVectors = [&](Value *Vec,
19118 SmallVectorImpl<int> &CommonMask) {
19119 for (auto [E, Idx] : SubVectors) {
19120 Value *V = getVectorizedValue(*E);
19121 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
19122 // Use scalar version of the SCalarType to correctly handle shuffles
19123 // for revectorization. The revectorization mode operates by the
19124 // vectors, but here we need to operate on the scalars, because the
19125 // masks were already transformed for the vector elements and we don't
19126 // need doing this transformation again.
19127 Type *OrigScalarTy = ScalarTy;
19128 ScalarTy = ScalarTy->getScalarType();
19129 Vec = createInsertVector(
19130 Builder, Vec, V, InsertionIndex,
19131 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
19132 _3));
19133 ScalarTy = OrigScalarTy;
19134 if (!CommonMask.empty()) {
19135 std::iota(std::next(CommonMask.begin(), Idx),
19136 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
19137 Idx);
19138 }
19139 }
19140 return Vec;
19141 };
19142 if (SubVectorsMask.empty()) {
19143 Vec = CreateSubVectors(Vec, CommonMask);
19144 } else {
19145 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
19146 copy(SubVectorsMask, SVMask.begin());
19147 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
19148 if (I2 != PoisonMaskElem) {
19149 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
19150 I1 = I2 + CommonMask.size();
19151 }
19152 }
19153 Value *InsertVec =
19154 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
19155 Vec = createShuffle(InsertVec, Vec, SVMask);
19156 transformMaskAfterShuffle(CommonMask, SVMask);
19157 }
19158 InVectors.front() = Vec;
19159 }
19160
19161 if (!ExtMask.empty()) {
19162 if (CommonMask.empty()) {
19163 CommonMask.assign(ExtMask.begin(), ExtMask.end());
19164 } else {
19165 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
19166 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
19167 if (ExtMask[I] == PoisonMaskElem)
19168 continue;
19169 NewMask[I] = CommonMask[ExtMask[I]];
19170 }
19171 CommonMask.swap(NewMask);
19172 }
19173 }
19174 if (CommonMask.empty()) {
19175 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
19176 return InVectors.front();
19177 }
19178 if (InVectors.size() == 2)
19179 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
19180 return createShuffle(InVectors.front(), nullptr, CommonMask);
19181 }
19182
19184 assert((IsFinalized || CommonMask.empty()) &&
19185 "Shuffle construction must be finalized.");
19186 }
19187};
19188
19189Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
19190 return vectorizeTree(getOperandEntry(E, NodeIdx));
19191}
19192
19193template <typename BVTy, typename ResTy, typename... Args>
19194ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
19195 Args &...Params) {
19196 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
19197 "Expected gather node.");
19198 unsigned VF = E->getVectorFactor();
19199
19200 bool NeedFreeze = false;
19201 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
19202 // Clear values, to be replaced by insertvector instructions.
19203 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
19204 for_each(MutableArrayRef(GatheredScalars)
19205 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
19206 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
19208 E->CombinedEntriesWithIndices.size());
19209 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
19210 [&](const auto &P) {
19211 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19212 });
19213 // Build a mask out of the reorder indices and reorder scalars per this
19214 // mask.
19215 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
19216 E->ReorderIndices.end());
19217 if (!ReorderMask.empty())
19218 reorderScalars(GatheredScalars, ReorderMask);
19219 SmallVector<int> SubVectorsMask;
19220 inversePermutation(E->ReorderIndices, SubVectorsMask);
19221 // Transform non-clustered elements in the mask to poison (-1).
19222 // "Clustered" operations will be reordered using this mask later.
19223 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
19224 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
19225 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
19226 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
19227 } else {
19228 SubVectorsMask.clear();
19229 }
19230 SmallVector<Value *> StoredGS(GatheredScalars);
19231 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
19232 unsigned I, unsigned SliceSize,
19233 bool IsNotPoisonous) {
19234 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
19235 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19236 }))
19237 return false;
19238 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
19239 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
19240 if (UserTE->getNumOperands() != 2)
19241 return false;
19242 if (!IsNotPoisonous) {
19243 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
19244 [=](const std::unique_ptr<TreeEntry> &TE) {
19245 return TE->UserTreeIndex.UserTE == UserTE &&
19246 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
19247 });
19248 if (It == VectorizableTree.end())
19249 return false;
19250 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
19251 if (!(*It)->ReorderIndices.empty()) {
19252 inversePermutation((*It)->ReorderIndices, ReorderMask);
19253 reorderScalars(GS, ReorderMask);
19254 }
19255 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
19256 Value *V0 = std::get<0>(P);
19257 Value *V1 = std::get<1>(P);
19258 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
19259 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
19260 is_contained(E->Scalars, V1));
19261 }))
19262 return false;
19263 }
19264 int Idx;
19265 if ((Mask.size() < InputVF &&
19266 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
19267 Idx == 0) ||
19268 (Mask.size() == InputVF &&
19269 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
19270 std::iota(
19271 std::next(Mask.begin(), I * SliceSize),
19272 std::next(Mask.begin(),
19273 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
19274 0);
19275 } else {
19276 unsigned IVal =
19277 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
19278 std::fill(
19279 std::next(Mask.begin(), I * SliceSize),
19280 std::next(Mask.begin(),
19281 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
19282 IVal);
19283 }
19284 return true;
19285 };
19286 BVTy ShuffleBuilder(ScalarTy, Params...);
19287 ResTy Res = ResTy();
19288 SmallVector<int> Mask;
19289 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
19291 Value *ExtractVecBase = nullptr;
19292 bool UseVecBaseAsInput = false;
19295 Type *OrigScalarTy = GatheredScalars.front()->getType();
19296 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
19297 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
19298 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
19299 // Check for gathered extracts.
19300 bool Resized = false;
19301 ExtractShuffles =
19302 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
19303 if (!ExtractShuffles.empty()) {
19304 SmallVector<const TreeEntry *> ExtractEntries;
19305 for (auto [Idx, I] : enumerate(ExtractMask)) {
19306 if (I == PoisonMaskElem)
19307 continue;
19308 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
19309 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
19310 !TEs.empty())
19311 ExtractEntries.append(TEs.begin(), TEs.end());
19312 }
19313 if (std::optional<ResTy> Delayed =
19314 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
19315 // Delay emission of gathers which are not ready yet.
19316 PostponedGathers.insert(E);
19317 // Postpone gather emission, will be emitted after the end of the
19318 // process to keep correct order.
19319 return *Delayed;
19320 }
19321 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
19322 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
19323 ExtractVecBase = VecBase;
19324 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
19325 if (VF == VecBaseTy->getNumElements() &&
19326 GatheredScalars.size() != VF) {
19327 Resized = true;
19328 GatheredScalars.append(VF - GatheredScalars.size(),
19329 PoisonValue::get(OrigScalarTy));
19330 NumParts =
19331 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
19332 }
19333 }
19334 }
19335 // Gather extracts after we check for full matched gathers only.
19336 if (!ExtractShuffles.empty() || !E->hasState() ||
19337 E->getOpcode() != Instruction::Load ||
19338 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
19339 any_of(E->Scalars, IsaPred<LoadInst>)) &&
19340 any_of(E->Scalars,
19341 [this](Value *V) {
19342 return isa<LoadInst>(V) && isVectorized(V);
19343 })) ||
19344 (E->hasState() && E->isAltShuffle()) ||
19345 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
19346 isSplat(E->Scalars) ||
19347 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
19348 GatherShuffles =
19349 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
19350 }
19351 if (!GatherShuffles.empty()) {
19352 if (std::optional<ResTy> Delayed =
19353 ShuffleBuilder.needToDelay(E, Entries)) {
19354 // Delay emission of gathers which are not ready yet.
19355 PostponedGathers.insert(E);
19356 // Postpone gather emission, will be emitted after the end of the
19357 // process to keep correct order.
19358 return *Delayed;
19359 }
19360 if (GatherShuffles.size() == 1 &&
19361 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
19362 Entries.front().front()->isSame(E->Scalars)) {
19363 // Perfect match in the graph, will reuse the previously vectorized
19364 // node. Cost is 0.
19365 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
19366 << shortBundleName(E->Scalars, E->Idx) << ".\n");
19367 // Restore the mask for previous partially matched values.
19368 Mask.resize(E->Scalars.size());
19369 const TreeEntry *FrontTE = Entries.front().front();
19370 if (FrontTE->ReorderIndices.empty() &&
19371 ((FrontTE->ReuseShuffleIndices.empty() &&
19372 E->Scalars.size() == FrontTE->Scalars.size()) ||
19373 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
19374 std::iota(Mask.begin(), Mask.end(), 0);
19375 } else {
19376 for (auto [I, V] : enumerate(E->Scalars)) {
19377 if (isa<PoisonValue>(V)) {
19378 Mask[I] = PoisonMaskElem;
19379 continue;
19380 }
19381 Mask[I] = FrontTE->findLaneForValue(V);
19382 }
19383 }
19384 // Reset the builder(s) to correctly handle perfect diamond matched
19385 // nodes.
19386 ShuffleBuilder.resetForSameNode();
19387 ShuffleBuilder.add(*FrontTE, Mask);
19388 // Full matched entry found, no need to insert subvectors.
19389 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
19390 return Res;
19391 }
19392 if (!Resized) {
19393 if (GatheredScalars.size() != VF &&
19394 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
19395 return any_of(TEs, [&](const TreeEntry *TE) {
19396 return TE->getVectorFactor() == VF;
19397 });
19398 }))
19399 GatheredScalars.append(VF - GatheredScalars.size(),
19400 PoisonValue::get(OrigScalarTy));
19401 }
19402 // Remove shuffled elements from list of gathers.
19403 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
19404 if (Mask[I] != PoisonMaskElem)
19405 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
19406 }
19407 }
19408 }
19409 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
19410 SmallVectorImpl<int> &ReuseMask,
19411 bool IsRootPoison) {
19412 // For splats with can emit broadcasts instead of gathers, so try to find
19413 // such sequences.
19414 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
19415 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
19416 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
19417 SmallVector<int> UndefPos;
19418 DenseMap<Value *, unsigned> UniquePositions;
19419 // Gather unique non-const values and all constant values.
19420 // For repeated values, just shuffle them.
19421 int NumNonConsts = 0;
19422 int SinglePos = 0;
19423 for (auto [I, V] : enumerate(Scalars)) {
19424 if (isa<UndefValue>(V)) {
19425 if (!isa<PoisonValue>(V)) {
19426 ReuseMask[I] = I;
19427 UndefPos.push_back(I);
19428 }
19429 continue;
19430 }
19431 if (isConstant(V)) {
19432 ReuseMask[I] = I;
19433 continue;
19434 }
19435 ++NumNonConsts;
19436 SinglePos = I;
19437 Value *OrigV = V;
19438 Scalars[I] = PoisonValue::get(OrigScalarTy);
19439 if (IsSplat) {
19440 Scalars.front() = OrigV;
19441 ReuseMask[I] = 0;
19442 } else {
19443 const auto Res = UniquePositions.try_emplace(OrigV, I);
19444 Scalars[Res.first->second] = OrigV;
19445 ReuseMask[I] = Res.first->second;
19446 }
19447 }
19448 if (NumNonConsts == 1) {
19449 // Restore single insert element.
19450 if (IsSplat) {
19451 ReuseMask.assign(VF, PoisonMaskElem);
19452 std::swap(Scalars.front(), Scalars[SinglePos]);
19453 if (!UndefPos.empty() && UndefPos.front() == 0)
19454 Scalars.front() = UndefValue::get(OrigScalarTy);
19455 }
19456 ReuseMask[SinglePos] = SinglePos;
19457 } else if (!UndefPos.empty() && IsSplat) {
19458 // For undef values, try to replace them with the simple broadcast.
19459 // We can do it if the broadcasted value is guaranteed to be
19460 // non-poisonous, or by freezing the incoming scalar value first.
19461 auto *It = find_if(Scalars, [this, E](Value *V) {
19462 return !isa<UndefValue>(V) &&
19464 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
19465 // Check if the value already used in the same operation in
19466 // one of the nodes already.
19467 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
19468 is_contained(E->UserTreeIndex.UserTE->Scalars,
19469 U.getUser());
19470 })));
19471 });
19472 if (It != Scalars.end()) {
19473 // Replace undefs by the non-poisoned scalars and emit broadcast.
19474 int Pos = std::distance(Scalars.begin(), It);
19475 for (int I : UndefPos) {
19476 // Set the undef position to the non-poisoned scalar.
19477 ReuseMask[I] = Pos;
19478 // Replace the undef by the poison, in the mask it is replaced by
19479 // non-poisoned scalar already.
19480 if (I != Pos)
19481 Scalars[I] = PoisonValue::get(OrigScalarTy);
19482 }
19483 } else {
19484 // Replace undefs by the poisons, emit broadcast and then emit
19485 // freeze.
19486 for (int I : UndefPos) {
19487 ReuseMask[I] = PoisonMaskElem;
19488 if (isa<UndefValue>(Scalars[I]))
19489 Scalars[I] = PoisonValue::get(OrigScalarTy);
19490 }
19491 NeedFreeze = true;
19492 }
19493 }
19494 };
19495 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
19496 bool IsNonPoisoned = true;
19497 bool IsUsedInExpr = true;
19498 Value *Vec1 = nullptr;
19499 if (!ExtractShuffles.empty()) {
19500 // Gather of extractelements can be represented as just a shuffle of
19501 // a single/two vectors the scalars are extracted from.
19502 // Find input vectors.
19503 Value *Vec2 = nullptr;
19504 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
19505 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
19506 ExtractMask[I] = PoisonMaskElem;
19507 }
19508 if (UseVecBaseAsInput) {
19509 Vec1 = ExtractVecBase;
19510 } else {
19511 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
19512 if (ExtractMask[I] == PoisonMaskElem)
19513 continue;
19514 if (isa<UndefValue>(StoredGS[I]))
19515 continue;
19516 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
19517 Value *VecOp = EI->getVectorOperand();
19518 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
19519 !TEs.empty() && TEs.front()->VectorizedValue)
19520 VecOp = TEs.front()->VectorizedValue;
19521 if (!Vec1) {
19522 Vec1 = VecOp;
19523 } else if (Vec1 != VecOp) {
19524 assert((!Vec2 || Vec2 == VecOp) &&
19525 "Expected only 1 or 2 vectors shuffle.");
19526 Vec2 = VecOp;
19527 }
19528 }
19529 }
19530 if (Vec2) {
19531 IsUsedInExpr = false;
19532 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
19533 isGuaranteedNotToBePoison(Vec2, AC);
19534 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
19535 } else if (Vec1) {
19536 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
19537 IsUsedInExpr &= FindReusedSplat(
19538 ExtractMask,
19539 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
19540 ExtractMask.size(), IsNotPoisonedVec);
19541 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
19542 IsNonPoisoned &= IsNotPoisonedVec;
19543 } else {
19544 IsUsedInExpr = false;
19545 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
19546 /*ForExtracts=*/true);
19547 }
19548 }
19549 if (!GatherShuffles.empty()) {
19550 unsigned SliceSize =
19551 getPartNumElems(E->Scalars.size(),
19552 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
19553 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
19554 for (const auto [I, TEs] : enumerate(Entries)) {
19555 if (TEs.empty()) {
19556 assert(!GatherShuffles[I] &&
19557 "No shuffles with empty entries list expected.");
19558 continue;
19559 }
19560 assert((TEs.size() == 1 || TEs.size() == 2) &&
19561 "Expected shuffle of 1 or 2 entries.");
19562 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
19563 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
19564 VecMask.assign(VecMask.size(), PoisonMaskElem);
19565 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
19566 if (TEs.size() == 1) {
19567 bool IsNotPoisonedVec =
19568 TEs.front()->VectorizedValue
19569 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
19570 : true;
19571 IsUsedInExpr &=
19572 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
19573 SliceSize, IsNotPoisonedVec);
19574 ShuffleBuilder.add(*TEs.front(), VecMask);
19575 IsNonPoisoned &= IsNotPoisonedVec;
19576 } else {
19577 IsUsedInExpr = false;
19578 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19579 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19580 IsNonPoisoned &=
19581 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
19582 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
19583 }
19584 }
19585 }
19586 // Try to figure out best way to combine values: build a shuffle and insert
19587 // elements or just build several shuffles.
19588 // Insert non-constant scalars.
19589 SmallVector<Value *> NonConstants(GatheredScalars);
19590 int EMSz = ExtractMask.size();
19591 int MSz = Mask.size();
19592 // Try to build constant vector and shuffle with it only if currently we
19593 // have a single permutation and more than 1 scalar constants.
19594 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
19595 bool IsIdentityShuffle =
19596 ((UseVecBaseAsInput ||
19597 all_of(ExtractShuffles,
19598 [](const std::optional<TTI::ShuffleKind> &SK) {
19599 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19601 })) &&
19602 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
19603 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
19604 (!GatherShuffles.empty() &&
19605 all_of(GatherShuffles,
19606 [](const std::optional<TTI::ShuffleKind> &SK) {
19607 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19609 }) &&
19610 none_of(Mask, [&](int I) { return I >= MSz; }) &&
19612 bool EnoughConstsForShuffle =
19613 IsSingleShuffle &&
19614 (none_of(GatheredScalars,
19615 [](Value *V) {
19616 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19617 }) ||
19618 any_of(GatheredScalars,
19619 [](Value *V) {
19620 return isa<Constant>(V) && !isa<UndefValue>(V);
19621 })) &&
19622 (!IsIdentityShuffle ||
19623 (GatheredScalars.size() == 2 &&
19624 any_of(GatheredScalars,
19625 [](Value *V) { return !isa<UndefValue>(V); })) ||
19626 count_if(GatheredScalars, [](Value *V) {
19627 return isa<Constant>(V) && !isa<PoisonValue>(V);
19628 }) > 1);
19629 // NonConstants array contains just non-constant values, GatheredScalars
19630 // contains only constant to build final vector and then shuffle.
19631 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
19632 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
19633 NonConstants[I] = PoisonValue::get(OrigScalarTy);
19634 else
19635 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
19636 }
19637 // Generate constants for final shuffle and build a mask for them.
19638 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
19639 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
19640 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
19641 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
19642 ShuffleBuilder.add(BV, BVMask);
19643 }
19644 if (all_of(NonConstants, [=](Value *V) {
19645 return isa<PoisonValue>(V) ||
19646 (IsSingleShuffle && ((IsIdentityShuffle &&
19647 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
19648 }))
19649 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19650 SubVectorsMask);
19651 else
19652 Res = ShuffleBuilder.finalize(
19653 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
19654 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
19655 bool IsSplat = isSplat(NonConstants);
19656 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
19657 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
19658 auto CheckIfSplatIsProfitable = [&]() {
19659 // Estimate the cost of splatting + shuffle and compare with
19660 // insert + shuffle.
19661 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19662 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19663 if (isa<ExtractElementInst>(V) || isVectorized(V))
19664 return false;
19665 InstructionCost SplatCost = TTI->getVectorInstrCost(
19666 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
19667 PoisonValue::get(VecTy), V);
19668 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19669 for (auto [Idx, I] : enumerate(BVMask))
19670 if (I != PoisonMaskElem)
19671 NewMask[Idx] = Mask.size();
19672 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
19673 NewMask, CostKind);
19674 InstructionCost BVCost = TTI->getVectorInstrCost(
19675 Instruction::InsertElement, VecTy, CostKind,
19676 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
19677 // Shuffle required?
19678 if (count(BVMask, PoisonMaskElem) <
19679 static_cast<int>(BVMask.size() - 1)) {
19680 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19681 for (auto [Idx, I] : enumerate(BVMask))
19682 if (I != PoisonMaskElem)
19683 NewMask[Idx] = I;
19684 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
19685 VecTy, NewMask, CostKind);
19686 }
19687 return SplatCost <= BVCost;
19688 };
19689 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
19690 for (auto [Idx, I] : enumerate(BVMask))
19691 if (I != PoisonMaskElem)
19692 Mask[Idx] = I;
19693 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
19694 } else {
19695 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19696 SmallVector<Value *> Values(NonConstants.size(),
19697 PoisonValue::get(ScalarTy));
19698 Values[0] = V;
19699 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
19700 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
19701 transform(BVMask, SplatMask.begin(), [](int I) {
19702 return I == PoisonMaskElem ? PoisonMaskElem : 0;
19703 });
19704 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
19705 BV = CreateShuffle(BV, nullptr, SplatMask);
19706 for (auto [Idx, I] : enumerate(BVMask))
19707 if (I != PoisonMaskElem)
19708 Mask[Idx] = BVMask.size() + Idx;
19709 Vec = CreateShuffle(Vec, BV, Mask);
19710 for (auto [Idx, I] : enumerate(Mask))
19711 if (I != PoisonMaskElem)
19712 Mask[Idx] = Idx;
19713 }
19714 });
19715 } else if (!allConstant(GatheredScalars)) {
19716 // Gather unique scalars and all constants.
19717 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
19718 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
19719 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
19720 ShuffleBuilder.add(BV, ReuseMask);
19721 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19722 SubVectorsMask);
19723 } else {
19724 // Gather all constants.
19725 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
19726 for (auto [I, V] : enumerate(GatheredScalars)) {
19727 if (!isa<PoisonValue>(V))
19728 Mask[I] = I;
19729 }
19730 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19731 ShuffleBuilder.add(BV, Mask);
19732 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19733 SubVectorsMask);
19734 }
19735
19736 if (NeedFreeze)
19737 Res = ShuffleBuilder.createFreeze(Res);
19738 return Res;
19739}
19740
19741Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
19742 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
19743 (void)vectorizeTree(VectorizableTree[EIdx].get());
19744 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
19745 Builder, *this);
19746}
19747
19748/// \returns \p I after propagating metadata from \p VL only for instructions in
19749/// \p VL.
19752 for (Value *V : VL)
19753 if (isa<Instruction>(V))
19754 Insts.push_back(V);
19755 return llvm::propagateMetadata(Inst, Insts);
19756}
19757
19759 if (DebugLoc DL = PN.getDebugLoc())
19760 return DL;
19761 return DebugLoc::getUnknown();
19762}
19763
19764Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
19765 IRBuilderBase::InsertPointGuard Guard(Builder);
19766
19767 Value *V = E->Scalars.front();
19768 Type *ScalarTy = V->getType();
19769 if (!isa<CmpInst>(V))
19770 ScalarTy = getValueType(V);
19771 auto It = MinBWs.find(E);
19772 if (It != MinBWs.end()) {
19773 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
19774 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
19775 if (VecTy)
19776 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
19777 }
19778 if (E->VectorizedValue)
19779 return E->VectorizedValue;
19780 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
19781 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
19782 // Set insert point for non-reduction initial nodes.
19783 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
19784 setInsertPointAfterBundle(E);
19785 Value *Vec = createBuildVector(E, ScalarTy);
19786 E->VectorizedValue = Vec;
19787 return Vec;
19788 }
19789 if (E->State == TreeEntry::SplitVectorize) {
19790 assert(E->CombinedEntriesWithIndices.size() == 2 &&
19791 "Expected exactly 2 combined entries.");
19792 setInsertPointAfterBundle(E);
19793 TreeEntry &OpTE1 =
19794 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
19795 assert(OpTE1.isSame(
19796 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19797 "Expected same first part of scalars.");
19798 Value *Op1 = vectorizeTree(&OpTE1);
19799 TreeEntry &OpTE2 =
19800 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
19801 assert(
19802 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19803 "Expected same second part of scalars.");
19804 Value *Op2 = vectorizeTree(&OpTE2);
19805 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
19806 bool IsSigned = false;
19807 auto It = MinBWs.find(OpE);
19808 if (It != MinBWs.end())
19809 IsSigned = It->second.second;
19810 else
19811 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19812 if (isa<PoisonValue>(V))
19813 return false;
19814 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19815 });
19816 return IsSigned;
19817 };
19818 if (cast<VectorType>(Op1->getType())->getElementType() !=
19819 ScalarTy->getScalarType()) {
19820 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19821 Op1 = Builder.CreateIntCast(
19822 Op1,
19824 ScalarTy,
19825 cast<FixedVectorType>(Op1->getType())->getNumElements()),
19826 GetOperandSignedness(&OpTE1));
19827 }
19828 if (cast<VectorType>(Op2->getType())->getElementType() !=
19829 ScalarTy->getScalarType()) {
19830 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19831 Op2 = Builder.CreateIntCast(
19832 Op2,
19834 ScalarTy,
19835 cast<FixedVectorType>(Op2->getType())->getNumElements()),
19836 GetOperandSignedness(&OpTE2));
19837 }
19838 if (E->ReorderIndices.empty()) {
19839 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
19840 std::iota(
19841 Mask.begin(),
19842 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
19843 0);
19844 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
19845 if (ScalarTyNumElements != 1) {
19846 assert(SLPReVec && "Only supported by REVEC.");
19847 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
19848 }
19849 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19850 Vec = createInsertVector(Builder, Vec, Op2,
19851 E->CombinedEntriesWithIndices.back().second *
19852 ScalarTyNumElements);
19853 E->VectorizedValue = Vec;
19854 return Vec;
19855 }
19856 unsigned CommonVF =
19857 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19858 if (getNumElements(Op1->getType()) != CommonVF) {
19859 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19860 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
19861 0);
19862 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19863 }
19864 if (getNumElements(Op2->getType()) != CommonVF) {
19865 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19866 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
19867 0);
19868 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19869 }
19870 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
19871 E->VectorizedValue = Vec;
19872 return Vec;
19873 }
19874
19875 bool IsReverseOrder =
19876 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
19877 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
19878 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
19879 if (E->getOpcode() == Instruction::Store &&
19880 E->State == TreeEntry::Vectorize) {
19881 ArrayRef<int> Mask =
19882 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19883 E->ReorderIndices.size());
19884 ShuffleBuilder.add(V, Mask);
19885 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19886 E->State == TreeEntry::CompressVectorize) {
19887 ShuffleBuilder.addOrdered(V, {});
19888 } else {
19889 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19890 }
19892 E->CombinedEntriesWithIndices.size());
19893 transform(
19894 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19895 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19896 });
19897 assert(
19898 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19899 "Expected either combined subnodes or reordering");
19900 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19901 };
19902
19903 assert(!E->isGather() && "Unhandled state");
19904 unsigned ShuffleOrOp =
19905 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19906 Instruction *VL0 = E->getMainOp();
19907 auto GetOperandSignedness = [&](unsigned Idx) {
19908 const TreeEntry *OpE = getOperandEntry(E, Idx);
19909 bool IsSigned = false;
19910 auto It = MinBWs.find(OpE);
19911 if (It != MinBWs.end())
19912 IsSigned = It->second.second;
19913 else
19914 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19915 if (isa<PoisonValue>(V))
19916 return false;
19917 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19918 });
19919 return IsSigned;
19920 };
19921 switch (ShuffleOrOp) {
19922 case Instruction::PHI: {
19923 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19924 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19925 "PHI reordering is free.");
19926 auto *PH = cast<PHINode>(VL0);
19927 Builder.SetInsertPoint(PH->getParent(),
19928 PH->getParent()->getFirstNonPHIIt());
19929 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19930 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19931 Value *V = NewPhi;
19932
19933 // Adjust insertion point once all PHI's have been generated.
19934 Builder.SetInsertPoint(PH->getParent(),
19935 PH->getParent()->getFirstInsertionPt());
19936 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19937
19938 V = FinalShuffle(V, E);
19939
19940 E->VectorizedValue = V;
19941 // If phi node is fully emitted - exit.
19942 if (NewPhi->getNumIncomingValues() != 0)
19943 return NewPhi;
19944
19945 // PHINodes may have multiple entries from the same block. We want to
19946 // visit every block once.
19947 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19948
19949 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19950 BasicBlock *IBB = PH->getIncomingBlock(I);
19951
19952 // Stop emission if all incoming values are generated.
19953 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19954 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19955 return NewPhi;
19956 }
19957
19958 if (!VisitedBBs.insert(IBB).second) {
19959 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19960 NewPhi->addIncoming(VecOp, IBB);
19961 TreeEntry *OpTE = getOperandEntry(E, I);
19962 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19963 OpTE->VectorizedValue = VecOp;
19964 continue;
19965 }
19966
19967 Builder.SetInsertPoint(IBB->getTerminator());
19968 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19969 Value *Vec = vectorizeOperand(E, I);
19970 if (VecTy != Vec->getType()) {
19971 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19972 MinBWs.contains(getOperandEntry(E, I))) &&
19973 "Expected item in MinBWs.");
19974 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19975 }
19976 NewPhi->addIncoming(Vec, IBB);
19977 }
19978
19979 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19980 "Invalid number of incoming values");
19981 assert(E->VectorizedValue && "Expected vectorized value.");
19982 return E->VectorizedValue;
19983 }
19984
19985 case Instruction::ExtractElement: {
19986 Value *V = E->getSingleOperand(0);
19987 setInsertPointAfterBundle(E);
19988 V = FinalShuffle(V, E);
19989 E->VectorizedValue = V;
19990 return V;
19991 }
19992 case Instruction::ExtractValue: {
19993 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19994 Builder.SetInsertPoint(LI);
19995 Value *Ptr = LI->getPointerOperand();
19996 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19997 Value *NewV = ::propagateMetadata(V, E->Scalars);
19998 NewV = FinalShuffle(NewV, E);
19999 E->VectorizedValue = NewV;
20000 return NewV;
20001 }
20002 case Instruction::InsertElement: {
20003 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
20004 if (const TreeEntry *OpE = getOperandEntry(E, 1);
20005 OpE && !OpE->isGather() && OpE->hasState() &&
20006 !OpE->hasCopyableElements())
20007 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
20008 else
20009 setInsertPointAfterBundle(E);
20010 Value *V = vectorizeOperand(E, 1);
20011 ArrayRef<Value *> Op = E->getOperand(1);
20012 Type *ScalarTy = Op.front()->getType();
20013 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
20014 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20015 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
20016 assert(Res.first > 0 && "Expected item in MinBWs.");
20017 V = Builder.CreateIntCast(
20018 V,
20020 ScalarTy,
20021 cast<FixedVectorType>(V->getType())->getNumElements()),
20022 Res.second);
20023 }
20024
20025 // Create InsertVector shuffle if necessary
20026 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
20027 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
20028 }));
20029 const unsigned NumElts =
20030 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
20031 const unsigned NumScalars = E->Scalars.size();
20032
20033 unsigned Offset = *getElementIndex(VL0);
20034 assert(Offset < NumElts && "Failed to find vector index offset");
20035
20036 // Create shuffle to resize vector
20037 SmallVector<int> Mask;
20038 if (!E->ReorderIndices.empty()) {
20039 inversePermutation(E->ReorderIndices, Mask);
20040 Mask.append(NumElts - NumScalars, PoisonMaskElem);
20041 } else {
20042 Mask.assign(NumElts, PoisonMaskElem);
20043 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
20044 }
20045 // Create InsertVector shuffle if necessary
20046 bool IsIdentity = true;
20047 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
20048 Mask.swap(PrevMask);
20049 for (unsigned I = 0; I < NumScalars; ++I) {
20050 Value *Scalar = E->Scalars[PrevMask[I]];
20051 unsigned InsertIdx = *getElementIndex(Scalar);
20052 IsIdentity &= InsertIdx - Offset == I;
20053 Mask[InsertIdx - Offset] = I;
20054 }
20055 if (!IsIdentity || NumElts != NumScalars) {
20056 Value *V2 = nullptr;
20057 bool IsVNonPoisonous =
20059 SmallVector<int> InsertMask(Mask);
20060 if (NumElts != NumScalars && Offset == 0) {
20061 // Follow all insert element instructions from the current buildvector
20062 // sequence.
20063 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
20064 do {
20065 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
20066 if (!InsertIdx)
20067 break;
20068 if (InsertMask[*InsertIdx] == PoisonMaskElem)
20069 InsertMask[*InsertIdx] = *InsertIdx;
20070 if (!Ins->hasOneUse())
20071 break;
20074 } while (Ins);
20075 SmallBitVector UseMask =
20076 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20077 SmallBitVector IsFirstPoison =
20078 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
20079 SmallBitVector IsFirstUndef =
20080 isUndefVector(FirstInsert->getOperand(0), UseMask);
20081 if (!IsFirstPoison.all()) {
20082 unsigned Idx = 0;
20083 for (unsigned I = 0; I < NumElts; I++) {
20084 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
20085 IsFirstUndef.test(I)) {
20086 if (IsVNonPoisonous) {
20087 InsertMask[I] = I < NumScalars ? I : 0;
20088 continue;
20089 }
20090 if (!V2)
20091 V2 = UndefValue::get(V->getType());
20092 if (Idx >= NumScalars)
20093 Idx = NumScalars - 1;
20094 InsertMask[I] = NumScalars + Idx;
20095 ++Idx;
20096 } else if (InsertMask[I] != PoisonMaskElem &&
20097 Mask[I] == PoisonMaskElem) {
20098 InsertMask[I] = PoisonMaskElem;
20099 }
20100 }
20101 } else {
20102 InsertMask = Mask;
20103 }
20104 }
20105 if (!V2)
20106 V2 = PoisonValue::get(V->getType());
20107 V = Builder.CreateShuffleVector(V, V2, InsertMask);
20108 if (auto *I = dyn_cast<Instruction>(V)) {
20109 GatherShuffleExtractSeq.insert(I);
20110 CSEBlocks.insert(I->getParent());
20111 }
20112 }
20113
20114 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
20115 for (unsigned I = 0; I < NumElts; I++) {
20116 if (Mask[I] != PoisonMaskElem)
20117 InsertMask[Offset + I] = I;
20118 }
20119 SmallBitVector UseMask =
20120 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20121 SmallBitVector IsFirstUndef =
20122 isUndefVector(FirstInsert->getOperand(0), UseMask);
20123 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
20124 NumElts != NumScalars) {
20125 if (IsFirstUndef.all()) {
20126 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
20127 SmallBitVector IsFirstPoison =
20128 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
20129 if (!IsFirstPoison.all()) {
20130 for (unsigned I = 0; I < NumElts; I++) {
20131 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
20132 InsertMask[I] = I + NumElts;
20133 }
20134 }
20135 V = Builder.CreateShuffleVector(
20136 V,
20137 IsFirstPoison.all() ? PoisonValue::get(V->getType())
20138 : FirstInsert->getOperand(0),
20139 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
20140 if (auto *I = dyn_cast<Instruction>(V)) {
20141 GatherShuffleExtractSeq.insert(I);
20142 CSEBlocks.insert(I->getParent());
20143 }
20144 }
20145 } else {
20146 SmallBitVector IsFirstPoison =
20147 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
20148 for (unsigned I = 0; I < NumElts; I++) {
20149 if (InsertMask[I] == PoisonMaskElem)
20150 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
20151 else
20152 InsertMask[I] += NumElts;
20153 }
20154 V = Builder.CreateShuffleVector(
20155 FirstInsert->getOperand(0), V, InsertMask,
20156 cast<Instruction>(E->Scalars.back())->getName());
20157 if (auto *I = dyn_cast<Instruction>(V)) {
20158 GatherShuffleExtractSeq.insert(I);
20159 CSEBlocks.insert(I->getParent());
20160 }
20161 }
20162 }
20163
20164 ++NumVectorInstructions;
20165 E->VectorizedValue = V;
20166 return V;
20167 }
20168 case Instruction::ZExt:
20169 case Instruction::SExt:
20170 case Instruction::FPToUI:
20171 case Instruction::FPToSI:
20172 case Instruction::FPExt:
20173 case Instruction::PtrToInt:
20174 case Instruction::IntToPtr:
20175 case Instruction::SIToFP:
20176 case Instruction::UIToFP:
20177 case Instruction::Trunc:
20178 case Instruction::FPTrunc:
20179 case Instruction::BitCast: {
20180 setInsertPointAfterBundle(E);
20181
20182 Value *InVec = vectorizeOperand(E, 0);
20183
20184 auto *CI = cast<CastInst>(VL0);
20185 Instruction::CastOps VecOpcode = CI->getOpcode();
20186 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
20187 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
20188 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
20189 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
20190 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
20191 // Check if the values are candidates to demote.
20192 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
20193 if (SrcIt != MinBWs.end())
20194 SrcBWSz = SrcIt->second.first;
20195 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
20196 if (BWSz == SrcBWSz) {
20197 VecOpcode = Instruction::BitCast;
20198 } else if (BWSz < SrcBWSz) {
20199 VecOpcode = Instruction::Trunc;
20200 } else if (It != MinBWs.end()) {
20201 assert(BWSz > SrcBWSz && "Invalid cast!");
20202 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
20203 } else if (SrcIt != MinBWs.end()) {
20204 assert(BWSz > SrcBWSz && "Invalid cast!");
20205 VecOpcode =
20206 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
20207 }
20208 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
20209 !SrcIt->second.second) {
20210 VecOpcode = Instruction::UIToFP;
20211 }
20212 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
20213 ? InVec
20214 : Builder.CreateCast(VecOpcode, InVec, VecTy);
20215 V = FinalShuffle(V, E);
20216
20217 E->VectorizedValue = V;
20218 ++NumVectorInstructions;
20219 return V;
20220 }
20221 case Instruction::FCmp:
20222 case Instruction::ICmp: {
20223 setInsertPointAfterBundle(E);
20224
20225 Value *L = vectorizeOperand(E, 0);
20226 Value *R = vectorizeOperand(E, 1);
20227 if (L->getType() != R->getType()) {
20228 assert((getOperandEntry(E, 0)->isGather() ||
20229 getOperandEntry(E, 1)->isGather() ||
20230 MinBWs.contains(getOperandEntry(E, 0)) ||
20231 MinBWs.contains(getOperandEntry(E, 1))) &&
20232 "Expected item in MinBWs.");
20233 if (cast<VectorType>(L->getType())
20234 ->getElementType()
20235 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
20236 ->getElementType()
20237 ->getIntegerBitWidth()) {
20238 Type *CastTy = R->getType();
20239 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
20240 } else {
20241 Type *CastTy = L->getType();
20242 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
20243 }
20244 }
20245
20246 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
20247 Value *V = Builder.CreateCmp(P0, L, R);
20248 propagateIRFlags(V, E->Scalars, VL0);
20249 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
20250 ICmp->setSameSign(/*B=*/false);
20251 // Do not cast for cmps.
20252 VecTy = cast<FixedVectorType>(V->getType());
20253 V = FinalShuffle(V, E);
20254
20255 E->VectorizedValue = V;
20256 ++NumVectorInstructions;
20257 return V;
20258 }
20259 case Instruction::Select: {
20260 setInsertPointAfterBundle(E);
20261
20262 Value *Cond = vectorizeOperand(E, 0);
20263 Value *True = vectorizeOperand(E, 1);
20264 Value *False = vectorizeOperand(E, 2);
20265 if (True->getType() != VecTy || False->getType() != VecTy) {
20266 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
20267 getOperandEntry(E, 2)->isGather() ||
20268 MinBWs.contains(getOperandEntry(E, 1)) ||
20269 MinBWs.contains(getOperandEntry(E, 2))) &&
20270 "Expected item in MinBWs.");
20271 if (True->getType() != VecTy)
20272 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
20273 if (False->getType() != VecTy)
20274 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
20275 }
20276
20277 unsigned CondNumElements = getNumElements(Cond->getType());
20278 unsigned TrueNumElements = getNumElements(True->getType());
20279 assert(TrueNumElements >= CondNumElements &&
20280 TrueNumElements % CondNumElements == 0 &&
20281 "Cannot vectorize Instruction::Select");
20282 assert(TrueNumElements == getNumElements(False->getType()) &&
20283 "Cannot vectorize Instruction::Select");
20284 if (CondNumElements != TrueNumElements) {
20285 // When the return type is i1 but the source is fixed vector type, we
20286 // need to duplicate the condition value.
20287 Cond = Builder.CreateShuffleVector(
20288 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
20289 CondNumElements));
20290 }
20291 assert(getNumElements(Cond->getType()) == TrueNumElements &&
20292 "Cannot vectorize Instruction::Select");
20293 Value *V =
20294 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
20295 V = FinalShuffle(V, E);
20296
20297 E->VectorizedValue = V;
20298 ++NumVectorInstructions;
20299 return V;
20300 }
20301 case Instruction::FNeg: {
20302 setInsertPointAfterBundle(E);
20303
20304 Value *Op = vectorizeOperand(E, 0);
20305
20306 Value *V = Builder.CreateUnOp(
20307 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
20308 propagateIRFlags(V, E->Scalars, VL0);
20309 if (auto *I = dyn_cast<Instruction>(V))
20310 V = ::propagateMetadata(I, E->Scalars);
20311
20312 V = FinalShuffle(V, E);
20313
20314 E->VectorizedValue = V;
20315 ++NumVectorInstructions;
20316
20317 return V;
20318 }
20319 case Instruction::Freeze: {
20320 setInsertPointAfterBundle(E);
20321
20322 Value *Op = vectorizeOperand(E, 0);
20323
20324 if (Op->getType() != VecTy) {
20325 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
20326 MinBWs.contains(getOperandEntry(E, 0))) &&
20327 "Expected item in MinBWs.");
20328 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
20329 }
20330 Value *V = Builder.CreateFreeze(Op);
20331 V = FinalShuffle(V, E);
20332
20333 E->VectorizedValue = V;
20334 ++NumVectorInstructions;
20335
20336 return V;
20337 }
20338 case Instruction::Add:
20339 case Instruction::FAdd:
20340 case Instruction::Sub:
20341 case Instruction::FSub:
20342 case Instruction::Mul:
20343 case Instruction::FMul:
20344 case Instruction::UDiv:
20345 case Instruction::SDiv:
20346 case Instruction::FDiv:
20347 case Instruction::URem:
20348 case Instruction::SRem:
20349 case Instruction::FRem:
20350 case Instruction::Shl:
20351 case Instruction::LShr:
20352 case Instruction::AShr:
20353 case Instruction::And:
20354 case Instruction::Or:
20355 case Instruction::Xor: {
20356 setInsertPointAfterBundle(E);
20357
20358 Value *LHS = vectorizeOperand(E, 0);
20359 Value *RHS = vectorizeOperand(E, 1);
20360 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
20361 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
20362 ArrayRef<Value *> Ops = E->getOperand(I);
20363 if (all_of(Ops, [&](Value *Op) {
20364 auto *CI = dyn_cast<ConstantInt>(Op);
20365 return CI && CI->getValue().countr_one() >= It->second.first;
20366 })) {
20367 V = FinalShuffle(I == 0 ? RHS : LHS, E);
20368 E->VectorizedValue = V;
20369 ++NumVectorInstructions;
20370 return V;
20371 }
20372 }
20373 }
20374 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
20375 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
20376 getOperandEntry(E, 1)->isGather() ||
20377 MinBWs.contains(getOperandEntry(E, 0)) ||
20378 MinBWs.contains(getOperandEntry(E, 1))) &&
20379 "Expected item in MinBWs.");
20380 if (LHS->getType() != VecTy)
20381 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
20382 if (RHS->getType() != VecTy)
20383 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
20384 }
20385
20386 Value *V = Builder.CreateBinOp(
20387 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
20388 RHS);
20389 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
20390 if (auto *I = dyn_cast<Instruction>(V)) {
20391 V = ::propagateMetadata(I, E->Scalars);
20392 // Drop nuw flags for abs(sub(commutative), true).
20393 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
20394 any_of(E->Scalars, [E](Value *V) {
20395 return isa<PoisonValue>(V) ||
20396 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
20397 isCommutative(cast<Instruction>(V));
20398 }))
20399 I->setHasNoUnsignedWrap(/*b=*/false);
20400 }
20401
20402 V = FinalShuffle(V, E);
20403
20404 E->VectorizedValue = V;
20405 ++NumVectorInstructions;
20406
20407 return V;
20408 }
20409 case Instruction::Load: {
20410 // Loads are inserted at the head of the tree because we don't want to
20411 // sink them all the way down past store instructions.
20412 setInsertPointAfterBundle(E);
20413
20414 LoadInst *LI = cast<LoadInst>(VL0);
20415 Instruction *NewLI;
20416 FixedVectorType *StridedLoadTy = nullptr;
20417 Value *PO = LI->getPointerOperand();
20418 if (E->State == TreeEntry::Vectorize) {
20419 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
20420 } else if (E->State == TreeEntry::CompressVectorize) {
20421 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
20422 CompressEntryToData.at(E);
20423 Align CommonAlignment = LI->getAlign();
20424 if (IsMasked) {
20425 unsigned VF = getNumElements(LoadVecTy);
20426 SmallVector<Constant *> MaskValues(
20427 VF / getNumElements(LI->getType()),
20428 ConstantInt::getFalse(VecTy->getContext()));
20429 for (int I : CompressMask)
20430 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
20431 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
20432 assert(SLPReVec && "Only supported by REVEC.");
20433 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
20434 }
20435 Constant *MaskValue = ConstantVector::get(MaskValues);
20436 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
20437 MaskValue);
20438 } else {
20439 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
20440 }
20441 NewLI = ::propagateMetadata(NewLI, E->Scalars);
20442 // TODO: include this cost into CommonCost.
20443 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
20444 assert(SLPReVec && "FixedVectorType is not expected.");
20445 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
20446 CompressMask);
20447 }
20448 NewLI =
20449 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
20450 } else if (E->State == TreeEntry::StridedVectorize) {
20451 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
20452 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
20453 PO = IsReverseOrder ? PtrN : Ptr0;
20454 Type *StrideTy = DL->getIndexType(PO->getType());
20455 Value *StrideVal;
20456 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
20457 StridedLoadTy = SPtrInfo.Ty;
20458 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
20459 unsigned StridedLoadEC =
20460 StridedLoadTy->getElementCount().getKnownMinValue();
20461
20462 Value *Stride = SPtrInfo.StrideVal;
20463 if (!Stride) {
20464 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
20465 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
20466 SCEVExpander Expander(*SE, "strided-load-vec");
20467 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
20468 &*Builder.GetInsertPoint());
20469 }
20470 Value *NewStride =
20471 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
20472 StrideVal = Builder.CreateMul(
20473 NewStride, ConstantInt::getSigned(
20474 StrideTy, (IsReverseOrder ? -1 : 1) *
20475 static_cast<int>(
20476 DL->getTypeAllocSize(ScalarTy))));
20477 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
20478 auto *Inst = Builder.CreateIntrinsic(
20479 Intrinsic::experimental_vp_strided_load,
20480 {StridedLoadTy, PO->getType(), StrideTy},
20481 {PO, StrideVal,
20482 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
20483 Builder.getInt32(StridedLoadEC)});
20484 Inst->addParamAttr(
20485 /*ArgNo=*/0,
20486 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20487 NewLI = Inst;
20488 } else {
20489 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
20490 Value *VecPtr = vectorizeOperand(E, 0);
20491 if (isa<FixedVectorType>(ScalarTy)) {
20492 assert(SLPReVec && "FixedVectorType is not expected.");
20493 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
20494 // to expand VecPtr if ScalarTy is a vector type.
20495 unsigned ScalarTyNumElements =
20496 cast<FixedVectorType>(ScalarTy)->getNumElements();
20497 unsigned VecTyNumElements =
20498 cast<FixedVectorType>(VecTy)->getNumElements();
20499 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
20500 "Cannot expand getelementptr.");
20501 unsigned VF = VecTyNumElements / ScalarTyNumElements;
20502 SmallVector<Constant *> Indices(VecTyNumElements);
20503 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
20504 return Builder.getInt64(I % ScalarTyNumElements);
20505 });
20506 VecPtr = Builder.CreateGEP(
20507 VecTy->getElementType(),
20508 Builder.CreateShuffleVector(
20509 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
20510 ConstantVector::get(Indices));
20511 }
20512 // Use the minimum alignment of the gathered loads.
20513 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
20514 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
20515 }
20516 Value *V = E->State == TreeEntry::CompressVectorize
20517 ? NewLI
20518 : ::propagateMetadata(NewLI, E->Scalars);
20519
20520 if (StridedLoadTy != VecTy)
20521 V = Builder.CreateBitOrPointerCast(V, VecTy);
20522 V = FinalShuffle(V, E);
20523 E->VectorizedValue = V;
20524 ++NumVectorInstructions;
20525 return V;
20526 }
20527 case Instruction::Store: {
20528 auto *SI = cast<StoreInst>(VL0);
20529
20530 setInsertPointAfterBundle(E);
20531
20532 Value *VecValue = vectorizeOperand(E, 0);
20533 if (VecValue->getType() != VecTy)
20534 VecValue =
20535 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
20536 VecValue = FinalShuffle(VecValue, E);
20537
20538 Value *Ptr = SI->getPointerOperand();
20539 Instruction *ST;
20540 if (E->State == TreeEntry::Vectorize) {
20541 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
20542 } else {
20543 assert(E->State == TreeEntry::StridedVectorize &&
20544 "Expected either strided or consecutive stores.");
20545 if (!E->ReorderIndices.empty()) {
20546 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
20547 Ptr = SI->getPointerOperand();
20548 }
20549 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
20550 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
20551 auto *Inst = Builder.CreateIntrinsic(
20552 Intrinsic::experimental_vp_strided_store,
20553 {VecTy, Ptr->getType(), StrideTy},
20554 {VecValue, Ptr,
20556 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
20557 Builder.getAllOnesMask(VecTy->getElementCount()),
20558 Builder.getInt32(E->Scalars.size())});
20559 Inst->addParamAttr(
20560 /*ArgNo=*/1,
20561 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20562 ST = Inst;
20563 }
20564
20565 Value *V = ::propagateMetadata(ST, E->Scalars);
20566
20567 E->VectorizedValue = V;
20568 ++NumVectorInstructions;
20569 return V;
20570 }
20571 case Instruction::GetElementPtr: {
20572 auto *GEP0 = cast<GetElementPtrInst>(VL0);
20573 setInsertPointAfterBundle(E);
20574
20575 Value *Op0 = vectorizeOperand(E, 0);
20576
20577 SmallVector<Value *> OpVecs;
20578 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
20579 Value *OpVec = vectorizeOperand(E, J);
20580 OpVecs.push_back(OpVec);
20581 }
20582
20583 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20584 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
20586 for (Value *V : E->Scalars) {
20588 GEPs.push_back(V);
20589 }
20590 V = ::propagateMetadata(I, GEPs);
20591 }
20592
20593 V = FinalShuffle(V, E);
20594
20595 E->VectorizedValue = V;
20596 ++NumVectorInstructions;
20597
20598 return V;
20599 }
20600 case Instruction::Call: {
20601 CallInst *CI = cast<CallInst>(VL0);
20602 setInsertPointAfterBundle(E);
20603
20605
20607 CI, ID, VecTy->getNumElements(),
20608 It != MinBWs.end() ? It->second.first : 0, TTI);
20609 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
20610 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
20611 VecCallCosts.first <= VecCallCosts.second;
20612
20613 Value *ScalarArg = nullptr;
20614 SmallVector<Value *> OpVecs;
20615 SmallVector<Type *, 2> TysForDecl;
20616 // Add return type if intrinsic is overloaded on it.
20617 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
20618 TysForDecl.push_back(VecTy);
20619 auto *CEI = cast<CallInst>(VL0);
20620 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
20621 // Some intrinsics have scalar arguments. This argument should not be
20622 // vectorized.
20623 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
20624 ScalarArg = CEI->getArgOperand(I);
20625 // if decided to reduce bitwidth of abs intrinsic, it second argument
20626 // must be set false (do not return poison, if value issigned min).
20627 if (ID == Intrinsic::abs && It != MinBWs.end() &&
20628 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
20629 ScalarArg = Builder.getFalse();
20630 OpVecs.push_back(ScalarArg);
20632 TysForDecl.push_back(ScalarArg->getType());
20633 continue;
20634 }
20635
20636 Value *OpVec = vectorizeOperand(E, I);
20637 ScalarArg = CEI->getArgOperand(I);
20638 if (cast<VectorType>(OpVec->getType())->getElementType() !=
20639 ScalarArg->getType()->getScalarType() &&
20640 It == MinBWs.end()) {
20641 auto *CastTy =
20642 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
20643 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
20644 } else if (It != MinBWs.end()) {
20645 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
20646 }
20647 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
20648 OpVecs.push_back(OpVec);
20649 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
20650 TysForDecl.push_back(OpVec->getType());
20651 }
20652
20653 Function *CF;
20654 if (!UseIntrinsic) {
20655 VFShape Shape =
20657 ElementCount::getFixed(VecTy->getNumElements()),
20658 false /*HasGlobalPred*/);
20659 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
20660 } else {
20661 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
20662 }
20663
20665 CI->getOperandBundlesAsDefs(OpBundles);
20666 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
20667
20668 propagateIRFlags(V, E->Scalars, VL0);
20669 V = FinalShuffle(V, E);
20670
20671 E->VectorizedValue = V;
20672 ++NumVectorInstructions;
20673 return V;
20674 }
20675 case Instruction::ShuffleVector: {
20676 Value *V;
20677 if (SLPReVec && !E->isAltShuffle()) {
20678 setInsertPointAfterBundle(E);
20679 Value *Src = vectorizeOperand(E, 0);
20680 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
20681 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
20682 SmallVector<int> NewMask(ThisMask.size());
20683 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
20684 return SVSrc->getShuffleMask()[Mask];
20685 });
20686 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
20687 SVSrc->getOperand(1), NewMask);
20688 } else {
20689 V = Builder.CreateShuffleVector(Src, ThisMask);
20690 }
20691 propagateIRFlags(V, E->Scalars, VL0);
20692 if (auto *I = dyn_cast<Instruction>(V))
20693 V = ::propagateMetadata(I, E->Scalars);
20694 V = FinalShuffle(V, E);
20695 } else {
20696 assert(E->isAltShuffle() &&
20697 ((Instruction::isBinaryOp(E->getOpcode()) &&
20698 Instruction::isBinaryOp(E->getAltOpcode())) ||
20699 (Instruction::isCast(E->getOpcode()) &&
20700 Instruction::isCast(E->getAltOpcode())) ||
20701 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
20702 "Invalid Shuffle Vector Operand");
20703
20704 Value *LHS = nullptr, *RHS = nullptr;
20705 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
20706 setInsertPointAfterBundle(E);
20707 LHS = vectorizeOperand(E, 0);
20708 RHS = vectorizeOperand(E, 1);
20709 } else {
20710 setInsertPointAfterBundle(E);
20711 LHS = vectorizeOperand(E, 0);
20712 }
20713 if (LHS && RHS &&
20714 ((Instruction::isBinaryOp(E->getOpcode()) &&
20715 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
20716 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
20717 assert((It != MinBWs.end() ||
20718 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
20719 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
20720 MinBWs.contains(getOperandEntry(E, 0)) ||
20721 MinBWs.contains(getOperandEntry(E, 1))) &&
20722 "Expected item in MinBWs.");
20723 Type *CastTy = VecTy;
20724 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
20726 ->getElementType()
20727 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
20728 ->getElementType()
20729 ->getIntegerBitWidth())
20730 CastTy = RHS->getType();
20731 else
20732 CastTy = LHS->getType();
20733 }
20734 if (LHS->getType() != CastTy)
20735 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
20736 if (RHS->getType() != CastTy)
20737 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
20738 }
20739
20740 Value *V0, *V1;
20741 if (Instruction::isBinaryOp(E->getOpcode())) {
20742 V0 = Builder.CreateBinOp(
20743 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
20744 V1 = Builder.CreateBinOp(
20745 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
20746 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
20747 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
20748 auto *AltCI = cast<CmpInst>(E->getAltOp());
20749 CmpInst::Predicate AltPred = AltCI->getPredicate();
20750 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
20751 } else {
20752 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
20753 unsigned SrcBWSz = DL->getTypeSizeInBits(
20754 cast<VectorType>(LHS->getType())->getElementType());
20755 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20756 if (BWSz <= SrcBWSz) {
20757 if (BWSz < SrcBWSz)
20758 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
20759 assert(LHS->getType() == VecTy &&
20760 "Expected same type as operand.");
20761 if (auto *I = dyn_cast<Instruction>(LHS))
20762 LHS = ::propagateMetadata(I, E->Scalars);
20763 LHS = FinalShuffle(LHS, E);
20764 E->VectorizedValue = LHS;
20765 ++NumVectorInstructions;
20766 return LHS;
20767 }
20768 }
20769 V0 = Builder.CreateCast(
20770 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
20771 V1 = Builder.CreateCast(
20772 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
20773 }
20774 // Add V0 and V1 to later analysis to try to find and remove matching
20775 // instruction, if any.
20776 for (Value *V : {V0, V1}) {
20777 if (auto *I = dyn_cast<Instruction>(V)) {
20778 GatherShuffleExtractSeq.insert(I);
20779 CSEBlocks.insert(I->getParent());
20780 }
20781 }
20782
20783 // Create shuffle to take alternate operations from the vector.
20784 // Also, gather up main and alt scalar ops to propagate IR flags to
20785 // each vector operation.
20786 ValueList OpScalars, AltScalars;
20787 SmallVector<int> Mask;
20788 E->buildAltOpShuffleMask(
20789 [E, this](Instruction *I) {
20790 assert(E->getMatchingMainOpOrAltOp(I) &&
20791 "Unexpected main/alternate opcode");
20792 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
20793 *TLI);
20794 },
20795 Mask, &OpScalars, &AltScalars);
20796
20797 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
20798 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
20799 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
20800 // Drop nuw flags for abs(sub(commutative), true).
20801 if (auto *I = dyn_cast<Instruction>(Vec);
20802 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
20803 any_of(E->Scalars, [E](Value *V) {
20804 if (isa<PoisonValue>(V))
20805 return false;
20806 if (E->hasCopyableElements() && E->isCopyableElement(V))
20807 return false;
20808 auto *IV = cast<Instruction>(V);
20809 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20810 }))
20811 I->setHasNoUnsignedWrap(/*b=*/false);
20812 };
20813 DropNuwFlag(V0, E->getOpcode());
20814 DropNuwFlag(V1, E->getAltOpcode());
20815
20816 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20817 assert(SLPReVec && "FixedVectorType is not expected.");
20818 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
20819 }
20820 V = Builder.CreateShuffleVector(V0, V1, Mask);
20821 if (auto *I = dyn_cast<Instruction>(V)) {
20822 V = ::propagateMetadata(I, E->Scalars);
20823 GatherShuffleExtractSeq.insert(I);
20824 CSEBlocks.insert(I->getParent());
20825 }
20826 }
20827
20828 E->VectorizedValue = V;
20829 ++NumVectorInstructions;
20830
20831 return V;
20832 }
20833 default:
20834 llvm_unreachable("unknown inst");
20835 }
20836 return nullptr;
20837}
20838
20840 ExtraValueToDebugLocsMap ExternallyUsedValues;
20841 return vectorizeTree(ExternallyUsedValues);
20842}
20843
20845 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
20846 Instruction *ReductionRoot,
20847 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20848 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
20849 // need to rebuild it.
20850 EntryToLastInstruction.clear();
20851 // All blocks must be scheduled before any instructions are inserted.
20852 for (auto &BSIter : BlocksSchedules)
20853 scheduleBlock(*this, BSIter.second.get());
20854 // Cache last instructions for the nodes to avoid side effects, which may
20855 // appear during vectorization, like extra uses, etc.
20856 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20857 if (TE->isGather() || DeletedNodes.contains(TE.get()))
20858 continue;
20859 (void)getLastInstructionInBundle(TE.get());
20860 }
20861
20862 if (ReductionRoot)
20863 Builder.SetInsertPoint(ReductionRoot->getParent(),
20864 ReductionRoot->getIterator());
20865 else
20866 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20867
20868 // Vectorize gather operands of the nodes with the external uses only.
20870 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20871 if (DeletedNodes.contains(TE.get()))
20872 continue;
20873 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20874 TE->UserTreeIndex.UserTE->hasState() &&
20875 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20876 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20877 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20878 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20879 all_of(TE->UserTreeIndex.UserTE->Scalars,
20880 [](Value *V) { return isUsedOutsideBlock(V); })) {
20881 Instruction &LastInst =
20882 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20883 GatherEntries.emplace_back(TE.get(), &LastInst);
20884 }
20885 }
20886 for (auto &Entry : GatherEntries) {
20887 IRBuilderBase::InsertPointGuard Guard(Builder);
20888 Builder.SetInsertPoint(Entry.second);
20889 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20890 (void)vectorizeTree(Entry.first);
20891 }
20892 // Emit gathered loads first to emit better code for the users of those
20893 // gathered loads.
20894 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20895 if (DeletedNodes.contains(TE.get()))
20896 continue;
20897 if (GatheredLoadsEntriesFirst.has_value() &&
20898 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20899 (!TE->isGather() || TE->UserTreeIndex)) {
20900 assert((TE->UserTreeIndex ||
20901 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20902 "Expected gathered load node.");
20903 (void)vectorizeTree(TE.get());
20904 }
20905 }
20906 (void)vectorizeTree(VectorizableTree[0].get());
20907 // Run through the list of postponed gathers and emit them, replacing the temp
20908 // emitted allocas with actual vector instructions.
20909 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20911 for (const TreeEntry *E : PostponedNodes) {
20912 auto *TE = const_cast<TreeEntry *>(E);
20913 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20914 TE->VectorizedValue = nullptr;
20915 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20916 // If user is a PHI node, its vector code have to be inserted right before
20917 // block terminator. Since the node was delayed, there were some unresolved
20918 // dependencies at the moment when stab instruction was emitted. In a case
20919 // when any of these dependencies turn out an operand of another PHI, coming
20920 // from this same block, position of a stab instruction will become invalid.
20921 // The is because source vector that supposed to feed this gather node was
20922 // inserted at the end of the block [after stab instruction]. So we need
20923 // to adjust insertion point again to the end of block.
20924 if (isa<PHINode>(UserI) ||
20925 (TE->UserTreeIndex.UserTE->hasState() &&
20926 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20927 // Insert before all users.
20928 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20929 for (User *U : PrevVec->users()) {
20930 if (U == UserI)
20931 continue;
20932 auto *UI = dyn_cast<Instruction>(U);
20933 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20934 continue;
20935 if (UI->comesBefore(InsertPt))
20936 InsertPt = UI;
20937 }
20938 Builder.SetInsertPoint(InsertPt);
20939 } else {
20940 Builder.SetInsertPoint(PrevVec);
20941 }
20942 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20943 Value *Vec = vectorizeTree(TE);
20944 if (auto *VecI = dyn_cast<Instruction>(Vec);
20945 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20946 Builder.GetInsertPoint()->comesBefore(VecI))
20947 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20948 Builder.GetInsertPoint());
20949 if (Vec->getType() != PrevVec->getType()) {
20950 assert(Vec->getType()->isIntOrIntVectorTy() &&
20951 PrevVec->getType()->isIntOrIntVectorTy() &&
20952 "Expected integer vector types only.");
20953 std::optional<bool> IsSigned;
20954 for (Value *V : TE->Scalars) {
20955 if (isVectorized(V)) {
20956 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20957 auto It = MinBWs.find(MNTE);
20958 if (It != MinBWs.end()) {
20959 IsSigned = IsSigned.value_or(false) || It->second.second;
20960 if (*IsSigned)
20961 break;
20962 }
20963 }
20964 if (IsSigned.value_or(false))
20965 break;
20966 // Scan through gather nodes.
20967 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20968 auto It = MinBWs.find(BVE);
20969 if (It != MinBWs.end()) {
20970 IsSigned = IsSigned.value_or(false) || It->second.second;
20971 if (*IsSigned)
20972 break;
20973 }
20974 }
20975 if (IsSigned.value_or(false))
20976 break;
20977 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20978 IsSigned =
20979 IsSigned.value_or(false) ||
20980 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20981 continue;
20982 }
20983 if (IsSigned.value_or(false))
20984 break;
20985 }
20986 }
20987 if (IsSigned.value_or(false)) {
20988 // Final attempt - check user node.
20989 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20990 if (It != MinBWs.end())
20991 IsSigned = It->second.second;
20992 }
20993 assert(IsSigned &&
20994 "Expected user node or perfect diamond match in MinBWs.");
20995 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20996 }
20997 PrevVec->replaceAllUsesWith(Vec);
20998 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20999 // Replace the stub vector node, if it was used before for one of the
21000 // buildvector nodes already.
21001 auto It = PostponedValues.find(PrevVec);
21002 if (It != PostponedValues.end()) {
21003 for (TreeEntry *VTE : It->getSecond())
21004 VTE->VectorizedValue = Vec;
21005 }
21006 eraseInstruction(PrevVec);
21007 }
21008
21009 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
21010 << " values .\n");
21011
21013 // Maps vector instruction to original insertelement instruction
21014 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
21015 // Maps extract Scalar to the corresponding extractelement instruction in the
21016 // basic block. Only one extractelement per block should be emitted.
21018 ScalarToEEs;
21019 SmallDenseSet<Value *, 4> UsedInserts;
21021 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
21023 // Extract all of the elements with the external uses.
21024 for (const auto &ExternalUse : ExternalUses) {
21025 Value *Scalar = ExternalUse.Scalar;
21026 llvm::User *User = ExternalUse.User;
21027
21028 // Skip users that we already RAUW. This happens when one instruction
21029 // has multiple uses of the same value.
21030 if (User && !is_contained(Scalar->users(), User))
21031 continue;
21032 const TreeEntry *E = &ExternalUse.E;
21033 assert(E && "Invalid scalar");
21034 assert(!E->isGather() && "Extracting from a gather list");
21035 // Non-instruction pointers are not deleted, just skip them.
21036 if (E->getOpcode() == Instruction::GetElementPtr &&
21037 !isa<GetElementPtrInst>(Scalar))
21038 continue;
21039
21040 Value *Vec = E->VectorizedValue;
21041 assert(Vec && "Can't find vectorizable value");
21042
21043 Value *Lane = Builder.getInt32(ExternalUse.Lane);
21044 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
21045 if (Scalar->getType() != Vec->getType()) {
21046 Value *Ex = nullptr;
21047 Value *ExV = nullptr;
21048 auto *Inst = dyn_cast<Instruction>(Scalar);
21049 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
21050 auto It = ScalarToEEs.find(Scalar);
21051 if (It != ScalarToEEs.end()) {
21052 // No need to emit many extracts, just move the only one in the
21053 // current block.
21054 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
21055 : Builder.GetInsertBlock());
21056 if (EEIt != It->second.end()) {
21057 Value *PrevV = EEIt->second.first;
21058 if (auto *I = dyn_cast<Instruction>(PrevV);
21059 I && !ReplaceInst &&
21060 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
21061 Builder.GetInsertPoint()->comesBefore(I)) {
21062 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
21063 Builder.GetInsertPoint());
21064 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
21065 CI->moveAfter(I);
21066 }
21067 Ex = PrevV;
21068 ExV = EEIt->second.second ? EEIt->second.second : Ex;
21069 }
21070 }
21071 if (!Ex) {
21072 // "Reuse" the existing extract to improve final codegen.
21073 if (ReplaceInst) {
21074 // Leave the instruction as is, if it cheaper extracts and all
21075 // operands are scalar.
21076 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
21077 IgnoredExtracts.insert(EE);
21078 Ex = EE;
21079 } else {
21080 auto *CloneInst = Inst->clone();
21081 CloneInst->insertBefore(Inst->getIterator());
21082 if (Inst->hasName())
21083 CloneInst->takeName(Inst);
21084 Ex = CloneInst;
21085 }
21086 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
21087 ES && isa<Instruction>(Vec)) {
21088 Value *V = ES->getVectorOperand();
21089 auto *IVec = cast<Instruction>(Vec);
21090 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
21091 V = ETEs.front()->VectorizedValue;
21092 if (auto *IV = dyn_cast<Instruction>(V);
21093 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
21094 IV->comesBefore(IVec))
21095 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
21096 else
21097 Ex = Builder.CreateExtractElement(Vec, Lane);
21098 } else if (auto *VecTy =
21099 dyn_cast<FixedVectorType>(Scalar->getType())) {
21100 assert(SLPReVec && "FixedVectorType is not expected.");
21101 unsigned VecTyNumElements = VecTy->getNumElements();
21102 // When REVEC is enabled, we need to extract a vector.
21103 // Note: The element size of Scalar may be different from the
21104 // element size of Vec.
21105 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
21106 ExternalUse.Lane * VecTyNumElements);
21107 } else {
21108 Ex = Builder.CreateExtractElement(Vec, Lane);
21109 }
21110 // If necessary, sign-extend or zero-extend ScalarRoot
21111 // to the larger type.
21112 ExV = Ex;
21113 if (Scalar->getType() != Ex->getType())
21114 ExV = Builder.CreateIntCast(
21115 Ex, Scalar->getType(),
21116 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
21117 auto *I = dyn_cast<Instruction>(Ex);
21118 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
21119 : &F->getEntryBlock(),
21120 std::make_pair(Ex, ExV));
21121 }
21122 // The then branch of the previous if may produce constants, since 0
21123 // operand might be a constant.
21124 if (auto *ExI = dyn_cast<Instruction>(Ex);
21125 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
21126 GatherShuffleExtractSeq.insert(ExI);
21127 CSEBlocks.insert(ExI->getParent());
21128 }
21129 return ExV;
21130 }
21131 assert(isa<FixedVectorType>(Scalar->getType()) &&
21132 isa<InsertElementInst>(Scalar) &&
21133 "In-tree scalar of vector type is not insertelement?");
21134 auto *IE = cast<InsertElementInst>(Scalar);
21135 VectorToInsertElement.try_emplace(Vec, IE);
21136 return Vec;
21137 };
21138 // If User == nullptr, the Scalar remains as scalar in vectorized
21139 // instructions or is used as extra arg. Generate ExtractElement instruction
21140 // and update the record for this scalar in ExternallyUsedValues.
21141 if (!User) {
21142 if (!ScalarsWithNullptrUser.insert(Scalar).second)
21143 continue;
21144 assert(
21145 (ExternallyUsedValues.count(Scalar) ||
21146 ExternalUsesWithNonUsers.count(Scalar) ||
21147 ExternalUsesAsOriginalScalar.contains(Scalar) ||
21148 any_of(
21149 Scalar->users(),
21150 [&, TTI = TTI](llvm::User *U) {
21151 if (ExternalUsesAsOriginalScalar.contains(U))
21152 return true;
21153 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
21154 return !UseEntries.empty() &&
21155 (E->State == TreeEntry::Vectorize ||
21156 E->State == TreeEntry::StridedVectorize ||
21157 E->State == TreeEntry::CompressVectorize) &&
21158 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
21159 return (UseEntry->State == TreeEntry::Vectorize ||
21160 UseEntry->State ==
21161 TreeEntry::StridedVectorize ||
21162 UseEntry->State ==
21163 TreeEntry::CompressVectorize) &&
21164 doesInTreeUserNeedToExtract(
21165 Scalar, getRootEntryInstruction(*UseEntry),
21166 TLI, TTI);
21167 });
21168 })) &&
21169 "Scalar with nullptr User must be registered in "
21170 "ExternallyUsedValues map or remain as scalar in vectorized "
21171 "instructions");
21172 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
21173 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
21174 if (PHI->getParent()->isLandingPad())
21175 Builder.SetInsertPoint(
21176 PHI->getParent(),
21177 std::next(
21178 PHI->getParent()->getLandingPadInst()->getIterator()));
21179 else
21180 Builder.SetInsertPoint(PHI->getParent(),
21181 PHI->getParent()->getFirstNonPHIIt());
21182 } else {
21183 Builder.SetInsertPoint(VecI->getParent(),
21184 std::next(VecI->getIterator()));
21185 }
21186 } else {
21187 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21188 }
21189 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21190 // Required to update internally referenced instructions.
21191 if (Scalar != NewInst) {
21192 assert((!isa<ExtractElementInst>(Scalar) ||
21193 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
21194 "Extractelements should not be replaced.");
21195 Scalar->replaceAllUsesWith(NewInst);
21196 }
21197 continue;
21198 }
21199
21200 if (auto *VU = dyn_cast<InsertElementInst>(User);
21201 VU && VU->getOperand(1) == Scalar) {
21202 // Skip if the scalar is another vector op or Vec is not an instruction.
21203 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
21204 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
21205 if (!UsedInserts.insert(VU).second)
21206 continue;
21207 // Need to use original vector, if the root is truncated.
21208 auto BWIt = MinBWs.find(E);
21209 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
21210 auto *ScalarTy = FTy->getElementType();
21211 auto Key = std::make_pair(Vec, ScalarTy);
21212 auto VecIt = VectorCasts.find(Key);
21213 if (VecIt == VectorCasts.end()) {
21214 IRBuilderBase::InsertPointGuard Guard(Builder);
21215 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
21216 if (IVec->getParent()->isLandingPad())
21217 Builder.SetInsertPoint(IVec->getParent(),
21218 std::next(IVec->getParent()
21219 ->getLandingPadInst()
21220 ->getIterator()));
21221 else
21222 Builder.SetInsertPoint(
21223 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
21224 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
21225 Builder.SetInsertPoint(IVec->getNextNode());
21226 }
21227 Vec = Builder.CreateIntCast(
21228 Vec,
21230 ScalarTy,
21231 cast<FixedVectorType>(Vec->getType())->getNumElements()),
21232 BWIt->second.second);
21233 VectorCasts.try_emplace(Key, Vec);
21234 } else {
21235 Vec = VecIt->second;
21236 }
21237 }
21238
21239 std::optional<unsigned> InsertIdx = getElementIndex(VU);
21240 if (InsertIdx) {
21241 auto *It = find_if(
21242 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
21243 // Checks if 2 insertelements are from the same buildvector.
21244 InsertElementInst *VecInsert = Data.InsertElements.front();
21246 VU, VecInsert,
21247 [](InsertElementInst *II) { return II->getOperand(0); });
21248 });
21249 unsigned Idx = *InsertIdx;
21250 if (It == ShuffledInserts.end()) {
21251 (void)ShuffledInserts.emplace_back();
21252 It = std::next(ShuffledInserts.begin(),
21253 ShuffledInserts.size() - 1);
21254 }
21255 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
21256 if (Mask.empty())
21257 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
21258 Mask[Idx] = ExternalUse.Lane;
21259 It->InsertElements.push_back(cast<InsertElementInst>(User));
21260 continue;
21261 }
21262 }
21263 }
21264 }
21265
21266 // Generate extracts for out-of-tree users.
21267 // Find the insertion point for the extractelement lane.
21268 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
21269 if (PHINode *PH = dyn_cast<PHINode>(User)) {
21270 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
21271 if (PH->getIncomingValue(I) == Scalar) {
21272 Instruction *IncomingTerminator =
21273 PH->getIncomingBlock(I)->getTerminator();
21274 if (isa<CatchSwitchInst>(IncomingTerminator)) {
21275 Builder.SetInsertPoint(VecI->getParent(),
21276 std::next(VecI->getIterator()));
21277 } else {
21278 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
21279 }
21280 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21281 PH->setOperand(I, NewInst);
21282 }
21283 }
21284 } else {
21285 Builder.SetInsertPoint(cast<Instruction>(User));
21286 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21287 User->replaceUsesOfWith(Scalar, NewInst);
21288 }
21289 } else {
21290 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21291 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21292 User->replaceUsesOfWith(Scalar, NewInst);
21293 }
21294
21295 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
21296 }
21297
21298 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
21299 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
21300 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
21301 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
21302 for (int I = 0, E = Mask.size(); I < E; ++I) {
21303 if (Mask[I] < VF)
21304 CombinedMask1[I] = Mask[I];
21305 else
21306 CombinedMask2[I] = Mask[I] - VF;
21307 }
21308 ShuffleInstructionBuilder ShuffleBuilder(
21309 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
21310 ShuffleBuilder.add(V1, CombinedMask1);
21311 if (V2)
21312 ShuffleBuilder.add(V2, CombinedMask2);
21313 return ShuffleBuilder.finalize({}, {}, {});
21314 };
21315
21316 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
21317 bool ForSingleMask) {
21318 unsigned VF = Mask.size();
21319 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
21320 if (VF != VecVF) {
21321 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
21322 Vec = CreateShuffle(Vec, nullptr, Mask);
21323 return std::make_pair(Vec, true);
21324 }
21325 if (!ForSingleMask) {
21326 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
21327 for (unsigned I = 0; I < VF; ++I) {
21328 if (Mask[I] != PoisonMaskElem)
21329 ResizeMask[Mask[I]] = Mask[I];
21330 }
21331 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
21332 }
21333 }
21334
21335 return std::make_pair(Vec, false);
21336 };
21337 // Perform shuffling of the vectorize tree entries for better handling of
21338 // external extracts.
21339 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
21340 // Find the first and the last instruction in the list of insertelements.
21341 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
21342 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
21343 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
21344 Builder.SetInsertPoint(LastInsert);
21345 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
21347 MutableArrayRef(Vector.data(), Vector.size()),
21348 FirstInsert->getOperand(0),
21349 [](Value *Vec) {
21350 return cast<VectorType>(Vec->getType())
21351 ->getElementCount()
21352 .getKnownMinValue();
21353 },
21354 ResizeToVF,
21355 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
21356 ArrayRef<Value *> Vals) {
21357 assert((Vals.size() == 1 || Vals.size() == 2) &&
21358 "Expected exactly 1 or 2 input values.");
21359 if (Vals.size() == 1) {
21360 // Do not create shuffle if the mask is a simple identity
21361 // non-resizing mask.
21362 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
21363 ->getNumElements() ||
21364 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
21365 return CreateShuffle(Vals.front(), nullptr, Mask);
21366 return Vals.front();
21367 }
21368 return CreateShuffle(Vals.front() ? Vals.front()
21369 : FirstInsert->getOperand(0),
21370 Vals.back(), Mask);
21371 });
21372 auto It = ShuffledInserts[I].InsertElements.rbegin();
21373 // Rebuild buildvector chain.
21374 InsertElementInst *II = nullptr;
21375 if (It != ShuffledInserts[I].InsertElements.rend())
21376 II = *It;
21378 while (It != ShuffledInserts[I].InsertElements.rend()) {
21379 assert(II && "Must be an insertelement instruction.");
21380 if (*It == II)
21381 ++It;
21382 else
21383 Inserts.push_back(cast<Instruction>(II));
21384 II = dyn_cast<InsertElementInst>(II->getOperand(0));
21385 }
21386 for (Instruction *II : reverse(Inserts)) {
21387 II->replaceUsesOfWith(II->getOperand(0), NewInst);
21388 if (auto *NewI = dyn_cast<Instruction>(NewInst))
21389 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
21390 II->moveAfter(NewI);
21391 NewInst = II;
21392 }
21393 LastInsert->replaceAllUsesWith(NewInst);
21394 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
21395 IE->replaceUsesOfWith(IE->getOperand(0),
21396 PoisonValue::get(IE->getOperand(0)->getType()));
21397 IE->replaceUsesOfWith(IE->getOperand(1),
21398 PoisonValue::get(IE->getOperand(1)->getType()));
21399 eraseInstruction(IE);
21400 }
21401 CSEBlocks.insert(LastInsert->getParent());
21402 }
21403
21404 SmallVector<Instruction *> RemovedInsts;
21405 // For each vectorized value:
21406 for (auto &TEPtr : VectorizableTree) {
21407 TreeEntry *Entry = TEPtr.get();
21408
21409 // No need to handle users of gathered values.
21410 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
21411 DeletedNodes.contains(Entry) ||
21412 TransformedToGatherNodes.contains(Entry))
21413 continue;
21414
21415 assert(Entry->VectorizedValue && "Can't find vectorizable value");
21416
21417 // For each lane:
21418 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
21419 Value *Scalar = Entry->Scalars[Lane];
21420
21421 if (Entry->getOpcode() == Instruction::GetElementPtr &&
21422 !isa<GetElementPtrInst>(Scalar))
21423 continue;
21424 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
21425 EE && IgnoredExtracts.contains(EE))
21426 continue;
21427 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
21428 continue;
21429#ifndef NDEBUG
21430 Type *Ty = Scalar->getType();
21431 if (!Ty->isVoidTy()) {
21432 for (User *U : Scalar->users()) {
21433 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
21434
21435 // It is legal to delete users in the ignorelist.
21436 assert((isVectorized(U) ||
21437 (UserIgnoreList && UserIgnoreList->contains(U)) ||
21440 "Deleting out-of-tree value");
21441 }
21442 }
21443#endif
21444 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
21445 auto *I = cast<Instruction>(Scalar);
21446 RemovedInsts.push_back(I);
21447 }
21448 }
21449
21450 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
21451 // new vector instruction.
21452 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
21453 V->mergeDIAssignID(RemovedInsts);
21454
21455 // Clear up reduction references, if any.
21456 if (UserIgnoreList) {
21457 for (Instruction *I : RemovedInsts) {
21458 const TreeEntry *IE = getTreeEntries(I).front();
21459 if (ArrayRef<TreeEntry *> SplitEntries = getSplitTreeEntries(I);
21460 !SplitEntries.empty() && SplitEntries.front()->Idx < IE->Idx)
21461 IE = SplitEntries.front();
21462 if (IE->Idx != 0 &&
21463 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
21464 (ValueToGatherNodes.lookup(I).contains(
21465 VectorizableTree.front().get()) ||
21466 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
21467 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
21468 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
21469 IE->UserTreeIndex &&
21470 is_contained(VectorizableTree.front()->Scalars, I)) &&
21471 !(GatheredLoadsEntriesFirst.has_value() &&
21472 IE->Idx >= *GatheredLoadsEntriesFirst &&
21473 VectorizableTree.front()->isGather() &&
21474 is_contained(VectorizableTree.front()->Scalars, I)) &&
21475 !(!VectorizableTree.front()->isGather() &&
21476 VectorizableTree.front()->isCopyableElement(I)))
21477 continue;
21478 SmallVector<SelectInst *> LogicalOpSelects;
21479 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
21480 // Do not replace condition of the logical op in form select <cond>.
21481 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
21482 (match(U.getUser(), m_LogicalAnd()) ||
21483 match(U.getUser(), m_LogicalOr())) &&
21484 U.getOperandNo() == 0;
21485 if (IsPoisoningLogicalOp) {
21486 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
21487 return false;
21488 }
21489 return UserIgnoreList->contains(U.getUser());
21490 });
21491 // Replace conditions of the poisoning logical ops with the non-poison
21492 // constant value.
21493 for (SelectInst *SI : LogicalOpSelects)
21494 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
21495 }
21496 }
21497 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
21498 // cache correctness.
21499 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
21500 // - instructions are not deleted until later.
21501 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
21502
21503 Builder.ClearInsertionPoint();
21504 InstrElementSize.clear();
21505
21506 const TreeEntry &RootTE = *VectorizableTree.front();
21507 Value *Vec = RootTE.VectorizedValue;
21508 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
21509 It != MinBWs.end() &&
21510 ReductionBitWidth != It->second.first) {
21511 IRBuilder<>::InsertPointGuard Guard(Builder);
21512 Builder.SetInsertPoint(ReductionRoot->getParent(),
21513 ReductionRoot->getIterator());
21514 Vec = Builder.CreateIntCast(
21515 Vec,
21516 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
21517 cast<VectorType>(Vec->getType())->getElementCount()),
21518 It->second.second);
21519 }
21520 return Vec;
21521}
21522
21524 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
21525 << " gather sequences instructions.\n");
21526 // LICM InsertElementInst sequences.
21527 for (Instruction *I : GatherShuffleExtractSeq) {
21528 if (isDeleted(I))
21529 continue;
21530
21531 // Check if this block is inside a loop.
21532 Loop *L = LI->getLoopFor(I->getParent());
21533 if (!L)
21534 continue;
21535
21536 // Check if it has a preheader.
21537 BasicBlock *PreHeader = L->getLoopPreheader();
21538 if (!PreHeader)
21539 continue;
21540
21541 // If the vector or the element that we insert into it are
21542 // instructions that are defined in this basic block then we can't
21543 // hoist this instruction.
21544 if (any_of(I->operands(), [L](Value *V) {
21545 auto *OpI = dyn_cast<Instruction>(V);
21546 return OpI && L->contains(OpI);
21547 }))
21548 continue;
21549
21550 // We can hoist this instruction. Move it to the pre-header.
21551 I->moveBefore(PreHeader->getTerminator()->getIterator());
21552 CSEBlocks.insert(PreHeader);
21553 }
21554
21555 // Make a list of all reachable blocks in our CSE queue.
21557 CSEWorkList.reserve(CSEBlocks.size());
21558 for (BasicBlock *BB : CSEBlocks)
21559 if (DomTreeNode *N = DT->getNode(BB)) {
21560 assert(DT->isReachableFromEntry(N));
21561 CSEWorkList.push_back(N);
21562 }
21563
21564 // Sort blocks by domination. This ensures we visit a block after all blocks
21565 // dominating it are visited.
21566 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
21567 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
21568 "Different nodes should have different DFS numbers");
21569 return A->getDFSNumIn() < B->getDFSNumIn();
21570 });
21571
21572 // Less defined shuffles can be replaced by the more defined copies.
21573 // Between two shuffles one is less defined if it has the same vector operands
21574 // and its mask indeces are the same as in the first one or undefs. E.g.
21575 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
21576 // poison, <0, 0, 0, 0>.
21577 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
21578 Instruction *I2,
21579 SmallVectorImpl<int> &NewMask) {
21580 if (I1->getType() != I2->getType())
21581 return false;
21582 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
21583 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
21584 if (!SI1 || !SI2)
21585 return I1->isIdenticalTo(I2);
21586 if (SI1->isIdenticalTo(SI2))
21587 return true;
21588 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
21589 if (SI1->getOperand(I) != SI2->getOperand(I))
21590 return false;
21591 // Check if the second instruction is more defined than the first one.
21592 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
21593 ArrayRef<int> SM1 = SI1->getShuffleMask();
21594 // Count trailing undefs in the mask to check the final number of used
21595 // registers.
21596 unsigned LastUndefsCnt = 0;
21597 for (int I = 0, E = NewMask.size(); I < E; ++I) {
21598 if (SM1[I] == PoisonMaskElem)
21599 ++LastUndefsCnt;
21600 else
21601 LastUndefsCnt = 0;
21602 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
21603 NewMask[I] != SM1[I])
21604 return false;
21605 if (NewMask[I] == PoisonMaskElem)
21606 NewMask[I] = SM1[I];
21607 }
21608 // Check if the last undefs actually change the final number of used vector
21609 // registers.
21610 return SM1.size() - LastUndefsCnt > 1 &&
21611 ::getNumberOfParts(*TTI, SI1->getType()) ==
21613 *TTI, getWidenedType(SI1->getType()->getElementType(),
21614 SM1.size() - LastUndefsCnt));
21615 };
21616 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
21617 // instructions. TODO: We can further optimize this scan if we split the
21618 // instructions into different buckets based on the insert lane.
21620 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
21621 assert(*I &&
21622 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
21623 "Worklist not sorted properly!");
21624 BasicBlock *BB = (*I)->getBlock();
21625 // For all instructions in blocks containing gather sequences:
21626 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
21627 if (isDeleted(&In))
21628 continue;
21630 !GatherShuffleExtractSeq.contains(&In))
21631 continue;
21632
21633 // Check if we can replace this instruction with any of the
21634 // visited instructions.
21635 bool Replaced = false;
21636 for (Instruction *&V : Visited) {
21637 SmallVector<int> NewMask;
21638 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
21639 DT->dominates(V->getParent(), In.getParent())) {
21640 In.replaceAllUsesWith(V);
21641 eraseInstruction(&In);
21642 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
21643 if (!NewMask.empty())
21644 SI->setShuffleMask(NewMask);
21645 Replaced = true;
21646 break;
21647 }
21649 GatherShuffleExtractSeq.contains(V) &&
21650 IsIdenticalOrLessDefined(V, &In, NewMask) &&
21651 DT->dominates(In.getParent(), V->getParent())) {
21652 In.moveAfter(V);
21653 V->replaceAllUsesWith(&In);
21655 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
21656 if (!NewMask.empty())
21657 SI->setShuffleMask(NewMask);
21658 V = &In;
21659 Replaced = true;
21660 break;
21661 }
21662 }
21663 if (!Replaced) {
21664 assert(!is_contained(Visited, &In));
21665 Visited.push_back(&In);
21666 }
21667 }
21668 }
21669 CSEBlocks.clear();
21670 GatherShuffleExtractSeq.clear();
21671}
21672
21673BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
21674 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
21675 auto &BundlePtr =
21676 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
21677 for (Value *V : VL) {
21678 if (S.isNonSchedulable(V))
21679 continue;
21680 auto *I = cast<Instruction>(V);
21681 if (S.isCopyableElement(V)) {
21682 // Add a copyable element model.
21683 ScheduleCopyableData &SD =
21684 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
21685 // Group the instructions to a bundle.
21686 BundlePtr->add(&SD);
21687 continue;
21688 }
21689 ScheduleData *BundleMember = getScheduleData(V);
21690 assert(BundleMember && "no ScheduleData for bundle member "
21691 "(maybe not in same basic block)");
21692 // Group the instructions to a bundle.
21693 BundlePtr->add(BundleMember);
21694 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
21695 BundlePtr.get());
21696 }
21697 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
21698 return *BundlePtr;
21699}
21700
21701// Groups the instructions to a bundle (which is then a single scheduling entity)
21702// and schedules instructions until the bundle gets ready.
21703std::optional<BoUpSLP::ScheduleBundle *>
21704BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
21705 const InstructionsState &S,
21706 const EdgeInfo &EI) {
21707 // No need to schedule PHIs, insertelement, extractelement and extractvalue
21708 // instructions.
21709 if (isa<PHINode>(S.getMainOp()) ||
21710 isVectorLikeInstWithConstOps(S.getMainOp()))
21711 return nullptr;
21712 // If the parent node is non-schedulable and the current node is copyable, and
21713 // any of parent instructions are used outside several basic blocks or in
21714 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
21715 // analysis, leading to a crash.
21716 // Non-scheduled nodes may not have related ScheduleData model, which may lead
21717 // to a skipped dep analysis.
21718 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21719 EI.UserTE->doesNotNeedToSchedule() &&
21720 EI.UserTE->getOpcode() != Instruction::PHI &&
21721 any_of(EI.UserTE->Scalars, [](Value *V) {
21722 auto *I = dyn_cast<Instruction>(V);
21723 if (!I || I->hasOneUser())
21724 return false;
21725 for (User *U : I->users()) {
21726 auto *UI = cast<Instruction>(U);
21727 if (isa<BinaryOperator>(UI))
21728 return true;
21729 }
21730 return false;
21731 }))
21732 return std::nullopt;
21733 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21734 EI.UserTE->hasCopyableElements() &&
21735 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21736 all_of(VL, [&](Value *V) {
21737 if (S.isCopyableElement(V))
21738 return true;
21739 return isUsedOutsideBlock(V);
21740 }))
21741 return std::nullopt;
21742 // If any instruction is used outside block only and its operand is placed
21743 // immediately before it, do not schedule, it may cause wrong def-use chain.
21744 if (S.areInstructionsWithCopyableElements() && any_of(VL, [&](Value *V) {
21745 if (isa<PoisonValue>(V) || S.isCopyableElement(V))
21746 return false;
21747 if (isUsedOutsideBlock(V)) {
21748 for (Value *Op : cast<Instruction>(V)->operands()) {
21749 auto *I = dyn_cast<Instruction>(Op);
21750 if (!I)
21751 continue;
21752 return SLP->isVectorized(I) && I->getNextNode() == V;
21753 }
21754 }
21755 return false;
21756 }))
21757 return std::nullopt;
21758 if (S.areInstructionsWithCopyableElements() && EI) {
21759 bool IsNonSchedulableWithParentPhiNode =
21760 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
21761 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
21762 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21763 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
21764 if (IsNonSchedulableWithParentPhiNode) {
21765 SmallSet<std::pair<Value *, Value *>, 4> Values;
21766 for (const auto [Idx, V] :
21767 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
21768 Value *Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
21769 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
21770 auto *I = dyn_cast<Instruction>(Op);
21771 if (!I || !isCommutative(I))
21772 continue;
21773 if (!Values.insert(std::make_pair(V, Op)).second)
21774 return std::nullopt;
21775 }
21776 }
21777 }
21778 bool HasCopyables = S.areInstructionsWithCopyableElements();
21779 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
21780 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
21781 // If all operands were replaced by copyables, the operands of this node
21782 // might be not, so need to recalculate dependencies for schedule data,
21783 // replaced by copyable schedule data.
21784 SmallVector<ScheduleData *> ControlDependentMembers;
21785 for (Value *V : VL) {
21786 auto *I = dyn_cast<Instruction>(V);
21787 if (!I || (HasCopyables && S.isCopyableElement(V)))
21788 continue;
21789 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21790 for (const Use &U : I->operands()) {
21791 unsigned &NumOps =
21792 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
21793 .first->getSecond();
21794 ++NumOps;
21795 if (auto *Op = dyn_cast<Instruction>(U.get());
21796 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
21797 if (ScheduleData *OpSD = getScheduleData(Op);
21798 OpSD && OpSD->hasValidDependencies())
21799 // TODO: investigate how to improve it instead of early exiting.
21800 return std::nullopt;
21801 }
21802 }
21803 }
21804 return nullptr;
21805 }
21806
21807 // Initialize the instruction bundle.
21808 Instruction *OldScheduleEnd = ScheduleEnd;
21809 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
21810
21811 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
21812 // Clear deps or recalculate the region, if the memory instruction is a
21813 // copyable. It may have memory deps, which must be recalculated.
21814 SmallVector<ScheduleData *> ControlDependentMembers;
21815 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21816 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21817 for (ScheduleEntity *SE : Bundle.getBundle()) {
21818 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
21819 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21820 BundleMember && BundleMember->hasValidDependencies()) {
21821 BundleMember->clearDirectDependencies();
21822 if (RegionHasStackSave ||
21824 BundleMember->getInst()))
21825 ControlDependentMembers.push_back(BundleMember);
21826 }
21827 continue;
21828 }
21829 auto *SD = cast<ScheduleData>(SE);
21830 if (SD->hasValidDependencies() &&
21831 (!S.areInstructionsWithCopyableElements() ||
21832 !S.isCopyableElement(SD->getInst())) &&
21833 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21834 EI.UserTE->hasState() &&
21835 (!EI.UserTE->hasCopyableElements() ||
21836 !EI.UserTE->isCopyableElement(SD->getInst())))
21837 SD->clearDirectDependencies();
21838 for (const Use &U : SD->getInst()->operands()) {
21839 unsigned &NumOps =
21840 UserOpToNumOps
21841 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
21842 .first->getSecond();
21843 ++NumOps;
21844 if (auto *Op = dyn_cast<Instruction>(U.get());
21845 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
21846 *SLP, NumOps)) {
21847 if (ScheduleData *OpSD = getScheduleData(Op);
21848 OpSD && OpSD->hasValidDependencies()) {
21849 OpSD->clearDirectDependencies();
21850 if (RegionHasStackSave ||
21852 ControlDependentMembers.push_back(OpSD);
21853 }
21854 }
21855 }
21856 }
21857 };
21858 // The scheduling region got new instructions at the lower end (or it is a
21859 // new region for the first bundle). This makes it necessary to
21860 // recalculate all dependencies.
21861 // It is seldom that this needs to be done a second time after adding the
21862 // initial bundle to the region.
21863 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21864 for_each(ScheduleDataMap, [&](auto &P) {
21865 if (BB != P.first->getParent())
21866 return;
21867 ScheduleData *SD = P.second;
21868 if (isInSchedulingRegion(*SD))
21869 SD->clearDependencies();
21870 });
21871 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21872 for_each(P.second, [&](ScheduleCopyableData *SD) {
21873 if (isInSchedulingRegion(*SD))
21874 SD->clearDependencies();
21875 });
21876 });
21877 ReSchedule = true;
21878 }
21879 // Check if the bundle data has deps for copyable elements already. In
21880 // this case need to reset deps and recalculate it.
21881 if (Bundle && !Bundle.getBundle().empty()) {
21882 if (S.areInstructionsWithCopyableElements() ||
21883 !ScheduleCopyableDataMap.empty())
21884 CheckIfNeedToClearDeps(Bundle);
21885 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
21886 << BB->getName() << "\n");
21887 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
21888 ControlDependentMembers);
21889 } else if (!ControlDependentMembers.empty()) {
21890 ScheduleBundle Invalid = ScheduleBundle::invalid();
21891 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
21892 ControlDependentMembers);
21893 }
21894
21895 if (ReSchedule) {
21896 resetSchedule();
21897 initialFillReadyList(ReadyInsts);
21898 }
21899
21900 // Now try to schedule the new bundle or (if no bundle) just calculate
21901 // dependencies. As soon as the bundle is "ready" it means that there are no
21902 // cyclic dependencies and we can schedule it. Note that's important that we
21903 // don't "schedule" the bundle yet.
21904 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21905 !ReadyInsts.empty()) {
21906 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21907 assert(Picked->isReady() && "must be ready to schedule");
21908 schedule(*SLP, S, EI, Picked, ReadyInsts);
21909 if (Picked == &Bundle)
21910 break;
21911 }
21912 };
21913
21914 // Make sure that the scheduling region contains all
21915 // instructions of the bundle.
21916 for (Value *V : VL) {
21917 if (S.isNonSchedulable(V))
21918 continue;
21919 if (!extendSchedulingRegion(V, S)) {
21920 // If the scheduling region got new instructions at the lower end (or it
21921 // is a new region for the first bundle). This makes it necessary to
21922 // recalculate all dependencies.
21923 // Otherwise the compiler may crash trying to incorrectly calculate
21924 // dependencies and emit instruction in the wrong order at the actual
21925 // scheduling.
21926 ScheduleBundle Invalid = ScheduleBundle::invalid();
21927 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
21928 return std::nullopt;
21929 }
21930 }
21931
21932 bool ReSchedule = false;
21933 for (Value *V : VL) {
21934 if (S.isNonSchedulable(V))
21935 continue;
21937 getScheduleCopyableData(cast<Instruction>(V));
21938 if (!CopyableData.empty()) {
21939 for (ScheduleCopyableData *SD : CopyableData)
21940 ReadyInsts.remove(SD);
21941 }
21942 ScheduleData *BundleMember = getScheduleData(V);
21943 assert((BundleMember || S.isCopyableElement(V)) &&
21944 "no ScheduleData for bundle member (maybe not in same basic block)");
21945 if (!BundleMember)
21946 continue;
21947
21948 // Make sure we don't leave the pieces of the bundle in the ready list when
21949 // whole bundle might not be ready.
21950 ReadyInsts.remove(BundleMember);
21951 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
21952 !Bundles.empty()) {
21953 for (ScheduleBundle *B : Bundles)
21954 ReadyInsts.remove(B);
21955 }
21956
21957 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21958 continue;
21959 // A bundle member was scheduled as single instruction before and now
21960 // needs to be scheduled as part of the bundle. We just get rid of the
21961 // existing schedule.
21962 // A bundle member has deps calculated before it was copyable element - need
21963 // to reschedule.
21964 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
21965 << " was already scheduled\n");
21966 ReSchedule = true;
21967 }
21968
21969 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21970 TryScheduleBundleImpl(ReSchedule, Bundle);
21971 if (!Bundle.isReady()) {
21972 for (ScheduleEntity *BD : Bundle.getBundle()) {
21973 // Copyable data scheduling is just removed.
21975 continue;
21976 if (BD->isReady()) {
21977 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
21978 if (Bundles.empty()) {
21979 ReadyInsts.insert(BD);
21980 continue;
21981 }
21982 for (ScheduleBundle *B : Bundles)
21983 if (B->isReady())
21984 ReadyInsts.insert(B);
21985 }
21986 }
21987 ScheduledBundlesList.pop_back();
21988 SmallVector<ScheduleData *> ControlDependentMembers;
21989 for (Value *V : VL) {
21990 if (S.isNonSchedulable(V))
21991 continue;
21992 auto *I = cast<Instruction>(V);
21993 if (S.isCopyableElement(I)) {
21994 // Remove the copyable data from the scheduling region and restore
21995 // previous mappings.
21996 auto KV = std::make_pair(EI, I);
21997 assert(ScheduleCopyableDataMap.contains(KV) &&
21998 "no ScheduleCopyableData for copyable element");
21999 ScheduleCopyableData *SD =
22000 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
22001 ScheduleCopyableDataMapByUsers[I].remove(SD);
22002 if (EI.UserTE) {
22003 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
22004 const auto *It = find(Op, I);
22005 assert(It != Op.end() && "Lane not set");
22006 SmallPtrSet<Instruction *, 4> Visited;
22007 do {
22008 int Lane = std::distance(Op.begin(), It);
22009 assert(Lane >= 0 && "Lane not set");
22010 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
22011 !EI.UserTE->ReorderIndices.empty())
22012 Lane = EI.UserTE->ReorderIndices[Lane];
22013 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
22014 "Couldn't find extract lane");
22015 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
22016 if (!Visited.insert(In).second) {
22017 It = find(make_range(std::next(It), Op.end()), I);
22018 break;
22019 }
22020 ScheduleCopyableDataMapByInstUser
22021 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
22022 .pop_back();
22023 It = find(make_range(std::next(It), Op.end()), I);
22024 } while (It != Op.end());
22025 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
22026 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
22027 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
22028 }
22029 if (ScheduleCopyableDataMapByUsers[I].empty())
22030 ScheduleCopyableDataMapByUsers.erase(I);
22031 ScheduleCopyableDataMap.erase(KV);
22032 // Need to recalculate dependencies for the actual schedule data.
22033 if (ScheduleData *OpSD = getScheduleData(I);
22034 OpSD && OpSD->hasValidDependencies()) {
22035 OpSD->clearDirectDependencies();
22036 if (RegionHasStackSave ||
22038 ControlDependentMembers.push_back(OpSD);
22039 }
22040 continue;
22041 }
22042 ScheduledBundles.find(I)->getSecond().pop_back();
22043 }
22044 if (!ControlDependentMembers.empty()) {
22045 ScheduleBundle Invalid = ScheduleBundle::invalid();
22046 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
22047 ControlDependentMembers);
22048 }
22049 return std::nullopt;
22050 }
22051 return &Bundle;
22052}
22053
22054BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
22055 // Allocate a new ScheduleData for the instruction.
22056 if (ChunkPos >= ChunkSize) {
22057 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
22058 ChunkPos = 0;
22059 }
22060 return &(ScheduleDataChunks.back()[ChunkPos++]);
22061}
22062
22063bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
22064 Value *V, const InstructionsState &S) {
22066 assert(I && "bundle member must be an instruction");
22067 if (getScheduleData(I))
22068 return true;
22069 if (!ScheduleStart) {
22070 // It's the first instruction in the new region.
22071 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
22072 ScheduleStart = I;
22073 ScheduleEnd = I->getNextNode();
22074 assert(ScheduleEnd && "tried to vectorize a terminator?");
22075 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
22076 return true;
22077 }
22078 // Search up and down at the same time, because we don't know if the new
22079 // instruction is above or below the existing scheduling region.
22080 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
22081 // against the budget. Otherwise debug info could affect codegen.
22083 ++ScheduleStart->getIterator().getReverse();
22084 BasicBlock::reverse_iterator UpperEnd = BB->rend();
22085 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
22086 BasicBlock::iterator LowerEnd = BB->end();
22087 auto IsAssumeLikeIntr = [](const Instruction &I) {
22088 if (auto *II = dyn_cast<IntrinsicInst>(&I))
22089 return II->isAssumeLikeIntrinsic();
22090 return false;
22091 };
22092 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22093 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22094 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
22095 &*DownIter != I) {
22096 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
22097 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
22098 return false;
22099 }
22100
22101 ++UpIter;
22102 ++DownIter;
22103
22104 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22105 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22106 }
22107 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
22108 assert(I->getParent() == ScheduleStart->getParent() &&
22109 "Instruction is in wrong basic block.");
22110 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
22111 ScheduleStart = I;
22112 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
22113 << "\n");
22114 return true;
22115 }
22116 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
22117 "Expected to reach top of the basic block or instruction down the "
22118 "lower end.");
22119 assert(I->getParent() == ScheduleEnd->getParent() &&
22120 "Instruction is in wrong basic block.");
22121 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
22122 nullptr);
22123 ScheduleEnd = I->getNextNode();
22124 assert(ScheduleEnd && "tried to vectorize a terminator?");
22125 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
22126 return true;
22127}
22128
22129void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
22130 Instruction *ToI,
22131 ScheduleData *PrevLoadStore,
22132 ScheduleData *NextLoadStore) {
22133 ScheduleData *CurrentLoadStore = PrevLoadStore;
22134 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
22135 // No need to allocate data for non-schedulable instructions.
22136 if (isa<PHINode>(I))
22137 continue;
22138 ScheduleData *SD = ScheduleDataMap.lookup(I);
22139 if (!SD) {
22140 SD = allocateScheduleDataChunks();
22141 ScheduleDataMap[I] = SD;
22142 }
22143 assert(!isInSchedulingRegion(*SD) &&
22144 "new ScheduleData already in scheduling region");
22145 SD->init(SchedulingRegionID, I);
22146
22147 auto CanIgnoreLoad = [](const Instruction *I) {
22148 const auto *LI = dyn_cast<LoadInst>(I);
22149 // If there is a simple load marked as invariant, we can ignore it.
22150 // But, in the (unlikely) case of non-simple invariant load,
22151 // we should not ignore it.
22152 return LI && LI->isSimple() &&
22153 LI->getMetadata(LLVMContext::MD_invariant_load);
22154 };
22155
22156 if (I->mayReadOrWriteMemory() &&
22157 // Simple InvariantLoad does not depend on other memory accesses.
22158 !CanIgnoreLoad(I) &&
22159 (!isa<IntrinsicInst>(I) ||
22160 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
22162 Intrinsic::pseudoprobe))) {
22163 // Update the linked list of memory accessing instructions.
22164 if (CurrentLoadStore) {
22165 CurrentLoadStore->setNextLoadStore(SD);
22166 } else {
22167 FirstLoadStoreInRegion = SD;
22168 }
22169 CurrentLoadStore = SD;
22170 }
22171
22174 RegionHasStackSave = true;
22175 }
22176 if (NextLoadStore) {
22177 if (CurrentLoadStore)
22178 CurrentLoadStore->setNextLoadStore(NextLoadStore);
22179 } else {
22180 LastLoadStoreInRegion = CurrentLoadStore;
22181 }
22182}
22183
22184void BoUpSLP::BlockScheduling::calculateDependencies(
22185 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
22186 ArrayRef<ScheduleData *> ControlDeps) {
22187 SmallVector<ScheduleEntity *> WorkList;
22188 auto ProcessNode = [&](ScheduleEntity *SE) {
22189 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
22190 if (CD->hasValidDependencies())
22191 return;
22192 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
22193 CD->initDependencies();
22194 CD->resetUnscheduledDeps();
22195 const EdgeInfo &EI = CD->getEdgeInfo();
22196 if (EI.UserTE) {
22197 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
22198 const auto *It = find(Op, CD->getInst());
22199 assert(It != Op.end() && "Lane not set");
22200 SmallPtrSet<Instruction *, 4> Visited;
22201 do {
22202 int Lane = std::distance(Op.begin(), It);
22203 assert(Lane >= 0 && "Lane not set");
22204 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
22205 !EI.UserTE->ReorderIndices.empty())
22206 Lane = EI.UserTE->ReorderIndices[Lane];
22207 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
22208 "Couldn't find extract lane");
22209 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
22210 if (EI.UserTE->isCopyableElement(In)) {
22211 // We may have not have related copyable scheduling data, if the
22212 // instruction is non-schedulable.
22213 if (ScheduleCopyableData *UseSD =
22214 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
22215 CD->incDependencies();
22216 if (!UseSD->isScheduled())
22217 CD->incrementUnscheduledDeps(1);
22218 if (!UseSD->hasValidDependencies() ||
22219 (InsertInReadyList && UseSD->isReady()))
22220 WorkList.push_back(UseSD);
22221 }
22222 } else if (Visited.insert(In).second) {
22223 if (ScheduleData *UseSD = getScheduleData(In)) {
22224 CD->incDependencies();
22225 if (!UseSD->isScheduled())
22226 CD->incrementUnscheduledDeps(1);
22227 if (!UseSD->hasValidDependencies() ||
22228 (InsertInReadyList && UseSD->isReady()))
22229 WorkList.push_back(UseSD);
22230 }
22231 }
22232 It = find(make_range(std::next(It), Op.end()), CD->getInst());
22233 } while (It != Op.end());
22234 if (CD->isReady() && CD->getDependencies() == 0 &&
22235 (EI.UserTE->hasState() &&
22236 (EI.UserTE->getMainOp()->getParent() !=
22237 CD->getInst()->getParent() ||
22238 (isa<PHINode>(EI.UserTE->getMainOp()) &&
22239 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
22240 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
22241 auto *IU = dyn_cast<Instruction>(U);
22242 if (!IU)
22243 return true;
22244 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
22245 })))))) {
22246 // If no uses in the block - mark as having pseudo-use, which cannot
22247 // be scheduled.
22248 // Prevents incorrect def-use tracking between external user and
22249 // actual instruction.
22250 CD->incDependencies();
22251 CD->incrementUnscheduledDeps(1);
22252 }
22253 }
22254 return;
22255 }
22256 auto *BundleMember = cast<ScheduleData>(SE);
22257 if (BundleMember->hasValidDependencies())
22258 return;
22259 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
22260 BundleMember->initDependencies();
22261 BundleMember->resetUnscheduledDeps();
22262 // Handle def-use chain dependencies.
22263 SmallDenseMap<Value *, unsigned> UserToNumOps;
22264 for (User *U : BundleMember->getInst()->users()) {
22265 if (isa<PHINode>(U))
22266 continue;
22267 if (ScheduleData *UseSD = getScheduleData(U)) {
22268 // The operand is a copyable element - skip.
22269 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
22270 ++NumOps;
22271 if (areAllOperandsReplacedByCopyableData(
22272 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
22273 continue;
22274 BundleMember->incDependencies();
22275 if (!UseSD->isScheduled())
22276 BundleMember->incrementUnscheduledDeps(1);
22277 if (!UseSD->hasValidDependencies() ||
22278 (InsertInReadyList && UseSD->isReady()))
22279 WorkList.push_back(UseSD);
22280 }
22281 }
22282 for (ScheduleCopyableData *UseSD :
22283 getScheduleCopyableDataUsers(BundleMember->getInst())) {
22284 BundleMember->incDependencies();
22285 if (!UseSD->isScheduled())
22286 BundleMember->incrementUnscheduledDeps(1);
22287 if (!UseSD->hasValidDependencies() ||
22288 (InsertInReadyList && UseSD->isReady()))
22289 WorkList.push_back(UseSD);
22290 }
22291
22292 SmallPtrSet<const Instruction *, 4> Visited;
22293 auto MakeControlDependent = [&](Instruction *I) {
22294 // Do not mark control dependent twice.
22295 if (!Visited.insert(I).second)
22296 return;
22297 auto *DepDest = getScheduleData(I);
22298 assert(DepDest && "must be in schedule window");
22299 DepDest->addControlDependency(BundleMember);
22300 BundleMember->incDependencies();
22301 if (!DepDest->isScheduled())
22302 BundleMember->incrementUnscheduledDeps(1);
22303 if (!DepDest->hasValidDependencies() ||
22304 (InsertInReadyList && DepDest->isReady()))
22305 WorkList.push_back(DepDest);
22306 };
22307
22308 // Any instruction which isn't safe to speculate at the beginning of the
22309 // block is control depend on any early exit or non-willreturn call
22310 // which proceeds it.
22311 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
22312 for (Instruction *I = BundleMember->getInst()->getNextNode();
22313 I != ScheduleEnd; I = I->getNextNode()) {
22314 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
22315 continue;
22316
22317 // Add the dependency
22318 MakeControlDependent(I);
22319
22321 // Everything past here must be control dependent on I.
22322 break;
22323 }
22324 }
22325
22326 if (RegionHasStackSave) {
22327 // If we have an inalloc alloca instruction, it needs to be scheduled
22328 // after any preceeding stacksave. We also need to prevent any alloca
22329 // from reordering above a preceeding stackrestore.
22330 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
22331 match(BundleMember->getInst(),
22333 for (Instruction *I = BundleMember->getInst()->getNextNode();
22334 I != ScheduleEnd; I = I->getNextNode()) {
22337 // Any allocas past here must be control dependent on I, and I
22338 // must be memory dependend on BundleMember->Inst.
22339 break;
22340
22341 if (!isa<AllocaInst>(I))
22342 continue;
22343
22344 // Add the dependency
22345 MakeControlDependent(I);
22346 }
22347 }
22348
22349 // In addition to the cases handle just above, we need to prevent
22350 // allocas and loads/stores from moving below a stacksave or a
22351 // stackrestore. Avoiding moving allocas below stackrestore is currently
22352 // thought to be conservatism. Moving loads/stores below a stackrestore
22353 // can lead to incorrect code.
22354 if (isa<AllocaInst>(BundleMember->getInst()) ||
22355 BundleMember->getInst()->mayReadOrWriteMemory()) {
22356 for (Instruction *I = BundleMember->getInst()->getNextNode();
22357 I != ScheduleEnd; I = I->getNextNode()) {
22360 continue;
22361
22362 // Add the dependency
22363 MakeControlDependent(I);
22364 break;
22365 }
22366 }
22367 }
22368
22369 // Handle the memory dependencies (if any).
22370 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
22371 if (!NextLoadStore)
22372 return;
22373 Instruction *SrcInst = BundleMember->getInst();
22374 assert(SrcInst->mayReadOrWriteMemory() &&
22375 "NextLoadStore list for non memory effecting bundle?");
22376 MemoryLocation SrcLoc = getLocation(SrcInst);
22377 bool SrcMayWrite = SrcInst->mayWriteToMemory();
22378 unsigned NumAliased = 0;
22379 unsigned DistToSrc = 1;
22380 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
22381
22382 for (ScheduleData *DepDest = NextLoadStore; DepDest;
22383 DepDest = DepDest->getNextLoadStore()) {
22384 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
22385
22386 // We have two limits to reduce the complexity:
22387 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
22388 // SLP->isAliased (which is the expensive part in this loop).
22389 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
22390 // the whole loop (even if the loop is fast, it's quadratic).
22391 // It's important for the loop break condition (see below) to
22392 // check this limit even between two read-only instructions.
22393 if (DistToSrc >= MaxMemDepDistance ||
22394 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
22395 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
22396 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
22397
22398 // We increment the counter only if the locations are aliased
22399 // (instead of counting all alias checks). This gives a better
22400 // balance between reduced runtime and accurate dependencies.
22401 NumAliased++;
22402
22403 DepDest->addMemoryDependency(BundleMember);
22404 BundleMember->incDependencies();
22405 if (!DepDest->isScheduled())
22406 BundleMember->incrementUnscheduledDeps(1);
22407 if (!DepDest->hasValidDependencies() ||
22408 (InsertInReadyList && DepDest->isReady()))
22409 WorkList.push_back(DepDest);
22410 }
22411
22412 // Example, explaining the loop break condition: Let's assume our
22413 // starting instruction is i0 and MaxMemDepDistance = 3.
22414 //
22415 // +--------v--v--v
22416 // i0,i1,i2,i3,i4,i5,i6,i7,i8
22417 // +--------^--^--^
22418 //
22419 // MaxMemDepDistance let us stop alias-checking at i3 and we add
22420 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
22421 // Previously we already added dependencies from i3 to i6,i7,i8
22422 // (because of MaxMemDepDistance). As we added a dependency from
22423 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
22424 // and we can abort this loop at i6.
22425 if (DistToSrc >= 2 * MaxMemDepDistance)
22426 break;
22427 DistToSrc++;
22428 }
22429 };
22430
22431 assert((Bundle || !ControlDeps.empty()) &&
22432 "expected at least one instruction to schedule");
22433 if (Bundle)
22434 WorkList.push_back(Bundle.getBundle().front());
22435 WorkList.append(ControlDeps.begin(), ControlDeps.end());
22436 SmallPtrSet<ScheduleBundle *, 16> Visited;
22437 while (!WorkList.empty()) {
22438 ScheduleEntity *SD = WorkList.pop_back_val();
22439 SmallVector<ScheduleBundle *, 1> CopyableBundle;
22441 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
22442 CopyableBundle.push_back(&CD->getBundle());
22443 Bundles = CopyableBundle;
22444 } else {
22445 Bundles = getScheduleBundles(SD->getInst());
22446 }
22447 if (Bundles.empty()) {
22448 if (!SD->hasValidDependencies())
22449 ProcessNode(SD);
22450 if (InsertInReadyList && SD->isReady()) {
22451 ReadyInsts.insert(SD);
22452 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
22453 }
22454 continue;
22455 }
22456 for (ScheduleBundle *Bundle : Bundles) {
22457 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
22458 continue;
22459 assert(isInSchedulingRegion(*Bundle) &&
22460 "ScheduleData not in scheduling region");
22461 for_each(Bundle->getBundle(), ProcessNode);
22462 }
22463 if (InsertInReadyList && SD->isReady()) {
22464 for (ScheduleBundle *Bundle : Bundles) {
22465 assert(isInSchedulingRegion(*Bundle) &&
22466 "ScheduleData not in scheduling region");
22467 if (!Bundle->isReady())
22468 continue;
22469 ReadyInsts.insert(Bundle);
22470 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
22471 << "\n");
22472 }
22473 }
22474 }
22475}
22476
22477void BoUpSLP::BlockScheduling::resetSchedule() {
22478 assert(ScheduleStart &&
22479 "tried to reset schedule on block which has not been scheduled");
22480 for_each(ScheduleDataMap, [&](auto &P) {
22481 if (BB != P.first->getParent())
22482 return;
22483 ScheduleData *SD = P.second;
22484 if (isInSchedulingRegion(*SD)) {
22485 SD->setScheduled(/*Scheduled=*/false);
22486 SD->resetUnscheduledDeps();
22487 }
22488 });
22489 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
22490 for_each(P.second, [&](ScheduleCopyableData *SD) {
22491 if (isInSchedulingRegion(*SD)) {
22492 SD->setScheduled(/*Scheduled=*/false);
22493 SD->resetUnscheduledDeps();
22494 }
22495 });
22496 });
22497 for_each(ScheduledBundles, [&](auto &P) {
22498 for_each(P.second, [&](ScheduleBundle *Bundle) {
22499 if (isInSchedulingRegion(*Bundle))
22500 Bundle->setScheduled(/*Scheduled=*/false);
22501 });
22502 });
22503 // Reset schedule data for copyable elements.
22504 for (auto &P : ScheduleCopyableDataMap) {
22505 if (isInSchedulingRegion(*P.second)) {
22506 P.second->setScheduled(/*Scheduled=*/false);
22507 P.second->resetUnscheduledDeps();
22508 }
22509 }
22510 ReadyInsts.clear();
22511}
22512
22513void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
22514 if (!BS->ScheduleStart)
22515 return;
22516
22517 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
22518
22519 // A key point - if we got here, pre-scheduling was able to find a valid
22520 // scheduling of the sub-graph of the scheduling window which consists
22521 // of all vector bundles and their transitive users. As such, we do not
22522 // need to reschedule anything *outside of* that subgraph.
22523
22524 BS->resetSchedule();
22525
22526 // For the real scheduling we use a more sophisticated ready-list: it is
22527 // sorted by the original instruction location. This lets the final schedule
22528 // be as close as possible to the original instruction order.
22529 // WARNING: If changing this order causes a correctness issue, that means
22530 // there is some missing dependence edge in the schedule data graph.
22531 struct ScheduleDataCompare {
22532 bool operator()(const ScheduleEntity *SD1,
22533 const ScheduleEntity *SD2) const {
22534 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
22535 }
22536 };
22537 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
22538
22539 // Ensure that all dependency data is updated (for nodes in the sub-graph)
22540 // and fill the ready-list with initial instructions.
22541 int Idx = 0;
22542 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22543 I = I->getNextNode()) {
22544 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
22545 if (!Bundles.empty()) {
22546 for (ScheduleBundle *Bundle : Bundles) {
22547 Bundle->setSchedulingPriority(Idx++);
22548 if (!Bundle->hasValidDependencies())
22549 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
22550 }
22551 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
22552 for (ScheduleCopyableData *SD : reverse(SDs)) {
22553 ScheduleBundle &Bundle = SD->getBundle();
22554 Bundle.setSchedulingPriority(Idx++);
22555 if (!Bundle.hasValidDependencies())
22556 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22557 }
22558 continue;
22559 }
22561 BS->getScheduleCopyableDataUsers(I);
22562 if (ScheduleData *SD = BS->getScheduleData(I)) {
22563 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
22564 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
22565 SDTEs.front()->doesNotNeedToSchedule() ||
22567 "scheduler and vectorizer bundle mismatch");
22568 SD->setSchedulingPriority(Idx++);
22569 if (!SD->hasValidDependencies() &&
22570 (!CopyableData.empty() ||
22571 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
22572 assert(TE->isGather() && "expected gather node");
22573 return TE->hasState() && TE->hasCopyableElements() &&
22574 TE->isCopyableElement(I);
22575 }))) {
22576 // Need to calculate deps for these nodes to correctly handle copyable
22577 // dependencies, even if they were cancelled.
22578 // If copyables bundle was cancelled, the deps are cleared and need to
22579 // recalculate them.
22580 ScheduleBundle Bundle;
22581 Bundle.add(SD);
22582 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22583 }
22584 }
22585 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
22586 ScheduleBundle &Bundle = SD->getBundle();
22587 Bundle.setSchedulingPriority(Idx++);
22588 if (!Bundle.hasValidDependencies())
22589 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22590 }
22591 }
22592 BS->initialFillReadyList(ReadyInsts);
22593
22594 Instruction *LastScheduledInst = BS->ScheduleEnd;
22595
22596 // Do the "real" scheduling.
22597 SmallPtrSet<Instruction *, 16> Scheduled;
22598 while (!ReadyInsts.empty()) {
22599 auto *Picked = *ReadyInsts.begin();
22600 ReadyInsts.erase(ReadyInsts.begin());
22601
22602 // Move the scheduled instruction(s) to their dedicated places, if not
22603 // there yet.
22604 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
22605 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
22606 Instruction *PickedInst = BundleMember->getInst();
22607 // If copyable must be schedule as part of something else, skip it.
22608 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
22609 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
22610 (!IsCopyable && !Scheduled.insert(PickedInst).second))
22611 continue;
22612 if (PickedInst->getNextNode() != LastScheduledInst)
22613 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
22614 LastScheduledInst = PickedInst;
22615 }
22616 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
22617 LastScheduledInst);
22618 } else {
22619 auto *SD = cast<ScheduleData>(Picked);
22620 Instruction *PickedInst = SD->getInst();
22621 if (PickedInst->getNextNode() != LastScheduledInst)
22622 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
22623 LastScheduledInst = PickedInst;
22624 }
22625 auto Invalid = InstructionsState::invalid();
22626 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
22627 }
22628
22629 // Check that we didn't break any of our invariants.
22630#ifdef EXPENSIVE_CHECKS
22631 BS->verify();
22632#endif
22633
22634#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
22635 // Check that all schedulable entities got scheduled
22636 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22637 I = I->getNextNode()) {
22638 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
22639 assert(all_of(Bundles,
22640 [](const ScheduleBundle *Bundle) {
22641 return Bundle->isScheduled();
22642 }) &&
22643 "must be scheduled at this point");
22644 }
22645#endif
22646
22647 // Avoid duplicate scheduling of the block.
22648 BS->ScheduleStart = nullptr;
22649}
22650
22652 // If V is a store, just return the width of the stored value (or value
22653 // truncated just before storing) without traversing the expression tree.
22654 // This is the common case.
22655 if (auto *Store = dyn_cast<StoreInst>(V))
22656 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
22657
22658 if (auto *IEI = dyn_cast<InsertElementInst>(V))
22659 return getVectorElementSize(IEI->getOperand(1));
22660
22661 auto E = InstrElementSize.find(V);
22662 if (E != InstrElementSize.end())
22663 return E->second;
22664
22665 // If V is not a store, we can traverse the expression tree to find loads
22666 // that feed it. The type of the loaded value may indicate a more suitable
22667 // width than V's type. We want to base the vector element size on the width
22668 // of memory operations where possible.
22671 if (auto *I = dyn_cast<Instruction>(V)) {
22672 Worklist.emplace_back(I, I->getParent(), 0);
22673 Visited.insert(I);
22674 }
22675
22676 // Traverse the expression tree in bottom-up order looking for loads. If we
22677 // encounter an instruction we don't yet handle, we give up.
22678 auto Width = 0u;
22679 Value *FirstNonBool = nullptr;
22680 while (!Worklist.empty()) {
22681 auto [I, Parent, Level] = Worklist.pop_back_val();
22682
22683 // We should only be looking at scalar instructions here. If the current
22684 // instruction has a vector type, skip.
22685 auto *Ty = I->getType();
22686 if (isa<VectorType>(Ty))
22687 continue;
22688 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
22689 FirstNonBool = I;
22690 if (Level > RecursionMaxDepth)
22691 continue;
22692
22693 // If the current instruction is a load, update MaxWidth to reflect the
22694 // width of the loaded value.
22696 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
22697
22698 // Otherwise, we need to visit the operands of the instruction. We only
22699 // handle the interesting cases from buildTree here. If an operand is an
22700 // instruction we haven't yet visited and from the same basic block as the
22701 // user or the use is a PHI node, we add it to the worklist.
22704 for (Use &U : I->operands()) {
22705 if (auto *J = dyn_cast<Instruction>(U.get()))
22706 if (Visited.insert(J).second &&
22707 (isa<PHINode>(I) || J->getParent() == Parent)) {
22708 Worklist.emplace_back(J, J->getParent(), Level + 1);
22709 continue;
22710 }
22711 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
22712 FirstNonBool = U.get();
22713 }
22714 } else {
22715 break;
22716 }
22717 }
22718
22719 // If we didn't encounter a memory access in the expression tree, or if we
22720 // gave up for some reason, just return the width of V. Otherwise, return the
22721 // maximum width we found.
22722 if (!Width) {
22723 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
22724 V = FirstNonBool;
22725 Width = DL->getTypeSizeInBits(V->getType());
22726 }
22727
22728 for (Instruction *I : Visited)
22729 InstrElementSize[I] = Width;
22730
22731 return Width;
22732}
22733
22734bool BoUpSLP::collectValuesToDemote(
22735 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
22737 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
22738 bool &IsProfitableToDemote, bool IsTruncRoot) const {
22739 // We can always demote constants.
22740 if (all_of(E.Scalars, IsaPred<Constant>))
22741 return true;
22742
22743 unsigned OrigBitWidth =
22744 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
22745 if (OrigBitWidth == BitWidth) {
22746 MaxDepthLevel = 1;
22747 return true;
22748 }
22749
22750 // Check if the node was analyzed already and must keep its original bitwidth.
22751 if (NodesToKeepBWs.contains(E.Idx))
22752 return false;
22753
22754 // If the value is not a vectorized instruction in the expression and not used
22755 // by the insertelement instruction and not used in multiple vector nodes, it
22756 // cannot be demoted.
22757 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
22758 if (isa<PoisonValue>(R))
22759 return false;
22760 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22761 });
22762 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
22763 if (isa<PoisonValue>(V))
22764 return true;
22765 if (getTreeEntries(V).size() > 1)
22766 return false;
22767 // For lat shuffle of sext/zext with many uses need to check the extra bit
22768 // for unsigned values, otherwise may have incorrect casting for reused
22769 // scalars.
22770 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
22771 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
22772 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22773 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22774 return true;
22775 }
22776 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22777 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22778 if (IsSignedNode)
22779 ++BitWidth1;
22780 if (auto *I = dyn_cast<Instruction>(V)) {
22781 APInt Mask = DB->getDemandedBits(I);
22782 unsigned BitWidth2 =
22783 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
22784 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22785 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
22786 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22787 break;
22788 BitWidth2 *= 2;
22789 }
22790 BitWidth1 = std::min(BitWidth1, BitWidth2);
22791 }
22792 BitWidth = std::max(BitWidth, BitWidth1);
22793 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
22794 };
22795 auto FinalAnalysis = [&, TTI = TTI]() {
22796 if (!IsProfitableToDemote)
22797 return false;
22798 bool Res = all_of(
22799 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
22800 // Demote gathers.
22801 if (Res && E.isGather()) {
22802 if (E.hasState()) {
22803 if (const TreeEntry *SameTE =
22804 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
22805 SameTE)
22806 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
22807 ToDemote, Visited, NodesToKeepBWs,
22808 MaxDepthLevel, IsProfitableToDemote,
22809 IsTruncRoot)) {
22810 ToDemote.push_back(E.Idx);
22811 return true;
22812 }
22813 }
22814 // Check possible extractelement instructions bases and final vector
22815 // length.
22816 SmallPtrSet<Value *, 4> UniqueBases;
22817 for (Value *V : E.Scalars) {
22818 auto *EE = dyn_cast<ExtractElementInst>(V);
22819 if (!EE)
22820 continue;
22821 UniqueBases.insert(EE->getVectorOperand());
22822 }
22823 const unsigned VF = E.Scalars.size();
22824 Type *OrigScalarTy = E.Scalars.front()->getType();
22825 if (UniqueBases.size() <= 2 ||
22826 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
22828 *TTI,
22830 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
22831 VF))) {
22832 ToDemote.push_back(E.Idx);
22833 return true;
22834 }
22835 }
22836 return Res;
22837 };
22838 if (E.isGather() || !Visited.insert(&E).second ||
22839 any_of(E.Scalars, [&](Value *V) {
22840 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22841 return isa<InsertElementInst>(U) && !isVectorized(U);
22842 });
22843 }))
22844 return FinalAnalysis();
22845
22846 if (any_of(E.Scalars, [&](Value *V) {
22847 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22848 return isVectorized(U) ||
22849 (E.Idx == 0 && UserIgnoreList &&
22850 UserIgnoreList->contains(U)) ||
22851 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22852 !U->getType()->isScalableTy() &&
22853 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22854 }) && !IsPotentiallyTruncated(V, BitWidth);
22855 }))
22856 return false;
22857
22858 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
22859 bool &NeedToExit) {
22860 NeedToExit = false;
22861 unsigned InitLevel = MaxDepthLevel;
22862 for (const TreeEntry *Op : Operands) {
22863 unsigned Level = InitLevel;
22864 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
22865 ToDemote, Visited, NodesToKeepBWs, Level,
22866 IsProfitableToDemote, IsTruncRoot)) {
22867 if (!IsProfitableToDemote)
22868 return false;
22869 NeedToExit = true;
22870 if (!FinalAnalysis())
22871 return false;
22872 continue;
22873 }
22874 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22875 }
22876 return true;
22877 };
22878 auto AttemptCheckBitwidth =
22879 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
22880 // Try all bitwidth < OrigBitWidth.
22881 NeedToExit = false;
22882 unsigned BestFailBitwidth = 0;
22883 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
22884 if (Checker(BitWidth, OrigBitWidth))
22885 return true;
22886 if (BestFailBitwidth == 0 && FinalAnalysis())
22887 BestFailBitwidth = BitWidth;
22888 }
22889 if (BitWidth >= OrigBitWidth) {
22890 if (BestFailBitwidth == 0) {
22891 BitWidth = OrigBitWidth;
22892 return false;
22893 }
22894 MaxDepthLevel = 1;
22895 BitWidth = BestFailBitwidth;
22896 NeedToExit = true;
22897 return true;
22898 }
22899 return false;
22900 };
22901 auto TryProcessInstruction =
22902 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
22903 function_ref<bool(unsigned, unsigned)> Checker = {}) {
22904 if (Operands.empty()) {
22905 if (!IsTruncRoot)
22906 MaxDepthLevel = 1;
22907 for (Value *V : E.Scalars)
22908 (void)IsPotentiallyTruncated(V, BitWidth);
22909 } else {
22910 // Several vectorized uses? Check if we can truncate it, otherwise -
22911 // exit.
22912 if (any_of(E.Scalars, [&](Value *V) {
22913 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22914 }))
22915 return false;
22916 bool NeedToExit = false;
22917 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22918 return false;
22919 if (NeedToExit)
22920 return true;
22921 if (!ProcessOperands(Operands, NeedToExit))
22922 return false;
22923 if (NeedToExit)
22924 return true;
22925 }
22926
22927 ++MaxDepthLevel;
22928 // Record the entry that we can demote.
22929 ToDemote.push_back(E.Idx);
22930 return IsProfitableToDemote;
22931 };
22932
22933 if (E.State == TreeEntry::SplitVectorize)
22934 return TryProcessInstruction(
22935 BitWidth,
22936 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
22937 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
22938
22939 if (E.isAltShuffle()) {
22940 // Combining these opcodes may lead to incorrect analysis, skip for now.
22941 auto IsDangerousOpcode = [](unsigned Opcode) {
22942 switch (Opcode) {
22943 case Instruction::Shl:
22944 case Instruction::AShr:
22945 case Instruction::LShr:
22946 case Instruction::UDiv:
22947 case Instruction::SDiv:
22948 case Instruction::URem:
22949 case Instruction::SRem:
22950 return true;
22951 default:
22952 break;
22953 }
22954 return false;
22955 };
22956 if (IsDangerousOpcode(E.getAltOpcode()))
22957 return FinalAnalysis();
22958 }
22959
22960 switch (E.getOpcode()) {
22961
22962 // We can always demote truncations and extensions. Since truncations can
22963 // seed additional demotion, we save the truncated value.
22964 case Instruction::Trunc:
22965 if (IsProfitableToDemoteRoot)
22966 IsProfitableToDemote = true;
22967 return TryProcessInstruction(BitWidth);
22968 case Instruction::ZExt:
22969 case Instruction::SExt:
22970 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
22971 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22972 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22973 return false;
22974 IsProfitableToDemote = true;
22975 return TryProcessInstruction(BitWidth);
22976
22977 // We can demote certain binary operations if we can demote both of their
22978 // operands.
22979 case Instruction::Add:
22980 case Instruction::Sub:
22981 case Instruction::Mul:
22982 case Instruction::And:
22983 case Instruction::Or:
22984 case Instruction::Xor: {
22985 return TryProcessInstruction(
22986 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
22987 }
22988 case Instruction::Freeze:
22989 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
22990 case Instruction::Shl: {
22991 // If we are truncating the result of this SHL, and if it's a shift of an
22992 // inrange amount, we can always perform a SHL in a smaller type.
22993 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
22994 return all_of(E.Scalars, [&](Value *V) {
22995 if (isa<PoisonValue>(V))
22996 return true;
22997 if (E.isCopyableElement(V))
22998 return true;
22999 auto *I = cast<Instruction>(V);
23000 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23001 return AmtKnownBits.getMaxValue().ult(BitWidth);
23002 });
23003 };
23004 return TryProcessInstruction(
23005 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
23006 }
23007 case Instruction::LShr: {
23008 // If this is a truncate of a logical shr, we can truncate it to a smaller
23009 // lshr iff we know that the bits we would otherwise be shifting in are
23010 // already zeros.
23011 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23012 return all_of(E.Scalars, [&](Value *V) {
23013 if (isa<PoisonValue>(V))
23014 return true;
23015 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23016 if (E.isCopyableElement(V))
23017 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
23018 auto *I = cast<Instruction>(V);
23019 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23020 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23021 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
23022 SimplifyQuery(*DL));
23023 });
23024 };
23025 return TryProcessInstruction(
23026 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
23027 LShrChecker);
23028 }
23029 case Instruction::AShr: {
23030 // If this is a truncate of an arithmetic shr, we can truncate it to a
23031 // smaller ashr iff we know that all the bits from the sign bit of the
23032 // original type and the sign bit of the truncate type are similar.
23033 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23034 return all_of(E.Scalars, [&](Value *V) {
23035 if (isa<PoisonValue>(V))
23036 return true;
23037 auto *I = cast<Instruction>(V);
23038 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23039 unsigned ShiftedBits = OrigBitWidth - BitWidth;
23040 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23041 ShiftedBits <
23042 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23043 });
23044 };
23045 return TryProcessInstruction(
23046 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
23047 AShrChecker);
23048 }
23049 case Instruction::UDiv:
23050 case Instruction::URem: {
23051 // UDiv and URem can be truncated if all the truncated bits are zero.
23052 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23053 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23054 return all_of(E.Scalars, [&](Value *V) {
23055 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23056 if (E.hasCopyableElements() && E.isCopyableElement(V))
23057 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
23058 auto *I = cast<Instruction>(V);
23059 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
23060 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23061 });
23062 };
23063 return TryProcessInstruction(
23064 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
23065 }
23066
23067 // We can demote selects if we can demote their true and false values.
23068 case Instruction::Select: {
23069 return TryProcessInstruction(
23070 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
23071 }
23072
23073 // We can demote phis if we can demote all their incoming operands.
23074 case Instruction::PHI: {
23075 const unsigned NumOps = E.getNumOperands();
23077 transform(seq<unsigned>(0, NumOps), Ops.begin(),
23078 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
23079
23080 return TryProcessInstruction(BitWidth, Ops);
23081 }
23082
23083 case Instruction::Call: {
23084 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
23085 if (!IC)
23086 break;
23088 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
23089 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
23090 break;
23091 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
23092 function_ref<bool(unsigned, unsigned)> CallChecker;
23093 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23094 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23095 return all_of(E.Scalars, [&](Value *V) {
23096 auto *I = cast<Instruction>(V);
23097 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
23098 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23099 return MaskedValueIsZero(I->getOperand(0), Mask,
23100 SimplifyQuery(*DL)) &&
23101 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23102 }
23103 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
23104 "Expected min/max intrinsics only.");
23105 unsigned SignBits = OrigBitWidth - BitWidth;
23106 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
23107 unsigned Op0SignBits =
23108 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23109 unsigned Op1SignBits =
23110 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
23111 return SignBits <= Op0SignBits &&
23112 ((SignBits != Op0SignBits &&
23113 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
23114 MaskedValueIsZero(I->getOperand(0), Mask,
23115 SimplifyQuery(*DL))) &&
23116 SignBits <= Op1SignBits &&
23117 ((SignBits != Op1SignBits &&
23118 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
23119 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
23120 });
23121 };
23122 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23123 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23124 return all_of(E.Scalars, [&](Value *V) {
23125 auto *I = cast<Instruction>(V);
23126 unsigned SignBits = OrigBitWidth - BitWidth;
23127 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
23128 unsigned Op0SignBits =
23129 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23130 return SignBits <= Op0SignBits &&
23131 ((SignBits != Op0SignBits &&
23132 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
23133 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
23134 });
23135 };
23136 if (ID != Intrinsic::abs) {
23137 Operands.push_back(getOperandEntry(&E, 1));
23138 CallChecker = CompChecker;
23139 } else {
23140 CallChecker = AbsChecker;
23141 }
23142 InstructionCost BestCost =
23143 std::numeric_limits<InstructionCost::CostType>::max();
23144 unsigned BestBitWidth = BitWidth;
23145 unsigned VF = E.Scalars.size();
23146 // Choose the best bitwidth based on cost estimations.
23147 auto Checker = [&](unsigned BitWidth, unsigned) {
23148 unsigned MinBW = PowerOf2Ceil(BitWidth);
23149 SmallVector<Type *> ArgTys =
23150 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
23151 auto VecCallCosts = getVectorCallCosts(
23152 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
23153 TTI, TLI, ArgTys);
23154 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
23155 if (Cost < BestCost) {
23156 BestCost = Cost;
23157 BestBitWidth = BitWidth;
23158 }
23159 return false;
23160 };
23161 [[maybe_unused]] bool NeedToExit;
23162 (void)AttemptCheckBitwidth(Checker, NeedToExit);
23163 BitWidth = BestBitWidth;
23164 return TryProcessInstruction(BitWidth, Operands, CallChecker);
23165 }
23166
23167 // Otherwise, conservatively give up.
23168 default:
23169 break;
23170 }
23171 MaxDepthLevel = 1;
23172 return FinalAnalysis();
23173}
23174
23175static RecurKind getRdxKind(Value *V);
23176
23178 // We only attempt to truncate integer expressions.
23179 bool IsStoreOrInsertElt =
23180 VectorizableTree.front()->hasState() &&
23181 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
23182 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
23183 if ((IsStoreOrInsertElt || UserIgnoreList) &&
23184 ExtraBitWidthNodes.size() <= 1 &&
23185 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
23186 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
23187 return;
23188
23189 unsigned NodeIdx = 0;
23190 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
23191 NodeIdx = 1;
23192
23193 // Ensure the roots of the vectorizable tree don't form a cycle.
23194 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
23195 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
23196 "Unexpected tree is graph.");
23197
23198 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
23199 // resize to the final type.
23200 bool IsTruncRoot = false;
23201 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
23202 SmallVector<unsigned> RootDemotes;
23203 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
23204 if (NodeIdx != 0 &&
23205 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23206 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23207 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
23208 IsTruncRoot = true;
23209 RootDemotes.push_back(NodeIdx);
23210 IsProfitableToDemoteRoot = true;
23211 ++NodeIdx;
23212 }
23213
23214 // Analyzed the reduction already and not profitable - exit.
23215 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
23216 return;
23217
23218 SmallVector<unsigned> ToDemote;
23219 auto ComputeMaxBitWidth =
23220 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
23221 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
23222 ToDemote.clear();
23223 // Check if the root is trunc and the next node is gather/buildvector, then
23224 // keep trunc in scalars, which is free in most cases.
23225 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
23226 !NodesToKeepBWs.contains(E.Idx) &&
23227 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
23228 all_of(E.Scalars, [&](Value *V) {
23229 return V->hasOneUse() || isa<Constant>(V) ||
23230 (!V->hasNUsesOrMore(UsesLimit) &&
23231 none_of(V->users(), [&](User *U) {
23232 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
23233 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23234 if (TEs.empty() || is_contained(TEs, UserTE))
23235 return false;
23236 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23237 SelectInst>(U) ||
23238 isa<SIToFPInst, UIToFPInst>(U) ||
23239 (UserTE->hasState() &&
23240 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23241 SelectInst>(UserTE->getMainOp()) ||
23242 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
23243 return true;
23244 unsigned UserTESz = DL->getTypeSizeInBits(
23245 UserTE->Scalars.front()->getType());
23246 if (all_of(TEs, [&](const TreeEntry *TE) {
23247 auto It = MinBWs.find(TE);
23248 return It != MinBWs.end() &&
23249 It->second.first > UserTESz;
23250 }))
23251 return true;
23252 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
23253 }));
23254 })) {
23255 ToDemote.push_back(E.Idx);
23256 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23257 auto It = MinBWs.find(UserTE);
23258 if (It != MinBWs.end())
23259 return It->second.first;
23260 unsigned MaxBitWidth =
23261 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
23262 MaxBitWidth = bit_ceil(MaxBitWidth);
23263 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23264 MaxBitWidth = 8;
23265 return MaxBitWidth;
23266 }
23267
23268 if (!E.hasState())
23269 return 0u;
23270
23271 unsigned VF = E.getVectorFactor();
23272 Type *ScalarTy = E.Scalars.front()->getType();
23273 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
23274 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
23275 if (!TreeRootIT)
23276 return 0u;
23277
23278 if (any_of(E.Scalars,
23279 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
23280 return 0u;
23281
23282 unsigned NumParts = ::getNumberOfParts(
23283 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
23284
23285 // The maximum bit width required to represent all the values that can be
23286 // demoted without loss of precision. It would be safe to truncate the roots
23287 // of the expression to this width.
23288 unsigned MaxBitWidth = 1u;
23289
23290 // True if the roots can be zero-extended back to their original type,
23291 // rather than sign-extended. We know that if the leading bits are not
23292 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
23293 // True.
23294 // Determine if the sign bit of all the roots is known to be zero. If not,
23295 // IsKnownPositive is set to False.
23296 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
23297 if (isa<PoisonValue>(R))
23298 return true;
23299 KnownBits Known = computeKnownBits(R, *DL);
23300 return Known.isNonNegative();
23301 });
23302
23303 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
23304 E.UserTreeIndex.UserTE->hasState() &&
23305 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
23306 MaxBitWidth =
23307 std::min(DL->getTypeSizeInBits(
23308 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
23309 DL->getTypeSizeInBits(ScalarTy));
23310
23311 // We first check if all the bits of the roots are demanded. If they're not,
23312 // we can truncate the roots to this narrower type.
23313 for (Value *Root : E.Scalars) {
23314 if (isa<PoisonValue>(Root))
23315 continue;
23316 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
23317 TypeSize NumTypeBits =
23318 DL->getTypeSizeInBits(Root->getType()->getScalarType());
23319 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23320 // If we can't prove that the sign bit is zero, we must add one to the
23321 // maximum bit width to account for the unknown sign bit. This preserves
23322 // the existing sign bit so we can safely sign-extend the root back to the
23323 // original type. Otherwise, if we know the sign bit is zero, we will
23324 // zero-extend the root instead.
23325 //
23326 // FIXME: This is somewhat suboptimal, as there will be cases where adding
23327 // one to the maximum bit width will yield a larger-than-necessary
23328 // type. In general, we need to add an extra bit only if we can't
23329 // prove that the upper bit of the original type is equal to the
23330 // upper bit of the proposed smaller type. If these two bits are
23331 // the same (either zero or one) we know that sign-extending from
23332 // the smaller type will result in the same value. Here, since we
23333 // can't yet prove this, we are just making the proposed smaller
23334 // type larger to ensure correctness.
23335 if (!IsKnownPositive)
23336 ++BitWidth1;
23337
23338 auto *I = dyn_cast<Instruction>(Root);
23339 if (!I) {
23340 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
23341 continue;
23342 }
23343 APInt Mask = DB->getDemandedBits(I);
23344 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23345 MaxBitWidth =
23346 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
23347 }
23348
23349 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23350 MaxBitWidth = 8;
23351
23352 // If the original type is large, but reduced type does not improve the reg
23353 // use - ignore it.
23354 if (NumParts > 1 &&
23355 NumParts ==
23357 *TTI, getWidenedType(IntegerType::get(F->getContext(),
23358 bit_ceil(MaxBitWidth)),
23359 VF)))
23360 return 0u;
23361
23362 unsigned Opcode = E.getOpcode();
23363 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
23364 Opcode == Instruction::SExt ||
23365 Opcode == Instruction::ZExt || NumParts > 1;
23366 // Conservatively determine if we can actually truncate the roots of the
23367 // expression. Collect the values that can be demoted in ToDemote and
23368 // additional roots that require investigating in Roots.
23370 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
23371 bool NeedToDemote = IsProfitableToDemote;
23372
23373 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
23374 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
23375 NeedToDemote, IsTruncRoot) ||
23376 (MaxDepthLevel <= Limit &&
23377 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
23378 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
23379 DL->getTypeSizeInBits(TreeRootIT) /
23380 DL->getTypeSizeInBits(
23381 E.getMainOp()->getOperand(0)->getType()) >
23382 2)))))
23383 return 0u;
23384 // Round MaxBitWidth up to the next power-of-two.
23385 MaxBitWidth = bit_ceil(MaxBitWidth);
23386
23387 return MaxBitWidth;
23388 };
23389
23390 // If we can truncate the root, we must collect additional values that might
23391 // be demoted as a result. That is, those seeded by truncations we will
23392 // modify.
23393 // Add reduction ops sizes, if any.
23394 if (UserIgnoreList &&
23395 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
23396 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
23397 // x i1> to in)).
23398 if (all_of(*UserIgnoreList,
23399 [](Value *V) {
23400 return isa<PoisonValue>(V) ||
23401 cast<Instruction>(V)->getOpcode() == Instruction::Add;
23402 }) &&
23403 VectorizableTree.front()->State == TreeEntry::Vectorize &&
23404 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
23405 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
23406 Builder.getInt1Ty()) {
23407 ReductionBitWidth = 1;
23408 } else {
23409 for (Value *V : *UserIgnoreList) {
23410 if (isa<PoisonValue>(V))
23411 continue;
23412 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
23413 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
23414 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23415 if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
23416 ++BitWidth1;
23417 unsigned BitWidth2 = BitWidth1;
23419 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
23420 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23421 }
23422 ReductionBitWidth =
23423 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
23424 }
23425 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
23426 ReductionBitWidth = 8;
23427
23428 ReductionBitWidth = bit_ceil(ReductionBitWidth);
23429 }
23430 }
23431 bool IsTopRoot = NodeIdx == 0;
23432 while (NodeIdx < VectorizableTree.size() &&
23433 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23434 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23435 RootDemotes.push_back(NodeIdx);
23436 ++NodeIdx;
23437 IsTruncRoot = true;
23438 }
23439 bool IsSignedCmp = false;
23440 if (UserIgnoreList &&
23441 all_of(*UserIgnoreList,
23443 m_SMax(m_Value(), m_Value())))))
23444 IsSignedCmp = true;
23445 while (NodeIdx < VectorizableTree.size()) {
23446 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
23447 unsigned Limit = 2;
23448 if (IsTopRoot &&
23449 ReductionBitWidth ==
23450 DL->getTypeSizeInBits(
23451 VectorizableTree.front()->Scalars.front()->getType()))
23452 Limit = 3;
23453 unsigned MaxBitWidth = ComputeMaxBitWidth(
23454 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
23455 IsTruncRoot, IsSignedCmp);
23456 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
23457 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
23458 ReductionBitWidth = bit_ceil(MaxBitWidth);
23459 else if (MaxBitWidth == 0)
23460 ReductionBitWidth = 0;
23461 }
23462
23463 for (unsigned Idx : RootDemotes) {
23464 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
23465 uint32_t OrigBitWidth =
23466 DL->getTypeSizeInBits(V->getType()->getScalarType());
23467 if (OrigBitWidth > MaxBitWidth) {
23468 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
23469 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
23470 }
23471 return false;
23472 }))
23473 ToDemote.push_back(Idx);
23474 }
23475 RootDemotes.clear();
23476 IsTopRoot = false;
23477 IsProfitableToDemoteRoot = true;
23478
23479 if (ExtraBitWidthNodes.empty()) {
23480 NodeIdx = VectorizableTree.size();
23481 } else {
23482 unsigned NewIdx = 0;
23483 do {
23484 NewIdx = *ExtraBitWidthNodes.begin();
23485 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
23486 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
23487 NodeIdx = NewIdx;
23488 IsTruncRoot =
23489 NodeIdx < VectorizableTree.size() &&
23490 VectorizableTree[NodeIdx]->UserTreeIndex &&
23491 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
23492 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23493 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23494 Instruction::Trunc &&
23495 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
23496 IsSignedCmp =
23497 NodeIdx < VectorizableTree.size() &&
23498 VectorizableTree[NodeIdx]->UserTreeIndex &&
23499 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23500 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23501 Instruction::ICmp &&
23502 any_of(
23503 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
23504 [&](Value *V) {
23505 auto *IC = dyn_cast<ICmpInst>(V);
23506 return IC && (IC->isSigned() ||
23507 !isKnownNonNegative(IC->getOperand(0),
23508 SimplifyQuery(*DL)) ||
23509 !isKnownNonNegative(IC->getOperand(1),
23510 SimplifyQuery(*DL)));
23511 });
23512 }
23513
23514 // If the maximum bit width we compute is less than the width of the roots'
23515 // type, we can proceed with the narrowing. Otherwise, do nothing.
23516 if (MaxBitWidth == 0 ||
23517 MaxBitWidth >=
23518 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
23519 ->getBitWidth()) {
23520 if (UserIgnoreList)
23521 AnalyzedMinBWVals.insert_range(TreeRoot);
23522 NodesToKeepBWs.insert_range(ToDemote);
23523 continue;
23524 }
23525
23526 // Finally, map the values we can demote to the maximum bit with we
23527 // computed.
23528 for (unsigned Idx : ToDemote) {
23529 TreeEntry *TE = VectorizableTree[Idx].get();
23530 if (MinBWs.contains(TE))
23531 continue;
23532 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
23533 if (isa<PoisonValue>(R))
23534 return false;
23535 return !isKnownNonNegative(R, SimplifyQuery(*DL));
23536 });
23537 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
23538 }
23539 }
23540}
23541
23543 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
23544 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
23546 auto *AA = &AM.getResult<AAManager>(F);
23547 auto *LI = &AM.getResult<LoopAnalysis>(F);
23548 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
23549 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
23550 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
23552
23553 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
23554 if (!Changed)
23555 return PreservedAnalyses::all();
23556
23559 return PA;
23560}
23561
23563 TargetTransformInfo *TTI_,
23564 TargetLibraryInfo *TLI_, AAResults *AA_,
23565 LoopInfo *LI_, DominatorTree *DT_,
23566 AssumptionCache *AC_, DemandedBits *DB_,
23569 return false;
23570 SE = SE_;
23571 TTI = TTI_;
23572 TLI = TLI_;
23573 AA = AA_;
23574 LI = LI_;
23575 DT = DT_;
23576 AC = AC_;
23577 DB = DB_;
23578 DL = &F.getDataLayout();
23579
23580 Stores.clear();
23581 GEPs.clear();
23582 bool Changed = false;
23583
23584 // If the target claims to have no vector registers don't attempt
23585 // vectorization.
23586 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
23587 LLVM_DEBUG(
23588 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
23589 return false;
23590 }
23591
23592 // Don't vectorize when the attribute NoImplicitFloat is used.
23593 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
23594 return false;
23595
23596 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
23597
23598 // Use the bottom up slp vectorizer to construct chains that start with
23599 // store instructions.
23600 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
23601
23602 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
23603 // delete instructions.
23604
23605 // Update DFS numbers now so that we can use them for ordering.
23606 DT->updateDFSNumbers();
23607
23608 // Scan the blocks in the function in post order.
23609 for (auto *BB : post_order(&F.getEntryBlock())) {
23611 continue;
23612
23613 // Start new block - clear the list of reduction roots.
23614 R.clearReductionData();
23615 collectSeedInstructions(BB);
23616
23617 // Vectorize trees that end at stores.
23618 if (!Stores.empty()) {
23619 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
23620 << " underlying objects.\n");
23621 Changed |= vectorizeStoreChains(R);
23622 }
23623
23624 // Vectorize trees that end at reductions.
23625 Changed |= vectorizeChainsInBlock(BB, R);
23626
23627 // Vectorize the index computations of getelementptr instructions. This
23628 // is primarily intended to catch gather-like idioms ending at
23629 // non-consecutive loads.
23630 if (!GEPs.empty()) {
23631 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
23632 << " underlying objects.\n");
23633 Changed |= vectorizeGEPIndices(BB, R);
23634 }
23635 }
23636
23637 if (Changed) {
23638 R.optimizeGatherSequence();
23639 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
23640 }
23641 return Changed;
23642}
23643
23644std::optional<bool>
23645SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
23646 unsigned Idx, unsigned MinVF,
23647 unsigned &Size) {
23648 Size = 0;
23649 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
23650 << "\n");
23651 const unsigned Sz = R.getVectorElementSize(Chain[0]);
23652 unsigned VF = Chain.size();
23653
23654 if (!has_single_bit(Sz) ||
23656 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
23657 VF) ||
23658 VF < 2 || VF < MinVF) {
23659 // Check if vectorizing with a non-power-of-2 VF should be considered. At
23660 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
23661 // all vector lanes are used.
23662 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
23663 return false;
23664 }
23665
23666 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
23667 << "\n");
23668
23669 SetVector<Value *> ValOps;
23670 for (Value *V : Chain)
23671 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
23672 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
23673 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
23674 InstructionsState S = Analysis.buildInstructionsState(
23675 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
23676 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
23677 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
23678 bool IsAllowedSize =
23679 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
23680 ValOps.size()) ||
23681 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
23682 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
23683 (!S.getMainOp()->isSafeToRemove() ||
23684 any_of(ValOps.getArrayRef(),
23685 [&](Value *V) {
23686 return !isa<ExtractElementInst>(V) &&
23687 (V->getNumUses() > Chain.size() ||
23688 any_of(V->users(), [&](User *U) {
23689 return !Stores.contains(U);
23690 }));
23691 }))) ||
23692 (ValOps.size() > Chain.size() / 2 && !S)) {
23693 Size = (!IsAllowedSize && S) ? 1 : 2;
23694 return false;
23695 }
23696 }
23697 if (R.isLoadCombineCandidate(Chain))
23698 return true;
23699 R.buildTree(Chain);
23700 // Check if tree tiny and store itself or its value is not vectorized.
23701 if (R.isTreeTinyAndNotFullyVectorizable()) {
23702 if (R.isGathered(Chain.front()) ||
23703 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
23704 return std::nullopt;
23705 Size = R.getCanonicalGraphSize();
23706 return false;
23707 }
23708 if (R.isProfitableToReorder()) {
23709 R.reorderTopToBottom();
23710 R.reorderBottomToTop();
23711 }
23712 R.transformNodes();
23713 R.computeMinimumValueSizes();
23714
23715 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
23716 R.buildExternalUses();
23717
23718 Size = R.getCanonicalGraphSize();
23719 if (S && S.getOpcode() == Instruction::Load)
23720 Size = 2; // cut off masked gather small trees
23721 InstructionCost Cost = R.getTreeCost(TreeCost);
23722
23723 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
23724 if (Cost < -SLPCostThreshold) {
23725 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
23726
23727 using namespace ore;
23728
23729 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
23730 cast<StoreInst>(Chain[0]))
23731 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
23732 << " and with tree size "
23733 << NV("TreeSize", R.getTreeSize()));
23734
23735 R.vectorizeTree();
23736 return true;
23737 }
23738
23739 return false;
23740}
23741
23742/// Checks if the quadratic mean deviation is less than 90% of the mean size.
23743static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
23744 bool First) {
23745 unsigned Num = 0;
23746 uint64_t Sum = std::accumulate(
23747 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
23748 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
23749 unsigned Size = First ? Val.first : Val.second;
23750 if (Size == 1)
23751 return V;
23752 ++Num;
23753 return V + Size;
23754 });
23755 if (Num == 0)
23756 return true;
23757 uint64_t Mean = Sum / Num;
23758 if (Mean == 0)
23759 return true;
23760 uint64_t Dev = std::accumulate(
23761 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
23762 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
23763 unsigned P = First ? Val.first : Val.second;
23764 if (P == 1)
23765 return V;
23766 return V + (P - Mean) * (P - Mean);
23767 }) /
23768 Num;
23769 return Dev * 96 / (Mean * Mean) == 0;
23770}
23771
23772namespace {
23773
23774/// A group of stores that we'll try to bundle together using vector ops.
23775/// They are ordered using the signed distance of their address operand to the
23776/// address of this group's BaseInstr.
23777class RelatedStoreInsts {
23778public:
23779 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
23780 : AllStores(AllStores) {
23781 reset(BaseInstrIdx);
23782 }
23783
23784 void reset(unsigned NewBaseInstr) {
23785 assert(NewBaseInstr < AllStores.size() &&
23786 "Instruction index out of bounds");
23787 BaseInstrIdx = NewBaseInstr;
23788 Instrs.clear();
23789 insertOrLookup(NewBaseInstr, 0);
23790 }
23791
23792 /// Tries to insert \p InstrIdx as the store with a pointer distance of
23793 /// \p PtrDist.
23794 /// Does nothing if there is already a store with that \p PtrDist.
23795 /// \returns The previously associated Instruction index, or std::nullopt
23796 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
23797 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23798 return Inserted ? std::nullopt : std::make_optional(It->second);
23799 }
23800
23801 using DistToInstMap = std::map<int64_t, unsigned>;
23802 const DistToInstMap &getStores() const { return Instrs; }
23803
23804 /// If \p SI is related to this group of stores, return the distance of its
23805 /// pointer operand to the one the group's BaseInstr.
23806 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
23807 ScalarEvolution &SE) const {
23808 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23809 return getPointersDiff(
23810 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
23811 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
23812 /*StrictCheck=*/true);
23813 }
23814
23815 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
23816 /// Stores whose index is less than \p MinSafeIdx will be dropped.
23817 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
23818 int64_t DistFromCurBase) {
23819 DistToInstMap PrevSet = std::move(Instrs);
23820 reset(NewBaseInstIdx);
23821
23822 // Re-insert stores that come after MinSafeIdx to try and vectorize them
23823 // again. Their distance will be "rebased" to use NewBaseInstIdx as
23824 // reference.
23825 for (auto [Dist, InstIdx] : PrevSet) {
23826 if (InstIdx >= MinSafeIdx)
23827 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23828 }
23829 }
23830
23831 /// Remove all stores that have been vectorized from this group.
23832 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
23833 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
23834 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
23835 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
23836 });
23837
23838 // Get a forward iterator pointing after the last vectorized store and erase
23839 // all stores before it so we don't try to vectorize them again.
23840 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23841 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23842 }
23843
23844private:
23845 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
23846 unsigned BaseInstrIdx;
23847
23848 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
23849 DistToInstMap Instrs;
23850
23851 /// Reference to all the stores in the BB being analyzed.
23852 ArrayRef<StoreInst *> AllStores;
23853};
23854
23855} // end anonymous namespace
23856
23857bool SLPVectorizerPass::vectorizeStores(
23858 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
23859 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23860 &Visited) {
23861 // We may run into multiple chains that merge into a single chain. We mark the
23862 // stores that we vectorized so that we don't visit the same store twice.
23863 BoUpSLP::ValueSet VectorizedStores;
23864 bool Changed = false;
23865
23866 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23867 int64_t PrevDist = -1;
23868 BoUpSLP::ValueList Operands;
23869 // Collect the chain into a list.
23870 for (auto [Idx, Data] : enumerate(StoreSeq)) {
23871 auto &[Dist, InstIdx] = Data;
23872 if (Operands.empty() || Dist - PrevDist == 1) {
23873 Operands.push_back(Stores[InstIdx]);
23874 PrevDist = Dist;
23875 if (Idx != StoreSeq.size() - 1)
23876 continue;
23877 }
23878 llvm::scope_exit E([&, &Dist = Dist, &InstIdx = InstIdx]() {
23879 Operands.clear();
23880 Operands.push_back(Stores[InstIdx]);
23881 PrevDist = Dist;
23882 });
23883
23884 if (Operands.size() <= 1 ||
23885 !Visited
23886 .insert({Operands.front(),
23887 cast<StoreInst>(Operands.front())->getValueOperand(),
23888 Operands.back(),
23889 cast<StoreInst>(Operands.back())->getValueOperand(),
23890 Operands.size()})
23891 .second)
23892 continue;
23893
23894 unsigned MaxVecRegSize = R.getMaxVecRegSize();
23895 unsigned EltSize = R.getVectorElementSize(Operands[0]);
23896 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
23897
23898 unsigned MaxVF =
23899 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23900 auto *Store = cast<StoreInst>(Operands[0]);
23901 Type *StoreTy = Store->getValueOperand()->getType();
23902 Type *ValueTy = StoreTy;
23903 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
23904 ValueTy = Trunc->getSrcTy();
23905 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
23906 // getStoreMinimumVF only support scalar type as arguments. As a result,
23907 // we need to use the element type of StoreTy and ValueTy to retrieve the
23908 // VF and then transform it back.
23909 // Remember: VF is defined as the number we want to vectorize, not the
23910 // number of elements in the final vector.
23911 Type *StoreScalarTy = StoreTy->getScalarType();
23912 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
23913 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23914 ValueTy->getScalarType()));
23915 MinVF /= getNumElements(StoreTy);
23916 MinVF = std::max<unsigned>(2, MinVF);
23917
23918 if (MaxVF < MinVF) {
23919 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23920 << ") < "
23921 << "MinVF (" << MinVF << ")\n");
23922 continue;
23923 }
23924
23925 unsigned NonPowerOf2VF = 0;
23927 // First try vectorizing with a non-power-of-2 VF. At the moment, only
23928 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
23929 // lanes are used.
23930 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
23931 if (has_single_bit(CandVF + 1)) {
23932 NonPowerOf2VF = CandVF;
23933 assert(NonPowerOf2VF != MaxVF &&
23934 "Non-power-of-2 VF should not be equal to MaxVF");
23935 }
23936 }
23937
23938 // MaxRegVF represents the number of instructions (scalar, or vector in
23939 // case of revec) that can be vectorized to naturally fit in a vector
23940 // register.
23941 unsigned MaxRegVF = MaxVF;
23942
23943 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
23944 if (MaxVF < MinVF) {
23945 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23946 << ") < "
23947 << "MinVF (" << MinVF << ")\n");
23948 continue;
23949 }
23950
23951 SmallVector<unsigned> CandidateVFs;
23952 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23953 VF = divideCeil(VF, 2))
23954 CandidateVFs.push_back(VF);
23955
23956 unsigned End = Operands.size();
23957 unsigned Repeat = 0;
23958 constexpr unsigned MaxAttempts = 4;
23959 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
23960 for (std::pair<unsigned, unsigned> &P : RangeSizes)
23961 P.first = P.second = 1;
23962 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23963 auto IsNotVectorized = [](bool First,
23964 const std::pair<unsigned, unsigned> &P) {
23965 return First ? P.first > 0 : P.second > 0;
23966 };
23967 auto IsVectorized = [](bool First,
23968 const std::pair<unsigned, unsigned> &P) {
23969 return First ? P.first == 0 : P.second == 0;
23970 };
23971 auto VFIsProfitable = [](bool First, unsigned Size,
23972 const std::pair<unsigned, unsigned> &P) {
23973 return First ? Size >= P.first : Size >= P.second;
23974 };
23975 auto FirstSizeSame = [](unsigned Size,
23976 const std::pair<unsigned, unsigned> &P) {
23977 return Size == P.first;
23978 };
23979 while (true) {
23980 ++Repeat;
23981 bool RepeatChanged = false;
23982 bool AnyProfitableGraph = false;
23983 for (unsigned VF : CandidateVFs) {
23984 AnyProfitableGraph = false;
23985 unsigned FirstUnvecStore =
23986 std::distance(RangeSizes.begin(),
23987 find_if(RangeSizes, std::bind(IsNotVectorized,
23988 VF >= MaxRegVF, _1)));
23989
23990 // Form slices of size VF starting from FirstUnvecStore and try to
23991 // vectorize them.
23992 while (FirstUnvecStore < End) {
23993 unsigned FirstVecStore = std::distance(
23994 RangeSizes.begin(),
23995 find_if(RangeSizes.drop_front(FirstUnvecStore),
23996 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23997 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23998 for (unsigned SliceStartIdx = FirstUnvecStore;
23999 SliceStartIdx + VF <= MaxSliceEnd;) {
24000 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
24001 VF >= MaxRegVF)) {
24002 ++SliceStartIdx;
24003 continue;
24004 }
24005 ArrayRef<Value *> Slice =
24006 ArrayRef(Operands).slice(SliceStartIdx, VF);
24007 assert(all_of(Slice,
24008 [&](Value *V) {
24009 return cast<StoreInst>(V)
24010 ->getValueOperand()
24011 ->getType() ==
24012 cast<StoreInst>(Slice.front())
24013 ->getValueOperand()
24014 ->getType();
24015 }) &&
24016 "Expected all operands of same type.");
24017 if (!NonSchedulable.empty()) {
24018 auto [NonSchedSizeMax, NonSchedSizeMin] =
24019 NonSchedulable.lookup(Slice.front());
24020 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
24021 // VF is too ambitious. Try to vectorize another slice before
24022 // trying a smaller VF.
24023 SliceStartIdx += NonSchedSizeMax;
24024 continue;
24025 }
24026 }
24027 unsigned TreeSize;
24028 std::optional<bool> Res =
24029 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
24030 if (!Res) {
24031 // Update the range of non schedulable VFs for slices starting
24032 // at SliceStartIdx.
24033 NonSchedulable
24034 .try_emplace(Slice.front(), std::make_pair(VF, VF))
24035 .first->getSecond()
24036 .second = VF;
24037 } else if (*Res) {
24038 // Mark the vectorized stores so that we don't vectorize them
24039 // again.
24040 VectorizedStores.insert_range(Slice);
24041 // Mark the vectorized stores so that we don't vectorize them
24042 // again.
24043 AnyProfitableGraph = RepeatChanged = Changed = true;
24044 // If we vectorized initial block, no need to try to vectorize
24045 // it again.
24046 for (std::pair<unsigned, unsigned> &P :
24047 RangeSizes.slice(SliceStartIdx, VF))
24048 P.first = P.second = 0;
24049 if (SliceStartIdx < FirstUnvecStore + MinVF) {
24050 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
24051 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
24052 P.first = P.second = 0;
24053 FirstUnvecStore = SliceStartIdx + VF;
24054 }
24055 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
24056 for (std::pair<unsigned, unsigned> &P :
24057 RangeSizes.slice(SliceStartIdx + VF,
24058 MaxSliceEnd - (SliceStartIdx + VF)))
24059 P.first = P.second = 0;
24060 if (MaxSliceEnd == End)
24061 End = SliceStartIdx;
24062 MaxSliceEnd = SliceStartIdx;
24063 }
24064 SliceStartIdx += VF;
24065 continue;
24066 }
24067 if (VF > 2 && Res &&
24068 !all_of(RangeSizes.slice(SliceStartIdx, VF),
24069 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
24070 _1))) {
24071 SliceStartIdx += VF;
24072 continue;
24073 }
24074 // Check for the very big VFs that we're not rebuilding same
24075 // trees, just with larger number of elements.
24076 if (VF > MaxRegVF && TreeSize > 1 &&
24077 all_of(RangeSizes.slice(SliceStartIdx, VF),
24078 std::bind(FirstSizeSame, TreeSize, _1))) {
24079 SliceStartIdx += VF;
24080 while (SliceStartIdx != MaxSliceEnd &&
24081 RangeSizes[SliceStartIdx].first == TreeSize)
24082 ++SliceStartIdx;
24083 continue;
24084 }
24085 if (TreeSize > 1) {
24086 for (std::pair<unsigned, unsigned> &P :
24087 RangeSizes.slice(SliceStartIdx, VF)) {
24088 if (VF >= MaxRegVF)
24089 P.second = std::max(P.second, TreeSize);
24090 else
24091 P.first = std::max(P.first, TreeSize);
24092 }
24093 }
24094 ++SliceStartIdx;
24095 AnyProfitableGraph = true;
24096 }
24097 if (FirstUnvecStore >= End)
24098 break;
24099 if (MaxSliceEnd - FirstUnvecStore < VF &&
24100 MaxSliceEnd - FirstUnvecStore >= MinVF)
24101 AnyProfitableGraph = true;
24102 FirstUnvecStore = std::distance(
24103 RangeSizes.begin(),
24104 find_if(RangeSizes.drop_front(MaxSliceEnd),
24105 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
24106 }
24107 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
24108 break;
24109 }
24110 // All values vectorized - exit.
24111 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
24112 return P.first == 0 && P.second == 0;
24113 }))
24114 break;
24115 // Check if tried all attempts or no need for the last attempts at all.
24116 if (Repeat >= MaxAttempts ||
24117 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
24118 break;
24119 constexpr unsigned StoresLimit = 64;
24120 const unsigned MaxTotalNum = std::min<unsigned>(
24121 Operands.size(),
24122 static_cast<unsigned>(
24123 End -
24124 std::distance(
24125 RangeSizes.begin(),
24126 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
24127 1));
24128 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
24129 unsigned Limit =
24130 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
24131 CandidateVFs.clear();
24132 if (bit_floor(Limit) == VF)
24133 CandidateVFs.push_back(Limit);
24134 if (VF > MaxTotalNum || VF >= StoresLimit)
24135 break;
24136 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
24137 if (P.first != 0)
24138 P.first = std::max(P.second, P.first);
24139 }
24140 // Last attempt to vectorize max number of elements, if all previous
24141 // attempts were unsuccessful because of the cost issues.
24142 CandidateVFs.push_back(VF);
24143 }
24144 }
24145 };
24146
24147 /// Groups of stores to vectorize
24148 SmallVector<RelatedStoreInsts> SortedStores;
24149
24150 // Inserts the specified store SI with the given index Idx to the set of the
24151 // stores. If the store with the same distance is found already - stop
24152 // insertion, try to vectorize already found stores. If some stores from this
24153 // sequence were not vectorized - try to vectorize them with the new store
24154 // later. But this logic is applied only to the stores, that come before the
24155 // previous store with the same distance.
24156 // Example:
24157 // 1. store x, %p
24158 // 2. store y, %p+1
24159 // 3. store z, %p+2
24160 // 4. store a, %p
24161 // 5. store b, %p+3
24162 // - Scan this from the last to first store. The very first bunch of stores is
24163 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
24164 // vector).
24165 // - The next store in the list - #1 - has the same distance from store #5 as
24166 // the store #4.
24167 // - Try to vectorize sequence of stores 4,2,3,5.
24168 // - If all these stores are vectorized - just drop them.
24169 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
24170 // - Start new stores sequence.
24171 // The new bunch of stores is {1, {1, 0}}.
24172 // - Add the stores from previous sequence, that were not vectorized.
24173 // Here we consider the stores in the reversed order, rather they are used in
24174 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
24175 // Store #3 can be added -> comes after store #4 with the same distance as
24176 // store #1.
24177 // Store #5 cannot be added - comes before store #4.
24178 // This logic allows to improve the compile time, we assume that the stores
24179 // after previous store with the same distance most likely have memory
24180 // dependencies and no need to waste compile time to try to vectorize them.
24181 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
24182 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
24183 std::optional<int64_t> PtrDist;
24184 auto *RelatedStores = find_if(
24185 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
24186 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
24187 return PtrDist.has_value();
24188 });
24189
24190 // We did not find a comparable store, start a new group.
24191 if (RelatedStores == SortedStores.end()) {
24192 SortedStores.emplace_back(Idx, Stores);
24193 return;
24194 }
24195
24196 // If there is already a store in the group with the same PtrDiff, try to
24197 // vectorize the existing instructions before adding the current store.
24198 // Otherwise, insert this store and keep collecting.
24199 if (std::optional<unsigned> PrevInst =
24200 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
24201 TryToVectorize(RelatedStores->getStores());
24202 RelatedStores->clearVectorizedStores(VectorizedStores);
24203 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
24204 /*NewBaseInstIdx=*/Idx,
24205 /*DistFromCurBase=*/*PtrDist);
24206 }
24207 };
24208 Type *PrevValTy = nullptr;
24209 for (auto [I, SI] : enumerate(Stores)) {
24210 if (R.isDeleted(SI))
24211 continue;
24212 if (!PrevValTy)
24213 PrevValTy = SI->getValueOperand()->getType();
24214 // Check that we do not try to vectorize stores of different types.
24215 if (PrevValTy != SI->getValueOperand()->getType()) {
24216 for (RelatedStoreInsts &StoreSeq : SortedStores)
24217 TryToVectorize(StoreSeq.getStores());
24218 SortedStores.clear();
24219 PrevValTy = SI->getValueOperand()->getType();
24220 }
24221 FillStoresSet(I, SI);
24222 }
24223
24224 // Final vectorization attempt.
24225 for (RelatedStoreInsts &StoreSeq : SortedStores)
24226 TryToVectorize(StoreSeq.getStores());
24227
24228 return Changed;
24229}
24230
24231void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
24232 // Initialize the collections. We will make a single pass over the block.
24233 Stores.clear();
24234 GEPs.clear();
24235
24236 // Visit the store and getelementptr instructions in BB and organize them in
24237 // Stores and GEPs according to the underlying objects of their pointer
24238 // operands.
24239 for (Instruction &I : *BB) {
24240 // Ignore store instructions that are volatile or have a pointer operand
24241 // that doesn't point to a scalar type.
24242 if (auto *SI = dyn_cast<StoreInst>(&I)) {
24243 if (!SI->isSimple())
24244 continue;
24245 if (!isValidElementType(SI->getValueOperand()->getType()))
24246 continue;
24247 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
24248 }
24249
24250 // Ignore getelementptr instructions that have more than one index, a
24251 // constant index, or a pointer operand that doesn't point to a scalar
24252 // type.
24253 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
24254 if (GEP->getNumIndices() != 1)
24255 continue;
24256 Value *Idx = GEP->idx_begin()->get();
24257 if (isa<Constant>(Idx))
24258 continue;
24259 if (!isValidElementType(Idx->getType()))
24260 continue;
24261 if (GEP->getType()->isVectorTy())
24262 continue;
24263 GEPs[GEP->getPointerOperand()].push_back(GEP);
24264 }
24265 }
24266}
24267
24268bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
24269 bool MaxVFOnly) {
24270 if (VL.size() < 2)
24271 return false;
24272
24273 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
24274 << VL.size() << ".\n");
24275
24276 // Check that all of the parts are instructions of the same type,
24277 // we permit an alternate opcode via InstructionsState.
24278 InstructionsState S = getSameOpcode(VL, *TLI);
24279 if (!S)
24280 return false;
24281
24282 Instruction *I0 = S.getMainOp();
24283 // Make sure invalid types (including vector type) are rejected before
24284 // determining vectorization factor for scalar instructions.
24285 for (Value *V : VL) {
24286 Type *Ty = V->getType();
24288 // NOTE: the following will give user internal llvm type name, which may
24289 // not be useful.
24290 R.getORE()->emit([&]() {
24291 std::string TypeStr;
24292 llvm::raw_string_ostream OS(TypeStr);
24293 Ty->print(OS);
24294 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
24295 << "Cannot SLP vectorize list: type "
24296 << TypeStr + " is unsupported by vectorizer";
24297 });
24298 return false;
24299 }
24300 }
24301
24302 Type *ScalarTy = getValueType(VL[0]);
24303 unsigned Sz = R.getVectorElementSize(I0);
24304 unsigned MinVF = R.getMinVF(Sz);
24305 unsigned MaxVF = std::max<unsigned>(
24306 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
24307 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
24308 if (MaxVF < 2) {
24309 R.getORE()->emit([&]() {
24310 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
24311 << "Cannot SLP vectorize list: vectorization factor "
24312 << "less than 2 is not supported";
24313 });
24314 return false;
24315 }
24316
24317 bool Changed = false;
24318 bool CandidateFound = false;
24319 InstructionCost MinCost = SLPCostThreshold.getValue();
24320
24321 unsigned NextInst = 0, MaxInst = VL.size();
24322 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
24323 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
24324 // No actual vectorization should happen, if number of parts is the same as
24325 // provided vectorization factor (i.e. the scalar type is used for vector
24326 // code during codegen).
24327 auto *VecTy = getWidenedType(ScalarTy, VF);
24328 if (TTI->getNumberOfParts(VecTy) == VF)
24329 continue;
24330 for (unsigned I = NextInst; I < MaxInst; ++I) {
24331 unsigned ActualVF = std::min(MaxInst - I, VF);
24332
24333 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
24334 continue;
24335
24336 if (MaxVFOnly && ActualVF < MaxVF)
24337 break;
24338 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
24339 break;
24340
24341 SmallVector<Value *> Ops(ActualVF, nullptr);
24342 unsigned Idx = 0;
24343 for (Value *V : VL.drop_front(I)) {
24344 // Check that a previous iteration of this loop did not delete the
24345 // Value.
24346 if (auto *Inst = dyn_cast<Instruction>(V);
24347 !Inst || !R.isDeleted(Inst)) {
24348 Ops[Idx] = V;
24349 ++Idx;
24350 if (Idx == ActualVF)
24351 break;
24352 }
24353 }
24354 // Not enough vectorizable instructions - exit.
24355 if (Idx != ActualVF)
24356 break;
24357
24358 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
24359 << "\n");
24360
24361 R.buildTree(Ops);
24362 if (R.isTreeTinyAndNotFullyVectorizable())
24363 continue;
24364 if (R.isProfitableToReorder()) {
24365 R.reorderTopToBottom();
24366 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
24367 }
24368 R.transformNodes();
24369 R.computeMinimumValueSizes();
24370 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
24371 R.buildExternalUses();
24372
24373 InstructionCost Cost = R.getTreeCost(TreeCost);
24374 CandidateFound = true;
24375 MinCost = std::min(MinCost, Cost);
24376
24377 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24378 << " for VF=" << ActualVF << "\n");
24379 if (Cost < -SLPCostThreshold) {
24380 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
24381 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
24383 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
24384 << " and with tree size "
24385 << ore::NV("TreeSize", R.getTreeSize()));
24386
24387 R.vectorizeTree();
24388 // Move to the next bundle.
24389 I += VF - 1;
24390 NextInst = I + 1;
24391 Changed = true;
24392 }
24393 }
24394 }
24395
24396 if (!Changed && CandidateFound) {
24397 R.getORE()->emit([&]() {
24398 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
24399 << "List vectorization was possible but not beneficial with cost "
24400 << ore::NV("Cost", MinCost) << " >= "
24401 << ore::NV("Treshold", -SLPCostThreshold);
24402 });
24403 } else if (!Changed) {
24404 R.getORE()->emit([&]() {
24405 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
24406 << "Cannot SLP vectorize list: vectorization was impossible"
24407 << " with available vectorization factors";
24408 });
24409 }
24410 return Changed;
24411}
24412
24413namespace {
24414
24415/// Model horizontal reductions.
24416///
24417/// A horizontal reduction is a tree of reduction instructions that has values
24418/// that can be put into a vector as its leaves. For example:
24419///
24420/// mul mul mul mul
24421/// \ / \ /
24422/// + +
24423/// \ /
24424/// +
24425/// This tree has "mul" as its leaf values and "+" as its reduction
24426/// instructions. A reduction can feed into a store or a binary operation
24427/// feeding a phi.
24428/// ...
24429/// \ /
24430/// +
24431/// |
24432/// phi +=
24433///
24434/// Or:
24435/// ...
24436/// \ /
24437/// +
24438/// |
24439/// *p =
24440///
24441class HorizontalReduction {
24442 using ReductionOpsType = SmallVector<Value *, 16>;
24443 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
24444 ReductionOpsListType ReductionOps;
24445 /// List of possibly reduced values.
24447 /// Maps reduced value to the corresponding reduction operation.
24448 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
24449 WeakTrackingVH ReductionRoot;
24450 /// The type of reduction operation.
24451 RecurKind RdxKind;
24452 /// Checks if the optimization of original scalar identity operations on
24453 /// matched horizontal reductions is enabled and allowed.
24454 bool IsSupportedHorRdxIdentityOp = false;
24455 /// The minimum number of the reduced values.
24456 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
24457 /// Contains vector values for reduction including their scale factor and
24458 /// signedness.
24460
24461 static bool isCmpSelMinMax(Instruction *I) {
24462 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
24464 }
24465
24466 // And/or are potentially poison-safe logical patterns like:
24467 // select x, y, false
24468 // select x, true, y
24469 static bool isBoolLogicOp(Instruction *I) {
24470 return isa<SelectInst>(I) &&
24471 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
24472 }
24473
24474 /// Checks if instruction is associative and can be vectorized.
24475 static bool isVectorizable(RecurKind Kind, Instruction *I,
24476 bool TwoElementReduction = false) {
24477 if (Kind == RecurKind::None)
24478 return false;
24479
24480 // Integer ops that map to select instructions or intrinsics are fine.
24482 isBoolLogicOp(I))
24483 return true;
24484
24485 // No need to check for associativity, if 2 reduced values.
24486 if (TwoElementReduction)
24487 return true;
24488
24489 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
24490 // FP min/max are associative except for NaN and -0.0. We do not
24491 // have to rule out -0.0 here because the intrinsic semantics do not
24492 // specify a fixed result for it.
24493 return I->getFastMathFlags().noNaNs();
24494 }
24495
24496 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
24497 return true;
24498
24499 return I->isAssociative();
24500 }
24501
24502 static Value *getRdxOperand(Instruction *I, unsigned Index) {
24503 // Poison-safe 'or' takes the form: select X, true, Y
24504 // To make that work with the normal operand processing, we skip the
24505 // true value operand.
24506 // TODO: Change the code and data structures to handle this without a hack.
24507 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
24508 return I->getOperand(2);
24509 return I->getOperand(Index);
24510 }
24511
24512 /// Creates reduction operation with the current opcode.
24513 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
24514 Value *RHS, const Twine &Name, bool UseSelect) {
24515 Type *OpTy = LHS->getType();
24516 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
24517 switch (Kind) {
24518 case RecurKind::Or: {
24519 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
24520 return Builder.CreateSelectWithUnknownProfile(
24521 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
24522 RHS, DEBUG_TYPE, Name);
24523 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24524 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24525 Name);
24526 }
24527 case RecurKind::And: {
24528 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
24529 return Builder.CreateSelectWithUnknownProfile(
24530 LHS, RHS,
24531 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
24532 DEBUG_TYPE, Name);
24533 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24534 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24535 Name);
24536 }
24537 case RecurKind::Add:
24538 case RecurKind::Mul:
24539 case RecurKind::Xor:
24540 case RecurKind::FAdd:
24541 case RecurKind::FMul: {
24542 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24543 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24544 Name);
24545 }
24546 case RecurKind::SMax:
24547 case RecurKind::SMin:
24548 case RecurKind::UMax:
24549 case RecurKind::UMin:
24550 if (UseSelect) {
24552 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
24553 return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
24554 Name);
24555 }
24556 [[fallthrough]];
24557 case RecurKind::FMax:
24558 case RecurKind::FMin:
24559 case RecurKind::FMaximum:
24560 case RecurKind::FMinimum:
24561 case RecurKind::FMaximumNum:
24562 case RecurKind::FMinimumNum: {
24564 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
24565 }
24566 default:
24567 llvm_unreachable("Unknown reduction operation.");
24568 }
24569 }
24570
24571 /// Creates reduction operation with the current opcode with the IR flags
24572 /// from \p ReductionOps, dropping nuw/nsw flags.
24573 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
24574 Value *RHS, const Twine &Name,
24575 const ReductionOpsListType &ReductionOps) {
24576 bool UseSelect = ReductionOps.size() == 2 ||
24577 // Logical or/and.
24578 (ReductionOps.size() == 1 &&
24579 any_of(ReductionOps.front(), IsaPred<SelectInst>));
24580 assert((!UseSelect || ReductionOps.size() != 2 ||
24581 isa<SelectInst>(ReductionOps[1][0])) &&
24582 "Expected cmp + select pairs for reduction");
24583 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
24585 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
24586 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
24587 /*IncludeWrapFlags=*/false);
24588 propagateIRFlags(Op, ReductionOps[1], nullptr,
24589 /*IncludeWrapFlags=*/false);
24590 return Op;
24591 }
24592 }
24593 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
24594 return Op;
24595 }
24596
24597public:
24598 static RecurKind getRdxKind(Value *V) {
24599 auto *I = dyn_cast<Instruction>(V);
24600 if (!I)
24601 return RecurKind::None;
24602 if (match(I, m_Add(m_Value(), m_Value())))
24603 return RecurKind::Add;
24604 if (match(I, m_Mul(m_Value(), m_Value())))
24605 return RecurKind::Mul;
24606 if (match(I, m_And(m_Value(), m_Value())) ||
24608 return RecurKind::And;
24609 if (match(I, m_Or(m_Value(), m_Value())) ||
24611 return RecurKind::Or;
24612 if (match(I, m_Xor(m_Value(), m_Value())))
24613 return RecurKind::Xor;
24614 if (match(I, m_FAdd(m_Value(), m_Value())))
24615 return RecurKind::FAdd;
24616 if (match(I, m_FMul(m_Value(), m_Value())))
24617 return RecurKind::FMul;
24618
24620 return RecurKind::FMax;
24622 return RecurKind::FMin;
24623
24624 if (match(I, m_FMaximum(m_Value(), m_Value())))
24625 return RecurKind::FMaximum;
24626 if (match(I, m_FMinimum(m_Value(), m_Value())))
24627 return RecurKind::FMinimum;
24628 // This matches either cmp+select or intrinsics. SLP is expected to handle
24629 // either form.
24630 // TODO: If we are canonicalizing to intrinsics, we can remove several
24631 // special-case paths that deal with selects.
24632 if (match(I, m_SMax(m_Value(), m_Value())))
24633 return RecurKind::SMax;
24634 if (match(I, m_SMin(m_Value(), m_Value())))
24635 return RecurKind::SMin;
24636 if (match(I, m_UMax(m_Value(), m_Value())))
24637 return RecurKind::UMax;
24638 if (match(I, m_UMin(m_Value(), m_Value())))
24639 return RecurKind::UMin;
24640
24641 if (auto *Select = dyn_cast<SelectInst>(I)) {
24642 // Try harder: look for min/max pattern based on instructions producing
24643 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
24644 // During the intermediate stages of SLP, it's very common to have
24645 // pattern like this (since optimizeGatherSequence is run only once
24646 // at the end):
24647 // %1 = extractelement <2 x i32> %a, i32 0
24648 // %2 = extractelement <2 x i32> %a, i32 1
24649 // %cond = icmp sgt i32 %1, %2
24650 // %3 = extractelement <2 x i32> %a, i32 0
24651 // %4 = extractelement <2 x i32> %a, i32 1
24652 // %select = select i1 %cond, i32 %3, i32 %4
24653 CmpPredicate Pred;
24654 Instruction *L1;
24655 Instruction *L2;
24656
24657 Value *LHS = Select->getTrueValue();
24658 Value *RHS = Select->getFalseValue();
24659 Value *Cond = Select->getCondition();
24660
24661 // TODO: Support inverse predicates.
24662 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
24665 return RecurKind::None;
24666 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
24669 return RecurKind::None;
24670 } else {
24672 return RecurKind::None;
24673 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
24676 return RecurKind::None;
24677 }
24678
24679 switch (Pred) {
24680 default:
24681 return RecurKind::None;
24682 case CmpInst::ICMP_SGT:
24683 case CmpInst::ICMP_SGE:
24684 return RecurKind::SMax;
24685 case CmpInst::ICMP_SLT:
24686 case CmpInst::ICMP_SLE:
24687 return RecurKind::SMin;
24688 case CmpInst::ICMP_UGT:
24689 case CmpInst::ICMP_UGE:
24690 return RecurKind::UMax;
24691 case CmpInst::ICMP_ULT:
24692 case CmpInst::ICMP_ULE:
24693 return RecurKind::UMin;
24694 }
24695 }
24696 return RecurKind::None;
24697 }
24698
24699 /// Get the index of the first operand.
24700 static unsigned getFirstOperandIndex(Instruction *I) {
24701 return isCmpSelMinMax(I) ? 1 : 0;
24702 }
24703
24704private:
24705 /// Total number of operands in the reduction operation.
24706 static unsigned getNumberOfOperands(Instruction *I) {
24707 return isCmpSelMinMax(I) ? 3 : 2;
24708 }
24709
24710 /// Checks if the instruction is in basic block \p BB.
24711 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
24712 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
24713 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
24714 auto *Sel = cast<SelectInst>(I);
24715 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
24716 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
24717 }
24718 return I->getParent() == BB;
24719 }
24720
24721 /// Expected number of uses for reduction operations/reduced values.
24722 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
24723 if (IsCmpSelMinMax) {
24724 // SelectInst must be used twice while the condition op must have single
24725 // use only.
24726 if (auto *Sel = dyn_cast<SelectInst>(I))
24727 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
24728 return I->hasNUses(2);
24729 }
24730
24731 // Arithmetic reduction operation must be used once only.
24732 return I->hasOneUse();
24733 }
24734
24735 /// Initializes the list of reduction operations.
24736 void initReductionOps(Instruction *I) {
24737 if (isCmpSelMinMax(I))
24738 ReductionOps.assign(2, ReductionOpsType());
24739 else
24740 ReductionOps.assign(1, ReductionOpsType());
24741 }
24742
24743 /// Add all reduction operations for the reduction instruction \p I.
24744 void addReductionOps(Instruction *I) {
24745 if (isCmpSelMinMax(I)) {
24746 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
24747 ReductionOps[1].emplace_back(I);
24748 } else {
24749 ReductionOps[0].emplace_back(I);
24750 }
24751 }
24752
24753 static bool isGoodForReduction(ArrayRef<Value *> Data) {
24754 int Sz = Data.size();
24755 auto *I = dyn_cast<Instruction>(Data.front());
24756 return Sz > 1 || isConstant(Data.front()) ||
24757 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
24758 }
24759
24760public:
24761 HorizontalReduction() = default;
24763 : ReductionRoot(I), ReductionLimit(2) {
24764 RdxKind = HorizontalReduction::getRdxKind(I);
24765 ReductionOps.emplace_back().push_back(I);
24766 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
24767 for (Value *V : Ops)
24768 ReducedValsToOps[V].push_back(I);
24769 }
24770
24771 bool matchReductionForOperands() const {
24772 // Analyze "regular" integer/FP types for reductions - no target-specific
24773 // types or pointers.
24774 assert(ReductionRoot && "Reduction root is not set!");
24775 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
24776 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
24777 return Ops.size() == 2;
24778 })))
24779 return false;
24780
24781 return true;
24782 }
24783
24784 /// Try to find a reduction tree.
24785 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24786 ScalarEvolution &SE, const DataLayout &DL,
24787 const TargetLibraryInfo &TLI) {
24788 RdxKind = HorizontalReduction::getRdxKind(Root);
24789 if (!isVectorizable(RdxKind, Root))
24790 return false;
24791
24792 // Analyze "regular" integer/FP types for reductions - no target-specific
24793 // types or pointers.
24794 Type *Ty = Root->getType();
24795 if (!isValidElementType(Ty) || Ty->isPointerTy())
24796 return false;
24797
24798 // Though the ultimate reduction may have multiple uses, its condition must
24799 // have only single use.
24800 if (auto *Sel = dyn_cast<SelectInst>(Root))
24801 if (!Sel->getCondition()->hasOneUse())
24802 return false;
24803
24804 ReductionRoot = Root;
24805
24806 // Iterate through all the operands of the possible reduction tree and
24807 // gather all the reduced values, sorting them by their value id.
24808 BasicBlock *BB = Root->getParent();
24809 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24811 1, std::make_pair(Root, 0));
24812 // Checks if the operands of the \p TreeN instruction are also reduction
24813 // operations or should be treated as reduced values or an extra argument,
24814 // which is not part of the reduction.
24815 auto CheckOperands = [&](Instruction *TreeN,
24816 SmallVectorImpl<Value *> &PossibleReducedVals,
24817 SmallVectorImpl<Instruction *> &ReductionOps,
24818 unsigned Level) {
24819 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
24820 getNumberOfOperands(TreeN)))) {
24821 Value *EdgeVal = getRdxOperand(TreeN, I);
24822 ReducedValsToOps[EdgeVal].push_back(TreeN);
24823 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
24824 // If the edge is not an instruction, or it is different from the main
24825 // reduction opcode or has too many uses - possible reduced value.
24826 // Also, do not try to reduce const values, if the operation is not
24827 // foldable.
24828 if (!EdgeInst || Level > RecursionMaxDepth ||
24829 getRdxKind(EdgeInst) != RdxKind ||
24830 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24831 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24832 !isVectorizable(RdxKind, EdgeInst) ||
24833 (R.isAnalyzedReductionRoot(EdgeInst) &&
24834 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
24835 PossibleReducedVals.push_back(EdgeVal);
24836 continue;
24837 }
24838 ReductionOps.push_back(EdgeInst);
24839 }
24840 };
24841 // Try to regroup reduced values so that it gets more profitable to try to
24842 // reduce them. Values are grouped by their value ids, instructions - by
24843 // instruction op id and/or alternate op id, plus do extra analysis for
24844 // loads (grouping them by the distance between pointers) and cmp
24845 // instructions (grouping them by the predicate).
24846 SmallMapVector<
24847 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24848 8>
24849 PossibleReducedVals;
24850 initReductionOps(Root);
24851 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
24852 SmallSet<size_t, 2> LoadKeyUsed;
24853
24854 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
24856 Value *Ptr =
24858 if (!LoadKeyUsed.insert(Key).second) {
24859 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
24860 if (LIt != LoadsMap.end()) {
24861 for (LoadInst *RLI : LIt->second) {
24862 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
24863 LI->getType(), LI->getPointerOperand(), DL, SE,
24864 /*StrictCheck=*/true))
24865 return hash_value(RLI->getPointerOperand());
24866 }
24867 for (LoadInst *RLI : LIt->second) {
24869 LI->getPointerOperand(), TLI)) {
24870 hash_code SubKey = hash_value(RLI->getPointerOperand());
24871 return SubKey;
24872 }
24873 }
24874 if (LIt->second.size() > 2) {
24875 hash_code SubKey =
24876 hash_value(LIt->second.back()->getPointerOperand());
24877 return SubKey;
24878 }
24879 }
24880 }
24881 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
24882 .first->second.push_back(LI);
24883 return hash_value(LI->getPointerOperand());
24884 };
24885
24886 while (!Worklist.empty()) {
24887 auto [TreeN, Level] = Worklist.pop_back_val();
24888 SmallVector<Value *> PossibleRedVals;
24889 SmallVector<Instruction *> PossibleReductionOps;
24890 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24891 addReductionOps(TreeN);
24892 // Add reduction values. The values are sorted for better vectorization
24893 // results.
24894 for (Value *V : PossibleRedVals) {
24895 size_t Key, Idx;
24896 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
24897 /*AllowAlternate=*/false);
24898 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
24899 }
24900 for (Instruction *I : reverse(PossibleReductionOps))
24901 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
24902 }
24903 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
24904 // Sort values by the total number of values kinds to start the reduction
24905 // from the longest possible reduced values sequences.
24906 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
24907 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
24908 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
24909 for (auto &Slice : PossibleRedVals) {
24910 PossibleRedValsVect.emplace_back();
24911 auto RedValsVect = Slice.second.takeVector();
24912 stable_sort(RedValsVect, llvm::less_second());
24913 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
24914 PossibleRedValsVect.back().append(Data.second, Data.first);
24915 }
24916 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
24917 return P1.size() > P2.size();
24918 });
24919 bool First = true;
24920 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
24921 if (First) {
24922 First = false;
24923 ReducedVals.emplace_back();
24924 } else if (!isGoodForReduction(Data)) {
24925 auto *LI = dyn_cast<LoadInst>(Data.front());
24926 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
24927 if (!LI || !LastLI ||
24929 getUnderlyingObject(LastLI->getPointerOperand()))
24930 ReducedVals.emplace_back();
24931 }
24932 ReducedVals.back().append(Data.rbegin(), Data.rend());
24933 }
24934 }
24935 // Sort the reduced values by number of same/alternate opcode and/or pointer
24936 // operand.
24937 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
24938 return P1.size() > P2.size();
24939 });
24940 return true;
24941 }
24942
24943 /// Attempt to vectorize the tree found by matchAssociativeReduction.
24944 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
24945 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24946 DominatorTree &DT) {
24947 constexpr unsigned RegMaxNumber = 4;
24948 constexpr unsigned RedValsMaxNumber = 128;
24949 // If there are a sufficient number of reduction values, reduce
24950 // to a nearby power-of-2. We can safely generate oversized
24951 // vectors and rely on the backend to split them to legal sizes.
24952 if (unsigned NumReducedVals = std::accumulate(
24953 ReducedVals.begin(), ReducedVals.end(), 0,
24954 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
24955 if (!isGoodForReduction(Vals))
24956 return Num;
24957 return Num + Vals.size();
24958 });
24959 NumReducedVals < ReductionLimit &&
24960 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
24961 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
24962 })) {
24963 for (ReductionOpsType &RdxOps : ReductionOps)
24964 for (Value *RdxOp : RdxOps)
24965 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24966 return nullptr;
24967 }
24968
24969 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24970 TargetFolder(DL));
24971 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
24972
24973 // Track the reduced values in case if they are replaced by extractelement
24974 // because of the vectorization.
24975 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
24976 ReducedVals.front().size());
24977
24978 // The compare instruction of a min/max is the insertion point for new
24979 // instructions and may be replaced with a new compare instruction.
24980 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
24981 assert(isa<SelectInst>(RdxRootInst) &&
24982 "Expected min/max reduction to have select root instruction");
24983 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
24984 assert(isa<Instruction>(ScalarCond) &&
24985 "Expected min/max reduction to have compare condition");
24986 return cast<Instruction>(ScalarCond);
24987 };
24988
24989 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
24990 return isBoolLogicOp(cast<Instruction>(V));
24991 });
24992 // Return new VectorizedTree, based on previous value.
24993 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
24994 if (VectorizedTree) {
24995 // Update the final value in the reduction.
24997 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
24998 if (AnyBoolLogicOp) {
24999 auto It = ReducedValsToOps.find(VectorizedTree);
25000 auto It1 = ReducedValsToOps.find(Res);
25001 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
25002 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
25003 (It != ReducedValsToOps.end() &&
25004 any_of(It->getSecond(), [&](Instruction *I) {
25005 return isBoolLogicOp(I) &&
25006 getRdxOperand(I, 0) == VectorizedTree;
25007 }))) {
25008 ;
25009 } else if (isGuaranteedNotToBePoison(Res, AC) ||
25010 (It1 != ReducedValsToOps.end() &&
25011 any_of(It1->getSecond(), [&](Instruction *I) {
25012 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
25013 }))) {
25014 std::swap(VectorizedTree, Res);
25015 } else {
25016 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
25017 }
25018 }
25019
25020 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
25021 ReductionOps);
25022 }
25023 // Initialize the final value in the reduction.
25024 return Res;
25025 };
25026 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
25027 ReductionOps.front().size());
25028 for (ReductionOpsType &RdxOps : ReductionOps)
25029 for (Value *RdxOp : RdxOps) {
25030 if (!RdxOp)
25031 continue;
25032 IgnoreList.insert(RdxOp);
25033 }
25034 // Intersect the fast-math-flags from all reduction operations.
25035 FastMathFlags RdxFMF;
25036 RdxFMF.set();
25037 for (Value *U : IgnoreList)
25038 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
25039 RdxFMF &= FPMO->getFastMathFlags();
25040 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
25041
25042 // Need to track reduced vals, they may be changed during vectorization of
25043 // subvectors.
25044 for (ArrayRef<Value *> Candidates : ReducedVals)
25045 for (Value *V : Candidates)
25046 TrackedVals.try_emplace(V, V);
25047
25048 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
25049 Value *V) -> unsigned & {
25050 auto *It = MV.find(V);
25051 assert(It != MV.end() && "Unable to find given key.");
25052 return It->second;
25053 };
25054
25055 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
25056 // List of the values that were reduced in other trees as part of gather
25057 // nodes and thus requiring extract if fully vectorized in other trees.
25058 SmallPtrSet<Value *, 4> RequiredExtract;
25059 WeakTrackingVH VectorizedTree = nullptr;
25060 bool CheckForReusedReductionOps = false;
25061 // Try to vectorize elements based on their type.
25063 SmallVector<SmallVector<Value *>> LocalReducedVals;
25064 // Try merge consecutive reduced values into a single vectorizable group and
25065 // check, if they can be vectorized as copyables.
25066 for (ArrayRef<Value *> RV : ReducedVals) {
25067 // Loads are not very compatible with undefs.
25068 if (isa<UndefValue>(RV.front()) &&
25069 (States.empty() || !States.back() ||
25070 States.back().getOpcode() == Instruction::Load)) {
25071 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
25072 States.push_back(InstructionsState::invalid());
25073 continue;
25074 }
25075 if (!LocalReducedVals.empty() &&
25076 isa<UndefValue>(LocalReducedVals.back().front()) &&
25077 isa<LoadInst>(RV.front())) {
25078 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
25079 States.push_back(getSameOpcode(RV, TLI));
25080 continue;
25081 }
25083 if (!LocalReducedVals.empty())
25084 Ops = LocalReducedVals.back();
25085 Ops.append(RV.begin(), RV.end());
25086 InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
25087 InstructionsState OpS =
25088 Analysis.buildInstructionsState(Ops, V, VectorizeCopyableElements);
25089 if (LocalReducedVals.empty()) {
25090 LocalReducedVals.push_back(Ops);
25091 States.push_back(OpS);
25092 continue;
25093 }
25094 if (OpS) {
25095 LocalReducedVals.back().swap(Ops);
25096 States.back() = OpS;
25097 continue;
25098 }
25099 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
25100 States.push_back(getSameOpcode(RV, TLI));
25101 }
25102 ReducedVals.swap(LocalReducedVals);
25103 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
25104 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
25105 InstructionsState S = States[I];
25106 SmallVector<Value *> Candidates;
25107 Candidates.reserve(2 * OrigReducedVals.size());
25108 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
25109 for (Value *ReducedVal : OrigReducedVals) {
25110 Value *RdxVal = TrackedVals.at(ReducedVal);
25111 // Check if the reduction value was not overriden by the extractelement
25112 // instruction because of the vectorization and exclude it, if it is not
25113 // compatible with other values.
25114 // Also check if the instruction was folded to constant/other value.
25115 auto *Inst = dyn_cast<Instruction>(RdxVal);
25116 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
25117 (!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
25118 !S.isCopyableElement(Inst)))) ||
25119 (S && !Inst && !isa<PoisonValue>(RdxVal) &&
25120 !S.isCopyableElement(RdxVal)))
25121 continue;
25122 Candidates.push_back(RdxVal);
25123 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
25124 }
25125 bool ShuffledExtracts = false;
25126 // Try to handle shuffled extractelements.
25127 if (S && S.getOpcode() == Instruction::ExtractElement &&
25128 !S.isAltShuffle() && I + 1 < E) {
25129 SmallVector<Value *> CommonCandidates(Candidates);
25130 for (Value *RV : ReducedVals[I + 1]) {
25131 Value *RdxVal = TrackedVals.at(RV);
25132 // Check if the reduction value was not overriden by the
25133 // extractelement instruction because of the vectorization and
25134 // exclude it, if it is not compatible with other values.
25135 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
25136 if (!Inst)
25137 continue;
25138 CommonCandidates.push_back(RdxVal);
25139 TrackedToOrig.try_emplace(RdxVal, RV);
25140 }
25141 SmallVector<int> Mask;
25142 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
25143 ++I;
25144 Candidates.swap(CommonCandidates);
25145 ShuffledExtracts = true;
25146 }
25147 }
25148
25149 // Emit code for constant values.
25150 if (Candidates.size() > 1 && allConstant(Candidates)) {
25151 Value *Res = Candidates.front();
25152 Value *OrigV = TrackedToOrig.at(Candidates.front());
25153 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25154 for (Value *VC : ArrayRef(Candidates).drop_front()) {
25155 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
25156 Value *OrigV = TrackedToOrig.at(VC);
25157 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25158 if (auto *ResI = dyn_cast<Instruction>(Res))
25159 V.analyzedReductionRoot(ResI);
25160 }
25161 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
25162 continue;
25163 }
25164
25165 unsigned NumReducedVals = Candidates.size();
25166 if (NumReducedVals < ReductionLimit &&
25167 (NumReducedVals < 2 || !isSplat(Candidates)))
25168 continue;
25169
25170 // Check if we support repeated scalar values processing (optimization of
25171 // original scalar identity operations on matched horizontal reductions).
25172 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
25173 RdxKind != RecurKind::FMul &&
25174 RdxKind != RecurKind::FMulAdd;
25175 // Gather same values.
25176 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
25177 if (IsSupportedHorRdxIdentityOp)
25178 for (Value *V : Candidates) {
25179 Value *OrigV = TrackedToOrig.at(V);
25180 ++SameValuesCounter.try_emplace(OrigV).first->second;
25181 }
25182 // Used to check if the reduced values used same number of times. In this
25183 // case the compiler may produce better code. E.g. if reduced values are
25184 // aabbccdd (8 x values), then the first node of the tree will have a node
25185 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
25186 // Plus, the final reduction will be performed on <8 x aabbccdd>.
25187 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
25188 // x abcd) * 2.
25189 // Currently it only handles add/fadd/xor. and/or/min/max do not require
25190 // this analysis, other operations may require an extra estimation of
25191 // the profitability.
25192 bool SameScaleFactor = false;
25193 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
25194 SameValuesCounter.size() != Candidates.size();
25195 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
25196 if (OptReusedScalars) {
25197 SameScaleFactor =
25198 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
25199 RdxKind == RecurKind::Xor) &&
25200 all_of(drop_begin(SameValuesCounter),
25201 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
25202 return P.second == SameValuesCounter.front().second;
25203 });
25204 Candidates.resize(SameValuesCounter.size());
25205 transform(SameValuesCounter, Candidates.begin(),
25206 [&](const auto &P) { return TrackedVals.at(P.first); });
25207 NumReducedVals = Candidates.size();
25208 // Have a reduction of the same element.
25209 if (NumReducedVals == 1) {
25210 Value *OrigV = TrackedToOrig.at(Candidates.front());
25211 unsigned Cnt = At(SameValuesCounter, OrigV);
25212 Value *RedVal =
25213 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
25214 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25215 VectorizedVals.try_emplace(OrigV, Cnt);
25216 ExternallyUsedValues.insert(OrigV);
25217 continue;
25218 }
25219 }
25220
25221 unsigned MaxVecRegSize = V.getMaxVecRegSize();
25222 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
25223 const unsigned MaxElts = std::clamp<unsigned>(
25224 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
25225 RegMaxNumber * RedValsMaxNumber);
25226
25227 unsigned ReduxWidth = NumReducedVals;
25228 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
25229 unsigned NumParts, NumRegs;
25230 Type *ScalarTy = Candidates.front()->getType();
25231 ReduxWidth =
25232 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
25233 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
25234 NumParts = ::getNumberOfParts(TTI, Tp);
25235 NumRegs =
25237 while (NumParts > NumRegs) {
25238 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
25239 ReduxWidth = bit_floor(ReduxWidth - 1);
25240 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
25241 NumParts = ::getNumberOfParts(TTI, Tp);
25242 NumRegs =
25244 }
25245 if (NumParts > NumRegs / 2)
25246 ReduxWidth = bit_floor(ReduxWidth);
25247 return ReduxWidth;
25248 };
25249 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
25250 ReduxWidth = GetVectorFactor(ReduxWidth);
25251 ReduxWidth = std::min(ReduxWidth, MaxElts);
25252
25253 unsigned Start = 0;
25254 unsigned Pos = Start;
25255 // Restarts vectorization attempt with lower vector factor.
25256 unsigned PrevReduxWidth = ReduxWidth;
25257 bool CheckForReusedReductionOpsLocal = false;
25258 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
25259 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
25260 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
25261 // Check if any of the reduction ops are gathered. If so, worth
25262 // trying again with less number of reduction ops.
25263 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
25264 }
25265 ++Pos;
25266 if (Pos < NumReducedVals - ReduxWidth + 1)
25267 return IsAnyRedOpGathered;
25268 Pos = Start;
25269 --ReduxWidth;
25270 if (ReduxWidth > 1)
25271 ReduxWidth = GetVectorFactor(ReduxWidth);
25272 return IsAnyRedOpGathered;
25273 };
25274 bool AnyVectorized = false;
25275 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
25276 while (Pos < NumReducedVals - ReduxWidth + 1 &&
25277 ReduxWidth >= ReductionLimit) {
25278 // Dependency in tree of the reduction ops - drop this attempt, try
25279 // later.
25280 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
25281 Start == 0) {
25282 CheckForReusedReductionOps = true;
25283 break;
25284 }
25285 PrevReduxWidth = ReduxWidth;
25286 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
25287 // Been analyzed already - skip.
25288 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
25289 (!has_single_bit(ReduxWidth) &&
25290 (IgnoredCandidates.contains(
25291 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
25292 IgnoredCandidates.contains(
25293 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
25294 bit_floor(ReduxWidth))))) ||
25295 V.areAnalyzedReductionVals(VL)) {
25296 (void)AdjustReducedVals(/*IgnoreVL=*/true);
25297 continue;
25298 }
25299 // Early exit if any of the reduction values were deleted during
25300 // previous vectorization attempts.
25301 if (any_of(VL, [&V](Value *RedVal) {
25302 auto *RedValI = dyn_cast<Instruction>(RedVal);
25303 return RedValI && V.isDeleted(RedValI);
25304 }))
25305 break;
25306 V.buildTree(VL, IgnoreList);
25307 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
25308 if (!AdjustReducedVals())
25309 V.analyzedReductionVals(VL);
25310 continue;
25311 }
25312 if (V.isLoadCombineReductionCandidate(RdxKind)) {
25313 if (!AdjustReducedVals())
25314 V.analyzedReductionVals(VL);
25315 continue;
25316 }
25317 V.reorderTopToBottom();
25318 // No need to reorder the root node at all for reassociative reduction.
25319 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
25320 VL.front()->getType()->isIntOrIntVectorTy() ||
25321 ReductionLimit > 2);
25322 // Keep extracted other reduction values, if they are used in the
25323 // vectorization trees.
25324 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
25325 ExternallyUsedValues);
25326 // The reduction root is used as the insertion point for new
25327 // instructions, so set it as externally used to prevent it from being
25328 // deleted.
25329 LocalExternallyUsedValues.insert(ReductionRoot);
25330 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
25331 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
25332 continue;
25333 for (Value *V : ReducedVals[Cnt])
25334 if (isa<Instruction>(V))
25335 LocalExternallyUsedValues.insert(TrackedVals[V]);
25336 }
25337 if (!IsSupportedHorRdxIdentityOp) {
25338 // Number of uses of the candidates in the vector of values.
25339 assert(SameValuesCounter.empty() &&
25340 "Reused values counter map is not empty");
25341 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25342 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25343 continue;
25344 Value *V = Candidates[Cnt];
25345 Value *OrigV = TrackedToOrig.at(V);
25346 ++SameValuesCounter.try_emplace(OrigV).first->second;
25347 }
25348 }
25349 V.transformNodes();
25350 V.computeMinimumValueSizes();
25351 InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VL);
25352
25353 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
25354 // Gather externally used values.
25355 SmallPtrSet<Value *, 4> Visited;
25356 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25357 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25358 continue;
25359 Value *RdxVal = Candidates[Cnt];
25360 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
25361 RdxVal = It->second;
25362 if (!Visited.insert(RdxVal).second)
25363 continue;
25364 // Check if the scalar was vectorized as part of the vectorization
25365 // tree but not the top node.
25366 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
25367 LocalExternallyUsedValues.insert(RdxVal);
25368 continue;
25369 }
25370 Value *OrigV = TrackedToOrig.at(RdxVal);
25371 unsigned NumOps =
25372 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
25373 if (NumOps != ReducedValsToOps.at(OrigV).size())
25374 LocalExternallyUsedValues.insert(RdxVal);
25375 }
25376 // Do not need the list of reused scalars in regular mode anymore.
25377 if (!IsSupportedHorRdxIdentityOp)
25378 SameValuesCounter.clear();
25379 for (Value *RdxVal : VL)
25380 if (RequiredExtract.contains(RdxVal))
25381 LocalExternallyUsedValues.insert(RdxVal);
25382 V.buildExternalUses(LocalExternallyUsedValues);
25383
25384 // Estimate cost.
25385 InstructionCost ReductionCost =
25386 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
25387 InstructionCost Cost = V.getTreeCost(TreeCost, VL, ReductionCost);
25388 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
25389 << " for reduction\n");
25390 if (!Cost.isValid())
25391 break;
25392 if (Cost >= -SLPCostThreshold) {
25393 V.getORE()->emit([&]() {
25394 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
25395 ReducedValsToOps.at(VL[0]).front())
25396 << "Vectorizing horizontal reduction is possible "
25397 << "but not beneficial with cost " << ore::NV("Cost", Cost)
25398 << " and threshold "
25399 << ore::NV("Threshold", -SLPCostThreshold);
25400 });
25401 if (!AdjustReducedVals()) {
25402 V.analyzedReductionVals(VL);
25403 unsigned Offset = Pos == Start ? Pos : Pos - 1;
25404 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
25405 // Add subvectors of VL to the list of the analyzed values.
25406 for (unsigned VF = getFloorFullVectorNumberOfElements(
25407 *TTI, VL.front()->getType(), ReduxWidth - 1);
25408 VF >= ReductionLimit;
25410 *TTI, VL.front()->getType(), VF - 1)) {
25411 if (has_single_bit(VF) &&
25412 V.getCanonicalGraphSize() != V.getTreeSize())
25413 continue;
25414 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
25415 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
25416 }
25417 }
25418 }
25419 continue;
25420 }
25421
25422 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
25423 << Cost << ". (HorRdx)\n");
25424 V.getORE()->emit([&]() {
25425 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
25426 ReducedValsToOps.at(VL[0]).front())
25427 << "Vectorized horizontal reduction with cost "
25428 << ore::NV("Cost", Cost) << " and with tree size "
25429 << ore::NV("TreeSize", V.getTreeSize());
25430 });
25431
25432 Builder.setFastMathFlags(RdxFMF);
25433
25434 // Emit a reduction. If the root is a select (min/max idiom), the insert
25435 // point is the compare condition of that select.
25436 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
25437 Instruction *InsertPt = RdxRootInst;
25438 if (IsCmpSelMinMax)
25439 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
25440
25441 // Vectorize a tree.
25442 Value *VectorizedRoot = V.vectorizeTree(
25443 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
25444 // Update TrackedToOrig mapping, since the tracked values might be
25445 // updated.
25446 for (Value *RdxVal : Candidates) {
25447 Value *OrigVal = TrackedToOrig.at(RdxVal);
25448 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
25449 if (TransformedRdxVal != RdxVal)
25450 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
25451 }
25452
25453 Builder.SetInsertPoint(InsertPt);
25454
25455 // To prevent poison from leaking across what used to be sequential,
25456 // safe, scalar boolean logic operations, the reduction operand must be
25457 // frozen.
25458 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
25459 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
25460
25461 // Emit code to correctly handle reused reduced values, if required.
25462 if (OptReusedScalars && !SameScaleFactor) {
25463 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
25464 SameValuesCounter, TrackedToOrig);
25465 }
25466
25467 Type *ScalarTy = VL.front()->getType();
25468 Type *VecTy = VectorizedRoot->getType();
25469 Type *RedScalarTy = VecTy->getScalarType();
25470 VectorValuesAndScales.emplace_back(
25471 VectorizedRoot,
25472 OptReusedScalars && SameScaleFactor
25473 ? SameValuesCounter.front().second
25474 : 1,
25475 RedScalarTy != ScalarTy->getScalarType()
25476 ? V.isSignedMinBitwidthRootNode()
25477 : true);
25478
25479 // Count vectorized reduced values to exclude them from final reduction.
25480 for (Value *RdxVal : VL) {
25481 Value *OrigV = TrackedToOrig.at(RdxVal);
25482 if (IsSupportedHorRdxIdentityOp) {
25483 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
25484 continue;
25485 }
25486 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25487 if (!V.isVectorized(RdxVal))
25488 RequiredExtract.insert(RdxVal);
25489 }
25490 Pos += ReduxWidth;
25491 Start = Pos;
25492 ReduxWidth = NumReducedVals - Pos;
25493 if (ReduxWidth > 1)
25494 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
25495 AnyVectorized = true;
25496 }
25497 if (OptReusedScalars && !AnyVectorized) {
25498 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
25499 Value *RdxVal = TrackedVals.at(P.first);
25500 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
25501 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25502 VectorizedVals.try_emplace(P.first, P.second);
25503 }
25504 continue;
25505 }
25506 }
25507 if (!VectorValuesAndScales.empty())
25508 VectorizedTree = GetNewVectorizedTree(
25509 VectorizedTree,
25510 emitReduction(Builder, *TTI, ReductionRoot->getType()));
25511
25512 if (!VectorizedTree) {
25513 if (!CheckForReusedReductionOps) {
25514 for (ReductionOpsType &RdxOps : ReductionOps)
25515 for (Value *RdxOp : RdxOps)
25516 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
25517 }
25518 return nullptr;
25519 }
25520
25521 // Reorder operands of bool logical op in the natural order to avoid
25522 // possible problem with poison propagation. If not possible to reorder
25523 // (both operands are originally RHS), emit an extra freeze instruction
25524 // for the LHS operand.
25525 // I.e., if we have original code like this:
25526 // RedOp1 = select i1 ?, i1 LHS, i1 false
25527 // RedOp2 = select i1 RHS, i1 ?, i1 false
25528
25529 // Then, we swap LHS/RHS to create a new op that matches the poison
25530 // semantics of the original code.
25531
25532 // If we have original code like this and both values could be poison:
25533 // RedOp1 = select i1 ?, i1 LHS, i1 false
25534 // RedOp2 = select i1 ?, i1 RHS, i1 false
25535
25536 // Then, we must freeze LHS in the new op.
25537 auto FixBoolLogicalOps =
25538 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
25539 Instruction *RedOp2, bool InitStep) {
25540 if (!AnyBoolLogicOp)
25541 return;
25542 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
25543 getRdxOperand(RedOp1, 0) == LHS ||
25545 return;
25546 bool NeedFreeze = LHS != VectorizedTree;
25547 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
25548 getRdxOperand(RedOp2, 0) == RHS ||
25550 // If RedOp2 was used as a second operand - do not swap.
25551 if ((InitStep || RHS != VectorizedTree) &&
25552 getRdxOperand(RedOp2, 0) == RHS &&
25553 ((isBoolLogicOp(RedOp1) &&
25554 getRdxOperand(RedOp1, 1) == RedOp2) ||
25555 any_of(ReductionOps, [&](ArrayRef<Value *> Ops) {
25556 return any_of(Ops, [&](Value *Op) {
25557 auto *OpI = dyn_cast<Instruction>(Op);
25558 return OpI && isBoolLogicOp(OpI) &&
25559 getRdxOperand(OpI, 1) == RedOp2;
25560 });
25561 }))) {
25562 NeedFreeze = false;
25563 } else {
25564 std::swap(LHS, RHS);
25565 return;
25566 }
25567 }
25568 if (NeedFreeze)
25569 LHS = Builder.CreateFreeze(LHS);
25570 };
25571 // Finish the reduction.
25572 // Need to add extra arguments and not vectorized possible reduction values.
25573 // Try to avoid dependencies between the scalar remainders after reductions.
25574 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
25575 bool InitStep) {
25576 unsigned Sz = InstVals.size();
25577 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
25578 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
25579 Instruction *RedOp = InstVals[I + 1].first;
25580 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
25581 Value *RdxVal1 = InstVals[I].second;
25582 Value *StableRdxVal1 = RdxVal1;
25583 auto It1 = TrackedVals.find(RdxVal1);
25584 if (It1 != TrackedVals.end())
25585 StableRdxVal1 = It1->second;
25586 Value *RdxVal2 = InstVals[I + 1].second;
25587 Value *StableRdxVal2 = RdxVal2;
25588 auto It2 = TrackedVals.find(RdxVal2);
25589 if (It2 != TrackedVals.end())
25590 StableRdxVal2 = It2->second;
25591 // To prevent poison from leaking across what used to be sequential,
25592 // safe, scalar boolean logic operations, the reduction operand must be
25593 // frozen.
25594 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
25595 RedOp, InitStep);
25596 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
25597 StableRdxVal2, "op.rdx", ReductionOps);
25598 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
25599 }
25600 if (Sz % 2 == 1)
25601 ExtraReds[Sz / 2] = InstVals.back();
25602 return ExtraReds;
25603 };
25605 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
25606 VectorizedTree);
25607 SmallPtrSet<Value *, 8> Visited;
25608 for (ArrayRef<Value *> Candidates : ReducedVals) {
25609 for (Value *RdxVal : Candidates) {
25610 if (!Visited.insert(RdxVal).second)
25611 continue;
25612 unsigned NumOps = VectorizedVals.lookup(RdxVal);
25613 for (Instruction *RedOp :
25614 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
25615 ExtraReductions.emplace_back(RedOp, RdxVal);
25616 }
25617 }
25618 // Iterate through all not-vectorized reduction values/extra arguments.
25619 bool InitStep = true;
25620 while (ExtraReductions.size() > 1) {
25622 FinalGen(ExtraReductions, InitStep);
25623 ExtraReductions.swap(NewReds);
25624 InitStep = false;
25625 }
25626 VectorizedTree = ExtraReductions.front().second;
25627
25628 ReductionRoot->replaceAllUsesWith(VectorizedTree);
25629
25630 // The original scalar reduction is expected to have no remaining
25631 // uses outside the reduction tree itself. Assert that we got this
25632 // correct, replace internal uses with undef, and mark for eventual
25633 // deletion.
25634#ifndef NDEBUG
25635 SmallPtrSet<Value *, 4> IgnoreSet;
25636 for (ArrayRef<Value *> RdxOps : ReductionOps)
25637 IgnoreSet.insert_range(RdxOps);
25638#endif
25639 for (ArrayRef<Value *> RdxOps : ReductionOps) {
25640 for (Value *Ignore : RdxOps) {
25641 if (!Ignore)
25642 continue;
25643#ifndef NDEBUG
25644 for (auto *U : Ignore->users()) {
25645 assert(IgnoreSet.count(U) &&
25646 "All users must be either in the reduction ops list.");
25647 }
25648#endif
25649 if (!Ignore->use_empty()) {
25650 Value *P = PoisonValue::get(Ignore->getType());
25651 Ignore->replaceAllUsesWith(P);
25652 }
25653 }
25654 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
25655 }
25656 return VectorizedTree;
25657 }
25658
25659private:
25660 /// Creates the reduction from the given \p Vec vector value with the given
25661 /// scale \p Scale and signedness \p IsSigned.
25662 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
25663 Value *Vec, unsigned Scale, bool IsSigned,
25664 Type *DestTy) {
25665 Value *Rdx;
25666 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
25667 unsigned DestTyNumElements = getNumElements(VecTy);
25668 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
25669 Rdx = PoisonValue::get(
25670 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
25671 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
25672 // Do reduction for each lane.
25673 // e.g., do reduce add for
25674 // VL[0] = <4 x Ty> <a, b, c, d>
25675 // VL[1] = <4 x Ty> <e, f, g, h>
25676 // Lane[0] = <2 x Ty> <a, e>
25677 // Lane[1] = <2 x Ty> <b, f>
25678 // Lane[2] = <2 x Ty> <c, g>
25679 // Lane[3] = <2 x Ty> <d, h>
25680 // result[0] = reduce add Lane[0]
25681 // result[1] = reduce add Lane[1]
25682 // result[2] = reduce add Lane[2]
25683 // result[3] = reduce add Lane[3]
25684 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
25685 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
25686 Rdx = Builder.CreateInsertElement(
25687 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
25688 }
25689 } else {
25690 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
25691 }
25692 if (Rdx->getType() != DestTy)
25693 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
25694 // Improved analysis for add/fadd/xor reductions with same scale
25695 // factor for all operands of reductions. We can emit scalar ops for
25696 // them instead.
25697 if (Scale > 1)
25698 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
25699 return Rdx;
25700 }
25701
25702 /// Calculate the cost of a reduction.
25703 InstructionCost getReductionCost(TargetTransformInfo *TTI,
25704 ArrayRef<Value *> ReducedVals,
25705 bool IsCmpSelMinMax, FastMathFlags FMF,
25706 const BoUpSLP &R, DominatorTree &DT,
25707 const DataLayout &DL,
25708 const TargetLibraryInfo &TLI) {
25710 Type *ScalarTy = ReducedVals.front()->getType();
25711 unsigned ReduxWidth = ReducedVals.size();
25712 FixedVectorType *VectorTy = R.getReductionType();
25713 InstructionCost VectorCost = 0, ScalarCost;
25714 // If all of the reduced values are constant, the vector cost is 0, since
25715 // the reduction value can be calculated at the compile time.
25716 bool AllConsts = allConstant(ReducedVals);
25717 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
25719 // Scalar cost is repeated for N-1 elements.
25720 int Cnt = ReducedVals.size();
25721 for (Value *RdxVal : ReducedVals) {
25722 if (!isa<Instruction>(RdxVal))
25723 continue;
25724 if (Cnt == 1)
25725 break;
25726 --Cnt;
25727 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
25728 Cost += GenCostFn();
25729 continue;
25730 }
25731 InstructionCost ScalarCost = 0;
25732 for (User *U : RdxVal->users()) {
25733 auto *RdxOp = cast<Instruction>(U);
25734 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
25735 if (RdxKind == RecurKind::FAdd) {
25737 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
25738 if (FMACost.isValid()) {
25739 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
25740 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
25741 // Also, exclude scalar fmul cost.
25742 InstructionCost FMulCost =
25744 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
25745 FMACost -= FMulCost;
25746 }
25747 ScalarCost += FMACost;
25748 continue;
25749 }
25750 }
25751 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
25752 continue;
25753 }
25754 ScalarCost = InstructionCost::getInvalid();
25755 break;
25756 }
25757 if (ScalarCost.isValid())
25758 Cost += ScalarCost;
25759 else
25760 Cost += GenCostFn();
25761 }
25762 return Cost;
25763 };
25764 // Require reduction cost if:
25765 // 1. This type is not a full register type and no other vectors with the
25766 // same type in the storage (first vector with small type).
25767 // 2. The storage does not have any vector with full vector use (first
25768 // vector with full register use).
25769 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
25770 switch (RdxKind) {
25771 case RecurKind::Add:
25772 case RecurKind::Mul:
25773 case RecurKind::Or:
25774 case RecurKind::And:
25775 case RecurKind::Xor:
25776 case RecurKind::FAdd:
25777 case RecurKind::FMul: {
25778 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
25779 if (!AllConsts) {
25780 if (DoesRequireReductionOp) {
25781 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
25782 assert(SLPReVec && "FixedVectorType is not expected.");
25783 unsigned ScalarTyNumElements = VecTy->getNumElements();
25784 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
25785 VectorCost += TTI->getShuffleCost(
25788 ReducedVals.size()),
25789 VectorTy,
25790 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
25791 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
25792 FMF, CostKind);
25793 }
25794 VectorCost += TTI->getScalarizationOverhead(
25795 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
25796 /*Extract*/ false, TTI::TCK_RecipThroughput);
25797 } else {
25798 Type *RedTy = VectorTy->getElementType();
25799 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25800 std::make_pair(RedTy, true));
25801 if (RType == RedTy) {
25802 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
25803 FMF, CostKind);
25804 } else {
25805 VectorCost = TTI->getExtendedReductionCost(
25806 RdxOpcode, !IsSigned, RedTy,
25807 getWidenedType(RType, ReduxWidth), FMF, CostKind);
25808 }
25809 }
25810 } else {
25811 Type *RedTy = VectorTy->getElementType();
25812 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25813 std::make_pair(RedTy, true));
25814 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
25815 InstructionCost FMACost = InstructionCost::getInvalid();
25816 if (RdxKind == RecurKind::FAdd) {
25817 // Check if the reduction operands can be converted to FMA.
25819 FastMathFlags FMF;
25820 FMF.set();
25821 for (Value *RdxVal : ReducedVals) {
25822 if (!RdxVal->hasOneUse()) {
25823 Ops.clear();
25824 break;
25825 }
25826 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
25827 FMF &= FPCI->getFastMathFlags();
25828 Ops.push_back(RdxVal->user_back());
25829 }
25830 if (!Ops.empty()) {
25831 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
25832 *TTI, TLI);
25833 if (FMACost.isValid()) {
25834 // Calculate actual FMAD cost.
25835 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25836 {RVecTy, RVecTy, RVecTy}, FMF);
25837 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
25838
25839 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
25840 // Also, exclude vector fmul cost.
25842 Instruction::FMul, RVecTy, CostKind);
25844 << "Minus vector FMul cost: " << FMulCost << "\n");
25845 FMACost -= FMulCost;
25846 }
25847 }
25848 }
25849 if (FMACost.isValid())
25850 VectorCost += FMACost;
25851 else
25852 VectorCost +=
25853 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
25854 if (RType != RedTy) {
25855 unsigned Opcode = Instruction::Trunc;
25856 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25857 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25858 VectorCost += TTI->getCastInstrCost(
25859 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25860 }
25861 }
25862 }
25863 ScalarCost = EvaluateScalarCost([&]() {
25864 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
25865 });
25866 break;
25867 }
25868 case RecurKind::FMax:
25869 case RecurKind::FMin:
25870 case RecurKind::FMaximum:
25871 case RecurKind::FMinimum:
25872 case RecurKind::SMax:
25873 case RecurKind::SMin:
25874 case RecurKind::UMax:
25875 case RecurKind::UMin: {
25877 if (!AllConsts) {
25878 if (DoesRequireReductionOp) {
25879 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
25880 } else {
25881 // Check if the previous reduction already exists and account it as
25882 // series of operations + single reduction.
25883 Type *RedTy = VectorTy->getElementType();
25884 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25885 std::make_pair(RedTy, true));
25886 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
25887 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25888 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
25889 if (RType != RedTy) {
25890 unsigned Opcode = Instruction::Trunc;
25891 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25892 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25893 VectorCost += TTI->getCastInstrCost(
25894 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25895 }
25896 }
25897 }
25898 ScalarCost = EvaluateScalarCost([&]() {
25899 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25900 return TTI->getIntrinsicInstrCost(ICA, CostKind);
25901 });
25902 break;
25903 }
25904 default:
25905 llvm_unreachable("Expected arithmetic or min/max reduction operation");
25906 }
25907
25908 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
25909 << " for reduction of " << shortBundleName(ReducedVals)
25910 << " (It is a splitting reduction)\n");
25911 return VectorCost - ScalarCost;
25912 }
25913
25914 /// Splits the values, stored in VectorValuesAndScales, into registers/free
25915 /// sub-registers, combines them with the given reduction operation as a
25916 /// vector operation and then performs single (small enough) reduction.
25917 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
25918 Type *DestTy) {
25919 Value *ReducedSubTree = nullptr;
25920 // Creates reduction and combines with the previous reduction.
25921 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
25922 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
25923 if (ReducedSubTree)
25924 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25925 "op.rdx", ReductionOps);
25926 else
25927 ReducedSubTree = Rdx;
25928 };
25929 if (VectorValuesAndScales.size() == 1) {
25930 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
25931 CreateSingleOp(Vec, Scale, IsSigned);
25932 return ReducedSubTree;
25933 }
25934 // Scales Vec using given Cnt scale factor and then performs vector combine
25935 // with previous value of VecOp.
25936 Value *VecRes = nullptr;
25937 bool VecResSignedness = false;
25938 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
25939 Type *ScalarTy = Vec->getType()->getScalarType();
25940 // Scale Vec using given Cnt scale factor.
25941 if (Cnt > 1) {
25942 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
25943 switch (RdxKind) {
25944 case RecurKind::Add: {
25945 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
25946 unsigned VF = getNumElements(Vec->getType());
25947 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
25948 << ". (HorRdx)\n");
25949 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
25950 for (unsigned I : seq<unsigned>(Cnt))
25951 std::iota(std::next(Mask.begin(), VF * I),
25952 std::next(Mask.begin(), VF * (I + 1)), 0);
25953 ++NumVectorInstructions;
25954 Vec = Builder.CreateShuffleVector(Vec, Mask);
25955 break;
25956 }
25957 // res = mul vv, n
25958 if (ScalarTy != DestTy->getScalarType())
25959 Vec = Builder.CreateIntCast(
25960 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25961 IsSigned);
25963 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
25964 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
25965 << ". (HorRdx)\n");
25966 ++NumVectorInstructions;
25967 Vec = Builder.CreateMul(Vec, Scale);
25968 break;
25969 }
25970 case RecurKind::Xor: {
25971 // res = n % 2 ? 0 : vv
25973 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
25974 if (Cnt % 2 == 0)
25975 Vec = Constant::getNullValue(Vec->getType());
25976 break;
25977 }
25978 case RecurKind::FAdd: {
25979 // res = fmul v, n
25980 Value *Scale =
25981 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
25982 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
25983 << ". (HorRdx)\n");
25984 ++NumVectorInstructions;
25985 Vec = Builder.CreateFMul(Vec, Scale);
25986 break;
25987 }
25988 case RecurKind::And:
25989 case RecurKind::Or:
25990 case RecurKind::SMax:
25991 case RecurKind::SMin:
25992 case RecurKind::UMax:
25993 case RecurKind::UMin:
25994 case RecurKind::FMax:
25995 case RecurKind::FMin:
25996 case RecurKind::FMaximum:
25997 case RecurKind::FMinimum:
25998 // res = vv
25999 break;
26000 case RecurKind::Sub:
26001 case RecurKind::AddChainWithSubs:
26002 case RecurKind::Mul:
26003 case RecurKind::FMul:
26004 case RecurKind::FMulAdd:
26005 case RecurKind::AnyOf:
26006 case RecurKind::FindFirstIVSMin:
26007 case RecurKind::FindFirstIVUMin:
26008 case RecurKind::FindLastIVSMax:
26009 case RecurKind::FindLastIVUMax:
26010 case RecurKind::FMaxNum:
26011 case RecurKind::FMinNum:
26012 case RecurKind::FMaximumNum:
26013 case RecurKind::FMinimumNum:
26014 case RecurKind::None:
26015 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
26016 }
26017 }
26018 // Combine Vec with the previous VecOp.
26019 if (!VecRes) {
26020 VecRes = Vec;
26021 VecResSignedness = IsSigned;
26022 } else {
26023 ++NumVectorInstructions;
26024 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
26025 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
26026 // Handle ctpop.
26027 unsigned VecResVF = getNumElements(VecRes->getType());
26028 unsigned VecVF = getNumElements(Vec->getType());
26029 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
26030 std::iota(Mask.begin(), Mask.end(), 0);
26031 // Ensure that VecRes is always larger than Vec
26032 if (VecResVF < VecVF) {
26033 std::swap(VecRes, Vec);
26034 std::swap(VecResVF, VecVF);
26035 }
26036 if (VecResVF != VecVF) {
26037 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
26038 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
26039 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
26040 }
26041 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
26042 return;
26043 }
26044 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
26045 VecRes = Builder.CreateIntCast(
26046 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
26047 VecResSignedness);
26048 if (ScalarTy != DestTy->getScalarType())
26049 Vec = Builder.CreateIntCast(
26050 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
26051 IsSigned);
26052 unsigned VecResVF = getNumElements(VecRes->getType());
26053 unsigned VecVF = getNumElements(Vec->getType());
26054 // Ensure that VecRes is always larger than Vec
26055 if (VecResVF < VecVF) {
26056 std::swap(VecRes, Vec);
26057 std::swap(VecResVF, VecVF);
26058 }
26059 // extract + op + insert
26060 Value *Op = VecRes;
26061 if (VecResVF != VecVF)
26062 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
26063 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
26064 if (VecResVF != VecVF)
26065 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
26066 VecRes = Op;
26067 }
26068 };
26069 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
26070 CreateVecOp(Vec, Scale, IsSigned);
26071 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
26072
26073 return ReducedSubTree;
26074 }
26075
26076 /// Emit a horizontal reduction of the vectorized value.
26077 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
26078 const TargetTransformInfo *TTI, Type *DestTy) {
26079 assert(VectorizedValue && "Need to have a vectorized tree node");
26080 assert(RdxKind != RecurKind::FMulAdd &&
26081 "A call to the llvm.fmuladd intrinsic is not handled yet");
26082
26083 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
26084 if (FTy->getScalarType() == Builder.getInt1Ty() &&
26085 RdxKind == RecurKind::Add &&
26086 DestTy->getScalarType() != FTy->getScalarType()) {
26087 // Convert vector_reduce_add(ZExt(<n x i1>)) to
26088 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
26089 Value *V = Builder.CreateBitCast(
26090 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
26091 ++NumVectorInstructions;
26092 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
26093 }
26094 ++NumVectorInstructions;
26095 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
26096 }
26097
26098 /// Emits optimized code for unique scalar value reused \p Cnt times.
26099 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
26100 unsigned Cnt) {
26101 assert(IsSupportedHorRdxIdentityOp &&
26102 "The optimization of matched scalar identity horizontal reductions "
26103 "must be supported.");
26104 if (Cnt == 1)
26105 return VectorizedValue;
26106 switch (RdxKind) {
26107 case RecurKind::Add: {
26108 // res = mul vv, n
26109 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
26110 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
26111 << VectorizedValue << ". (HorRdx)\n");
26112 return Builder.CreateMul(VectorizedValue, Scale);
26113 }
26114 case RecurKind::Xor: {
26115 // res = n % 2 ? 0 : vv
26116 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
26117 << ". (HorRdx)\n");
26118 if (Cnt % 2 == 0)
26119 return Constant::getNullValue(VectorizedValue->getType());
26120 return VectorizedValue;
26121 }
26122 case RecurKind::FAdd: {
26123 // res = fmul v, n
26124 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
26125 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
26126 << VectorizedValue << ". (HorRdx)\n");
26127 return Builder.CreateFMul(VectorizedValue, Scale);
26128 }
26129 case RecurKind::And:
26130 case RecurKind::Or:
26131 case RecurKind::SMax:
26132 case RecurKind::SMin:
26133 case RecurKind::UMax:
26134 case RecurKind::UMin:
26135 case RecurKind::FMax:
26136 case RecurKind::FMin:
26137 case RecurKind::FMaximum:
26138 case RecurKind::FMinimum:
26139 // res = vv
26140 return VectorizedValue;
26141 case RecurKind::Sub:
26142 case RecurKind::AddChainWithSubs:
26143 case RecurKind::Mul:
26144 case RecurKind::FMul:
26145 case RecurKind::FMulAdd:
26146 case RecurKind::AnyOf:
26147 case RecurKind::FindFirstIVSMin:
26148 case RecurKind::FindFirstIVUMin:
26149 case RecurKind::FindLastIVSMax:
26150 case RecurKind::FindLastIVUMax:
26151 case RecurKind::FMaxNum:
26152 case RecurKind::FMinNum:
26153 case RecurKind::FMaximumNum:
26154 case RecurKind::FMinimumNum:
26155 case RecurKind::None:
26156 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
26157 }
26158 return nullptr;
26159 }
26160
26161 /// Emits actual operation for the scalar identity values, found during
26162 /// horizontal reduction analysis.
26163 Value *
26164 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
26165 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
26166 const DenseMap<Value *, Value *> &TrackedToOrig) {
26167 assert(IsSupportedHorRdxIdentityOp &&
26168 "The optimization of matched scalar identity horizontal reductions "
26169 "must be supported.");
26170 ArrayRef<Value *> VL = R.getRootNodeScalars();
26171 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
26172 if (VTy->getElementType() != VL.front()->getType()) {
26173 VectorizedValue = Builder.CreateIntCast(
26174 VectorizedValue,
26175 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
26176 R.isSignedMinBitwidthRootNode());
26177 }
26178 switch (RdxKind) {
26179 case RecurKind::Add: {
26180 // root = mul prev_root, <1, 1, n, 1>
26182 for (Value *V : VL) {
26183 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
26184 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
26185 }
26186 auto *Scale = ConstantVector::get(Vals);
26187 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
26188 << VectorizedValue << ". (HorRdx)\n");
26189 return Builder.CreateMul(VectorizedValue, Scale);
26190 }
26191 case RecurKind::And:
26192 case RecurKind::Or:
26193 // No need for multiple or/and(s).
26194 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
26195 << ". (HorRdx)\n");
26196 return VectorizedValue;
26197 case RecurKind::SMax:
26198 case RecurKind::SMin:
26199 case RecurKind::UMax:
26200 case RecurKind::UMin:
26201 case RecurKind::FMax:
26202 case RecurKind::FMin:
26203 case RecurKind::FMaximum:
26204 case RecurKind::FMinimum:
26205 // No need for multiple min/max(s) of the same value.
26206 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
26207 << ". (HorRdx)\n");
26208 return VectorizedValue;
26209 case RecurKind::Xor: {
26210 // Replace values with even number of repeats with 0, since
26211 // x xor x = 0.
26212 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
26213 // 7>, if elements 4th and 6th elements have even number of repeats.
26214 SmallVector<int> Mask(
26215 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
26217 std::iota(Mask.begin(), Mask.end(), 0);
26218 bool NeedShuffle = false;
26219 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
26220 Value *V = VL[I];
26221 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
26222 if (Cnt % 2 == 0) {
26223 Mask[I] = VF;
26224 NeedShuffle = true;
26225 }
26226 }
26227 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
26228 : Mask) dbgs()
26229 << I << " ";
26230 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
26231 if (NeedShuffle)
26232 VectorizedValue = Builder.CreateShuffleVector(
26233 VectorizedValue,
26234 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
26235 return VectorizedValue;
26236 }
26237 case RecurKind::FAdd: {
26238 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
26240 for (Value *V : VL) {
26241 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
26242 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
26243 }
26244 auto *Scale = ConstantVector::get(Vals);
26245 return Builder.CreateFMul(VectorizedValue, Scale);
26246 }
26247 case RecurKind::Sub:
26248 case RecurKind::AddChainWithSubs:
26249 case RecurKind::Mul:
26250 case RecurKind::FMul:
26251 case RecurKind::FMulAdd:
26252 case RecurKind::AnyOf:
26253 case RecurKind::FindFirstIVSMin:
26254 case RecurKind::FindFirstIVUMin:
26255 case RecurKind::FindLastIVSMax:
26256 case RecurKind::FindLastIVUMax:
26257 case RecurKind::FMaxNum:
26258 case RecurKind::FMinNum:
26259 case RecurKind::FMaximumNum:
26260 case RecurKind::FMinimumNum:
26261 case RecurKind::None:
26262 llvm_unreachable("Unexpected reduction kind for reused scalars.");
26263 }
26264 return nullptr;
26265 }
26266};
26267} // end anonymous namespace
26268
26269/// Gets recurrence kind from the specified value.
26271 return HorizontalReduction::getRdxKind(V);
26272}
26273static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
26274 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
26275 return cast<FixedVectorType>(IE->getType())->getNumElements();
26276
26277 unsigned AggregateSize = 1;
26278 auto *IV = cast<InsertValueInst>(InsertInst);
26279 Type *CurrentType = IV->getType();
26280 do {
26281 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
26282 for (auto *Elt : ST->elements())
26283 if (Elt != ST->getElementType(0)) // check homogeneity
26284 return std::nullopt;
26285 AggregateSize *= ST->getNumElements();
26286 CurrentType = ST->getElementType(0);
26287 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
26288 AggregateSize *= AT->getNumElements();
26289 CurrentType = AT->getElementType();
26290 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
26291 AggregateSize *= VT->getNumElements();
26292 return AggregateSize;
26293 } else if (CurrentType->isSingleValueType()) {
26294 return AggregateSize;
26295 } else {
26296 return std::nullopt;
26297 }
26298 } while (true);
26299}
26300
26301static void findBuildAggregateRec(Instruction *LastInsertInst,
26303 SmallVectorImpl<Value *> &BuildVectorOpds,
26304 SmallVectorImpl<Value *> &InsertElts,
26305 unsigned OperandOffset, const BoUpSLP &R) {
26306 do {
26307 Value *InsertedOperand = LastInsertInst->getOperand(1);
26308 std::optional<unsigned> OperandIndex =
26309 getElementIndex(LastInsertInst, OperandOffset);
26310 if (!OperandIndex || R.isDeleted(LastInsertInst))
26311 return;
26312 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
26314 BuildVectorOpds, InsertElts, *OperandIndex, R);
26315
26316 } else {
26317 BuildVectorOpds[*OperandIndex] = InsertedOperand;
26318 InsertElts[*OperandIndex] = LastInsertInst;
26319 }
26320 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
26321 } while (LastInsertInst != nullptr &&
26323 LastInsertInst->hasOneUse());
26324}
26325
26326/// Recognize construction of vectors like
26327/// %ra = insertelement <4 x float> poison, float %s0, i32 0
26328/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
26329/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
26330/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
26331/// starting from the last insertelement or insertvalue instruction.
26332///
26333/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
26334/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
26335/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
26336///
26337/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
26338///
26339/// \return true if it matches.
26340static bool findBuildAggregate(Instruction *LastInsertInst,
26342 SmallVectorImpl<Value *> &BuildVectorOpds,
26343 SmallVectorImpl<Value *> &InsertElts,
26344 const BoUpSLP &R) {
26345
26346 assert((isa<InsertElementInst>(LastInsertInst) ||
26347 isa<InsertValueInst>(LastInsertInst)) &&
26348 "Expected insertelement or insertvalue instruction!");
26349
26350 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
26351 "Expected empty result vectors!");
26352
26353 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
26354 if (!AggregateSize)
26355 return false;
26356 BuildVectorOpds.resize(*AggregateSize);
26357 InsertElts.resize(*AggregateSize);
26358
26359 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
26360 llvm::erase(BuildVectorOpds, nullptr);
26361 llvm::erase(InsertElts, nullptr);
26362 if (BuildVectorOpds.size() >= 2)
26363 return true;
26364
26365 return false;
26366}
26367
26368/// Try and get a reduction instruction from a phi node.
26369///
26370/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
26371/// if they come from either \p ParentBB or a containing loop latch.
26372///
26373/// \returns A candidate reduction value if possible, or \code nullptr \endcode
26374/// if not possible.
26376 BasicBlock *ParentBB, LoopInfo *LI) {
26377 // There are situations where the reduction value is not dominated by the
26378 // reduction phi. Vectorizing such cases has been reported to cause
26379 // miscompiles. See PR25787.
26380 auto DominatedReduxValue = [&](Value *R) {
26381 return isa<Instruction>(R) &&
26382 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
26383 };
26384
26385 Instruction *Rdx = nullptr;
26386
26387 // Return the incoming value if it comes from the same BB as the phi node.
26388 if (P->getIncomingBlock(0) == ParentBB) {
26389 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
26390 } else if (P->getIncomingBlock(1) == ParentBB) {
26391 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
26392 }
26393
26394 if (Rdx && DominatedReduxValue(Rdx))
26395 return Rdx;
26396
26397 // Otherwise, check whether we have a loop latch to look at.
26398 Loop *BBL = LI->getLoopFor(ParentBB);
26399 if (!BBL)
26400 return nullptr;
26401 BasicBlock *BBLatch = BBL->getLoopLatch();
26402 if (!BBLatch)
26403 return nullptr;
26404
26405 // There is a loop latch, return the incoming value if it comes from
26406 // that. This reduction pattern occasionally turns up.
26407 if (P->getIncomingBlock(0) == BBLatch) {
26408 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
26409 } else if (P->getIncomingBlock(1) == BBLatch) {
26410 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
26411 }
26412
26413 if (Rdx && DominatedReduxValue(Rdx))
26414 return Rdx;
26415
26416 return nullptr;
26417}
26418
26419static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
26420 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
26421 return true;
26422 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
26423 return true;
26424 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
26425 return true;
26426 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
26427 return true;
26428 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
26429 return true;
26431 return true;
26433 return true;
26435 return true;
26437 return true;
26438 return false;
26439}
26440
26441/// We could have an initial reduction that is not an add.
26442/// r *= v1 + v2 + v3 + v4
26443/// In such a case start looking for a tree rooted in the first '+'.
26444/// \Returns the new root if found, which may be nullptr if not an instruction.
26446 Instruction *Root) {
26447 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
26448 isa<IntrinsicInst>(Root)) &&
26449 "Expected binop, select, or intrinsic for reduction matching");
26450 Value *LHS =
26451 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
26452 Value *RHS =
26453 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
26454 if (LHS == Phi)
26455 return dyn_cast<Instruction>(RHS);
26456 if (RHS == Phi)
26457 return dyn_cast<Instruction>(LHS);
26458 return nullptr;
26459}
26460
26461/// \p Returns the first operand of \p I that does not match \p Phi. If
26462/// operand is not an instruction it returns nullptr.
26464 Value *Op0 = nullptr;
26465 Value *Op1 = nullptr;
26466 if (!matchRdxBop(I, Op0, Op1))
26467 return nullptr;
26468 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
26469}
26470
26471/// \Returns true if \p I is a candidate instruction for reduction vectorization.
26473 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
26474 Value *B0 = nullptr, *B1 = nullptr;
26475 bool IsBinop = matchRdxBop(I, B0, B1);
26476 return IsBinop || IsSelect;
26477}
26478
26479bool SLPVectorizerPass::vectorizeHorReduction(
26480 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
26481 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
26482 if (!ShouldVectorizeHor)
26483 return false;
26484 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
26485
26486 if (Root->getParent() != BB || isa<PHINode>(Root))
26487 return false;
26488
26489 // If we can find a secondary reduction root, use that instead.
26490 auto SelectRoot = [&]() {
26491 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
26492 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
26493 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
26494 return NewRoot;
26495 return Root;
26496 };
26497
26498 // Start analysis starting from Root instruction. If horizontal reduction is
26499 // found, try to vectorize it. If it is not a horizontal reduction or
26500 // vectorization is not possible or not effective, and currently analyzed
26501 // instruction is a binary operation, try to vectorize the operands, using
26502 // pre-order DFS traversal order. If the operands were not vectorized, repeat
26503 // the same procedure considering each operand as a possible root of the
26504 // horizontal reduction.
26505 // Interrupt the process if the Root instruction itself was vectorized or all
26506 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
26507 // If a horizintal reduction was not matched or vectorized we collect
26508 // instructions for possible later attempts for vectorization.
26509 std::queue<std::pair<Instruction *, unsigned>> Stack;
26510 Stack.emplace(SelectRoot(), 0);
26511 SmallPtrSet<Value *, 8> VisitedInstrs;
26512 bool Res = false;
26513 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
26514 if (R.isAnalyzedReductionRoot(Inst))
26515 return nullptr;
26516 if (!isReductionCandidate(Inst))
26517 return nullptr;
26518 HorizontalReduction HorRdx;
26519 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
26520 return nullptr;
26521 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
26522 };
26523 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
26524 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
26525 FutureSeed = getNonPhiOperand(Root, P);
26526 if (!FutureSeed)
26527 return false;
26528 }
26529 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
26530 // analysis is done separately.
26532 PostponedInsts.push_back(FutureSeed);
26533 return true;
26534 };
26535
26536 while (!Stack.empty()) {
26537 Instruction *Inst;
26538 unsigned Level;
26539 std::tie(Inst, Level) = Stack.front();
26540 Stack.pop();
26541 // Do not try to analyze instruction that has already been vectorized.
26542 // This may happen when we vectorize instruction operands on a previous
26543 // iteration while stack was populated before that happened.
26544 if (R.isDeleted(Inst))
26545 continue;
26546 if (Value *VectorizedV = TryToReduce(Inst)) {
26547 Res = true;
26548 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
26549 // Try to find another reduction.
26550 Stack.emplace(I, Level);
26551 continue;
26552 }
26553 if (R.isDeleted(Inst))
26554 continue;
26555 } else {
26556 // We could not vectorize `Inst` so try to use it as a future seed.
26557 if (!TryAppendToPostponedInsts(Inst)) {
26558 assert(Stack.empty() && "Expected empty stack");
26559 break;
26560 }
26561 }
26562
26563 // Try to vectorize operands.
26564 // Continue analysis for the instruction from the same basic block only to
26565 // save compile time.
26566 if (++Level < RecursionMaxDepth)
26567 for (auto *Op : Inst->operand_values())
26568 if (VisitedInstrs.insert(Op).second)
26569 if (auto *I = dyn_cast<Instruction>(Op))
26570 // Do not try to vectorize CmpInst operands, this is done
26571 // separately.
26573 !R.isDeleted(I) && I->getParent() == BB)
26574 Stack.emplace(I, Level);
26575 }
26576 return Res;
26577}
26578
26579bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
26580 if (!I)
26581 return false;
26582
26583 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
26584 return false;
26585 // Skip potential FMA candidates.
26586 if ((I->getOpcode() == Instruction::FAdd ||
26587 I->getOpcode() == Instruction::FSub) &&
26588 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
26589 .isValid())
26590 return false;
26591
26592 Value *P = I->getParent();
26593
26594 // Vectorize in current basic block only.
26595 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
26596 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
26597 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
26598 R.isDeleted(Op0) || R.isDeleted(Op1))
26599 return false;
26600
26601 // First collect all possible candidates
26603 Candidates.emplace_back(Op0, Op1);
26604
26605 auto *A = dyn_cast<BinaryOperator>(Op0);
26606 auto *B = dyn_cast<BinaryOperator>(Op1);
26607 // Try to skip B.
26608 if (A && B && B->hasOneUse()) {
26609 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
26610 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
26611 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
26612 Candidates.emplace_back(A, B0);
26613 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
26614 Candidates.emplace_back(A, B1);
26615 }
26616 // Try to skip A.
26617 if (B && A && A->hasOneUse()) {
26618 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
26619 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
26620 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
26621 Candidates.emplace_back(A0, B);
26622 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
26623 Candidates.emplace_back(A1, B);
26624 }
26625
26626 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
26628 if (!isReductionCandidate(Inst))
26629 return false;
26630 Type *Ty = Inst->getType();
26631 if (!isValidElementType(Ty) || Ty->isPointerTy())
26632 return false;
26633 HorizontalReduction HorRdx(Inst, Ops);
26634 if (!HorRdx.matchReductionForOperands())
26635 return false;
26636 // Check the cost of operations.
26637 VectorType *VecTy = getWidenedType(Ty, Ops.size());
26639 InstructionCost ScalarCost =
26640 TTI.getScalarizationOverhead(
26641 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
26642 /*Extract=*/true, CostKind) +
26643 TTI.getInstructionCost(Inst, CostKind);
26644 InstructionCost RedCost;
26645 switch (::getRdxKind(Inst)) {
26646 case RecurKind::Add:
26647 case RecurKind::Mul:
26648 case RecurKind::Or:
26649 case RecurKind::And:
26650 case RecurKind::Xor:
26651 case RecurKind::FAdd:
26652 case RecurKind::FMul: {
26653 FastMathFlags FMF;
26654 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
26655 FMF = FPCI->getFastMathFlags();
26656 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
26657 CostKind);
26658 break;
26659 }
26660 default:
26661 return false;
26662 }
26663 if (RedCost >= ScalarCost)
26664 return false;
26665
26666 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
26667 };
26668 if (Candidates.size() == 1)
26669 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
26670
26671 // We have multiple options. Try to pick the single best.
26672 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
26673 if (!BestCandidate)
26674 return false;
26675 return (*BestCandidate == 0 &&
26676 TryToReduce(I, {Candidates[*BestCandidate].first,
26677 Candidates[*BestCandidate].second})) ||
26678 tryToVectorizeList({Candidates[*BestCandidate].first,
26679 Candidates[*BestCandidate].second},
26680 R);
26681}
26682
26683bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
26684 BasicBlock *BB, BoUpSLP &R) {
26685 SmallVector<WeakTrackingVH> PostponedInsts;
26686 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
26687 Res |= tryToVectorize(PostponedInsts, R);
26688 return Res;
26689}
26690
26691bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
26692 BoUpSLP &R) {
26693 bool Res = false;
26694 for (Value *V : Insts)
26695 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
26696 Res |= tryToVectorize(Inst, R);
26697 return Res;
26698}
26699
26700bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
26701 BasicBlock *BB, BoUpSLP &R,
26702 bool MaxVFOnly) {
26703 if (!R.canMapToVector(IVI->getType()))
26704 return false;
26705
26706 SmallVector<Value *, 16> BuildVectorOpds;
26707 SmallVector<Value *, 16> BuildVectorInsts;
26708 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
26709 return false;
26710
26711 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
26712 R.getORE()->emit([&]() {
26713 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
26714 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
26715 "trying reduction first.";
26716 });
26717 return false;
26718 }
26719 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
26720 // Aggregate value is unlikely to be processed in vector register.
26721 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
26722}
26723
26724bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
26725 BasicBlock *BB, BoUpSLP &R,
26726 bool MaxVFOnly) {
26727 SmallVector<Value *, 16> BuildVectorInsts;
26728 SmallVector<Value *, 16> BuildVectorOpds;
26729 SmallVector<int> Mask;
26730 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
26732 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
26733 return false;
26734
26735 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
26736 R.getORE()->emit([&]() {
26737 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
26738 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
26739 "trying reduction first.";
26740 });
26741 return false;
26742 }
26743 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
26744 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
26745}
26746
26747template <typename T>
26749 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
26750 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
26751 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
26752 bool MaxVFOnly, BoUpSLP &R) {
26753 bool Changed = false;
26754 // Sort by type, parent, operands.
26755 stable_sort(Incoming, Comparator);
26756
26757 // Try to vectorize elements base on their type.
26758 SmallVector<T *> Candidates;
26760 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
26761 VL.clear()) {
26762 // Look for the next elements with the same type, parent and operand
26763 // kinds.
26764 auto *I = dyn_cast<Instruction>(*IncIt);
26765 if (!I || R.isDeleted(I)) {
26766 ++IncIt;
26767 continue;
26768 }
26769 auto *SameTypeIt = IncIt;
26770 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
26771 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
26772 AreCompatible(VL, *SameTypeIt))) {
26773 auto *I = dyn_cast<Instruction>(*SameTypeIt);
26774 ++SameTypeIt;
26775 if (I && !R.isDeleted(I))
26776 VL.push_back(cast<T>(I));
26777 }
26778
26779 // Try to vectorize them.
26780 unsigned NumElts = VL.size();
26781 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
26782 << NumElts << ")\n");
26783 // The vectorization is a 3-state attempt:
26784 // 1. Try to vectorize instructions with the same/alternate opcodes with the
26785 // size of maximal register at first.
26786 // 2. Try to vectorize remaining instructions with the same type, if
26787 // possible. This may result in the better vectorization results rather than
26788 // if we try just to vectorize instructions with the same/alternate opcodes.
26789 // 3. Final attempt to try to vectorize all instructions with the
26790 // same/alternate ops only, this may result in some extra final
26791 // vectorization.
26792 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
26793 // Success start over because instructions might have been changed.
26794 Changed = true;
26795 VL.swap(Candidates);
26796 Candidates.clear();
26797 for (T *V : VL) {
26798 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
26799 Candidates.push_back(V);
26800 }
26801 } else {
26802 /// \Returns the minimum number of elements that we will attempt to
26803 /// vectorize.
26804 auto GetMinNumElements = [&R](Value *V) {
26805 unsigned EltSize = R.getVectorElementSize(V);
26806 return std::max(2U, R.getMaxVecRegSize() / EltSize);
26807 };
26808 if (NumElts < GetMinNumElements(*IncIt) &&
26809 (Candidates.empty() ||
26810 Candidates.front()->getType() == (*IncIt)->getType())) {
26811 for (T *V : VL) {
26812 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
26813 Candidates.push_back(V);
26814 }
26815 }
26816 }
26817 // Final attempt to vectorize instructions with the same types.
26818 if (Candidates.size() > 1 &&
26819 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
26820 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
26821 // Success start over because instructions might have been changed.
26822 Changed = true;
26823 } else if (MaxVFOnly) {
26824 // Try to vectorize using small vectors.
26826 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
26827 VL.clear()) {
26828 auto *I = dyn_cast<Instruction>(*It);
26829 if (!I || R.isDeleted(I)) {
26830 ++It;
26831 continue;
26832 }
26833 auto *SameTypeIt = It;
26834 while (SameTypeIt != End &&
26835 (!isa<Instruction>(*SameTypeIt) ||
26836 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
26837 AreCompatible(*SameTypeIt, *It))) {
26838 auto *I = dyn_cast<Instruction>(*SameTypeIt);
26839 ++SameTypeIt;
26840 if (I && !R.isDeleted(I))
26841 VL.push_back(cast<T>(I));
26842 }
26843 unsigned NumElts = VL.size();
26844 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
26845 /*MaxVFOnly=*/false))
26846 Changed = true;
26847 It = SameTypeIt;
26848 }
26849 }
26850 Candidates.clear();
26851 }
26852
26853 // Start over at the next instruction of a different type (or the end).
26854 IncIt = SameTypeIt;
26855 }
26856 return Changed;
26857}
26858
26859/// Compare two cmp instructions. If IsCompatibility is true, function returns
26860/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
26861/// operands. If IsCompatibility is false, function implements strict weak
26862/// ordering relation between two cmp instructions, returning true if the first
26863/// instruction is "less" than the second, i.e. its predicate is less than the
26864/// predicate of the second or the operands IDs are less than the operands IDs
26865/// of the second cmp instruction.
26866template <bool IsCompatibility>
26867static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
26868 const DominatorTree &DT) {
26869 assert(isValidElementType(V->getType()) &&
26870 isValidElementType(V2->getType()) &&
26871 "Expected valid element types only.");
26872 if (V == V2)
26873 return IsCompatibility;
26874 auto *CI1 = cast<CmpInst>(V);
26875 auto *CI2 = cast<CmpInst>(V2);
26876 if (CI1->getOperand(0)->getType()->getTypeID() <
26877 CI2->getOperand(0)->getType()->getTypeID())
26878 return !IsCompatibility;
26879 if (CI1->getOperand(0)->getType()->getTypeID() >
26880 CI2->getOperand(0)->getType()->getTypeID())
26881 return false;
26882 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26884 return !IsCompatibility;
26885 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26887 return false;
26888 CmpInst::Predicate Pred1 = CI1->getPredicate();
26889 CmpInst::Predicate Pred2 = CI2->getPredicate();
26892 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
26893 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
26894 if (BasePred1 < BasePred2)
26895 return !IsCompatibility;
26896 if (BasePred1 > BasePred2)
26897 return false;
26898 // Compare operands.
26899 bool CI1Preds = Pred1 == BasePred1;
26900 bool CI2Preds = Pred2 == BasePred1;
26901 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
26902 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
26903 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
26904 if (Op1 == Op2)
26905 continue;
26906 if (Op1->getValueID() < Op2->getValueID())
26907 return !IsCompatibility;
26908 if (Op1->getValueID() > Op2->getValueID())
26909 return false;
26910 if (auto *I1 = dyn_cast<Instruction>(Op1))
26911 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
26912 if (IsCompatibility) {
26913 if (I1->getParent() != I2->getParent())
26914 return false;
26915 } else {
26916 // Try to compare nodes with same parent.
26917 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
26918 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
26919 if (!NodeI1)
26920 return NodeI2 != nullptr;
26921 if (!NodeI2)
26922 return false;
26923 assert((NodeI1 == NodeI2) ==
26924 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26925 "Different nodes should have different DFS numbers");
26926 if (NodeI1 != NodeI2)
26927 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26928 }
26929 InstructionsState S = getSameOpcode({I1, I2}, TLI);
26930 if (S && (IsCompatibility || !S.isAltShuffle()))
26931 continue;
26932 if (IsCompatibility)
26933 return false;
26934 if (I1->getOpcode() != I2->getOpcode())
26935 return I1->getOpcode() < I2->getOpcode();
26936 }
26937 }
26938 return IsCompatibility;
26939}
26940
26941template <typename ItT>
26942bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
26943 BasicBlock *BB, BoUpSLP &R) {
26944 bool Changed = false;
26945 // Try to find reductions first.
26946 for (CmpInst *I : CmpInsts) {
26947 if (R.isDeleted(I))
26948 continue;
26949 for (Value *Op : I->operands())
26950 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
26951 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
26952 if (R.isDeleted(I))
26953 break;
26954 }
26955 }
26956 // Try to vectorize operands as vector bundles.
26957 for (CmpInst *I : CmpInsts) {
26958 if (R.isDeleted(I))
26959 continue;
26960 Changed |= tryToVectorize(I, R);
26961 }
26962 // Try to vectorize list of compares.
26963 // Sort by type, compare predicate, etc.
26964 auto CompareSorter = [&](Value *V, Value *V2) {
26965 if (V == V2)
26966 return false;
26967 return compareCmp<false>(V, V2, *TLI, *DT);
26968 };
26969
26970 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
26971 if (VL.empty() || VL.back() == V1)
26972 return true;
26973 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
26974 };
26975
26977 for (Instruction *V : CmpInsts)
26978 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
26979 Vals.push_back(V);
26980 if (Vals.size() <= 1)
26981 return Changed;
26983 Vals, CompareSorter, AreCompatibleCompares,
26984 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26985 // Exclude possible reductions from other blocks.
26986 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
26987 return any_of(V->users(), [V](User *U) {
26988 auto *Select = dyn_cast<SelectInst>(U);
26989 return Select &&
26990 Select->getParent() != cast<Instruction>(V)->getParent();
26991 });
26992 });
26993 if (ArePossiblyReducedInOtherBlock)
26994 return false;
26995 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26996 },
26997 /*MaxVFOnly=*/true, R);
26998 return Changed;
26999}
27000
27001bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
27002 BasicBlock *BB, BoUpSLP &R) {
27004 "This function only accepts Insert instructions");
27005 bool OpsChanged = false;
27006 SmallVector<WeakTrackingVH> PostponedInsts;
27007 for (auto *I : reverse(Instructions)) {
27008 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
27009 if (R.isDeleted(I) || isa<CmpInst>(I))
27010 continue;
27011 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
27012 OpsChanged |=
27013 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
27014 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
27015 OpsChanged |=
27016 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
27017 }
27018 // pass2 - try to vectorize reductions only
27019 if (R.isDeleted(I))
27020 continue;
27021 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
27022 if (R.isDeleted(I) || isa<CmpInst>(I))
27023 continue;
27024 // pass3 - try to match and vectorize a buildvector sequence.
27025 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
27026 OpsChanged |=
27027 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
27028 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
27029 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
27030 /*MaxVFOnly=*/false);
27031 }
27032 }
27033 // Now try to vectorize postponed instructions.
27034 OpsChanged |= tryToVectorize(PostponedInsts, R);
27035
27036 Instructions.clear();
27037 return OpsChanged;
27038}
27039
27040bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
27041 bool Changed = false;
27042 SmallVector<Value *, 4> Incoming;
27043 SmallPtrSet<Value *, 16> VisitedInstrs;
27044 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
27045 // node. Allows better to identify the chains that can be vectorized in the
27046 // better way.
27047 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
27048 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
27050 isValidElementType(V2->getType()) &&
27051 "Expected vectorizable types only.");
27052 if (V1 == V2)
27053 return false;
27054 // It is fine to compare type IDs here, since we expect only vectorizable
27055 // types, like ints, floats and pointers, we don't care about other type.
27056 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
27057 return true;
27058 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
27059 return false;
27060 if (V1->getType()->getScalarSizeInBits() <
27061 V2->getType()->getScalarSizeInBits())
27062 return true;
27063 if (V1->getType()->getScalarSizeInBits() >
27064 V2->getType()->getScalarSizeInBits())
27065 return false;
27066 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
27067 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
27068 if (Opcodes1.size() < Opcodes2.size())
27069 return true;
27070 if (Opcodes1.size() > Opcodes2.size())
27071 return false;
27072 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
27073 {
27074 // Instructions come first.
27075 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
27076 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
27077 if (I1 && I2) {
27078 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
27079 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
27080 if (!NodeI1)
27081 return NodeI2 != nullptr;
27082 if (!NodeI2)
27083 return false;
27084 assert((NodeI1 == NodeI2) ==
27085 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27086 "Different nodes should have different DFS numbers");
27087 if (NodeI1 != NodeI2)
27088 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27089 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
27090 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
27091 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
27092 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
27093 if (!E1 || !E2)
27094 continue;
27095
27096 // Sort on ExtractElementInsts primarily by vector operands. Prefer
27097 // program order of the vector operands.
27098 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
27099 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
27100 if (V1 != V2) {
27101 if (V1 && !V2)
27102 return true;
27103 if (!V1 && V2)
27104 return false;
27106 DT->getNode(V1->getParent());
27108 DT->getNode(V2->getParent());
27109 if (!NodeI1)
27110 return NodeI2 != nullptr;
27111 if (!NodeI2)
27112 return false;
27113 assert((NodeI1 == NodeI2) ==
27114 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27115 "Different nodes should have different DFS numbers");
27116 if (NodeI1 != NodeI2)
27117 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27118 return V1->comesBefore(V2);
27119 }
27120 // If we have the same vector operand, try to sort by constant
27121 // index.
27122 std::optional<unsigned> Id1 = getExtractIndex(E1);
27123 std::optional<unsigned> Id2 = getExtractIndex(E2);
27124 // Bring constants to the top
27125 if (Id1 && !Id2)
27126 return true;
27127 if (!Id1 && Id2)
27128 return false;
27129 // First elements come first.
27130 if (Id1 && Id2)
27131 return *Id1 < *Id2;
27132
27133 continue;
27134 }
27135 if (I1->getOpcode() == I2->getOpcode())
27136 continue;
27137 return I1->getOpcode() < I2->getOpcode();
27138 }
27139 if (I1)
27140 return true;
27141 if (I2)
27142 return false;
27143 }
27144 {
27145 // Non-undef constants come next.
27146 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
27147 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
27148 if (C1 && C2)
27149 continue;
27150 if (C1)
27151 return true;
27152 if (C2)
27153 return false;
27154 }
27155 bool U1 = isa<UndefValue>(Opcodes1[I]);
27156 bool U2 = isa<UndefValue>(Opcodes2[I]);
27157 {
27158 // Non-constant non-instructions come next.
27159 if (!U1 && !U2) {
27160 auto ValID1 = Opcodes1[I]->getValueID();
27161 auto ValID2 = Opcodes2[I]->getValueID();
27162 if (ValID1 == ValID2)
27163 continue;
27164 if (ValID1 < ValID2)
27165 return true;
27166 if (ValID1 > ValID2)
27167 return false;
27168 }
27169 if (!U1)
27170 return true;
27171 if (!U2)
27172 return false;
27173 }
27174 // Undefs come last.
27175 assert(U1 && U2 && "The only thing left should be undef & undef.");
27176 }
27177 return false;
27178 };
27179 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
27180 Value *V1) {
27181 if (VL.empty() || V1 == VL.back())
27182 return true;
27183 Value *V2 = VL.back();
27184 if (V1->getType() != V2->getType())
27185 return false;
27186 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
27187 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
27188 if (Opcodes1.size() != Opcodes2.size())
27189 return false;
27190 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
27191 // Undefs are compatible with any other value.
27192 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
27193 continue;
27194 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
27195 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
27196 if (R.isDeleted(I1) || R.isDeleted(I2))
27197 return false;
27198 if (I1->getParent() != I2->getParent())
27199 return false;
27200 if (getSameOpcode({I1, I2}, *TLI))
27201 continue;
27202 return false;
27203 }
27204 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
27205 continue;
27206 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
27207 return false;
27208 }
27209 return true;
27210 };
27211
27212 bool HaveVectorizedPhiNodes = false;
27213 do {
27214 // Collect the incoming values from the PHIs.
27215 Incoming.clear();
27216 for (Instruction &I : *BB) {
27217 auto *P = dyn_cast<PHINode>(&I);
27218 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
27219 break;
27220
27221 // No need to analyze deleted, vectorized and non-vectorizable
27222 // instructions.
27223 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
27224 isValidElementType(P->getType()))
27225 Incoming.push_back(P);
27226 }
27227
27228 if (Incoming.size() <= 1)
27229 break;
27230
27231 // Find the corresponding non-phi nodes for better matching when trying to
27232 // build the tree.
27233 for (Value *V : Incoming) {
27234 SmallVectorImpl<Value *> &Opcodes =
27235 PHIToOpcodes.try_emplace(V).first->getSecond();
27236 if (!Opcodes.empty())
27237 continue;
27238 SmallVector<Value *, 4> Nodes(1, V);
27239 SmallPtrSet<Value *, 4> Visited;
27240 while (!Nodes.empty()) {
27241 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
27242 if (!Visited.insert(PHI).second)
27243 continue;
27244 for (Value *V : PHI->incoming_values()) {
27245 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
27246 Nodes.push_back(PHI1);
27247 continue;
27248 }
27249 Opcodes.emplace_back(V);
27250 }
27251 }
27252 }
27253
27254 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
27255 Incoming, PHICompare, AreCompatiblePHIs,
27256 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
27257 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27258 },
27259 /*MaxVFOnly=*/true, R);
27260 Changed |= HaveVectorizedPhiNodes;
27261 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
27262 auto *PHI = dyn_cast<PHINode>(P.first);
27263 return !PHI || R.isDeleted(PHI);
27264 }))
27265 PHIToOpcodes.clear();
27266 VisitedInstrs.insert_range(Incoming);
27267 } while (HaveVectorizedPhiNodes);
27268
27269 VisitedInstrs.clear();
27270
27271 InstSetVector PostProcessInserts;
27272 SmallSetVector<CmpInst *, 8> PostProcessCmps;
27273 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
27274 // also vectorizes `PostProcessCmps`.
27275 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
27276 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
27277 if (VectorizeCmps) {
27278 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
27279 PostProcessCmps.clear();
27280 }
27281 PostProcessInserts.clear();
27282 return Changed;
27283 };
27284 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
27285 auto IsInPostProcessInstrs = [&](Instruction *I) {
27286 if (auto *Cmp = dyn_cast<CmpInst>(I))
27287 return PostProcessCmps.contains(Cmp);
27289 PostProcessInserts.contains(I);
27290 };
27291 // Returns true if `I` is an instruction without users, like terminator, or
27292 // function call with ignored return value, store. Ignore unused instructions
27293 // (basing on instruction type, except for CallInst and InvokeInst).
27294 auto HasNoUsers = [](Instruction *I) {
27295 return I->use_empty() &&
27296 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
27297 };
27298 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
27299 // Skip instructions with scalable type. The num of elements is unknown at
27300 // compile-time for scalable type.
27301 if (isa<ScalableVectorType>(It->getType()))
27302 continue;
27303
27304 // Skip instructions marked for the deletion.
27305 if (R.isDeleted(&*It))
27306 continue;
27307 // We may go through BB multiple times so skip the one we have checked.
27308 if (!VisitedInstrs.insert(&*It).second) {
27309 if (HasNoUsers(&*It) &&
27310 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
27311 // We would like to start over since some instructions are deleted
27312 // and the iterator may become invalid value.
27313 Changed = true;
27314 It = BB->begin();
27315 E = BB->end();
27316 }
27317 continue;
27318 }
27319
27320 // Try to vectorize reductions that use PHINodes.
27321 if (PHINode *P = dyn_cast<PHINode>(It)) {
27322 // Check that the PHI is a reduction PHI.
27323 if (P->getNumIncomingValues() == 2) {
27324 // Try to match and vectorize a horizontal reduction.
27325 Instruction *Root = getReductionInstr(DT, P, BB, LI);
27326 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
27327 Changed = true;
27328 It = BB->begin();
27329 E = BB->end();
27330 continue;
27331 }
27332 }
27333 // Try to vectorize the incoming values of the PHI, to catch reductions
27334 // that feed into PHIs.
27335 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
27336 // Skip if the incoming block is the current BB for now. Also, bypass
27337 // unreachable IR for efficiency and to avoid crashing.
27338 // TODO: Collect the skipped incoming values and try to vectorize them
27339 // after processing BB.
27340 if (BB == P->getIncomingBlock(I) ||
27341 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
27342 continue;
27343
27344 // Postponed instructions should not be vectorized here, delay their
27345 // vectorization.
27346 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
27347 PI && !IsInPostProcessInstrs(PI)) {
27348 bool Res =
27349 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
27350 Changed |= Res;
27351 if (Res && R.isDeleted(P)) {
27352 It = BB->begin();
27353 E = BB->end();
27354 break;
27355 }
27356 }
27357 }
27358 continue;
27359 }
27360
27361 if (HasNoUsers(&*It)) {
27362 bool OpsChanged = false;
27363 auto *SI = dyn_cast<StoreInst>(It);
27364 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
27365 if (SI) {
27366 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
27367 // Try to vectorize chain in store, if this is the only store to the
27368 // address in the block.
27369 // TODO: This is just a temporarily solution to save compile time. Need
27370 // to investigate if we can safely turn on slp-vectorize-hor-store
27371 // instead to allow lookup for reduction chains in all non-vectorized
27372 // stores (need to check side effects and compile time).
27373 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
27374 SI->getValueOperand()->hasOneUse();
27375 }
27376 if (TryToVectorizeRoot) {
27377 for (auto *V : It->operand_values()) {
27378 // Postponed instructions should not be vectorized here, delay their
27379 // vectorization.
27380 if (auto *VI = dyn_cast<Instruction>(V);
27381 VI && !IsInPostProcessInstrs(VI))
27382 // Try to match and vectorize a horizontal reduction.
27383 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
27384 }
27385 }
27386 // Start vectorization of post-process list of instructions from the
27387 // top-tree instructions to try to vectorize as many instructions as
27388 // possible.
27389 OpsChanged |=
27390 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
27391 if (OpsChanged) {
27392 // We would like to start over since some instructions are deleted
27393 // and the iterator may become invalid value.
27394 Changed = true;
27395 It = BB->begin();
27396 E = BB->end();
27397 continue;
27398 }
27399 }
27400
27402 PostProcessInserts.insert(&*It);
27403 else if (isa<CmpInst>(It))
27404 PostProcessCmps.insert(cast<CmpInst>(&*It));
27405 }
27406
27407 return Changed;
27408}
27409
27410bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
27411 auto Changed = false;
27412 for (auto &Entry : GEPs) {
27413 // If the getelementptr list has fewer than two elements, there's nothing
27414 // to do.
27415 if (Entry.second.size() < 2)
27416 continue;
27417
27418 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
27419 << Entry.second.size() << ".\n");
27420
27421 // Process the GEP list in chunks suitable for the target's supported
27422 // vector size. If a vector register can't hold 1 element, we are done. We
27423 // are trying to vectorize the index computations, so the maximum number of
27424 // elements is based on the size of the index expression, rather than the
27425 // size of the GEP itself (the target's pointer size).
27426 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
27427 return !R.isDeleted(GEP);
27428 });
27429 if (It == Entry.second.end())
27430 continue;
27431 unsigned MaxVecRegSize = R.getMaxVecRegSize();
27432 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
27433 if (MaxVecRegSize < EltSize)
27434 continue;
27435
27436 unsigned MaxElts = MaxVecRegSize / EltSize;
27437 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
27438 auto Len = std::min<unsigned>(BE - BI, MaxElts);
27439 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
27440
27441 // Initialize a set a candidate getelementptrs. Note that we use a
27442 // SetVector here to preserve program order. If the index computations
27443 // are vectorizable and begin with loads, we want to minimize the chance
27444 // of having to reorder them later.
27445 SetVector<Value *> Candidates(llvm::from_range, GEPList);
27446
27447 // Some of the candidates may have already been vectorized after we
27448 // initially collected them or their index is optimized to constant value.
27449 // If so, they are marked as deleted, so remove them from the set of
27450 // candidates.
27451 Candidates.remove_if([&R](Value *I) {
27452 return R.isDeleted(cast<Instruction>(I)) ||
27453 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
27454 });
27455
27456 // Remove from the set of candidates all pairs of getelementptrs with
27457 // constant differences. Such getelementptrs are likely not good
27458 // candidates for vectorization in a bottom-up phase since one can be
27459 // computed from the other. We also ensure all candidate getelementptr
27460 // indices are unique.
27461 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
27462 auto *GEPI = GEPList[I];
27463 if (!Candidates.count(GEPI))
27464 continue;
27465 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
27466 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
27467 auto *GEPJ = GEPList[J];
27468 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
27469 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
27470 Candidates.remove(GEPI);
27471 Candidates.remove(GEPJ);
27472 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
27473 Candidates.remove(GEPJ);
27474 }
27475 }
27476 }
27477
27478 // We break out of the above computation as soon as we know there are
27479 // fewer than two candidates remaining.
27480 if (Candidates.size() < 2)
27481 continue;
27482
27483 // Add the single, non-constant index of each candidate to the bundle. We
27484 // ensured the indices met these constraints when we originally collected
27485 // the getelementptrs.
27486 SmallVector<Value *, 16> Bundle(Candidates.size());
27487 auto BundleIndex = 0u;
27488 for (auto *V : Candidates) {
27489 auto *GEP = cast<GetElementPtrInst>(V);
27490 auto *GEPIdx = GEP->idx_begin()->get();
27491 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
27492 Bundle[BundleIndex++] = GEPIdx;
27493 }
27494
27495 // Try and vectorize the indices. We are currently only interested in
27496 // gather-like cases of the form:
27497 //
27498 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
27499 //
27500 // where the loads of "a", the loads of "b", and the subtractions can be
27501 // performed in parallel. It's likely that detecting this pattern in a
27502 // bottom-up phase will be simpler and less costly than building a
27503 // full-blown top-down phase beginning at the consecutive loads.
27504 Changed |= tryToVectorizeList(Bundle, R);
27505 }
27506 }
27507 return Changed;
27508}
27509
27510bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
27511 bool Changed = false;
27512 // Sort by type, base pointers and values operand. Value operands must be
27513 // compatible (have the same opcode, same parent), otherwise it is
27514 // definitely not profitable to try to vectorize them.
27515 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
27516 if (V->getValueOperand()->getType()->getTypeID() <
27517 V2->getValueOperand()->getType()->getTypeID())
27518 return true;
27519 if (V->getValueOperand()->getType()->getTypeID() >
27520 V2->getValueOperand()->getType()->getTypeID())
27521 return false;
27522 if (V->getPointerOperandType()->getTypeID() <
27523 V2->getPointerOperandType()->getTypeID())
27524 return true;
27525 if (V->getPointerOperandType()->getTypeID() >
27526 V2->getPointerOperandType()->getTypeID())
27527 return false;
27528 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
27529 V2->getValueOperand()->getType()->getScalarSizeInBits())
27530 return true;
27531 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
27532 V2->getValueOperand()->getType()->getScalarSizeInBits())
27533 return false;
27534 // UndefValues are compatible with all other values.
27535 auto *I1 = dyn_cast<Instruction>(V->getValueOperand());
27536 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
27537 if (I1 && I2) {
27538 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
27539 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
27540 assert(NodeI1 && "Should only process reachable instructions");
27541 assert(NodeI2 && "Should only process reachable instructions");
27542 assert((NodeI1 == NodeI2) ==
27543 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27544 "Different nodes should have different DFS numbers");
27545 if (NodeI1 != NodeI2)
27546 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27547 return I1->getOpcode() < I2->getOpcode();
27548 }
27549 if (I1 && !I2)
27550 return true;
27551 if (!I1 && I2)
27552 return false;
27553 return V->getValueOperand()->getValueID() <
27554 V2->getValueOperand()->getValueID();
27555 };
27556
27557 bool SameParent = true;
27558 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
27559 if (VL.empty()) {
27560 SameParent = true;
27561 return true;
27562 }
27563 StoreInst *V2 = VL.back();
27564 if (V1 == V2)
27565 return true;
27566 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
27567 return false;
27568 if (V1->getPointerOperandType() != V2->getPointerOperandType())
27569 return false;
27570 // Undefs are compatible with any other value.
27571 if (isa<UndefValue>(V1->getValueOperand()) ||
27573 return true;
27574 if (isa<Constant>(V1->getValueOperand()) &&
27576 return true;
27577 // Check if the operands of the stores can be vectorized. They can be
27578 // vectorized, if they have compatible operands or have operands, which can
27579 // be vectorized as copyables.
27580 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
27581 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
27582 if (I1 || I2) {
27583 // Accept only tail-following non-compatible values for now.
27584 // TODO: investigate if it is possible to vectorize incompatible values,
27585 // if the copyables are first in the list.
27586 if (I1 && !I2)
27587 return false;
27588 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
27589 SmallVector<Value *> NewVL(VL.size() + 1);
27590 for (auto [SI, V] : zip(VL, NewVL))
27591 V = SI->getValueOperand();
27592 NewVL.back() = V1->getValueOperand();
27593 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
27594 InstructionsState S = Analysis.buildInstructionsState(
27595 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
27596 /*SkipSameCodeCheck=*/!SameParent);
27597 if (S)
27598 return true;
27599 if (!SameParent)
27600 return false;
27601 }
27602 return V1->getValueOperand()->getValueID() ==
27603 V2->getValueOperand()->getValueID();
27604 };
27605
27606 // Attempt to sort and vectorize each of the store-groups.
27607 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
27608 for (auto &Pair : Stores) {
27609 if (Pair.second.size() < 2)
27610 continue;
27611
27612 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
27613 << Pair.second.size() << ".\n");
27614
27615 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
27616 continue;
27617
27618 // Reverse stores to do bottom-to-top analysis. This is important if the
27619 // values are stores to the same addresses several times, in this case need
27620 // to follow the stores order (reversed to meet the memory dependecies).
27621 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
27622 Pair.second.rend());
27624 ReversedStores, StoreSorter, AreCompatibleStores,
27625 [&](ArrayRef<StoreInst *> Candidates, bool) {
27626 return vectorizeStores(Candidates, R, Attempted);
27627 },
27628 /*MaxVFOnly=*/false, R);
27629 }
27630 return Changed;
27631}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:646
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
static Type * getIndexType(Value *In)
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op, bool IsCopyable=false)
Checks if the operand is commutative.
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, SmallVectorImpl< int64_t > &Coeffs)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1415
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1549
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1339
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1677
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1497
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1112
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1405
void negate()
Negate this APInt in place.
Definition APInt.h:1477
unsigned logBase2() const
Definition APInt.h:1770
void setAllBits()
Set every bit to 1.
Definition APInt.h:1328
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1376
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:287
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:178
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:219
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:195
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:157
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:470
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:488
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:491
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:718
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:664
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
static bool shouldExecute(CounterInfo &Counter)
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getUnknown()
Definition DebugLoc.h:161
An analysis that produces DemandedBits for a function.
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2585
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2651
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2607
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:154
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Information for memory intrinsic cost model.
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
T & front() const
front - Get the first element.
Definition ArrayRef.h:349
iterator end() const
Definition ArrayRef.h:343
iterator begin() const
Definition ArrayRef.h:342
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:91
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:132
void insert_range(Range &&R)
Definition SetVector.h:176
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:94
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:228
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:180
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:285
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:233
unsigned getNumOperands() const
Definition User.h:255
iterator_range< value_op_iterator > operand_values()
Definition User.h:317
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
bool hasUseList() const
Check if this Value has a use-list.
Definition Value.h:344
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1106
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
InstructionCost getTreeCost(InstructionCost TreeCost, ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
InstructionCost calculateTreeCostAndTrimNonProfitable(ArrayRef< Value * > VectorizedVals={})
Calculates the cost of the subtrees, trims non-profitable ones and returns final cost.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2170
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2106
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1757
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1731
scope_exit(Callable) -> scope_exit< Callable >
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2303
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:2029
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2190
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2016
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1775
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:361
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1397
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1968
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Add
Sum of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1883
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1407
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1437
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1446
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)