LLVM 23.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <map>
99#include <memory>
100#include <optional>
101#include <set>
102#include <string>
103#include <tuple>
104#include <utility>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108using namespace slpvectorizer;
109using namespace std::placeholders;
110
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
113
114STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
115
116DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
117 "Controls which SLP graphs should be vectorized.");
118
119static cl::opt<bool>
120 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
121 cl::desc("Run the SLP vectorization passes"));
122
123static cl::opt<bool>
124 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
125 cl::desc("Enable vectorization for wider vector utilization"));
126
127static cl::opt<int>
129 cl::desc("Only vectorize if you gain more than this "
130 "number "));
131
133 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
134 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
135 "heuristics and makes vectorization decision via cost modeling."));
136
137static cl::opt<bool>
138ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
139 cl::desc("Attempt to vectorize horizontal reductions"));
140
142 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
143 cl::desc(
144 "Attempt to vectorize horizontal reductions feeding into a store"));
145
147 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
148 cl::desc("Improve the code quality by splitting alternate instructions"));
149
150static cl::opt<int>
152 cl::desc("Attempt to vectorize for this register size in bits"));
153
156 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
157
158/// Limits the size of scheduling regions in a block.
159/// It avoid long compile times for _very_ large blocks where vector
160/// instructions are spread over a wide range.
161/// This limit is way higher than needed by real-world functions.
162static cl::opt<int>
163ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
164 cl::desc("Limit the size of the SLP scheduling region per block"));
165
167 "slp-min-reg-size", cl::init(128), cl::Hidden,
168 cl::desc("Attempt to vectorize for this register size in bits"));
169
171 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
172 cl::desc("Limit the recursion depth when building a vectorizable tree"));
173
175 "slp-min-tree-size", cl::init(3), cl::Hidden,
176 cl::desc("Only vectorize small trees if they are fully vectorizable"));
177
178// The maximum depth that the look-ahead score heuristic will explore.
179// The higher this value, the higher the compilation time overhead.
181 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
182 cl::desc("The maximum look-ahead depth for operand reordering scores"));
183
184// The maximum depth that the look-ahead score heuristic will explore
185// when it probing among candidates for vectorization tree roots.
186// The higher this value, the higher the compilation time overhead but unlike
187// similar limit for operands ordering this is less frequently used, hence
188// impact of higher value is less noticeable.
190 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
191 cl::desc("The maximum look-ahead depth for searching best rooting option"));
192
194 "slp-min-strided-loads", cl::init(2), cl::Hidden,
195 cl::desc("The minimum number of loads, which should be considered strided, "
196 "if the stride is > 1 or is runtime value"));
197
199 "slp-max-stride", cl::init(8), cl::Hidden,
200 cl::desc("The maximum stride, considered to be profitable."));
201
202static cl::opt<bool>
203 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
204 cl::desc("Disable tree reordering even if it is "
205 "profitable. Used for testing only."));
206
207static cl::opt<bool>
208 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
209 cl::desc("Generate strided loads even if they are not "
210 "profitable. Used for testing only."));
211
212static cl::opt<bool>
213 ViewSLPTree("view-slp-tree", cl::Hidden,
214 cl::desc("Display the SLP trees with Graphviz"));
215
217 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
218 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
219
220/// Enables vectorization of copyable elements.
222 "slp-copyable-elements", cl::init(true), cl::Hidden,
223 cl::desc("Try to replace values with the idempotent instructions for "
224 "better vectorization."));
225
226// Limit the number of alias checks. The limit is chosen so that
227// it has no negative effect on the llvm benchmarks.
228static const unsigned AliasedCheckLimit = 10;
229
230// Limit of the number of uses for potentially transformed instructions/values,
231// used in checks to avoid compile-time explode.
232static constexpr int UsesLimit = 64;
233
234// Another limit for the alias checks: The maximum distance between load/store
235// instructions where alias checks are done.
236// This limit is useful for very large basic blocks.
237static const unsigned MaxMemDepDistance = 160;
238
239/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
240/// regions to be handled.
241static const int MinScheduleRegionSize = 16;
242
243/// Maximum allowed number of operands in the PHI nodes.
244static const unsigned MaxPHINumOperands = 128;
245
246/// Predicate for the element types that the SLP vectorizer supports.
247///
248/// The most important thing to filter here are types which are invalid in LLVM
249/// vectors. We also filter target specific types which have absolutely no
250/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
251/// avoids spending time checking the cost model and realizing that they will
252/// be inevitably scalarized.
253static bool isValidElementType(Type *Ty) {
254 // TODO: Support ScalableVectorType.
256 Ty = Ty->getScalarType();
257 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
258 !Ty->isPPC_FP128Ty();
259}
260
261/// Returns the type of the given value/instruction \p V. If it is store,
262/// returns the type of its value operand, for Cmp - the types of the compare
263/// operands and for insertelement - the type os the inserted operand.
264/// Otherwise, just the type of the value is returned.
266 if (auto *SI = dyn_cast<StoreInst>(V))
267 return SI->getValueOperand()->getType();
268 if (auto *CI = dyn_cast<CmpInst>(V))
269 return CI->getOperand(0)->getType();
270 if (auto *IE = dyn_cast<InsertElementInst>(V))
271 return IE->getOperand(1)->getType();
272 return V->getType();
273}
274
275/// \returns the number of elements for Ty.
276static unsigned getNumElements(Type *Ty) {
278 "ScalableVectorType is not supported.");
279 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
280 return VecTy->getNumElements();
281 return 1;
282}
283
284/// \returns the vector type of ScalarTy based on vectorization factor.
285static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
286 return FixedVectorType::get(ScalarTy->getScalarType(),
287 VF * getNumElements(ScalarTy));
288}
289
290/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
291/// which forms type, which splits by \p TTI into whole vector types during
292/// legalization.
294 Type *Ty, unsigned Sz) {
295 if (!isValidElementType(Ty))
296 return bit_ceil(Sz);
297 // Find the number of elements, which forms full vectors.
298 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
299 if (NumParts == 0 || NumParts >= Sz)
300 return bit_ceil(Sz);
301 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
302}
303
304/// Returns the number of elements of the given type \p Ty, not greater than \p
305/// Sz, which forms type, which splits by \p TTI into whole vector types during
306/// legalization.
307static unsigned
309 unsigned Sz) {
310 if (!isValidElementType(Ty))
311 return bit_floor(Sz);
312 // Find the number of elements, which forms full vectors.
313 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
314 if (NumParts == 0 || NumParts >= Sz)
315 return bit_floor(Sz);
316 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
317 if (RegVF > Sz)
318 return bit_floor(Sz);
319 return (Sz / RegVF) * RegVF;
320}
321
322static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
323 SmallVectorImpl<int> &Mask) {
324 // The ShuffleBuilder implementation use shufflevector to splat an "element".
325 // But the element have different meaning for SLP (scalar) and REVEC
326 // (vector). We need to expand Mask into masks which shufflevector can use
327 // directly.
328 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
329 for (unsigned I : seq<unsigned>(Mask.size()))
330 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
331 I * VecTyNumElements, VecTyNumElements)))
332 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
333 : Mask[I] * VecTyNumElements + J;
334 Mask.swap(NewMask);
335}
336
337/// \returns the number of groups of shufflevector
338/// A group has the following features
339/// 1. All of value in a group are shufflevector.
340/// 2. The mask of all shufflevector is isExtractSubvectorMask.
341/// 3. The mask of all shufflevector uses all of the elements of the source.
342/// e.g., it is 1 group (%0)
343/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
344/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
345/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
346/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
347/// it is 2 groups (%3 and %4)
348/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
349/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
350/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
351/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
352/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
353/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
354/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
355/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
356/// it is 0 group
357/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
358/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
359/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
360/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
362 if (VL.empty())
363 return 0;
365 return 0;
366 auto *SV = cast<ShuffleVectorInst>(VL.front());
367 unsigned SVNumElements =
368 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
369 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
370 if (SVNumElements % ShuffleMaskSize != 0)
371 return 0;
372 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
373 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
374 return 0;
375 unsigned NumGroup = 0;
376 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
377 auto *SV = cast<ShuffleVectorInst>(VL[I]);
378 Value *Src = SV->getOperand(0);
379 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
380 SmallBitVector ExpectedIndex(GroupSize);
381 if (!all_of(Group, [&](Value *V) {
382 auto *SV = cast<ShuffleVectorInst>(V);
383 // From the same source.
384 if (SV->getOperand(0) != Src)
385 return false;
386 int Index;
387 if (!SV->isExtractSubvectorMask(Index))
388 return false;
389 ExpectedIndex.set(Index / ShuffleMaskSize);
390 return true;
391 }))
392 return 0;
393 if (!ExpectedIndex.all())
394 return 0;
395 ++NumGroup;
396 }
397 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
398 return NumGroup;
399}
400
401/// \returns a shufflevector mask which is used to vectorize shufflevectors
402/// e.g.,
403/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
404/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
405/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
406/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
407/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
408/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
409/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
410/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
411/// the result is
412/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
414 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
415 auto *SV = cast<ShuffleVectorInst>(VL.front());
416 unsigned SVNumElements =
417 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
418 SmallVector<int> Mask;
419 unsigned AccumulateLength = 0;
420 for (Value *V : VL) {
421 auto *SV = cast<ShuffleVectorInst>(V);
422 for (int M : SV->getShuffleMask())
423 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
424 : AccumulateLength + M);
425 AccumulateLength += SVNumElements;
426 }
427 return Mask;
428}
429
430/// \returns True if the value is a constant (but not globals/constant
431/// expressions).
432static bool isConstant(Value *V) {
434}
435
436/// Checks if \p V is one of vector-like instructions, i.e. undef,
437/// insertelement/extractelement with constant indices for fixed vector type or
438/// extractvalue instruction.
442 return false;
443 auto *I = dyn_cast<Instruction>(V);
444 if (!I || isa<ExtractValueInst>(I))
445 return true;
446 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
447 return false;
449 return isConstant(I->getOperand(1));
450 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
451 return isConstant(I->getOperand(2));
452}
453
454/// Returns power-of-2 number of elements in a single register (part), given the
455/// total number of elements \p Size and number of registers (parts) \p
456/// NumParts.
457static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
458 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
459}
460
461/// Returns correct remaining number of elements, considering total amount \p
462/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
463/// and current register (part) \p Part.
464static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
465 unsigned Part) {
466 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
467}
468
469#if !defined(NDEBUG)
470/// Print a short descriptor of the instruction bundle suitable for debug output.
471static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
472 std::string Result;
473 raw_string_ostream OS(Result);
474 if (Idx >= 0)
475 OS << "Idx: " << Idx << ", ";
476 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
477 return Result;
478}
479#endif
480
481/// \returns true if all of the instructions in \p VL are in the same block or
482/// false otherwise.
484 auto *It = find_if(VL, IsaPred<Instruction>);
485 if (It == VL.end())
486 return false;
489 return true;
490
491 BasicBlock *BB = I0->getParent();
492 for (Value *V : iterator_range(It, VL.end())) {
493 if (isa<PoisonValue>(V))
494 continue;
495 auto *II = dyn_cast<Instruction>(V);
496 if (!II)
497 return false;
498
499 if (BB != II->getParent())
500 return false;
501 }
502 return true;
503}
504
505/// \returns True if all of the values in \p VL are constants (but not
506/// globals/constant expressions).
508 // Constant expressions and globals can't be vectorized like normal integer/FP
509 // constants.
510 return all_of(VL, isConstant);
511}
512
513/// \returns True if all of the values in \p VL are identical or some of them
514/// are UndefValue.
515static bool isSplat(ArrayRef<Value *> VL) {
516 Value *FirstNonUndef = nullptr;
517 for (Value *V : VL) {
518 if (isa<UndefValue>(V))
519 continue;
520 if (!FirstNonUndef) {
521 FirstNonUndef = V;
522 continue;
523 }
524 if (V != FirstNonUndef)
525 return false;
526 }
527 return FirstNonUndef != nullptr;
528}
529
530/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
531/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
532/// patterns that make it effectively commutative (like equality comparisons
533/// with zero).
534/// In most cases, users should not call this function directly (since \p I and
535/// \p InstWithUses are the same). However, when analyzing interchangeable
536/// instructions, we need to use the converted opcode along with the original
537/// uses.
538/// \param I The instruction to check for commutativity
539/// \param ValWithUses The value whose uses are analyzed for special
540/// patterns
541static bool isCommutative(Instruction *I, Value *ValWithUses,
542 bool IsCopyable = false) {
543 if (auto *Cmp = dyn_cast<CmpInst>(I))
544 return Cmp->isCommutative();
545 if (auto *BO = dyn_cast<BinaryOperator>(I))
546 return BO->isCommutative() ||
547 (BO->getOpcode() == Instruction::Sub &&
548 ValWithUses->hasUseList() &&
549 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
550 all_of(
551 ValWithUses->uses(),
552 [&](const Use &U) {
553 // Commutative, if icmp eq/ne sub, 0
554 CmpPredicate Pred;
555 if (match(U.getUser(),
556 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
557 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return true;
559 // Commutative, if abs(sub nsw, true) or abs(sub, false).
560 ConstantInt *Flag;
561 auto *I = dyn_cast<BinaryOperator>(U.get());
562 return match(U.getUser(),
563 m_Intrinsic<Intrinsic::abs>(
564 m_Specific(U.get()), m_ConstantInt(Flag))) &&
565 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
566 Flag->isOne());
567 })) ||
568 (BO->getOpcode() == Instruction::FSub &&
569 ValWithUses->hasUseList() &&
570 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
571 all_of(ValWithUses->uses(), [](const Use &U) {
572 return match(U.getUser(),
573 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
574 }));
575 return I->isCommutative();
576}
577
578/// Checks if the operand is commutative. In commutative operations, not all
579/// operands might commutable, e.g. for fmuladd only 2 first operands are
580/// commutable.
581static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
582 bool IsCopyable = false) {
583 assert(::isCommutative(I, ValWithUses, IsCopyable) &&
584 "The instruction is not commutative.");
585 if (isa<CmpInst>(I))
586 return true;
587 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
588 switch (BO->getOpcode()) {
589 case Instruction::Sub:
590 case Instruction::FSub:
591 return true;
592 default:
593 break;
594 }
595 }
596 return I->isCommutableOperand(Op);
597}
598
599/// This is a helper function to check whether \p I is commutative.
600/// This is a convenience wrapper that calls the two-parameter version of
601/// isCommutative with the same instruction for both parameters. This is
602/// the common case where the instruction being checked for commutativity
603/// is the same as the instruction whose uses are analyzed for special
604/// patterns (see the two-parameter version above for details).
605/// \param I The instruction to check for commutativity
606/// \returns true if the instruction is commutative, false otherwise
607static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
608
609/// \returns number of operands of \p I, considering commutativity. Returns 2
610/// for commutative instrinsics.
611/// \param I The instruction to check for commutativity
614 // IntrinsicInst::isCommutative returns true if swapping the first "two"
615 // arguments to the intrinsic produces the same result.
616 constexpr unsigned IntrinsicNumOperands = 2;
617 return IntrinsicNumOperands;
618 }
619 return I->getNumOperands();
620}
621
622template <typename T>
623static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
624 unsigned Offset) {
625 static_assert(std::is_same_v<T, InsertElementInst> ||
626 std::is_same_v<T, ExtractElementInst>,
627 "unsupported T");
628 int Index = Offset;
629 if (const auto *IE = dyn_cast<T>(Inst)) {
630 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
631 if (!VT)
632 return std::nullopt;
633 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
634 if (!CI)
635 return std::nullopt;
636 if (CI->getValue().uge(VT->getNumElements()))
637 return std::nullopt;
638 Index *= VT->getNumElements();
639 Index += CI->getZExtValue();
640 return Index;
641 }
642 return std::nullopt;
643}
644
645/// \returns inserting or extracting index of InsertElement, ExtractElement or
646/// InsertValue instruction, using Offset as base offset for index.
647/// \returns std::nullopt if the index is not an immediate.
648static std::optional<unsigned> getElementIndex(const Value *Inst,
649 unsigned Offset = 0) {
650 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
651 return Index;
653 return Index;
654
655 int Index = Offset;
656
657 const auto *IV = dyn_cast<InsertValueInst>(Inst);
658 if (!IV)
659 return std::nullopt;
660
661 Type *CurrentType = IV->getType();
662 for (unsigned I : IV->indices()) {
663 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
664 Index *= ST->getNumElements();
665 CurrentType = ST->getElementType(I);
666 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
667 Index *= AT->getNumElements();
668 CurrentType = AT->getElementType();
669 } else {
670 return std::nullopt;
671 }
672 Index += I;
673 }
674 return Index;
675}
676
677/// \returns true if all of the values in \p VL use the same opcode.
678/// For comparison instructions, also checks if predicates match.
679/// PoisonValues are considered matching.
680/// Interchangeable instructions are not considered.
682 auto *It = find_if(VL, IsaPred<Instruction>);
683 if (It == VL.end())
684 return true;
685 Instruction *MainOp = cast<Instruction>(*It);
686 unsigned Opcode = MainOp->getOpcode();
687 bool IsCmpOp = isa<CmpInst>(MainOp);
688 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
690 return std::all_of(It, VL.end(), [&](Value *V) {
691 if (auto *CI = dyn_cast<CmpInst>(V))
692 return BasePred == CI->getPredicate();
693 if (auto *I = dyn_cast<Instruction>(V))
694 return I->getOpcode() == Opcode;
695 return isa<PoisonValue>(V);
696 });
697}
698
699namespace {
700/// Specifies the way the mask should be analyzed for undefs/poisonous elements
701/// in the shuffle mask.
702enum class UseMask {
703 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
704 ///< check for the mask elements for the first argument (mask
705 ///< indices are in range [0:VF)).
706 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
707 ///< for the mask elements for the second argument (mask indices
708 ///< are in range [VF:2*VF))
709 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
710 ///< future shuffle elements and mark them as ones as being used
711 ///< in future. Non-undef elements are considered as unused since
712 ///< they're already marked as used in the mask.
713};
714} // namespace
715
716/// Prepares a use bitset for the given mask either for the first argument or
717/// for the second.
719 UseMask MaskArg) {
720 SmallBitVector UseMask(VF, true);
721 for (auto [Idx, Value] : enumerate(Mask)) {
722 if (Value == PoisonMaskElem) {
723 if (MaskArg == UseMask::UndefsAsMask)
724 UseMask.reset(Idx);
725 continue;
726 }
727 if (MaskArg == UseMask::FirstArg && Value < VF)
728 UseMask.reset(Value);
729 else if (MaskArg == UseMask::SecondArg && Value >= VF)
730 UseMask.reset(Value - VF);
731 }
732 return UseMask;
733}
734
735/// Checks if the given value is actually an undefined constant vector.
736/// Also, if the \p UseMask is not empty, tries to check if the non-masked
737/// elements actually mask the insertelement buildvector, if any.
738template <bool IsPoisonOnly = false>
740 const SmallBitVector &UseMask = {}) {
741 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
742 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
743 if (isa<T>(V))
744 return Res;
745 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
746 if (!VecTy)
747 return Res.reset();
748 auto *C = dyn_cast<Constant>(V);
749 if (!C) {
750 if (!UseMask.empty()) {
751 const Value *Base = V;
752 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
753 Base = II->getOperand(0);
754 if (isa<T>(II->getOperand(1)))
755 continue;
756 std::optional<unsigned> Idx = getElementIndex(II);
757 if (!Idx) {
758 Res.reset();
759 return Res;
760 }
761 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
762 Res.reset(*Idx);
763 }
764 // TODO: Add analysis for shuffles here too.
765 if (V == Base) {
766 Res.reset();
767 } else {
768 SmallBitVector SubMask(UseMask.size(), false);
769 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
770 }
771 } else {
772 Res.reset();
773 }
774 return Res;
775 }
776 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
777 if (Constant *Elem = C->getAggregateElement(I))
778 if (!isa<T>(Elem) &&
779 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
780 Res.reset(I);
781 }
782 return Res;
783}
784
785/// Checks if the vector of instructions can be represented as a shuffle, like:
786/// %x0 = extractelement <4 x i8> %x, i32 0
787/// %x3 = extractelement <4 x i8> %x, i32 3
788/// %y1 = extractelement <4 x i8> %y, i32 1
789/// %y2 = extractelement <4 x i8> %y, i32 2
790/// %x0x0 = mul i8 %x0, %x0
791/// %x3x3 = mul i8 %x3, %x3
792/// %y1y1 = mul i8 %y1, %y1
793/// %y2y2 = mul i8 %y2, %y2
794/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
795/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
796/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
797/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
798/// ret <4 x i8> %ins4
799/// can be transformed into:
800/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
801/// i32 6>
802/// %2 = mul <4 x i8> %1, %1
803/// ret <4 x i8> %2
804/// Mask will return the Shuffle Mask equivalent to the extracted elements.
805/// TODO: Can we split off and reuse the shuffle mask detection from
806/// ShuffleVectorInst/getShuffleCost?
807static std::optional<TargetTransformInfo::ShuffleKind>
809 AssumptionCache *AC) {
810 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
811 if (It == VL.end())
812 return std::nullopt;
813 unsigned Size =
814 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
815 auto *EI = dyn_cast<ExtractElementInst>(V);
816 if (!EI)
817 return S;
818 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
819 if (!VTy)
820 return S;
821 return std::max(S, VTy->getNumElements());
822 });
823
824 Value *Vec1 = nullptr;
825 Value *Vec2 = nullptr;
826 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
827 auto *EE = dyn_cast<ExtractElementInst>(V);
828 if (!EE)
829 return false;
830 Value *Vec = EE->getVectorOperand();
831 if (isa<UndefValue>(Vec))
832 return false;
833 return isGuaranteedNotToBePoison(Vec, AC);
834 });
835 enum ShuffleMode { Unknown, Select, Permute };
836 ShuffleMode CommonShuffleMode = Unknown;
837 Mask.assign(VL.size(), PoisonMaskElem);
838 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
839 // Undef can be represented as an undef element in a vector.
840 if (isa<UndefValue>(VL[I]))
841 continue;
842 auto *EI = cast<ExtractElementInst>(VL[I]);
843 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
844 return std::nullopt;
845 auto *Vec = EI->getVectorOperand();
846 // We can extractelement from undef or poison vector.
848 continue;
849 // All vector operands must have the same number of vector elements.
850 if (isa<UndefValue>(Vec)) {
851 Mask[I] = I;
852 } else {
853 if (isa<UndefValue>(EI->getIndexOperand()))
854 continue;
855 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
856 if (!Idx)
857 return std::nullopt;
858 // Undefined behavior if Idx is negative or >= Size.
859 if (Idx->getValue().uge(Size))
860 continue;
861 unsigned IntIdx = Idx->getValue().getZExtValue();
862 Mask[I] = IntIdx;
863 }
864 if (isUndefVector(Vec).all() && HasNonUndefVec)
865 continue;
866 // For correct shuffling we have to have at most 2 different vector operands
867 // in all extractelement instructions.
868 if (!Vec1 || Vec1 == Vec) {
869 Vec1 = Vec;
870 } else if (!Vec2 || Vec2 == Vec) {
871 Vec2 = Vec;
872 Mask[I] += Size;
873 } else {
874 return std::nullopt;
875 }
876 if (CommonShuffleMode == Permute)
877 continue;
878 // If the extract index is not the same as the operation number, it is a
879 // permutation.
880 if (Mask[I] % Size != I) {
881 CommonShuffleMode = Permute;
882 continue;
883 }
884 CommonShuffleMode = Select;
885 }
886 // If we're not crossing lanes in different vectors, consider it as blending.
887 if (CommonShuffleMode == Select && Vec2)
889 // If Vec2 was never used, we have a permutation of a single vector, otherwise
890 // we have permutation of 2 vectors.
893}
894
895/// \returns True if Extract{Value,Element} instruction extracts element Idx.
896static std::optional<unsigned> getExtractIndex(const Instruction *E) {
897 unsigned Opcode = E->getOpcode();
898 assert((Opcode == Instruction::ExtractElement ||
899 Opcode == Instruction::ExtractValue) &&
900 "Expected extractelement or extractvalue instruction.");
901 if (Opcode == Instruction::ExtractElement) {
902 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
903 if (!CI)
904 return std::nullopt;
905 return CI->getZExtValue();
906 }
907 auto *EI = cast<ExtractValueInst>(E);
908 if (EI->getNumIndices() != 1)
909 return std::nullopt;
910 return *EI->idx_begin();
911}
912
913/// Checks if the provided value does not require scheduling. It does not
914/// require scheduling if this is not an instruction or it is an instruction
915/// that does not read/write memory and all operands are either not instructions
916/// or phi nodes or instructions from different blocks.
917static bool areAllOperandsNonInsts(Value *V);
918/// Checks if the provided value does not require scheduling. It does not
919/// require scheduling if this is not an instruction or it is an instruction
920/// that does not read/write memory and all users are phi nodes or instructions
921/// from the different blocks.
922static bool isUsedOutsideBlock(Value *V);
923/// Checks if the specified value does not require scheduling. It does not
924/// require scheduling if all operands and all users do not need to be scheduled
925/// in the current basic block.
926static bool doesNotNeedToBeScheduled(Value *V);
927
928/// \returns true if \p Opcode is allowed as part of the main/alternate
929/// instruction for SLP vectorization.
930///
931/// Example of unsupported opcode is SDIV that can potentially cause UB if the
932/// "shuffled out" lane would result in division by zero.
933static bool isValidForAlternation(unsigned Opcode) {
934 return !Instruction::isIntDivRem(Opcode);
935}
936
937namespace {
938
939/// Helper class that determines VL can use the same opcode.
940/// Alternate instruction is supported. In addition, it supports interchangeable
941/// instruction. An interchangeable instruction is an instruction that can be
942/// converted to another instruction with same semantics. For example, x << 1 is
943/// equal to x * 2. x * 1 is equal to x | 0.
944class BinOpSameOpcodeHelper {
945 using MaskType = std::uint_fast16_t;
946 /// Sort SupportedOp because it is used by binary_search.
947 constexpr static std::initializer_list<unsigned> SupportedOp = {
948 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
949 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
950 enum : MaskType {
951 ShlBIT = 0b1,
952 AShrBIT = 0b10,
953 MulBIT = 0b100,
954 AddBIT = 0b1000,
955 SubBIT = 0b10000,
956 AndBIT = 0b100000,
957 OrBIT = 0b1000000,
958 XorBIT = 0b10000000,
959 MainOpBIT = 0b100000000,
961 };
962 /// Return a non-nullptr if either operand of I is a ConstantInt.
963 /// The second return value represents the operand position. We check the
964 /// right-hand side first (1). If the right hand side is not a ConstantInt and
965 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
966 /// side (0).
967 static std::pair<ConstantInt *, unsigned>
968 isBinOpWithConstantInt(const Instruction *I) {
969 unsigned Opcode = I->getOpcode();
970 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
971 (void)SupportedOp;
972 auto *BinOp = cast<BinaryOperator>(I);
973 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
974 return {CI, 1};
975 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
976 Opcode == Instruction::AShr)
977 return {nullptr, 0};
978 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
979 return {CI, 0};
980 return {nullptr, 0};
981 }
982 struct InterchangeableInfo {
983 const Instruction *I = nullptr;
984 /// The bit it sets represents whether MainOp can be converted to.
985 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
986 MulBIT | AShrBIT | ShlBIT;
987 /// We cannot create an interchangeable instruction that does not exist in
988 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
989 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
990 /// 1]. SeenBefore is used to know what operations have been seen before.
991 MaskType SeenBefore = 0;
992 InterchangeableInfo(const Instruction *I) : I(I) {}
993 /// Return false allows BinOpSameOpcodeHelper to find an alternate
994 /// instruction. Directly setting the mask will destroy the mask state,
995 /// preventing us from determining which instruction it should convert to.
996 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
997 if (Mask & InterchangeableMask) {
998 SeenBefore |= OpcodeInMaskForm;
999 Mask &= InterchangeableMask;
1000 return true;
1001 }
1002 return false;
1003 }
1004 bool equal(unsigned Opcode) {
1005 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1006 }
1007 unsigned getOpcode() const {
1008 MaskType Candidate = Mask & SeenBefore;
1009 if (Candidate & MainOpBIT)
1010 return I->getOpcode();
1011 if (Candidate & ShlBIT)
1012 return Instruction::Shl;
1013 if (Candidate & AShrBIT)
1014 return Instruction::AShr;
1015 if (Candidate & MulBIT)
1016 return Instruction::Mul;
1017 if (Candidate & AddBIT)
1018 return Instruction::Add;
1019 if (Candidate & SubBIT)
1020 return Instruction::Sub;
1021 if (Candidate & AndBIT)
1022 return Instruction::And;
1023 if (Candidate & OrBIT)
1024 return Instruction::Or;
1025 if (Candidate & XorBIT)
1026 return Instruction::Xor;
1027 llvm_unreachable("Cannot find interchangeable instruction.");
1028 }
1029
1030 /// Return true if the instruction can be converted to \p Opcode.
1031 bool hasCandidateOpcode(unsigned Opcode) const {
1032 MaskType Candidate = Mask & SeenBefore;
1033 switch (Opcode) {
1034 case Instruction::Shl:
1035 return Candidate & ShlBIT;
1036 case Instruction::AShr:
1037 return Candidate & AShrBIT;
1038 case Instruction::Mul:
1039 return Candidate & MulBIT;
1040 case Instruction::Add:
1041 return Candidate & AddBIT;
1042 case Instruction::Sub:
1043 return Candidate & SubBIT;
1044 case Instruction::And:
1045 return Candidate & AndBIT;
1046 case Instruction::Or:
1047 return Candidate & OrBIT;
1048 case Instruction::Xor:
1049 return Candidate & XorBIT;
1050 case Instruction::LShr:
1051 case Instruction::FAdd:
1052 case Instruction::FSub:
1053 case Instruction::FMul:
1054 case Instruction::SDiv:
1055 case Instruction::UDiv:
1056 case Instruction::FDiv:
1057 case Instruction::SRem:
1058 case Instruction::URem:
1059 case Instruction::FRem:
1060 return false;
1061 default:
1062 break;
1063 }
1064 llvm_unreachable("Cannot find interchangeable instruction.");
1065 }
1066
1067 SmallVector<Value *> getOperand(const Instruction *To) const {
1068 unsigned ToOpcode = To->getOpcode();
1069 unsigned FromOpcode = I->getOpcode();
1070 if (FromOpcode == ToOpcode)
1071 return SmallVector<Value *>(I->operands());
1072 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1073 auto [CI, Pos] = isBinOpWithConstantInt(I);
1074 const APInt &FromCIValue = CI->getValue();
1075 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1076 APInt ToCIValue;
1077 switch (FromOpcode) {
1078 case Instruction::Shl:
1079 if (ToOpcode == Instruction::Mul) {
1080 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1081 FromCIValue.getZExtValue());
1082 } else {
1083 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1084 ToCIValue = ToOpcode == Instruction::And
1085 ? APInt::getAllOnes(FromCIValueBitWidth)
1086 : APInt::getZero(FromCIValueBitWidth);
1087 }
1088 break;
1089 case Instruction::Mul:
1090 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1091 if (ToOpcode == Instruction::Shl) {
1092 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1093 } else {
1094 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1095 ToCIValue = ToOpcode == Instruction::And
1096 ? APInt::getAllOnes(FromCIValueBitWidth)
1097 : APInt::getZero(FromCIValueBitWidth);
1098 }
1099 break;
1100 case Instruction::Add:
1101 case Instruction::Sub:
1102 if (FromCIValue.isZero()) {
1103 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1104 } else {
1105 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1106 "Cannot convert the instruction.");
1107 ToCIValue = FromCIValue;
1108 ToCIValue.negate();
1109 }
1110 break;
1111 case Instruction::And:
1112 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1113 ToCIValue = ToOpcode == Instruction::Mul
1114 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1115 : APInt::getZero(FromCIValueBitWidth);
1116 break;
1117 default:
1118 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1119 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1120 break;
1121 }
1122 Value *LHS = I->getOperand(1 - Pos);
1123 Constant *RHS =
1124 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1125 // constant + x cannot be -constant - x
1126 // instead, it should be x - -constant
1127 if (Pos == 1 ||
1128 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1129 FromOpcode == Instruction::Xor) &&
1130 ToOpcode == Instruction::Sub))
1131 return SmallVector<Value *>({LHS, RHS});
1132 return SmallVector<Value *>({RHS, LHS});
1133 }
1134 };
1135 InterchangeableInfo MainOp;
1136 InterchangeableInfo AltOp;
1137 bool isValidForAlternation(const Instruction *I) const {
1138 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1139 ::isValidForAlternation(I->getOpcode());
1140 }
1141 bool initializeAltOp(const Instruction *I) {
1142 if (AltOp.I)
1143 return true;
1145 return false;
1146 AltOp.I = I;
1147 return true;
1148 }
1149
1150public:
1151 BinOpSameOpcodeHelper(const Instruction *MainOp,
1152 const Instruction *AltOp = nullptr)
1153 : MainOp(MainOp), AltOp(AltOp) {
1154 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1155 }
1156 bool add(const Instruction *I) {
1158 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1159 unsigned Opcode = I->getOpcode();
1160 MaskType OpcodeInMaskForm;
1161 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1162 switch (Opcode) {
1163 case Instruction::Shl:
1164 OpcodeInMaskForm = ShlBIT;
1165 break;
1166 case Instruction::AShr:
1167 OpcodeInMaskForm = AShrBIT;
1168 break;
1169 case Instruction::Mul:
1170 OpcodeInMaskForm = MulBIT;
1171 break;
1172 case Instruction::Add:
1173 OpcodeInMaskForm = AddBIT;
1174 break;
1175 case Instruction::Sub:
1176 OpcodeInMaskForm = SubBIT;
1177 break;
1178 case Instruction::And:
1179 OpcodeInMaskForm = AndBIT;
1180 break;
1181 case Instruction::Or:
1182 OpcodeInMaskForm = OrBIT;
1183 break;
1184 case Instruction::Xor:
1185 OpcodeInMaskForm = XorBIT;
1186 break;
1187 default:
1188 return MainOp.equal(Opcode) ||
1189 (initializeAltOp(I) && AltOp.equal(Opcode));
1190 }
1191 MaskType InterchangeableMask = OpcodeInMaskForm;
1192 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1193 if (CI) {
1194 constexpr MaskType CanBeAll =
1195 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1196 const APInt &CIValue = CI->getValue();
1197 switch (Opcode) {
1198 case Instruction::Shl:
1199 if (CIValue.ult(CIValue.getBitWidth()))
1200 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1201 break;
1202 case Instruction::Mul:
1203 if (CIValue.isOne()) {
1204 InterchangeableMask = CanBeAll;
1205 break;
1206 }
1207 if (CIValue.isPowerOf2())
1208 InterchangeableMask = MulBIT | ShlBIT;
1209 break;
1210 case Instruction::Add:
1211 case Instruction::Sub:
1212 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1213 break;
1214 case Instruction::And:
1215 if (CIValue.isAllOnes())
1216 InterchangeableMask = CanBeAll;
1217 break;
1218 case Instruction::Xor:
1219 if (CIValue.isZero())
1220 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1221 break;
1222 default:
1223 if (CIValue.isZero())
1224 InterchangeableMask = CanBeAll;
1225 break;
1226 }
1227 }
1228 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1229 (initializeAltOp(I) &&
1230 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1231 }
1232 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1233 /// Checks if the list of potential opcodes includes \p Opcode.
1234 bool hasCandidateOpcode(unsigned Opcode) const {
1235 return MainOp.hasCandidateOpcode(Opcode);
1236 }
1237 bool hasAltOp() const { return AltOp.I; }
1238 unsigned getAltOpcode() const {
1239 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1240 }
1241 SmallVector<Value *> getOperand(const Instruction *I) const {
1242 return MainOp.getOperand(I);
1243 }
1244};
1245
1246/// Main data required for vectorization of instructions.
1247class InstructionsState {
1248 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1249 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1250 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1251 /// isAltShuffle).
1252 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1253 /// from getMainAltOpsNoStateVL.
1254 /// For those InstructionsState that use alternate instructions, the resulting
1255 /// vectorized output ultimately comes from a shufflevector. For example,
1256 /// given a vector list (VL):
1257 /// VL[0] = add i32 a, e
1258 /// VL[1] = sub i32 b, f
1259 /// VL[2] = add i32 c, g
1260 /// VL[3] = sub i32 d, h
1261 /// The vectorized result would be:
1262 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1263 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1264 /// result = shufflevector <4 x i32> intermediated_0,
1265 /// <4 x i32> intermediated_1,
1266 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1267 /// Since shufflevector is used in the final result, when calculating the cost
1268 /// (getEntryCost), we must account for the usage of shufflevector in
1269 /// GetVectorCost.
1270 Instruction *MainOp = nullptr;
1271 Instruction *AltOp = nullptr;
1272 /// Wether the instruction state represents copyable instructions.
1273 bool HasCopyables = false;
1274
1275public:
1276 Instruction *getMainOp() const {
1277 assert(valid() && "InstructionsState is invalid.");
1278 return MainOp;
1279 }
1280
1281 Instruction *getAltOp() const {
1282 assert(valid() && "InstructionsState is invalid.");
1283 return AltOp;
1284 }
1285
1286 /// The main/alternate opcodes for the list of instructions.
1287 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1288
1289 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1290
1291 /// Some of the instructions in the list have alternate opcodes.
1292 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1293
1294 /// Checks if the instruction matches either the main or alternate opcode.
1295 /// \returns
1296 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1297 /// to it
1298 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1299 /// it
1300 /// - nullptr if \param I cannot be matched or converted to either opcode
1301 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1302 assert(MainOp && "MainOp cannot be nullptr.");
1303 if (I->getOpcode() == MainOp->getOpcode())
1304 return MainOp;
1305 // Prefer AltOp instead of interchangeable instruction of MainOp.
1306 assert(AltOp && "AltOp cannot be nullptr.");
1307 if (I->getOpcode() == AltOp->getOpcode())
1308 return AltOp;
1309 if (!I->isBinaryOp())
1310 return nullptr;
1311 BinOpSameOpcodeHelper Converter(MainOp);
1312 if (!Converter.add(I) || !Converter.add(MainOp))
1313 return nullptr;
1314 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1315 BinOpSameOpcodeHelper AltConverter(AltOp);
1316 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1317 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1318 return AltOp;
1319 }
1320 if (Converter.hasAltOp() && !isAltShuffle())
1321 return nullptr;
1322 return Converter.hasAltOp() ? AltOp : MainOp;
1323 }
1324
1325 /// Checks if main/alt instructions are shift operations.
1326 bool isShiftOp() const {
1327 return getMainOp()->isShift() && getAltOp()->isShift();
1328 }
1329
1330 /// Checks if main/alt instructions are bitwise logic operations.
1331 bool isBitwiseLogicOp() const {
1332 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1333 }
1334
1335 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1336 bool isMulDivLikeOp() const {
1337 constexpr std::array<unsigned, 8> MulDiv = {
1338 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1339 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1340 Instruction::URem, Instruction::FRem};
1341 return is_contained(MulDiv, getOpcode()) &&
1342 is_contained(MulDiv, getAltOpcode());
1343 }
1344
1345 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1346 bool isAddSubLikeOp() const {
1347 constexpr std::array<unsigned, 4> AddSub = {
1348 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1349 Instruction::FSub};
1350 return is_contained(AddSub, getOpcode()) &&
1351 is_contained(AddSub, getAltOpcode());
1352 }
1353
1354 /// Checks if main/alt instructions are cmp operations.
1355 bool isCmpOp() const {
1356 return (getOpcode() == Instruction::ICmp ||
1357 getOpcode() == Instruction::FCmp) &&
1358 getAltOpcode() == getOpcode();
1359 }
1360
1361 /// Checks if the current state is valid, i.e. has non-null MainOp
1362 bool valid() const { return MainOp && AltOp; }
1363
1364 explicit operator bool() const { return valid(); }
1365
1366 InstructionsState() = delete;
1367 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1368 bool HasCopyables = false)
1369 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1370 static InstructionsState invalid() { return {nullptr, nullptr}; }
1371
1372 /// Checks if the value is a copyable element.
1373 bool isCopyableElement(Value *V) const {
1374 assert(valid() && "InstructionsState is invalid.");
1375 if (!HasCopyables)
1376 return false;
1377 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1378 return false;
1379 auto *I = dyn_cast<Instruction>(V);
1380 if (!I)
1381 return !isa<PoisonValue>(V);
1382 if (I->getParent() != MainOp->getParent() &&
1385 return true;
1386 if (I->getOpcode() == MainOp->getOpcode())
1387 return false;
1388 if (!I->isBinaryOp())
1389 return true;
1390 BinOpSameOpcodeHelper Converter(MainOp);
1391 return !Converter.add(I) || !Converter.add(MainOp) ||
1392 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1393 }
1394
1395 /// Checks if the value is non-schedulable.
1396 bool isNonSchedulable(Value *V) const {
1397 assert(valid() && "InstructionsState is invalid.");
1398 auto *I = dyn_cast<Instruction>(V);
1399 if (!HasCopyables)
1400 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1402 // MainOp for copyables always schedulable to correctly identify
1403 // non-schedulable copyables.
1404 if (getMainOp() == V)
1405 return false;
1406 if (isCopyableElement(V)) {
1407 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1408 auto *I = dyn_cast<Instruction>(V);
1409 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1411 // If the copyable instructions comes after MainOp
1412 // (non-schedulable, but used in the block) - cannot vectorize
1413 // it, will possibly generate use before def.
1414 !MainOp->comesBefore(I));
1415 };
1416
1417 return IsNonSchedulableCopyableElement(V);
1418 }
1419 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1421 }
1422
1423 /// Checks if the state represents copyable instructions.
1424 bool areInstructionsWithCopyableElements() const {
1425 assert(valid() && "InstructionsState is invalid.");
1426 return HasCopyables;
1427 }
1428};
1429
1430std::pair<Instruction *, SmallVector<Value *>>
1431convertTo(Instruction *I, const InstructionsState &S) {
1432 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1433 assert(SelectedOp && "Cannot convert the instruction.");
1434 if (I->isBinaryOp()) {
1435 BinOpSameOpcodeHelper Converter(I);
1436 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1437 }
1438 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1439}
1440
1441} // end anonymous namespace
1442
1443static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1444 const TargetLibraryInfo &TLI);
1445
1446/// Find an instruction with a specific opcode in VL.
1447/// \param VL Array of values to search through. Must contain only Instructions
1448/// and PoisonValues.
1449/// \param Opcode The instruction opcode to search for
1450/// \returns
1451/// - The first instruction found with matching opcode
1452/// - nullptr if no matching instruction is found
1454 unsigned Opcode) {
1455 for (Value *V : VL) {
1456 if (isa<PoisonValue>(V))
1457 continue;
1458 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1459 auto *Inst = cast<Instruction>(V);
1460 if (Inst->getOpcode() == Opcode)
1461 return Inst;
1462 }
1463 return nullptr;
1464}
1465
1466/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1467/// compatible instructions or constants, or just some other regular values.
1468static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1469 Value *Op1, const TargetLibraryInfo &TLI) {
1470 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1471 (isConstant(BaseOp1) && isConstant(Op1)) ||
1472 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1473 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1474 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1475 getSameOpcode({BaseOp0, Op0}, TLI) ||
1476 getSameOpcode({BaseOp1, Op1}, TLI);
1477}
1478
1479/// \returns true if a compare instruction \p CI has similar "look" and
1480/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1481/// swapped, false otherwise.
1482static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1483 const TargetLibraryInfo &TLI) {
1484 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1485 "Assessing comparisons of different types?");
1486 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1487 CmpInst::Predicate Pred = CI->getPredicate();
1489
1490 Value *BaseOp0 = BaseCI->getOperand(0);
1491 Value *BaseOp1 = BaseCI->getOperand(1);
1492 Value *Op0 = CI->getOperand(0);
1493 Value *Op1 = CI->getOperand(1);
1494
1495 return (BasePred == Pred &&
1496 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1497 (BasePred == SwappedPred &&
1498 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1499}
1500
1501/// \returns analysis of the Instructions in \p VL described in
1502/// InstructionsState, the Opcode that we suppose the whole list
1503/// could be vectorized even if its structure is diverse.
1504static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1505 const TargetLibraryInfo &TLI) {
1506 // Make sure these are all Instructions.
1508 return InstructionsState::invalid();
1509
1510 auto *It = find_if(VL, IsaPred<Instruction>);
1511 if (It == VL.end())
1512 return InstructionsState::invalid();
1513
1514 Instruction *MainOp = cast<Instruction>(*It);
1515 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1516 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1517 (VL.size() == 2 && InstCnt < 2))
1518 return InstructionsState::invalid();
1519
1520 bool IsCastOp = isa<CastInst>(MainOp);
1521 bool IsBinOp = isa<BinaryOperator>(MainOp);
1522 bool IsCmpOp = isa<CmpInst>(MainOp);
1523 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1525 Instruction *AltOp = MainOp;
1526 unsigned Opcode = MainOp->getOpcode();
1527 unsigned AltOpcode = Opcode;
1528
1529 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1530 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1531 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1532 UniquePreds.insert(BasePred);
1533 UniqueNonSwappedPreds.insert(BasePred);
1534 for (Value *V : VL) {
1535 auto *I = dyn_cast<CmpInst>(V);
1536 if (!I)
1537 return false;
1538 CmpInst::Predicate CurrentPred = I->getPredicate();
1539 CmpInst::Predicate SwappedCurrentPred =
1540 CmpInst::getSwappedPredicate(CurrentPred);
1541 UniqueNonSwappedPreds.insert(CurrentPred);
1542 if (!UniquePreds.contains(CurrentPred) &&
1543 !UniquePreds.contains(SwappedCurrentPred))
1544 UniquePreds.insert(CurrentPred);
1545 }
1546 // Total number of predicates > 2, but if consider swapped predicates
1547 // compatible only 2, consider swappable predicates as compatible opcodes,
1548 // not alternate.
1549 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1550 }();
1551 // Check for one alternate opcode from another BinaryOperator.
1552 // TODO - generalize to support all operators (types, calls etc.).
1553 Intrinsic::ID BaseID = 0;
1554 SmallVector<VFInfo> BaseMappings;
1555 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1556 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1557 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1558 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1559 return InstructionsState::invalid();
1560 }
1561 bool AnyPoison = InstCnt != VL.size();
1562 // Check MainOp too to be sure that it matches the requirements for the
1563 // instructions.
1564 for (Value *V : iterator_range(It, VL.end())) {
1565 auto *I = dyn_cast<Instruction>(V);
1566 if (!I)
1567 continue;
1568
1569 // Cannot combine poison and divisions.
1570 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1571 // intrinsics/functions only.
1572 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1573 return InstructionsState::invalid();
1574 unsigned InstOpcode = I->getOpcode();
1575 if (IsBinOp && isa<BinaryOperator>(I)) {
1576 if (BinOpHelper.add(I))
1577 continue;
1578 } else if (IsCastOp && isa<CastInst>(I)) {
1579 Value *Op0 = MainOp->getOperand(0);
1580 Type *Ty0 = Op0->getType();
1581 Value *Op1 = I->getOperand(0);
1582 Type *Ty1 = Op1->getType();
1583 if (Ty0 == Ty1) {
1584 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1585 continue;
1586 if (Opcode == AltOpcode) {
1587 assert(isValidForAlternation(Opcode) &&
1588 isValidForAlternation(InstOpcode) &&
1589 "Cast isn't safe for alternation, logic needs to be updated!");
1590 AltOpcode = InstOpcode;
1591 AltOp = I;
1592 continue;
1593 }
1594 }
1595 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1596 auto *BaseInst = cast<CmpInst>(MainOp);
1597 Type *Ty0 = BaseInst->getOperand(0)->getType();
1598 Type *Ty1 = Inst->getOperand(0)->getType();
1599 if (Ty0 == Ty1) {
1600 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1601 assert(InstOpcode == AltOpcode &&
1602 "Alternate instructions are only supported by BinaryOperator "
1603 "and CastInst.");
1604 // Check for compatible operands. If the corresponding operands are not
1605 // compatible - need to perform alternate vectorization.
1606 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1607 CmpInst::Predicate SwappedCurrentPred =
1608 CmpInst::getSwappedPredicate(CurrentPred);
1609
1610 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1611 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1612 continue;
1613
1614 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1615 continue;
1616 auto *AltInst = cast<CmpInst>(AltOp);
1617 if (MainOp != AltOp) {
1618 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1619 continue;
1620 } else if (BasePred != CurrentPred) {
1621 assert(
1622 isValidForAlternation(InstOpcode) &&
1623 "CmpInst isn't safe for alternation, logic needs to be updated!");
1624 AltOp = I;
1625 continue;
1626 }
1627 CmpInst::Predicate AltPred = AltInst->getPredicate();
1628 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1629 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1630 continue;
1631 }
1632 } else if (InstOpcode == Opcode) {
1633 assert(InstOpcode == AltOpcode &&
1634 "Alternate instructions are only supported by BinaryOperator and "
1635 "CastInst.");
1636 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1637 if (Gep->getNumOperands() != 2 ||
1638 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1639 return InstructionsState::invalid();
1640 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1642 return InstructionsState::invalid();
1643 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1644 auto *BaseLI = cast<LoadInst>(MainOp);
1645 if (!LI->isSimple() || !BaseLI->isSimple())
1646 return InstructionsState::invalid();
1647 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1648 auto *CallBase = cast<CallInst>(MainOp);
1649 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1650 return InstructionsState::invalid();
1651 if (Call->hasOperandBundles() &&
1653 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1654 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1655 CallBase->op_begin() +
1657 return InstructionsState::invalid();
1659 if (ID != BaseID)
1660 return InstructionsState::invalid();
1661 if (!ID) {
1662 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1663 if (Mappings.size() != BaseMappings.size() ||
1664 Mappings.front().ISA != BaseMappings.front().ISA ||
1665 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1666 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1667 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1668 Mappings.front().Shape.Parameters !=
1669 BaseMappings.front().Shape.Parameters)
1670 return InstructionsState::invalid();
1671 }
1672 }
1673 continue;
1674 }
1675 return InstructionsState::invalid();
1676 }
1677
1678 if (IsBinOp) {
1679 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1680 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1681 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1682 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1683 }
1684 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1685 "Incorrect implementation of allSameOpcode.");
1686 InstructionsState S(MainOp, AltOp);
1687 assert(all_of(VL,
1688 [&](Value *V) {
1689 return isa<PoisonValue>(V) ||
1690 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1691 }) &&
1692 "Invalid InstructionsState.");
1693 return S;
1694}
1695
1696/// \returns true if all of the values in \p VL have the same type or false
1697/// otherwise.
1699 Type *Ty = VL.consume_front()->getType();
1700 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1701}
1702
1703/// \returns True if in-tree use also needs extract. This refers to
1704/// possible scalar operand in vectorized instruction.
1705static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1706 TargetLibraryInfo *TLI,
1707 const TargetTransformInfo *TTI) {
1708 if (!UserInst)
1709 return false;
1710 unsigned Opcode = UserInst->getOpcode();
1711 switch (Opcode) {
1712 case Instruction::Load: {
1713 LoadInst *LI = cast<LoadInst>(UserInst);
1714 return (LI->getPointerOperand() == Scalar);
1715 }
1716 case Instruction::Store: {
1717 StoreInst *SI = cast<StoreInst>(UserInst);
1718 return (SI->getPointerOperand() == Scalar);
1719 }
1720 case Instruction::Call: {
1721 CallInst *CI = cast<CallInst>(UserInst);
1723 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1724 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1725 Arg.value().get() == Scalar;
1726 });
1727 }
1728 default:
1729 return false;
1730 }
1731}
1732
1733/// \returns the AA location that is being access by the instruction.
1736 return MemoryLocation::get(SI);
1737 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1738 return MemoryLocation::get(LI);
1739 return MemoryLocation();
1740}
1741
1742/// \returns True if the instruction is not a volatile or atomic load/store.
1743static bool isSimple(Instruction *I) {
1744 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1745 return LI->isSimple();
1747 return SI->isSimple();
1749 return !MI->isVolatile();
1750 return true;
1751}
1752
1753/// Shuffles \p Mask in accordance with the given \p SubMask.
1754/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1755/// one but two input vectors.
1756static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1757 bool ExtendingManyInputs = false) {
1758 if (SubMask.empty())
1759 return;
1760 assert(
1761 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1762 // Check if input scalars were extended to match the size of other node.
1763 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1764 "SubMask with many inputs support must be larger than the mask.");
1765 if (Mask.empty()) {
1766 Mask.append(SubMask.begin(), SubMask.end());
1767 return;
1768 }
1769 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1770 int TermValue = std::min(Mask.size(), SubMask.size());
1771 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1772 if (SubMask[I] == PoisonMaskElem ||
1773 (!ExtendingManyInputs &&
1774 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1775 continue;
1776 NewMask[I] = Mask[SubMask[I]];
1777 }
1778 Mask.swap(NewMask);
1779}
1780
1781/// Order may have elements assigned special value (size) which is out of
1782/// bounds. Such indices only appear on places which correspond to undef values
1783/// (see canReuseExtract for details) and used in order to avoid undef values
1784/// have effect on operands ordering.
1785/// The first loop below simply finds all unused indices and then the next loop
1786/// nest assigns these indices for undef values positions.
1787/// As an example below Order has two undef positions and they have assigned
1788/// values 3 and 7 respectively:
1789/// before: 6 9 5 4 9 2 1 0
1790/// after: 6 3 5 4 7 2 1 0
1792 const size_t Sz = Order.size();
1793 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1794 SmallBitVector MaskedIndices(Sz);
1795 for (unsigned I = 0; I < Sz; ++I) {
1796 if (Order[I] < Sz)
1797 UnusedIndices.reset(Order[I]);
1798 else
1799 MaskedIndices.set(I);
1800 }
1801 if (MaskedIndices.none())
1802 return;
1803 assert(UnusedIndices.count() == MaskedIndices.count() &&
1804 "Non-synced masked/available indices.");
1805 int Idx = UnusedIndices.find_first();
1806 int MIdx = MaskedIndices.find_first();
1807 while (MIdx >= 0) {
1808 assert(Idx >= 0 && "Indices must be synced.");
1809 Order[MIdx] = Idx;
1810 Idx = UnusedIndices.find_next(Idx);
1811 MIdx = MaskedIndices.find_next(MIdx);
1812 }
1813}
1814
1815/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1816/// Opcode1.
1818 unsigned Opcode0, unsigned Opcode1) {
1819 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1820 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1821 for (unsigned Lane : seq<unsigned>(VL.size())) {
1822 if (isa<PoisonValue>(VL[Lane]))
1823 continue;
1824 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1825 OpcodeMask.set(Lane * ScalarTyNumElements,
1826 Lane * ScalarTyNumElements + ScalarTyNumElements);
1827 }
1828 return OpcodeMask;
1829}
1830
1831/// Replicates the given \p Val \p VF times.
1833 unsigned VF) {
1834 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1835 "Expected scalar constants.");
1836 SmallVector<Constant *> NewVal(Val.size() * VF);
1837 for (auto [I, V] : enumerate(Val))
1838 std::fill_n(NewVal.begin() + I * VF, VF, V);
1839 return NewVal;
1840}
1841
1843 SmallVectorImpl<int> &Mask) {
1844 Mask.clear();
1845 const unsigned E = Indices.size();
1846 Mask.resize(E, PoisonMaskElem);
1847 for (unsigned I = 0; I < E; ++I)
1848 Mask[Indices[I]] = I;
1849}
1850
1851/// Reorders the list of scalars in accordance with the given \p Mask.
1853 ArrayRef<int> Mask) {
1854 assert(!Mask.empty() && "Expected non-empty mask.");
1855 SmallVector<Value *> Prev(Scalars.size(),
1856 PoisonValue::get(Scalars.front()->getType()));
1857 Prev.swap(Scalars);
1858 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1859 if (Mask[I] != PoisonMaskElem)
1860 Scalars[Mask[I]] = Prev[I];
1861}
1862
1863/// Checks if the provided value does not require scheduling. It does not
1864/// require scheduling if this is not an instruction or it is an instruction
1865/// that does not read/write memory and all operands are either not instructions
1866/// or phi nodes or instructions from different blocks.
1868 auto *I = dyn_cast<Instruction>(V);
1869 if (!I)
1870 return true;
1871 return !mayHaveNonDefUseDependency(*I) &&
1872 all_of(I->operands(), [I](Value *V) {
1873 auto *IO = dyn_cast<Instruction>(V);
1874 if (!IO)
1875 return true;
1876 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1877 });
1878}
1879
1880/// Checks if the provided value does not require scheduling. It does not
1881/// require scheduling if this is not an instruction or it is an instruction
1882/// that does not read/write memory and all users are phi nodes or instructions
1883/// from the different blocks.
1884static bool isUsedOutsideBlock(Value *V) {
1885 auto *I = dyn_cast<Instruction>(V);
1886 if (!I)
1887 return true;
1888 // Limits the number of uses to save compile time.
1889 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1890 all_of(I->users(), [I](User *U) {
1891 auto *IU = dyn_cast<Instruction>(U);
1892 if (!IU)
1893 return true;
1894 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1895 });
1896}
1897
1898/// Checks if the specified value does not require scheduling. It does not
1899/// require scheduling if all operands and all users do not need to be scheduled
1900/// in the current basic block.
1903}
1904
1905/// Checks if the specified array of instructions does not require scheduling.
1906/// It is so if all either instructions have operands that do not require
1907/// scheduling or their users do not require scheduling since they are phis or
1908/// in other basic blocks.
1910 return !VL.empty() &&
1912}
1913
1914/// Returns true if widened type of \p Ty elements with size \p Sz represents
1915/// full vector type, i.e. adding extra element results in extra parts upon type
1916/// legalization.
1918 unsigned Sz) {
1919 if (Sz <= 1)
1920 return false;
1922 return false;
1923 if (has_single_bit(Sz))
1924 return true;
1925 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1926 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1927 Sz % NumParts == 0;
1928}
1929
1930/// Returns number of parts, the type \p VecTy will be split at the codegen
1931/// phase. If the type is going to be scalarized or does not uses whole
1932/// registers, returns 1.
1933static unsigned
1935 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1936 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1937 if (NumParts == 0 || NumParts >= Limit)
1938 return 1;
1939 unsigned Sz = getNumElements(VecTy);
1940 if (NumParts >= Sz || Sz % NumParts != 0 ||
1941 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1942 return 1;
1943 return NumParts;
1944}
1945
1946/// Bottom Up SLP Vectorizer.
1948 class TreeEntry;
1949 class ScheduleEntity;
1950 class ScheduleData;
1951 class ScheduleCopyableData;
1952 class ScheduleBundle;
1955
1956 /// If we decide to generate strided load / store, this struct contains all
1957 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1958 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1959 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1960 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1961 /// size of element of FixedVectorType.
1962 struct StridedPtrInfo {
1963 Value *StrideVal = nullptr;
1964 const SCEV *StrideSCEV = nullptr;
1965 FixedVectorType *Ty = nullptr;
1966 };
1967 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1968
1969public:
1970 /// Tracks the state we can represent the loads in the given sequence.
1978
1985
1987 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1989 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1990 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1991 AC(AC), DB(DB), DL(DL), ORE(ORE),
1992 Builder(Se->getContext(), TargetFolder(*DL)) {
1993 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1994 // Use the vector register size specified by the target unless overridden
1995 // by a command-line option.
1996 // TODO: It would be better to limit the vectorization factor based on
1997 // data type rather than just register size. For example, x86 AVX has
1998 // 256-bit registers, but it does not support integer operations
1999 // at that width (that requires AVX2).
2000 if (MaxVectorRegSizeOption.getNumOccurrences())
2001 MaxVecRegSize = MaxVectorRegSizeOption;
2002 else
2003 MaxVecRegSize =
2004 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
2005 .getFixedValue();
2006
2007 if (MinVectorRegSizeOption.getNumOccurrences())
2008 MinVecRegSize = MinVectorRegSizeOption;
2009 else
2010 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2011 }
2012
2013 /// Vectorize the tree that starts with the elements in \p VL.
2014 /// Returns the vectorized root.
2016
2017 /// Vectorize the tree but with the list of externally used values \p
2018 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
2019 /// generated extractvalue instructions.
2021 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2022 Instruction *ReductionRoot = nullptr,
2023 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2024
2025 /// \returns the cost incurred by unwanted spills and fills, caused by
2026 /// holding live values over call sites.
2028
2029 /// Calculates the cost of the subtrees, trims non-profitable ones and returns
2030 /// final cost.
2033
2034 /// \returns the vectorization cost of the subtree that starts at \p VL.
2035 /// A negative number means that this is profitable.
2037 ArrayRef<Value *> VectorizedVals = {},
2038 InstructionCost ReductionCost = TTI::TCC_Free);
2039
2040 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2041 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2042 void buildTree(ArrayRef<Value *> Roots,
2043 const SmallDenseSet<Value *> &UserIgnoreLst);
2044
2045 /// Construct a vectorizable tree that starts at \p Roots.
2046 void buildTree(ArrayRef<Value *> Roots);
2047
2048 /// Return the scalars of the root node.
2050 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2051 return VectorizableTree.front()->Scalars;
2052 }
2053
2054 /// Returns the type/is-signed info for the root node in the graph without
2055 /// casting.
2056 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2057 const TreeEntry &Root = *VectorizableTree.front();
2058 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2059 !Root.Scalars.front()->getType()->isIntegerTy())
2060 return std::nullopt;
2061 auto It = MinBWs.find(&Root);
2062 if (It != MinBWs.end())
2063 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2064 It->second.first),
2065 It->second.second);
2066 if (Root.getOpcode() == Instruction::ZExt ||
2067 Root.getOpcode() == Instruction::SExt)
2068 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2069 Root.getOpcode() == Instruction::SExt);
2070 return std::nullopt;
2071 }
2072
2073 /// Checks if the root graph node can be emitted with narrower bitwidth at
2074 /// codegen and returns it signedness, if so.
2076 return MinBWs.at(VectorizableTree.front().get()).second;
2077 }
2078
2079 /// Returns reduction type after minbitdth analysis.
2081 if (ReductionBitWidth == 0 ||
2082 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2083 ReductionBitWidth >=
2084 DL->getTypeSizeInBits(
2085 VectorizableTree.front()->Scalars.front()->getType()))
2086 return getWidenedType(
2087 VectorizableTree.front()->Scalars.front()->getType(),
2088 VectorizableTree.front()->getVectorFactor());
2089 return getWidenedType(
2091 VectorizableTree.front()->Scalars.front()->getContext(),
2092 ReductionBitWidth),
2093 VectorizableTree.front()->getVectorFactor());
2094 }
2095
2096 /// Builds external uses of the vectorized scalars, i.e. the list of
2097 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2098 /// ExternallyUsedValues contains additional list of external uses to handle
2099 /// vectorization of reductions.
2100 void
2101 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2102
2103 /// Transforms graph nodes to target specific representations, if profitable.
2104 void transformNodes();
2105
2106 /// Clear the internal data structures that are created by 'buildTree'.
2107 void deleteTree() {
2108 VectorizableTree.clear();
2109 ScalarToTreeEntries.clear();
2110 DeletedNodes.clear();
2111 TransformedToGatherNodes.clear();
2112 OperandsToTreeEntry.clear();
2113 ScalarsInSplitNodes.clear();
2114 MustGather.clear();
2115 NonScheduledFirst.clear();
2116 EntryToLastInstruction.clear();
2117 LastInstructionToPos.clear();
2118 LoadEntriesToVectorize.clear();
2119 IsGraphTransformMode = false;
2120 GatheredLoadsEntriesFirst.reset();
2121 CompressEntryToData.clear();
2122 ExternalUses.clear();
2123 ExternalUsesAsOriginalScalar.clear();
2124 ExternalUsesWithNonUsers.clear();
2125 for (auto &Iter : BlocksSchedules) {
2126 BlockScheduling *BS = Iter.second.get();
2127 BS->clear();
2128 }
2129 MinBWs.clear();
2130 ReductionBitWidth = 0;
2131 BaseGraphSize = 1;
2132 CastMaxMinBWSizes.reset();
2133 ExtraBitWidthNodes.clear();
2134 InstrElementSize.clear();
2135 UserIgnoreList = nullptr;
2136 PostponedGathers.clear();
2137 ValueToGatherNodes.clear();
2138 TreeEntryToStridedPtrInfoMap.clear();
2139 }
2140
2141 unsigned getTreeSize() const { return VectorizableTree.size(); }
2142
2143 /// Returns the base graph size, before any transformations.
2144 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2145
2146 /// Perform LICM and CSE on the newly generated gather sequences.
2148
2149 /// Does this non-empty order represent an identity order? Identity
2150 /// should be represented as an empty order, so this is used to
2151 /// decide if we can canonicalize a computed order. Undef elements
2152 /// (represented as size) are ignored.
2154 assert(!Order.empty() && "expected non-empty order");
2155 const unsigned Sz = Order.size();
2156 return all_of(enumerate(Order), [&](const auto &P) {
2157 return P.value() == P.index() || P.value() == Sz;
2158 });
2159 }
2160
2161 /// Checks if the specified gather tree entry \p TE can be represented as a
2162 /// shuffled vector entry + (possibly) permutation with other gathers. It
2163 /// implements the checks only for possibly ordered scalars (Loads,
2164 /// ExtractElement, ExtractValue), which can be part of the graph.
2165 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2166 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2167 /// node might be ignored.
2168 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2169 bool TopToBottom,
2170 bool IgnoreReorder);
2171
2172 /// Sort loads into increasing pointers offsets to allow greater clustering.
2173 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2174
2175 /// Gets reordering data for the given tree entry. If the entry is vectorized
2176 /// - just return ReorderIndices, otherwise check if the scalars can be
2177 /// reordered and return the most optimal order.
2178 /// \return std::nullopt if ordering is not important, empty order, if
2179 /// identity order is important, or the actual order.
2180 /// \param TopToBottom If true, include the order of vectorized stores and
2181 /// insertelement nodes, otherwise skip them.
2182 /// \param IgnoreReorder true, if the root node order can be ignored.
2183 std::optional<OrdersType>
2184 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2185
2186 /// Checks if it is profitable to reorder the current tree.
2187 /// If the tree does not contain many profitable reordable nodes, better to
2188 /// skip it to save compile time.
2189 bool isProfitableToReorder() const;
2190
2191 /// Reorders the current graph to the most profitable order starting from the
2192 /// root node to the leaf nodes. The best order is chosen only from the nodes
2193 /// of the same size (vectorization factor). Smaller nodes are considered
2194 /// parts of subgraph with smaller VF and they are reordered independently. We
2195 /// can make it because we still need to extend smaller nodes to the wider VF
2196 /// and we can merge reordering shuffles with the widening shuffles.
2197 void reorderTopToBottom();
2198
2199 /// Reorders the current graph to the most profitable order starting from
2200 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2201 /// number of reshuffles if the leaf nodes use the same order. In this case we
2202 /// can merge the orders and just shuffle user node instead of shuffling its
2203 /// operands. Plus, even the leaf nodes have different orders, it allows to
2204 /// sink reordering in the graph closer to the root node and merge it later
2205 /// during analysis.
2206 void reorderBottomToTop(bool IgnoreReorder = false);
2207
2208 /// \return The vector element size in bits to use when vectorizing the
2209 /// expression tree ending at \p V. If V is a store, the size is the width of
2210 /// the stored value. Otherwise, the size is the width of the largest loaded
2211 /// value reaching V. This method is used by the vectorizer to calculate
2212 /// vectorization factors.
2213 unsigned getVectorElementSize(Value *V);
2214
2215 /// Compute the minimum type sizes required to represent the entries in a
2216 /// vectorizable tree.
2218
2219 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2220 unsigned getMaxVecRegSize() const {
2221 return MaxVecRegSize;
2222 }
2223
2224 // \returns minimum vector register size as set by cl::opt.
2225 unsigned getMinVecRegSize() const {
2226 return MinVecRegSize;
2227 }
2228
2229 unsigned getMinVF(unsigned Sz) const {
2230 return std::max(2U, getMinVecRegSize() / Sz);
2231 }
2232
2233 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2234 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2235 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2236 return MaxVF ? MaxVF : UINT_MAX;
2237 }
2238
2239 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2240 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2241 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2242 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2243 ///
2244 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2245 unsigned canMapToVector(Type *T) const;
2246
2247 /// \returns True if the VectorizableTree is both tiny and not fully
2248 /// vectorizable. We do not vectorize such trees.
2249 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2250
2251 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2252 /// It may happen, if all gather nodes are loads and they cannot be
2253 /// "clusterized". In this case even subgraphs cannot be vectorized more
2254 /// effectively than the base graph.
2255 bool isTreeNotExtendable() const;
2256
2257 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2258 /// can be load combined in the backend. Load combining may not be allowed in
2259 /// the IR optimizer, so we do not want to alter the pattern. For example,
2260 /// partially transforming a scalar bswap() pattern into vector code is
2261 /// effectively impossible for the backend to undo.
2262 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2263 /// may not be necessary.
2264 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2265
2266 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2267 /// can be load combined in the backend. Load combining may not be allowed in
2268 /// the IR optimizer, so we do not want to alter the pattern. For example,
2269 /// partially transforming a scalar bswap() pattern into vector code is
2270 /// effectively impossible for the backend to undo.
2271 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2272 /// may not be necessary.
2273 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2274 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2275 Align Alignment, const int64_t Diff,
2276 const size_t Sz) const;
2277
2278 /// Return true if an array of scalar loads can be replaced with a strided
2279 /// load (with constant stride).
2280 ///
2281 /// It is possible that the load gets "widened". Suppose that originally each
2282 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2283 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2284 /// ...
2285 /// %b + 0 * %s + (w - 1)
2286 ///
2287 /// %b + 1 * %s + 0
2288 /// %b + 1 * %s + 1
2289 /// %b + 1 * %s + 2
2290 /// ...
2291 /// %b + 1 * %s + (w - 1)
2292 /// ...
2293 ///
2294 /// %b + (n - 1) * %s + 0
2295 /// %b + (n - 1) * %s + 1
2296 /// %b + (n - 1) * %s + 2
2297 /// ...
2298 /// %b + (n - 1) * %s + (w - 1)
2299 ///
2300 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2301 ///
2302 /// \param PointerOps list of pointer arguments of loads.
2303 /// \param ElemTy original scalar type of loads.
2304 /// \param Alignment alignment of the first load.
2305 /// \param SortedIndices is the order of PointerOps as returned by
2306 /// `sortPtrAccesses`
2307 /// \param Diff Pointer difference between the lowest and the highes pointer
2308 /// in `PointerOps` as returned by `getPointersDiff`.
2309 /// \param Ptr0 first pointer in `PointersOps`.
2310 /// \param PtrN last pointer in `PointersOps`.
2311 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2312 /// of `SPtrInfo` necessary to generate the strided load later.
2314 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2315 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2316 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2317
2318 /// Return true if an array of scalar loads can be replaced with a strided
2319 /// load (with run-time stride).
2320 /// \param PointerOps list of pointer arguments of loads.
2321 /// \param ScalarTy type of loads.
2322 /// \param CommonAlignment common alignement of loads as computed by
2323 /// `computeCommonAlignment<LoadInst>`.
2324 /// \param SortedIndicies is a list of indicies computed by this function such
2325 /// that the sequence `PointerOps[SortedIndices[0]],
2326 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2327 /// ordered by the coefficient of the stride. For example, if PointerOps is
2328 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2329 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2330 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2331 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2332 /// of `SPtrInfo` necessary to generate the strided load later.
2333 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2334 Align CommonAlignment,
2335 SmallVectorImpl<unsigned> &SortedIndices,
2336 StridedPtrInfo &SPtrInfo) const;
2337
2338 /// Checks if the given array of loads can be represented as a vectorized,
2339 /// scatter or just simple gather.
2340 /// \param VL list of loads.
2341 /// \param VL0 main load value.
2342 /// \param Order returned order of load instructions.
2343 /// \param PointerOps returned list of pointer operands.
2344 /// \param BestVF return best vector factor, if recursive check found better
2345 /// vectorization sequences rather than masked gather.
2346 /// \param TryRecursiveCheck used to check if long masked gather can be
2347 /// represented as a serie of loads/insert subvector, if profitable.
2350 SmallVectorImpl<Value *> &PointerOps,
2351 StridedPtrInfo &SPtrInfo,
2352 unsigned *BestVF = nullptr,
2353 bool TryRecursiveCheck = true) const;
2354
2355 /// Registers non-vectorizable sequence of loads
2356 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2357 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2358 }
2359
2360 /// Checks if the given loads sequence is known as not vectorizable
2361 template <typename T>
2363 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2364 }
2365
2367
2368 /// This structure holds any data we need about the edges being traversed
2369 /// during buildTreeRec(). We keep track of:
2370 /// (i) the user TreeEntry index, and
2371 /// (ii) the index of the edge.
2372 struct EdgeInfo {
2373 EdgeInfo() = default;
2374 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2376 /// The user TreeEntry.
2377 TreeEntry *UserTE = nullptr;
2378 /// The operand index of the use.
2379 unsigned EdgeIdx = UINT_MAX;
2380#ifndef NDEBUG
2382 const BoUpSLP::EdgeInfo &EI) {
2383 EI.dump(OS);
2384 return OS;
2385 }
2386 /// Debug print.
2387 void dump(raw_ostream &OS) const {
2388 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2389 << " EdgeIdx:" << EdgeIdx << "}";
2390 }
2391 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2392#endif
2393 bool operator == (const EdgeInfo &Other) const {
2394 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2395 }
2396
2397 operator bool() const { return UserTE != nullptr; }
2398 };
2399 friend struct DenseMapInfo<EdgeInfo>;
2400
2401 /// A helper class used for scoring candidates for two consecutive lanes.
2403 const TargetLibraryInfo &TLI;
2404 const DataLayout &DL;
2405 ScalarEvolution &SE;
2406 const BoUpSLP &R;
2407 int NumLanes; // Total number of lanes (aka vectorization factor).
2408 int MaxLevel; // The maximum recursion depth for accumulating score.
2409
2410 public:
2412 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2413 int MaxLevel)
2414 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2415 MaxLevel(MaxLevel) {}
2416
2417 // The hard-coded scores listed here are not very important, though it shall
2418 // be higher for better matches to improve the resulting cost. When
2419 // computing the scores of matching one sub-tree with another, we are
2420 // basically counting the number of values that are matching. So even if all
2421 // scores are set to 1, we would still get a decent matching result.
2422 // However, sometimes we have to break ties. For example we may have to
2423 // choose between matching loads vs matching opcodes. This is what these
2424 // scores are helping us with: they provide the order of preference. Also,
2425 // this is important if the scalar is externally used or used in another
2426 // tree entry node in the different lane.
2427
2428 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2429 static const int ScoreConsecutiveLoads = 4;
2430 /// The same load multiple times. This should have a better score than
2431 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2432 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2433 /// a vector load and 1.0 for a broadcast.
2434 static const int ScoreSplatLoads = 3;
2435 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2436 static const int ScoreReversedLoads = 3;
2437 /// A load candidate for masked gather.
2438 static const int ScoreMaskedGatherCandidate = 1;
2439 /// ExtractElementInst from same vector and consecutive indexes.
2440 static const int ScoreConsecutiveExtracts = 4;
2441 /// ExtractElementInst from same vector and reversed indices.
2442 static const int ScoreReversedExtracts = 3;
2443 /// Constants.
2444 static const int ScoreConstants = 2;
2445 /// Instructions with the same opcode.
2446 static const int ScoreSameOpcode = 2;
2447 /// Instructions with alt opcodes (e.g, add + sub).
2448 static const int ScoreAltOpcodes = 1;
2449 /// Identical instructions (a.k.a. splat or broadcast).
2450 static const int ScoreSplat = 1;
2451 /// Matching with an undef is preferable to failing.
2452 static const int ScoreUndef = 1;
2453 /// Score for failing to find a decent match.
2454 static const int ScoreFail = 0;
2455 /// Score if all users are vectorized.
2456 static const int ScoreAllUserVectorized = 1;
2457
2458 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2459 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2460 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2461 /// MainAltOps.
2463 ArrayRef<Value *> MainAltOps) const {
2464 if (!isValidElementType(V1->getType()) ||
2467
2468 if (V1 == V2) {
2469 if (isa<LoadInst>(V1)) {
2470 // Retruns true if the users of V1 and V2 won't need to be extracted.
2471 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2472 // Bail out if we have too many uses to save compilation time.
2473 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2474 return false;
2475
2476 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2477 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2478 return U == U1 || U == U2 || R.isVectorized(U);
2479 });
2480 };
2481 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2482 };
2483 // A broadcast of a load can be cheaper on some targets.
2484 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2485 ElementCount::getFixed(NumLanes)) &&
2486 ((int)V1->getNumUses() == NumLanes ||
2487 AllUsersAreInternal(V1, V2)))
2489 }
2491 }
2492
2493 auto CheckSameEntryOrFail = [&]() {
2494 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2496 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2497 !TEs2.empty() &&
2498 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2500 }
2502 };
2503
2504 auto *LI1 = dyn_cast<LoadInst>(V1);
2505 auto *LI2 = dyn_cast<LoadInst>(V2);
2506 if (LI1 && LI2) {
2507 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2508 !LI2->isSimple())
2509 return CheckSameEntryOrFail();
2510
2511 std::optional<int64_t> Dist = getPointersDiff(
2512 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2513 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2514 if (!Dist || *Dist == 0) {
2515 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2516 getUnderlyingObject(LI2->getPointerOperand()) &&
2517 R.TTI->isLegalMaskedGather(
2518 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2520 return CheckSameEntryOrFail();
2521 }
2522 // The distance is too large - still may be profitable to use masked
2523 // loads/gathers.
2524 if (std::abs(*Dist) > NumLanes / 2)
2526 // This still will detect consecutive loads, but we might have "holes"
2527 // in some cases. It is ok for non-power-2 vectorization and may produce
2528 // better results. It should not affect current vectorization.
2531 }
2532
2533 auto *C1 = dyn_cast<Constant>(V1);
2534 auto *C2 = dyn_cast<Constant>(V2);
2535 if (C1 && C2)
2537
2538 // Consider constants and buildvector compatible.
2539 if ((C1 && isa<InsertElementInst>(V2)) ||
2540 (C2 && isa<InsertElementInst>(V1)))
2542
2543 // Extracts from consecutive indexes of the same vector better score as
2544 // the extracts could be optimized away.
2545 Value *EV1;
2546 ConstantInt *Ex1Idx;
2547 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2548 // Undefs are always profitable for extractelements.
2549 // Compiler can easily combine poison and extractelement <non-poison> or
2550 // undef and extractelement <poison>. But combining undef +
2551 // extractelement <non-poison-but-may-produce-poison> requires some
2552 // extra operations.
2553 if (isa<UndefValue>(V2))
2554 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2557 Value *EV2 = nullptr;
2558 ConstantInt *Ex2Idx = nullptr;
2559 if (match(V2,
2561 m_Undef())))) {
2562 // Undefs are always profitable for extractelements.
2563 if (!Ex2Idx)
2565 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2567 if (EV2 == EV1) {
2568 int Idx1 = Ex1Idx->getZExtValue();
2569 int Idx2 = Ex2Idx->getZExtValue();
2570 int Dist = Idx2 - Idx1;
2571 // The distance is too large - still may be profitable to use
2572 // shuffles.
2573 if (std::abs(Dist) == 0)
2575 if (std::abs(Dist) > NumLanes / 2)
2579 }
2581 }
2582 return CheckSameEntryOrFail();
2583 }
2584
2585 auto *I1 = dyn_cast<Instruction>(V1);
2586 auto *I2 = dyn_cast<Instruction>(V2);
2587 if (I1 && I2) {
2588 if (I1->getParent() != I2->getParent())
2589 return CheckSameEntryOrFail();
2590 SmallVector<Value *, 4> Ops(MainAltOps);
2591 Ops.push_back(I1);
2592 Ops.push_back(I2);
2593 InstructionsState S = getSameOpcode(Ops, TLI);
2594 // Note: Only consider instructions with <= 2 operands to avoid
2595 // complexity explosion.
2596 if (S &&
2597 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2598 !S.isAltShuffle()) &&
2599 all_of(Ops, [&S](Value *V) {
2600 return isa<PoisonValue>(V) ||
2601 cast<Instruction>(V)->getNumOperands() ==
2602 S.getMainOp()->getNumOperands();
2603 }))
2604 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2606 }
2607
2608 if (I1 && isa<PoisonValue>(V2))
2610
2611 if (isa<UndefValue>(V2))
2613
2614 return CheckSameEntryOrFail();
2615 }
2616
2617 /// Go through the operands of \p LHS and \p RHS recursively until
2618 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2619 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2620 /// of \p U1 and \p U2), except at the beginning of the recursion where
2621 /// these are set to nullptr.
2622 ///
2623 /// For example:
2624 /// \verbatim
2625 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2626 /// \ / \ / \ / \ /
2627 /// + + + +
2628 /// G1 G2 G3 G4
2629 /// \endverbatim
2630 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2631 /// each level recursively, accumulating the score. It starts from matching
2632 /// the additions at level 0, then moves on to the loads (level 1). The
2633 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2634 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2635 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2636 /// Please note that the order of the operands does not matter, as we
2637 /// evaluate the score of all profitable combinations of operands. In
2638 /// other words the score of G1 and G4 is the same as G1 and G2. This
2639 /// heuristic is based on ideas described in:
2640 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2641 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2642 /// Luís F. W. Góes
2644 Instruction *U2, int CurrLevel,
2645 ArrayRef<Value *> MainAltOps) const {
2646
2647 // Get the shallow score of V1 and V2.
2648 int ShallowScoreAtThisLevel =
2649 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2650
2651 // If reached MaxLevel,
2652 // or if V1 and V2 are not instructions,
2653 // or if they are SPLAT,
2654 // or if they are not consecutive,
2655 // or if profitable to vectorize loads or extractelements, early return
2656 // the current cost.
2657 auto *I1 = dyn_cast<Instruction>(LHS);
2658 auto *I2 = dyn_cast<Instruction>(RHS);
2659 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2660 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2661 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2662 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2664 ShallowScoreAtThisLevel))
2665 return ShallowScoreAtThisLevel;
2666 assert(I1 && I2 && "Should have early exited.");
2667
2668 // Contains the I2 operand indexes that got matched with I1 operands.
2669 SmallSet<unsigned, 4> Op2Used;
2670
2671 // Recursion towards the operands of I1 and I2. We are trying all possible
2672 // operand pairs, and keeping track of the best score.
2673 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2674 OpIdx1 != NumOperands1; ++OpIdx1) {
2675 // Try to pair op1I with the best operand of I2.
2676 int MaxTmpScore = 0;
2677 unsigned MaxOpIdx2 = 0;
2678 bool FoundBest = false;
2679 // If I2 is commutative try all combinations.
2680 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2681 unsigned ToIdx = isCommutative(I2)
2682 ? I2->getNumOperands()
2683 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2684 assert(FromIdx <= ToIdx && "Bad index");
2685 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2686 // Skip operands already paired with OpIdx1.
2687 if (Op2Used.count(OpIdx2))
2688 continue;
2689 // Recursively calculate the cost at each level
2690 int TmpScore =
2691 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2692 I1, I2, CurrLevel + 1, {});
2693 // Look for the best score.
2694 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2695 TmpScore > MaxTmpScore) {
2696 MaxTmpScore = TmpScore;
2697 MaxOpIdx2 = OpIdx2;
2698 FoundBest = true;
2699 }
2700 }
2701 if (FoundBest) {
2702 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2703 Op2Used.insert(MaxOpIdx2);
2704 ShallowScoreAtThisLevel += MaxTmpScore;
2705 }
2706 }
2707 return ShallowScoreAtThisLevel;
2708 }
2709 };
2710 /// A helper data structure to hold the operands of a vector of instructions.
2711 /// This supports a fixed vector length for all operand vectors.
2713 /// For each operand we need (i) the value, and (ii) the opcode that it
2714 /// would be attached to if the expression was in a left-linearized form.
2715 /// This is required to avoid illegal operand reordering.
2716 /// For example:
2717 /// \verbatim
2718 /// 0 Op1
2719 /// |/
2720 /// Op1 Op2 Linearized + Op2
2721 /// \ / ----------> |/
2722 /// - -
2723 ///
2724 /// Op1 - Op2 (0 + Op1) - Op2
2725 /// \endverbatim
2726 ///
2727 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2728 ///
2729 /// Another way to think of this is to track all the operations across the
2730 /// path from the operand all the way to the root of the tree and to
2731 /// calculate the operation that corresponds to this path. For example, the
2732 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2733 /// corresponding operation is a '-' (which matches the one in the
2734 /// linearized tree, as shown above).
2735 ///
2736 /// For lack of a better term, we refer to this operation as Accumulated
2737 /// Path Operation (APO).
2738 struct OperandData {
2739 OperandData() = default;
2740 OperandData(Value *V, bool APO, bool IsUsed)
2741 : V(V), APO(APO), IsUsed(IsUsed) {}
2742 /// The operand value.
2743 Value *V = nullptr;
2744 /// TreeEntries only allow a single opcode, or an alternate sequence of
2745 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2746 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2747 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2748 /// (e.g., Add/Mul)
2749 bool APO = false;
2750 /// Helper data for the reordering function.
2751 bool IsUsed = false;
2752 };
2753
2754 /// During operand reordering, we are trying to select the operand at lane
2755 /// that matches best with the operand at the neighboring lane. Our
2756 /// selection is based on the type of value we are looking for. For example,
2757 /// if the neighboring lane has a load, we need to look for a load that is
2758 /// accessing a consecutive address. These strategies are summarized in the
2759 /// 'ReorderingMode' enumerator.
2760 enum class ReorderingMode {
2761 Load, ///< Matching loads to consecutive memory addresses
2762 Opcode, ///< Matching instructions based on opcode (same or alternate)
2763 Constant, ///< Matching constants
2764 Splat, ///< Matching the same instruction multiple times (broadcast)
2765 Failed, ///< We failed to create a vectorizable group
2766 };
2767
2768 using OperandDataVec = SmallVector<OperandData, 2>;
2769
2770 /// A vector of operand vectors.
2772 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2773 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2774 unsigned ArgSize = 0;
2775
2776 const TargetLibraryInfo &TLI;
2777 const DataLayout &DL;
2778 ScalarEvolution &SE;
2779 const BoUpSLP &R;
2780 const Loop *L = nullptr;
2781
2782 /// \returns the operand data at \p OpIdx and \p Lane.
2783 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2784 return OpsVec[OpIdx][Lane];
2785 }
2786
2787 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2788 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2789 return OpsVec[OpIdx][Lane];
2790 }
2791
2792 /// Clears the used flag for all entries.
2793 void clearUsed() {
2794 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2795 OpIdx != NumOperands; ++OpIdx)
2796 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2797 ++Lane)
2798 OpsVec[OpIdx][Lane].IsUsed = false;
2799 }
2800
2801 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2802 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2803 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2804 }
2805
2806 /// \param Lane lane of the operands under analysis.
2807 /// \param OpIdx operand index in \p Lane lane we're looking the best
2808 /// candidate for.
2809 /// \param Idx operand index of the current candidate value.
2810 /// \returns The additional score due to possible broadcasting of the
2811 /// elements in the lane. It is more profitable to have power-of-2 unique
2812 /// elements in the lane, it will be vectorized with higher probability
2813 /// after removing duplicates. Currently the SLP vectorizer supports only
2814 /// vectorization of the power-of-2 number of unique scalars.
2815 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2816 const SmallBitVector &UsedLanes) const {
2817 Value *IdxLaneV = getData(Idx, Lane).V;
2818 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2819 isa<ExtractElementInst>(IdxLaneV))
2820 return 0;
2822 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2823 if (Ln == Lane)
2824 continue;
2825 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2826 if (!isa<Instruction>(OpIdxLnV))
2827 return 0;
2828 Uniques.try_emplace(OpIdxLnV, Ln);
2829 }
2830 unsigned UniquesCount = Uniques.size();
2831 auto IdxIt = Uniques.find(IdxLaneV);
2832 unsigned UniquesCntWithIdxLaneV =
2833 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2834 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2835 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2836 unsigned UniquesCntWithOpIdxLaneV =
2837 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2838 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2839 return 0;
2840 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2841 UniquesCntWithOpIdxLaneV,
2842 UniquesCntWithOpIdxLaneV -
2843 bit_floor(UniquesCntWithOpIdxLaneV)) -
2844 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2845 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2846 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2847 }
2848
2849 /// \param Lane lane of the operands under analysis.
2850 /// \param OpIdx operand index in \p Lane lane we're looking the best
2851 /// candidate for.
2852 /// \param Idx operand index of the current candidate value.
2853 /// \returns The additional score for the scalar which users are all
2854 /// vectorized.
2855 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2856 Value *IdxLaneV = getData(Idx, Lane).V;
2857 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2858 // Do not care about number of uses for vector-like instructions
2859 // (extractelement/extractvalue with constant indices), they are extracts
2860 // themselves and already externally used. Vectorization of such
2861 // instructions does not add extra extractelement instruction, just may
2862 // remove it.
2863 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2864 isVectorLikeInstWithConstOps(OpIdxLaneV))
2866 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2867 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2868 return 0;
2869 return R.areAllUsersVectorized(IdxLaneI)
2871 : 0;
2872 }
2873
2874 /// Score scaling factor for fully compatible instructions but with
2875 /// different number of external uses. Allows better selection of the
2876 /// instructions with less external uses.
2877 static const int ScoreScaleFactor = 10;
2878
2879 /// \Returns the look-ahead score, which tells us how much the sub-trees
2880 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2881 /// score. This helps break ties in an informed way when we cannot decide on
2882 /// the order of the operands by just considering the immediate
2883 /// predecessors.
2884 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2885 int Lane, unsigned OpIdx, unsigned Idx,
2886 bool &IsUsed, const SmallBitVector &UsedLanes) {
2887 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2889 // Keep track of the instruction stack as we recurse into the operands
2890 // during the look-ahead score exploration.
2891 int Score =
2892 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2893 /*CurrLevel=*/1, MainAltOps);
2894 if (Score) {
2895 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2896 if (Score <= -SplatScore) {
2897 // Failed score.
2898 Score = 0;
2899 } else {
2900 Score += SplatScore;
2901 // Scale score to see the difference between different operands
2902 // and similar operands but all vectorized/not all vectorized
2903 // uses. It does not affect actual selection of the best
2904 // compatible operand in general, just allows to select the
2905 // operand with all vectorized uses.
2906 Score *= ScoreScaleFactor;
2907 Score += getExternalUseScore(Lane, OpIdx, Idx);
2908 IsUsed = true;
2909 }
2910 }
2911 return Score;
2912 }
2913
2914 /// Best defined scores per lanes between the passes. Used to choose the
2915 /// best operand (with the highest score) between the passes.
2916 /// The key - {Operand Index, Lane}.
2917 /// The value - the best score between the passes for the lane and the
2918 /// operand.
2920 BestScoresPerLanes;
2921
2922 // Search all operands in Ops[*][Lane] for the one that matches best
2923 // Ops[OpIdx][LastLane] and return its opreand index.
2924 // If no good match can be found, return std::nullopt.
2925 std::optional<unsigned>
2926 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2927 ArrayRef<ReorderingMode> ReorderingModes,
2928 ArrayRef<Value *> MainAltOps,
2929 const SmallBitVector &UsedLanes) {
2930 unsigned NumOperands = getNumOperands();
2931
2932 // The operand of the previous lane at OpIdx.
2933 Value *OpLastLane = getData(OpIdx, LastLane).V;
2934
2935 // Our strategy mode for OpIdx.
2936 ReorderingMode RMode = ReorderingModes[OpIdx];
2937 if (RMode == ReorderingMode::Failed)
2938 return std::nullopt;
2939
2940 // The linearized opcode of the operand at OpIdx, Lane.
2941 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2942
2943 // The best operand index and its score.
2944 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2945 // are using the score to differentiate between the two.
2946 struct BestOpData {
2947 std::optional<unsigned> Idx;
2948 unsigned Score = 0;
2949 } BestOp;
2950 BestOp.Score =
2951 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2952 .first->second;
2953
2954 // Track if the operand must be marked as used. If the operand is set to
2955 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2956 // want to reestimate the operands again on the following iterations).
2957 bool IsUsed = RMode == ReorderingMode::Splat ||
2958 RMode == ReorderingMode::Constant ||
2959 RMode == ReorderingMode::Load;
2960 // Iterate through all unused operands and look for the best.
2961 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2962 // Get the operand at Idx and Lane.
2963 OperandData &OpData = getData(Idx, Lane);
2964 Value *Op = OpData.V;
2965 bool OpAPO = OpData.APO;
2966
2967 // Skip already selected operands.
2968 if (OpData.IsUsed)
2969 continue;
2970
2971 // Skip if we are trying to move the operand to a position with a
2972 // different opcode in the linearized tree form. This would break the
2973 // semantics.
2974 if (OpAPO != OpIdxAPO)
2975 continue;
2976
2977 // Look for an operand that matches the current mode.
2978 switch (RMode) {
2979 case ReorderingMode::Load:
2980 case ReorderingMode::Opcode: {
2981 bool LeftToRight = Lane > LastLane;
2982 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2983 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2984 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2985 OpIdx, Idx, IsUsed, UsedLanes);
2986 if (Score > static_cast<int>(BestOp.Score) ||
2987 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2988 Idx == OpIdx)) {
2989 BestOp.Idx = Idx;
2990 BestOp.Score = Score;
2991 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2992 }
2993 break;
2994 }
2995 case ReorderingMode::Constant:
2996 if (isa<Constant>(Op) ||
2997 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2998 BestOp.Idx = Idx;
2999 if (isa<Constant>(Op)) {
3001 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3003 }
3005 IsUsed = false;
3006 }
3007 break;
3008 case ReorderingMode::Splat:
3009 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
3010 IsUsed = Op == OpLastLane;
3011 if (Op == OpLastLane) {
3012 BestOp.Score = LookAheadHeuristics::ScoreSplat;
3013 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3015 }
3016 BestOp.Idx = Idx;
3017 }
3018 break;
3019 case ReorderingMode::Failed:
3020 llvm_unreachable("Not expected Failed reordering mode.");
3021 }
3022 }
3023
3024 if (BestOp.Idx) {
3025 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3026 return BestOp.Idx;
3027 }
3028 // If we could not find a good match return std::nullopt.
3029 return std::nullopt;
3030 }
3031
3032 /// Helper for reorderOperandVecs.
3033 /// \returns the lane that we should start reordering from. This is the one
3034 /// which has the least number of operands that can freely move about or
3035 /// less profitable because it already has the most optimal set of operands.
3036 unsigned getBestLaneToStartReordering() const {
3037 unsigned Min = UINT_MAX;
3038 unsigned SameOpNumber = 0;
3039 // std::pair<unsigned, unsigned> is used to implement a simple voting
3040 // algorithm and choose the lane with the least number of operands that
3041 // can freely move about or less profitable because it already has the
3042 // most optimal set of operands. The first unsigned is a counter for
3043 // voting, the second unsigned is the counter of lanes with instructions
3044 // with same/alternate opcodes and same parent basic block.
3046 // Try to be closer to the original results, if we have multiple lanes
3047 // with same cost. If 2 lanes have the same cost, use the one with the
3048 // highest index.
3049 for (int I = getNumLanes(); I > 0; --I) {
3050 unsigned Lane = I - 1;
3051 OperandsOrderData NumFreeOpsHash =
3052 getMaxNumOperandsThatCanBeReordered(Lane);
3053 // Compare the number of operands that can move and choose the one with
3054 // the least number.
3055 if (NumFreeOpsHash.NumOfAPOs < Min) {
3056 Min = NumFreeOpsHash.NumOfAPOs;
3057 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3058 HashMap.clear();
3059 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3060 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3061 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3062 // Select the most optimal lane in terms of number of operands that
3063 // should be moved around.
3064 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3065 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3066 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3067 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3068 auto [It, Inserted] =
3069 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3070 if (!Inserted)
3071 ++It->second.first;
3072 }
3073 }
3074 // Select the lane with the minimum counter.
3075 unsigned BestLane = 0;
3076 unsigned CntMin = UINT_MAX;
3077 for (const auto &Data : reverse(HashMap)) {
3078 if (Data.second.first < CntMin) {
3079 CntMin = Data.second.first;
3080 BestLane = Data.second.second;
3081 }
3082 }
3083 return BestLane;
3084 }
3085
3086 /// Data structure that helps to reorder operands.
3087 struct OperandsOrderData {
3088 /// The best number of operands with the same APOs, which can be
3089 /// reordered.
3090 unsigned NumOfAPOs = UINT_MAX;
3091 /// Number of operands with the same/alternate instruction opcode and
3092 /// parent.
3093 unsigned NumOpsWithSameOpcodeParent = 0;
3094 /// Hash for the actual operands ordering.
3095 /// Used to count operands, actually their position id and opcode
3096 /// value. It is used in the voting mechanism to find the lane with the
3097 /// least number of operands that can freely move about or less profitable
3098 /// because it already has the most optimal set of operands. Can be
3099 /// replaced with SmallVector<unsigned> instead but hash code is faster
3100 /// and requires less memory.
3101 unsigned Hash = 0;
3102 };
3103 /// \returns the maximum number of operands that are allowed to be reordered
3104 /// for \p Lane and the number of compatible instructions(with the same
3105 /// parent/opcode). This is used as a heuristic for selecting the first lane
3106 /// to start operand reordering.
3107 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3108 unsigned CntTrue = 0;
3109 unsigned NumOperands = getNumOperands();
3110 // Operands with the same APO can be reordered. We therefore need to count
3111 // how many of them we have for each APO, like this: Cnt[APO] = x.
3112 // Since we only have two APOs, namely true and false, we can avoid using
3113 // a map. Instead we can simply count the number of operands that
3114 // correspond to one of them (in this case the 'true' APO), and calculate
3115 // the other by subtracting it from the total number of operands.
3116 // Operands with the same instruction opcode and parent are more
3117 // profitable since we don't need to move them in many cases, with a high
3118 // probability such lane already can be vectorized effectively.
3119 bool AllUndefs = true;
3120 unsigned NumOpsWithSameOpcodeParent = 0;
3121 Instruction *OpcodeI = nullptr;
3122 BasicBlock *Parent = nullptr;
3123 unsigned Hash = 0;
3124 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3125 const OperandData &OpData = getData(OpIdx, Lane);
3126 if (OpData.APO)
3127 ++CntTrue;
3128 // Use Boyer-Moore majority voting for finding the majority opcode and
3129 // the number of times it occurs.
3130 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3131 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3132 I->getParent() != Parent) {
3133 if (NumOpsWithSameOpcodeParent == 0) {
3134 NumOpsWithSameOpcodeParent = 1;
3135 OpcodeI = I;
3136 Parent = I->getParent();
3137 } else {
3138 --NumOpsWithSameOpcodeParent;
3139 }
3140 } else {
3141 ++NumOpsWithSameOpcodeParent;
3142 }
3143 }
3144 Hash = hash_combine(
3145 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3146 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3147 }
3148 if (AllUndefs)
3149 return {};
3150 OperandsOrderData Data;
3151 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3152 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3153 Data.Hash = Hash;
3154 return Data;
3155 }
3156
3157 /// Go through the instructions in VL and append their operands.
3158 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3159 const InstructionsState &S) {
3160 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3161 assert((empty() || all_of(Operands,
3162 [this](const ValueList &VL) {
3163 return VL.size() == getNumLanes();
3164 })) &&
3165 "Expected same number of lanes");
3166 assert(S.valid() && "InstructionsState is invalid.");
3167 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3168 // arguments to the intrinsic produces the same result.
3169 Instruction *MainOp = S.getMainOp();
3170 unsigned NumOperands = MainOp->getNumOperands();
3172 OpsVec.resize(ArgSize);
3173 unsigned NumLanes = VL.size();
3174 for (OperandDataVec &Ops : OpsVec)
3175 Ops.resize(NumLanes);
3176 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3177 // Our tree has just 3 nodes: the root and two operands.
3178 // It is therefore trivial to get the APO. We only need to check the
3179 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3180 // operand. The LHS operand of both add and sub is never attached to an
3181 // inversese operation in the linearized form, therefore its APO is
3182 // false. The RHS is true only if V is an inverse operation.
3183
3184 // Since operand reordering is performed on groups of commutative
3185 // operations or alternating sequences (e.g., +, -), we can safely tell
3186 // the inverse operations by checking commutativity.
3187 auto *I = dyn_cast<Instruction>(VL[Lane]);
3188 if (!I && isa<PoisonValue>(VL[Lane])) {
3189 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3190 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3191 continue;
3192 }
3193 bool IsInverseOperation = false;
3194 if (S.isCopyableElement(VL[Lane])) {
3195 // The value is a copyable element.
3196 IsInverseOperation =
3197 !isCommutative(MainOp, VL[Lane], /*IsCopyable=*/true);
3198 } else {
3199 assert(I && "Expected instruction");
3200 auto [SelectedOp, Ops] = convertTo(I, S);
3201 // We cannot check commutativity by the converted instruction
3202 // (SelectedOp) because isCommutative also examines def-use
3203 // relationships.
3204 IsInverseOperation = !isCommutative(SelectedOp, I);
3205 }
3206 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3207 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3208 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3209 }
3210 }
3211 }
3212
3213 /// \returns the number of operands.
3214 unsigned getNumOperands() const { return ArgSize; }
3215
3216 /// \returns the number of lanes.
3217 unsigned getNumLanes() const { return OpsVec[0].size(); }
3218
3219 /// \returns the operand value at \p OpIdx and \p Lane.
3220 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3221 return getData(OpIdx, Lane).V;
3222 }
3223
3224 /// \returns true if the data structure is empty.
3225 bool empty() const { return OpsVec.empty(); }
3226
3227 /// Clears the data.
3228 void clear() { OpsVec.clear(); }
3229
3230 /// \Returns true if there are enough operands identical to \p Op to fill
3231 /// the whole vector (it is mixed with constants or loop invariant values).
3232 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3233 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3234 assert(Op == getValue(OpIdx, Lane) &&
3235 "Op is expected to be getValue(OpIdx, Lane).");
3236 // Small number of loads - try load matching.
3237 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3238 return false;
3239 bool OpAPO = getData(OpIdx, Lane).APO;
3240 bool IsInvariant = L && L->isLoopInvariant(Op);
3241 unsigned Cnt = 0;
3242 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3243 if (Ln == Lane)
3244 continue;
3245 // This is set to true if we found a candidate for broadcast at Lane.
3246 bool FoundCandidate = false;
3247 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3248 OperandData &Data = getData(OpI, Ln);
3249 if (Data.APO != OpAPO || Data.IsUsed)
3250 continue;
3251 Value *OpILane = getValue(OpI, Lane);
3252 bool IsConstantOp = isa<Constant>(OpILane);
3253 // Consider the broadcast candidate if:
3254 // 1. Same value is found in one of the operands.
3255 if (Data.V == Op ||
3256 // 2. The operand in the given lane is not constant but there is a
3257 // constant operand in another lane (which can be moved to the
3258 // given lane). In this case we can represent it as a simple
3259 // permutation of constant and broadcast.
3260 (!IsConstantOp &&
3261 ((Lns > 2 && isa<Constant>(Data.V)) ||
3262 // 2.1. If we have only 2 lanes, need to check that value in the
3263 // next lane does not build same opcode sequence.
3264 (Lns == 2 &&
3265 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3266 isa<Constant>(Data.V)))) ||
3267 // 3. The operand in the current lane is loop invariant (can be
3268 // hoisted out) and another operand is also a loop invariant
3269 // (though not a constant). In this case the whole vector can be
3270 // hoisted out.
3271 // FIXME: need to teach the cost model about this case for better
3272 // estimation.
3273 (IsInvariant && !isa<Constant>(Data.V) &&
3274 !getSameOpcode({Op, Data.V}, TLI) &&
3275 L->isLoopInvariant(Data.V))) {
3276 FoundCandidate = true;
3277 Data.IsUsed = Data.V == Op;
3278 if (Data.V == Op)
3279 ++Cnt;
3280 break;
3281 }
3282 }
3283 if (!FoundCandidate)
3284 return false;
3285 }
3286 return getNumLanes() == 2 || Cnt > 1;
3287 }
3288
3289 /// Checks if there is at least single compatible operand in lanes other
3290 /// than \p Lane, compatible with the operand \p Op.
3291 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3292 assert(Op == getValue(OpIdx, Lane) &&
3293 "Op is expected to be getValue(OpIdx, Lane).");
3294 bool OpAPO = getData(OpIdx, Lane).APO;
3295 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3296 if (Ln == Lane)
3297 continue;
3298 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3299 const OperandData &Data = getData(OpI, Ln);
3300 if (Data.APO != OpAPO || Data.IsUsed)
3301 return true;
3302 Value *OpILn = getValue(OpI, Ln);
3303 return (L && L->isLoopInvariant(OpILn)) ||
3304 (getSameOpcode({Op, OpILn}, TLI) &&
3305 allSameBlock({Op, OpILn}));
3306 }))
3307 return true;
3308 }
3309 return false;
3310 }
3311
3312 public:
3313 /// Initialize with all the operands of the instruction vector \p RootVL.
3315 const InstructionsState &S, const BoUpSLP &R)
3316 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3317 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3318 // Append all the operands of RootVL.
3319 appendOperands(RootVL, Operands, S);
3320 }
3321
3322 /// \Returns a value vector with the operands across all lanes for the
3323 /// opearnd at \p OpIdx.
3324 ValueList getVL(unsigned OpIdx) const {
3325 ValueList OpVL(OpsVec[OpIdx].size());
3326 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3327 "Expected same num of lanes across all operands");
3328 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3329 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3330 return OpVL;
3331 }
3332
3333 // Performs operand reordering for 2 or more operands.
3334 // The original operands are in OrigOps[OpIdx][Lane].
3335 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3336 void reorder() {
3337 unsigned NumOperands = getNumOperands();
3338 unsigned NumLanes = getNumLanes();
3339 // Each operand has its own mode. We are using this mode to help us select
3340 // the instructions for each lane, so that they match best with the ones
3341 // we have selected so far.
3342 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3343
3344 // This is a greedy single-pass algorithm. We are going over each lane
3345 // once and deciding on the best order right away with no back-tracking.
3346 // However, in order to increase its effectiveness, we start with the lane
3347 // that has operands that can move the least. For example, given the
3348 // following lanes:
3349 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3350 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3351 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3352 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3353 // we will start at Lane 1, since the operands of the subtraction cannot
3354 // be reordered. Then we will visit the rest of the lanes in a circular
3355 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3356
3357 // Find the first lane that we will start our search from.
3358 unsigned FirstLane = getBestLaneToStartReordering();
3359
3360 // Initialize the modes.
3361 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3362 Value *OpLane0 = getValue(OpIdx, FirstLane);
3363 // Keep track if we have instructions with all the same opcode on one
3364 // side.
3365 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3366 // Check if OpLane0 should be broadcast.
3367 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3368 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3369 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3370 else if (isa<LoadInst>(OpILane0))
3371 ReorderingModes[OpIdx] = ReorderingMode::Load;
3372 else
3373 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3374 } else if (isa<Constant>(OpLane0)) {
3375 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3376 } else if (isa<Argument>(OpLane0)) {
3377 // Our best hope is a Splat. It may save some cost in some cases.
3378 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3379 } else {
3380 llvm_unreachable("Unexpected value kind.");
3381 }
3382 }
3383
3384 // Check that we don't have same operands. No need to reorder if operands
3385 // are just perfect diamond or shuffled diamond match. Do not do it only
3386 // for possible broadcasts or non-power of 2 number of scalars (just for
3387 // now).
3388 auto &&SkipReordering = [this]() {
3389 SmallPtrSet<Value *, 4> UniqueValues;
3390 ArrayRef<OperandData> Op0 = OpsVec.front();
3391 for (const OperandData &Data : Op0)
3392 UniqueValues.insert(Data.V);
3394 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3395 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3396 return !UniqueValues.contains(Data.V);
3397 }))
3398 return false;
3399 }
3400 // TODO: Check if we can remove a check for non-power-2 number of
3401 // scalars after full support of non-power-2 vectorization.
3402 return UniqueValues.size() != 2 &&
3403 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3404 UniqueValues.size());
3405 };
3406
3407 // If the initial strategy fails for any of the operand indexes, then we
3408 // perform reordering again in a second pass. This helps avoid assigning
3409 // high priority to the failed strategy, and should improve reordering for
3410 // the non-failed operand indexes.
3411 for (int Pass = 0; Pass != 2; ++Pass) {
3412 // Check if no need to reorder operands since they're are perfect or
3413 // shuffled diamond match.
3414 // Need to do it to avoid extra external use cost counting for
3415 // shuffled matches, which may cause regressions.
3416 if (SkipReordering())
3417 break;
3418 // Skip the second pass if the first pass did not fail.
3419 bool StrategyFailed = false;
3420 // Mark all operand data as free to use.
3421 clearUsed();
3422 // We keep the original operand order for the FirstLane, so reorder the
3423 // rest of the lanes. We are visiting the nodes in a circular fashion,
3424 // using FirstLane as the center point and increasing the radius
3425 // distance.
3426 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3427 for (unsigned I = 0; I < NumOperands; ++I)
3428 MainAltOps[I].push_back(getData(I, FirstLane).V);
3429
3430 SmallBitVector UsedLanes(NumLanes);
3431 UsedLanes.set(FirstLane);
3432 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3433 // Visit the lane on the right and then the lane on the left.
3434 for (int Direction : {+1, -1}) {
3435 int Lane = FirstLane + Direction * Distance;
3436 if (Lane < 0 || Lane >= (int)NumLanes)
3437 continue;
3438 UsedLanes.set(Lane);
3439 int LastLane = Lane - Direction;
3440 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3441 "Out of bounds");
3442 // Look for a good match for each operand.
3443 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3444 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3445 std::optional<unsigned> BestIdx =
3446 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3447 MainAltOps[OpIdx], UsedLanes);
3448 // By not selecting a value, we allow the operands that follow to
3449 // select a better matching value. We will get a non-null value in
3450 // the next run of getBestOperand().
3451 if (BestIdx) {
3452 // Swap the current operand with the one returned by
3453 // getBestOperand().
3454 swap(OpIdx, *BestIdx, Lane);
3455 } else {
3456 // Enable the second pass.
3457 StrategyFailed = true;
3458 }
3459 // Try to get the alternate opcode and follow it during analysis.
3460 if (MainAltOps[OpIdx].size() != 2) {
3461 OperandData &AltOp = getData(OpIdx, Lane);
3462 InstructionsState OpS =
3463 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3464 if (OpS && OpS.isAltShuffle())
3465 MainAltOps[OpIdx].push_back(AltOp.V);
3466 }
3467 }
3468 }
3469 }
3470 // Skip second pass if the strategy did not fail.
3471 if (!StrategyFailed)
3472 break;
3473 }
3474 }
3475
3476#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3477 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3478 switch (RMode) {
3479 case ReorderingMode::Load:
3480 return "Load";
3481 case ReorderingMode::Opcode:
3482 return "Opcode";
3483 case ReorderingMode::Constant:
3484 return "Constant";
3485 case ReorderingMode::Splat:
3486 return "Splat";
3487 case ReorderingMode::Failed:
3488 return "Failed";
3489 }
3490 llvm_unreachable("Unimplemented Reordering Type");
3491 }
3492
3493 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3494 raw_ostream &OS) {
3495 return OS << getModeStr(RMode);
3496 }
3497
3498 /// Debug print.
3499 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3500 printMode(RMode, dbgs());
3501 }
3502
3503 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3504 return printMode(RMode, OS);
3505 }
3506
3508 const unsigned Indent = 2;
3509 unsigned Cnt = 0;
3510 for (const OperandDataVec &OpDataVec : OpsVec) {
3511 OS << "Operand " << Cnt++ << "\n";
3512 for (const OperandData &OpData : OpDataVec) {
3513 OS.indent(Indent) << "{";
3514 if (Value *V = OpData.V)
3515 OS << *V;
3516 else
3517 OS << "null";
3518 OS << ", APO:" << OpData.APO << "}\n";
3519 }
3520 OS << "\n";
3521 }
3522 return OS;
3523 }
3524
3525 /// Debug print.
3526 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3527#endif
3528 };
3529
3530 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3531 /// for a pair which have highest score deemed to have best chance to form
3532 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3533 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3534 /// of the cost, considered to be good enough score.
3535 std::optional<int>
3536 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3537 int Limit = LookAheadHeuristics::ScoreFail) const {
3538 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3540 int BestScore = Limit;
3541 std::optional<int> Index;
3542 for (int I : seq<int>(0, Candidates.size())) {
3543 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3544 Candidates[I].second,
3545 /*U1=*/nullptr, /*U2=*/nullptr,
3546 /*CurrLevel=*/1, {});
3547 if (Score > BestScore) {
3548 BestScore = Score;
3549 Index = I;
3550 }
3551 }
3552 return Index;
3553 }
3554
3555 /// Checks if the instruction is marked for deletion.
3556 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3557
3558 /// Removes an instruction from its block and eventually deletes it.
3559 /// It's like Instruction::eraseFromParent() except that the actual deletion
3560 /// is delayed until BoUpSLP is destructed.
3562 DeletedInstructions.insert(I);
3563 }
3564
3565 /// Remove instructions from the parent function and clear the operands of \p
3566 /// DeadVals instructions, marking for deletion trivially dead operands.
3567 template <typename T>
3569 ArrayRef<T *> DeadVals,
3570 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3572 for (T *V : DeadVals) {
3573 auto *I = cast<Instruction>(V);
3575 }
3576 DenseSet<Value *> Processed;
3577 for (T *V : DeadVals) {
3578 if (!V || !Processed.insert(V).second)
3579 continue;
3580 auto *I = cast<Instruction>(V);
3582 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3583 for (Use &U : I->operands()) {
3584 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3585 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3587 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3588 return Entry->VectorizedValue == OpI;
3589 })))
3590 DeadInsts.push_back(OpI);
3591 }
3592 I->dropAllReferences();
3593 }
3594 for (T *V : DeadVals) {
3595 auto *I = cast<Instruction>(V);
3596 if (!I->getParent())
3597 continue;
3598 assert((I->use_empty() || all_of(I->uses(),
3599 [&](Use &U) {
3600 return isDeleted(
3601 cast<Instruction>(U.getUser()));
3602 })) &&
3603 "trying to erase instruction with users.");
3604 I->removeFromParent();
3605 SE->forgetValue(I);
3606 }
3607 // Process the dead instruction list until empty.
3608 while (!DeadInsts.empty()) {
3609 Value *V = DeadInsts.pop_back_val();
3611 if (!VI || !VI->getParent())
3612 continue;
3614 "Live instruction found in dead worklist!");
3615 assert(VI->use_empty() && "Instructions with uses are not dead.");
3616
3617 // Don't lose the debug info while deleting the instructions.
3618 salvageDebugInfo(*VI);
3619
3620 // Null out all of the instruction's operands to see if any operand
3621 // becomes dead as we go.
3622 for (Use &OpU : VI->operands()) {
3623 Value *OpV = OpU.get();
3624 if (!OpV)
3625 continue;
3626 OpU.set(nullptr);
3627
3628 if (!OpV->use_empty())
3629 continue;
3630
3631 // If the operand is an instruction that became dead as we nulled out
3632 // the operand, and if it is 'trivially' dead, delete it in a future
3633 // loop iteration.
3634 if (auto *OpI = dyn_cast<Instruction>(OpV))
3635 if (!DeletedInstructions.contains(OpI) &&
3636 (!OpI->getType()->isVectorTy() ||
3637 none_of(VectorValuesAndScales,
3638 [&](const std::tuple<Value *, unsigned, bool> &V) {
3639 return std::get<0>(V) == OpI;
3640 })) &&
3642 DeadInsts.push_back(OpI);
3643 }
3644
3645 VI->removeFromParent();
3646 eraseInstruction(VI);
3647 SE->forgetValue(VI);
3648 }
3649 }
3650
3651 /// Checks if the instruction was already analyzed for being possible
3652 /// reduction root.
3654 return AnalyzedReductionsRoots.count(I);
3655 }
3656 /// Register given instruction as already analyzed for being possible
3657 /// reduction root.
3659 AnalyzedReductionsRoots.insert(I);
3660 }
3661 /// Checks if the provided list of reduced values was checked already for
3662 /// vectorization.
3664 return AnalyzedReductionVals.contains(hash_value(VL));
3665 }
3666 /// Adds the list of reduced values to list of already checked values for the
3667 /// vectorization.
3669 AnalyzedReductionVals.insert(hash_value(VL));
3670 }
3671 /// Clear the list of the analyzed reduction root instructions.
3673 AnalyzedReductionsRoots.clear();
3674 AnalyzedReductionVals.clear();
3675 AnalyzedMinBWVals.clear();
3676 }
3677 /// Checks if the given value is gathered in one of the nodes.
3678 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3679 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3680 }
3681 /// Checks if the given value is gathered in one of the nodes.
3682 bool isGathered(const Value *V) const {
3683 return MustGather.contains(V);
3684 }
3685 /// Checks if the specified value was not schedule.
3686 bool isNotScheduled(const Value *V) const {
3687 return NonScheduledFirst.contains(V);
3688 }
3689
3690 /// Check if the value is vectorized in the tree.
3691 bool isVectorized(const Value *V) const {
3692 assert(V && "V cannot be nullptr.");
3693 return ScalarToTreeEntries.contains(V);
3694 }
3695
3696 ~BoUpSLP();
3697
3698private:
3699 /// Determine if a node \p E in can be demoted to a smaller type with a
3700 /// truncation. We collect the entries that will be demoted in ToDemote.
3701 /// \param E Node for analysis
3702 /// \param ToDemote indices of the nodes to be demoted.
3703 bool collectValuesToDemote(
3704 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3706 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3707 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3708
3709 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3710 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3711 /// they have only one user and reordarable).
3712 /// \param ReorderableGathers List of all gather nodes that require reordering
3713 /// (e.g., gather of extractlements or partially vectorizable loads).
3714 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3715 /// reordering, subset of \p NonVectorized.
3716 void buildReorderableOperands(
3717 TreeEntry *UserTE,
3718 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3719 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3720 SmallVectorImpl<TreeEntry *> &GatherOps);
3721
3722 /// Checks if the given \p TE is a gather node with clustered reused scalars
3723 /// and reorders it per given \p Mask.
3724 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3725
3726 /// Checks if all users of \p I are the part of the vectorization tree.
3727 bool areAllUsersVectorized(
3728 Instruction *I,
3729 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3730
3731 /// Return information about the vector formed for the specified index
3732 /// of a vector of (the same) instruction.
3734
3735 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3736 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3737 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3738 return const_cast<TreeEntry *>(
3739 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3740 }
3741
3742 /// Gets the root instruction for the given node. If the node is a strided
3743 /// load/store node with the reverse order, the root instruction is the last
3744 /// one.
3745 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3746
3747 /// \returns Cast context for the given graph node.
3749 getCastContextHint(const TreeEntry &TE) const;
3750
3751 /// \returns the cost of the vectorizable entry.
3752 InstructionCost getEntryCost(const TreeEntry *E,
3753 ArrayRef<Value *> VectorizedVals,
3754 SmallPtrSetImpl<Value *> &CheckedExtracts);
3755
3756 /// Checks if it is legal and profitable to build SplitVectorize node for the
3757 /// given \p VL.
3758 /// \param Op1 first homogeneous scalars.
3759 /// \param Op2 second homogeneous scalars.
3760 /// \param ReorderIndices indices to reorder the scalars.
3761 /// \returns true if the node was successfully built.
3762 bool canBuildSplitNode(ArrayRef<Value *> VL,
3763 const InstructionsState &LocalState,
3766 OrdersType &ReorderIndices) const;
3767
3768 /// This is the recursive part of buildTree.
3769 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3770 unsigned InterleaveFactor = 0);
3771
3772 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3773 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3774 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3775 /// returns false, setting \p CurrentOrder to either an empty vector or a
3776 /// non-identity permutation that allows to reuse extract instructions.
3777 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3778 /// extract order.
3779 bool canReuseExtract(ArrayRef<Value *> VL,
3780 SmallVectorImpl<unsigned> &CurrentOrder,
3781 bool ResizeAllowed = false) const;
3782
3783 /// Vectorize a single entry in the tree.
3784 Value *vectorizeTree(TreeEntry *E);
3785
3786 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3787 /// \p E.
3788 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3789
3790 /// Create a new vector from a list of scalar values. Produces a sequence
3791 /// which exploits values reused across lanes, and arranges the inserts
3792 /// for ease of later optimization.
3793 template <typename BVTy, typename ResTy, typename... Args>
3794 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3795
3796 /// Create a new vector from a list of scalar values. Produces a sequence
3797 /// which exploits values reused across lanes, and arranges the inserts
3798 /// for ease of later optimization.
3799 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3800
3801 /// Returns the instruction in the bundle, which can be used as a base point
3802 /// for scheduling. Usually it is the last instruction in the bundle, except
3803 /// for the case when all operands are external (in this case, it is the first
3804 /// instruction in the list).
3805 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3806
3807 /// Tries to find extractelement instructions with constant indices from fixed
3808 /// vector type and gather such instructions into a bunch, which highly likely
3809 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3810 /// was successful, the matched scalars are replaced by poison values in \p VL
3811 /// for future analysis.
3812 std::optional<TargetTransformInfo::ShuffleKind>
3813 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3814 SmallVectorImpl<int> &Mask) const;
3815
3816 /// Tries to find extractelement instructions with constant indices from fixed
3817 /// vector type and gather such instructions into a bunch, which highly likely
3818 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3819 /// was successful, the matched scalars are replaced by poison values in \p VL
3820 /// for future analysis.
3822 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3824 unsigned NumParts) const;
3825
3826 /// Checks if the gathered \p VL can be represented as a single register
3827 /// shuffle(s) of previous tree entries.
3828 /// \param TE Tree entry checked for permutation.
3829 /// \param VL List of scalars (a subset of the TE scalar), checked for
3830 /// permutations. Must form single-register vector.
3831 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3832 /// commands to build the mask using the original vector value, without
3833 /// relying on the potential reordering.
3834 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3835 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3836 std::optional<TargetTransformInfo::ShuffleKind>
3837 isGatherShuffledSingleRegisterEntry(
3838 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3839 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3840 bool ForOrder);
3841
3842 /// Checks if the gathered \p VL can be represented as multi-register
3843 /// shuffle(s) of previous tree entries.
3844 /// \param TE Tree entry checked for permutation.
3845 /// \param VL List of scalars (a subset of the TE scalar), checked for
3846 /// permutations.
3847 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3848 /// commands to build the mask using the original vector value, without
3849 /// relying on the potential reordering.
3850 /// \returns per-register series of ShuffleKind, if gathered values can be
3851 /// represented as shuffles of previous tree entries. \p Mask is filled with
3852 /// the shuffle mask (also on per-register base).
3854 isGatherShuffledEntry(
3855 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3857 unsigned NumParts, bool ForOrder = false);
3858
3859 /// \returns the cost of gathering (inserting) the values in \p VL into a
3860 /// vector.
3861 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3862 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3863 Type *ScalarTy) const;
3864
3865 /// Set the Builder insert point to one after the last instruction in
3866 /// the bundle
3867 void setInsertPointAfterBundle(const TreeEntry *E);
3868
3869 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3870 /// specified, the starting vector value is poison.
3871 Value *
3872 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3873 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3874
3875 /// \returns whether the VectorizableTree is fully vectorizable and will
3876 /// be beneficial even the tree height is tiny.
3877 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3878
3879 /// Run through the list of all gathered loads in the graph and try to find
3880 /// vector loads/masked gathers instead of regular gathers. Later these loads
3881 /// are reshufled to build final gathered nodes.
3882 void tryToVectorizeGatheredLoads(
3883 const SmallMapVector<
3884 std::tuple<BasicBlock *, Value *, Type *>,
3885 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3886 &GatheredLoads);
3887
3888 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3889 /// users of \p TE and collects the stores. It returns the map from the store
3890 /// pointers to the collected stores.
3892 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3893
3894 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3895 /// stores in \p StoresVec can form a vector instruction. If so it returns
3896 /// true and populates \p ReorderIndices with the shuffle indices of the
3897 /// stores when compared to the sorted vector.
3898 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3899 OrdersType &ReorderIndices) const;
3900
3901 /// Iterates through the users of \p TE, looking for scalar stores that can be
3902 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3903 /// their order and builds an order index vector for each store bundle. It
3904 /// returns all these order vectors found.
3905 /// We run this after the tree has formed, otherwise we may come across user
3906 /// instructions that are not yet in the tree.
3908 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3909
3910 /// Tries to reorder the gathering node for better vectorization
3911 /// opportunities.
3912 void reorderGatherNode(TreeEntry &TE);
3913
3914 class TreeEntry {
3915 public:
3916 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3917 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3918
3919 /// \returns Common mask for reorder indices and reused scalars.
3920 SmallVector<int> getCommonMask() const {
3921 if (State == TreeEntry::SplitVectorize)
3922 return {};
3923 SmallVector<int> Mask;
3924 inversePermutation(ReorderIndices, Mask);
3925 ::addMask(Mask, ReuseShuffleIndices);
3926 return Mask;
3927 }
3928
3929 /// \returns The mask for split nodes.
3930 SmallVector<int> getSplitMask() const {
3931 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3932 "Expected only split vectorize node.");
3933 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3934 unsigned CommonVF = std::max<unsigned>(
3935 CombinedEntriesWithIndices.back().second,
3936 Scalars.size() - CombinedEntriesWithIndices.back().second);
3937 for (auto [Idx, I] : enumerate(ReorderIndices))
3938 Mask[I] =
3939 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3940 ? CommonVF - CombinedEntriesWithIndices.back().second
3941 : 0);
3942 return Mask;
3943 }
3944
3945 /// Updates (reorders) SplitVectorize node according to the given mask \p
3946 /// Mask and order \p MaskOrder.
3947 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3948 ArrayRef<int> MaskOrder);
3949
3950 /// \returns true if the scalars in VL are equal to this entry.
3951 bool isSame(ArrayRef<Value *> VL) const {
3952 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3953 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3954 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3955 return VL.size() == Mask.size() &&
3956 std::equal(VL.begin(), VL.end(), Mask.begin(),
3957 [Scalars](Value *V, int Idx) {
3958 return (isa<UndefValue>(V) &&
3959 Idx == PoisonMaskElem) ||
3960 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3961 });
3962 };
3963 if (!ReorderIndices.empty()) {
3964 // TODO: implement matching if the nodes are just reordered, still can
3965 // treat the vector as the same if the list of scalars matches VL
3966 // directly, without reordering.
3967 SmallVector<int> Mask;
3968 inversePermutation(ReorderIndices, Mask);
3969 if (VL.size() == Scalars.size())
3970 return IsSame(Scalars, Mask);
3971 if (VL.size() == ReuseShuffleIndices.size()) {
3972 ::addMask(Mask, ReuseShuffleIndices);
3973 return IsSame(Scalars, Mask);
3974 }
3975 return false;
3976 }
3977 return IsSame(Scalars, ReuseShuffleIndices);
3978 }
3979
3980 /// \returns true if current entry has same operands as \p TE.
3981 bool hasEqualOperands(const TreeEntry &TE) const {
3982 if (TE.getNumOperands() != getNumOperands())
3983 return false;
3984 SmallBitVector Used(getNumOperands());
3985 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3986 unsigned PrevCount = Used.count();
3987 for (unsigned K = 0; K < E; ++K) {
3988 if (Used.test(K))
3989 continue;
3990 if (getOperand(K) == TE.getOperand(I)) {
3991 Used.set(K);
3992 break;
3993 }
3994 }
3995 // Check if we actually found the matching operand.
3996 if (PrevCount == Used.count())
3997 return false;
3998 }
3999 return true;
4000 }
4001
4002 /// \return Final vectorization factor for the node. Defined by the total
4003 /// number of vectorized scalars, including those, used several times in the
4004 /// entry and counted in the \a ReuseShuffleIndices, if any.
4005 unsigned getVectorFactor() const {
4006 if (!ReuseShuffleIndices.empty())
4007 return ReuseShuffleIndices.size();
4008 return Scalars.size();
4009 };
4010
4011 /// Checks if the current node is a gather node.
4012 bool isGather() const { return State == NeedToGather; }
4013
4014 /// A vector of scalars.
4015 ValueList Scalars;
4016
4017 /// The Scalars are vectorized into this value. It is initialized to Null.
4018 WeakTrackingVH VectorizedValue = nullptr;
4019
4020 /// Do we need to gather this sequence or vectorize it
4021 /// (either with vector instruction or with scatter/gather
4022 /// intrinsics for store/load)?
4023 enum EntryState {
4024 Vectorize, ///< The node is regularly vectorized.
4025 ScatterVectorize, ///< Masked scatter/gather node.
4026 StridedVectorize, ///< Strided loads (and stores)
4027 CompressVectorize, ///< (Masked) load with compress.
4028 NeedToGather, ///< Gather/buildvector node.
4029 CombinedVectorize, ///< Vectorized node, combined with its user into more
4030 ///< complex node like select/cmp to minmax, mul/add to
4031 ///< fma, etc. Must be used for the following nodes in
4032 ///< the pattern, not the very first one.
4033 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4034 ///< independently and then combines back.
4035 };
4036 EntryState State;
4037
4038 /// List of combined opcodes supported by the vectorizer.
4039 enum CombinedOpcode {
4040 NotCombinedOp = -1,
4041 MinMax = Instruction::OtherOpsEnd + 1,
4042 FMulAdd,
4043 };
4044 CombinedOpcode CombinedOp = NotCombinedOp;
4045
4046 /// Does this sequence require some shuffling?
4047 SmallVector<int, 4> ReuseShuffleIndices;
4048
4049 /// Does this entry require reordering?
4050 SmallVector<unsigned, 4> ReorderIndices;
4051
4052 /// Points back to the VectorizableTree.
4053 ///
4054 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4055 /// to be a pointer and needs to be able to initialize the child iterator.
4056 /// Thus we need a reference back to the container to translate the indices
4057 /// to entries.
4058 VecTreeTy &Container;
4059
4060 /// The TreeEntry index containing the user of this entry.
4061 EdgeInfo UserTreeIndex;
4062
4063 /// The index of this treeEntry in VectorizableTree.
4064 unsigned Idx = 0;
4065
4066 /// For gather/buildvector/alt opcode nodes, which are combined from
4067 /// other nodes as a series of insertvector instructions.
4068 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4069
4070 private:
4071 /// The operands of each instruction in each lane Operands[op_index][lane].
4072 /// Note: This helps avoid the replication of the code that performs the
4073 /// reordering of operands during buildTreeRec() and vectorizeTree().
4074 SmallVector<ValueList, 2> Operands;
4075
4076 /// Copyable elements of the entry node.
4077 SmallPtrSet<const Value *, 4> CopyableElements;
4078
4079 /// MainOp and AltOp are recorded inside. S should be obtained from
4080 /// newTreeEntry.
4081 InstructionsState S = InstructionsState::invalid();
4082
4083 /// Interleaving factor for interleaved loads Vectorize nodes.
4084 unsigned InterleaveFactor = 0;
4085
4086 /// True if the node does not require scheduling.
4087 bool DoesNotNeedToSchedule = false;
4088
4089 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4090 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4091 if (Operands.size() < OpIdx + 1)
4092 Operands.resize(OpIdx + 1);
4093 assert(Operands[OpIdx].empty() && "Already resized?");
4094 assert(OpVL.size() <= Scalars.size() &&
4095 "Number of operands is greater than the number of scalars.");
4096 Operands[OpIdx].resize(OpVL.size());
4097 copy(OpVL, Operands[OpIdx].begin());
4098 }
4099
4100 public:
4101 /// Returns interleave factor for interleave nodes.
4102 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4103 /// Sets interleaving factor for the interleaving nodes.
4104 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4105
4106 /// Marks the node as one that does not require scheduling.
4107 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4108 /// Returns true if the node is marked as one that does not require
4109 /// scheduling.
4110 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4111
4112 /// Set this bundle's operands from \p Operands.
4113 void setOperands(ArrayRef<ValueList> Operands) {
4114 for (unsigned I : seq<unsigned>(Operands.size()))
4115 setOperand(I, Operands[I]);
4116 }
4117
4118 /// Reorders operands of the node to the given mask \p Mask.
4119 void reorderOperands(ArrayRef<int> Mask) {
4120 for (ValueList &Operand : Operands)
4121 reorderScalars(Operand, Mask);
4122 }
4123
4124 /// \returns the \p OpIdx operand of this TreeEntry.
4125 ValueList &getOperand(unsigned OpIdx) {
4126 assert(OpIdx < Operands.size() && "Off bounds");
4127 return Operands[OpIdx];
4128 }
4129
4130 /// \returns the \p OpIdx operand of this TreeEntry.
4131 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4132 assert(OpIdx < Operands.size() && "Off bounds");
4133 return Operands[OpIdx];
4134 }
4135
4136 /// \returns the number of operands.
4137 unsigned getNumOperands() const { return Operands.size(); }
4138
4139 /// \return the single \p OpIdx operand.
4140 Value *getSingleOperand(unsigned OpIdx) const {
4141 assert(OpIdx < Operands.size() && "Off bounds");
4142 assert(!Operands[OpIdx].empty() && "No operand available");
4143 return Operands[OpIdx][0];
4144 }
4145
4146 /// Some of the instructions in the list have alternate opcodes.
4147 bool isAltShuffle() const { return S.isAltShuffle(); }
4148
4149 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4150 return S.getMatchingMainOpOrAltOp(I);
4151 }
4152
4153 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4154 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4155 /// \p OpValue.
4156 Value *isOneOf(Value *Op) const {
4157 auto *I = dyn_cast<Instruction>(Op);
4158 if (I && getMatchingMainOpOrAltOp(I))
4159 return Op;
4160 return S.getMainOp();
4161 }
4162
4163 void setOperations(const InstructionsState &S) {
4164 assert(S && "InstructionsState is invalid.");
4165 this->S = S;
4166 }
4167
4168 Instruction *getMainOp() const { return S.getMainOp(); }
4169
4170 Instruction *getAltOp() const { return S.getAltOp(); }
4171
4172 /// The main/alternate opcodes for the list of instructions.
4173 unsigned getOpcode() const { return S.getOpcode(); }
4174
4175 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4176
4177 bool hasState() const { return S.valid(); }
4178
4179 /// Add \p V to the list of copyable elements.
4180 void addCopyableElement(Value *V) {
4181 assert(S.isCopyableElement(V) && "Not a copyable element.");
4182 CopyableElements.insert(V);
4183 }
4184
4185 /// Returns true if \p V is a copyable element.
4186 bool isCopyableElement(Value *V) const {
4187 return CopyableElements.contains(V);
4188 }
4189
4190 /// Returns true if any scalar in the list is a copyable element.
4191 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4192
4193 /// Returns the state of the operations.
4194 const InstructionsState &getOperations() const { return S; }
4195
4196 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4197 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4198 unsigned findLaneForValue(Value *V) const {
4199 unsigned FoundLane = getVectorFactor();
4200 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4201 std::advance(It, 1)) {
4202 if (*It != V)
4203 continue;
4204 FoundLane = std::distance(Scalars.begin(), It);
4205 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4206 if (!ReorderIndices.empty())
4207 FoundLane = ReorderIndices[FoundLane];
4208 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4209 if (ReuseShuffleIndices.empty())
4210 break;
4211 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4212 RIt != ReuseShuffleIndices.end()) {
4213 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4214 break;
4215 }
4216 }
4217 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4218 return FoundLane;
4219 }
4220
4221 /// Build a shuffle mask for graph entry which represents a merge of main
4222 /// and alternate operations.
4223 void
4224 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4225 SmallVectorImpl<int> &Mask,
4226 SmallVectorImpl<Value *> *OpScalars = nullptr,
4227 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4228
4229 /// Return true if this is a non-power-of-2 node.
4230 bool isNonPowOf2Vec() const {
4231 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4232 return IsNonPowerOf2;
4233 }
4234
4235 /// Return true if this is a node, which tries to vectorize number of
4236 /// elements, forming whole vectors.
4237 bool
4238 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4239 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4240 TTI, getValueType(Scalars.front()), Scalars.size());
4241 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4242 "Reshuffling not supported with non-power-of-2 vectors yet.");
4243 return IsNonPowerOf2;
4244 }
4245
4246 Value *getOrdered(unsigned Idx) const {
4247 if (ReorderIndices.empty())
4248 return Scalars[Idx];
4249 SmallVector<int> Mask;
4250 inversePermutation(ReorderIndices, Mask);
4251 return Scalars[Mask[Idx]];
4252 }
4253
4254#ifndef NDEBUG
4255 /// Debug printer.
4256 LLVM_DUMP_METHOD void dump() const {
4257 dbgs() << Idx << ".\n";
4258 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4259 dbgs() << "Operand " << OpI << ":\n";
4260 for (const Value *V : Operands[OpI])
4261 dbgs().indent(2) << *V << "\n";
4262 }
4263 dbgs() << "Scalars: \n";
4264 for (Value *V : Scalars)
4265 dbgs().indent(2) << *V << "\n";
4266 dbgs() << "State: ";
4267 if (S && hasCopyableElements())
4268 dbgs() << "[[Copyable]] ";
4269 switch (State) {
4270 case Vectorize:
4271 if (InterleaveFactor > 0) {
4272 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4273 << "\n";
4274 } else {
4275 dbgs() << "Vectorize\n";
4276 }
4277 break;
4278 case ScatterVectorize:
4279 dbgs() << "ScatterVectorize\n";
4280 break;
4281 case StridedVectorize:
4282 dbgs() << "StridedVectorize\n";
4283 break;
4284 case CompressVectorize:
4285 dbgs() << "CompressVectorize\n";
4286 break;
4287 case NeedToGather:
4288 dbgs() << "NeedToGather\n";
4289 break;
4290 case CombinedVectorize:
4291 dbgs() << "CombinedVectorize\n";
4292 break;
4293 case SplitVectorize:
4294 dbgs() << "SplitVectorize\n";
4295 break;
4296 }
4297 if (S) {
4298 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4299 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4300 } else {
4301 dbgs() << "MainOp: NULL\n";
4302 dbgs() << "AltOp: NULL\n";
4303 }
4304 dbgs() << "VectorizedValue: ";
4305 if (VectorizedValue)
4306 dbgs() << *VectorizedValue << "\n";
4307 else
4308 dbgs() << "NULL\n";
4309 dbgs() << "ReuseShuffleIndices: ";
4310 if (ReuseShuffleIndices.empty())
4311 dbgs() << "Empty";
4312 else
4313 for (int ReuseIdx : ReuseShuffleIndices)
4314 dbgs() << ReuseIdx << ", ";
4315 dbgs() << "\n";
4316 dbgs() << "ReorderIndices: ";
4317 for (unsigned ReorderIdx : ReorderIndices)
4318 dbgs() << ReorderIdx << ", ";
4319 dbgs() << "\n";
4320 dbgs() << "UserTreeIndex: ";
4321 if (UserTreeIndex)
4322 dbgs() << UserTreeIndex;
4323 else
4324 dbgs() << "<invalid>";
4325 dbgs() << "\n";
4326 if (!CombinedEntriesWithIndices.empty()) {
4327 dbgs() << "Combined entries: ";
4328 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4329 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4330 });
4331 dbgs() << "\n";
4332 }
4333 }
4334#endif
4335 };
4336
4337#ifndef NDEBUG
4338 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4339 InstructionCost VecCost, InstructionCost ScalarCost,
4340 StringRef Banner) const {
4341 dbgs() << "SLP: " << Banner << ":\n";
4342 E->dump();
4343 dbgs() << "SLP: Costs:\n";
4344 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4345 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4346 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4347 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4348 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4349 }
4350#endif
4351
4352 /// Create a new gather TreeEntry
4353 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4354 const InstructionsState &S,
4355 const EdgeInfo &UserTreeIdx,
4356 ArrayRef<int> ReuseShuffleIndices = {}) {
4357 auto Invalid = ScheduleBundle::invalid();
4358 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4359 }
4360
4361 /// Create a new VectorizableTree entry.
4362 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4363 const InstructionsState &S,
4364 const EdgeInfo &UserTreeIdx,
4365 ArrayRef<int> ReuseShuffleIndices = {},
4366 ArrayRef<unsigned> ReorderIndices = {},
4367 unsigned InterleaveFactor = 0) {
4368 TreeEntry::EntryState EntryState =
4369 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4370 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4371 ReuseShuffleIndices, ReorderIndices);
4372 if (E && InterleaveFactor > 0)
4373 E->setInterleave(InterleaveFactor);
4374 return E;
4375 }
4376
4377 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4378 TreeEntry::EntryState EntryState,
4379 ScheduleBundle &Bundle, const InstructionsState &S,
4380 const EdgeInfo &UserTreeIdx,
4381 ArrayRef<int> ReuseShuffleIndices = {},
4382 ArrayRef<unsigned> ReorderIndices = {}) {
4383 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4384 EntryState == TreeEntry::SplitVectorize)) ||
4385 (Bundle && EntryState != TreeEntry::NeedToGather &&
4386 EntryState != TreeEntry::SplitVectorize)) &&
4387 "Need to vectorize gather entry?");
4388 // Gathered loads still gathered? Do not create entry, use the original one.
4389 if (GatheredLoadsEntriesFirst.has_value() &&
4390 EntryState == TreeEntry::NeedToGather && S &&
4391 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4392 !UserTreeIdx.UserTE)
4393 return nullptr;
4394 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4395 TreeEntry *Last = VectorizableTree.back().get();
4396 Last->Idx = VectorizableTree.size() - 1;
4397 Last->State = EntryState;
4398 if (UserTreeIdx.UserTE)
4399 OperandsToTreeEntry.try_emplace(
4400 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4401 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4402 // for non-power-of-two vectors.
4403 assert(
4404 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4405 ReuseShuffleIndices.empty()) &&
4406 "Reshuffling scalars not yet supported for nodes with padding");
4407 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4408 ReuseShuffleIndices.end());
4409 if (ReorderIndices.empty()) {
4410 Last->Scalars.assign(VL.begin(), VL.end());
4411 if (S)
4412 Last->setOperations(S);
4413 } else {
4414 // Reorder scalars and build final mask.
4415 Last->Scalars.assign(VL.size(), nullptr);
4416 transform(ReorderIndices, Last->Scalars.begin(),
4417 [VL](unsigned Idx) -> Value * {
4418 if (Idx >= VL.size())
4419 return UndefValue::get(VL.front()->getType());
4420 return VL[Idx];
4421 });
4422 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4423 if (S)
4424 Last->setOperations(S);
4425 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4426 }
4427 if (EntryState == TreeEntry::SplitVectorize) {
4428 assert(S && "Split nodes must have operations.");
4429 Last->setOperations(S);
4430 SmallPtrSet<Value *, 4> Processed;
4431 for (Value *V : VL) {
4432 auto *I = dyn_cast<Instruction>(V);
4433 if (!I)
4434 continue;
4435 auto It = ScalarsInSplitNodes.find(V);
4436 if (It == ScalarsInSplitNodes.end()) {
4437 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4438 (void)Processed.insert(V);
4439 } else if (Processed.insert(V).second) {
4440 assert(!is_contained(It->getSecond(), Last) &&
4441 "Value already associated with the node.");
4442 It->getSecond().push_back(Last);
4443 }
4444 }
4445 } else if (!Last->isGather()) {
4446 if (isa<PHINode>(S.getMainOp()) ||
4447 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4448 (!S.areInstructionsWithCopyableElements() &&
4449 doesNotNeedToSchedule(VL)) ||
4450 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4451 Last->setDoesNotNeedToSchedule();
4452 SmallPtrSet<Value *, 4> Processed;
4453 for (Value *V : VL) {
4454 if (isa<PoisonValue>(V))
4455 continue;
4456 if (S.isCopyableElement(V)) {
4457 Last->addCopyableElement(V);
4458 continue;
4459 }
4460 auto It = ScalarToTreeEntries.find(V);
4461 if (It == ScalarToTreeEntries.end()) {
4462 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4463 (void)Processed.insert(V);
4464 } else if (Processed.insert(V).second) {
4465 assert(!is_contained(It->getSecond(), Last) &&
4466 "Value already associated with the node.");
4467 It->getSecond().push_back(Last);
4468 }
4469 }
4470 // Update the scheduler bundle to point to this TreeEntry.
4471 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4472 "Bundle and VL out of sync");
4473 if (!Bundle.getBundle().empty()) {
4474#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4475 auto *BundleMember = Bundle.getBundle().begin();
4476 SmallPtrSet<Value *, 4> Processed;
4477 for (Value *V : VL) {
4478 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4479 continue;
4480 ++BundleMember;
4481 }
4482 assert(BundleMember == Bundle.getBundle().end() &&
4483 "Bundle and VL out of sync");
4484#endif
4485 Bundle.setTreeEntry(Last);
4486 }
4487 } else {
4488 // Build a map for gathered scalars to the nodes where they are used.
4489 bool AllConstsOrCasts = true;
4490 for (Value *V : VL) {
4491 if (S && S.areInstructionsWithCopyableElements() &&
4492 S.isCopyableElement(V))
4493 Last->addCopyableElement(V);
4494 if (!isConstant(V)) {
4495 auto *I = dyn_cast<CastInst>(V);
4496 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4497 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4498 !UserTreeIdx.UserTE->isGather())
4499 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4500 }
4501 }
4502 if (AllConstsOrCasts)
4503 CastMaxMinBWSizes =
4504 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4505 MustGather.insert_range(VL);
4506 }
4507
4508 if (UserTreeIdx.UserTE)
4509 Last->UserTreeIndex = UserTreeIdx;
4510 return Last;
4511 }
4512
4513 /// -- Vectorization State --
4514 /// Holds all of the tree entries.
4515 TreeEntry::VecTreeTy VectorizableTree;
4516
4517#ifndef NDEBUG
4518 /// Debug printer.
4519 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4520 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4521 VectorizableTree[Id]->dump();
4522 dbgs() << "\n";
4523 }
4524 }
4525#endif
4526
4527 /// Get list of vector entries, associated with the value \p V.
4528 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4529 assert(V && "V cannot be nullptr.");
4530 auto It = ScalarToTreeEntries.find(V);
4531 if (It == ScalarToTreeEntries.end())
4532 return {};
4533 return It->getSecond();
4534 }
4535
4536 /// Get list of split vector entries, associated with the value \p V.
4537 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4538 assert(V && "V cannot be nullptr.");
4539 auto It = ScalarsInSplitNodes.find(V);
4540 if (It == ScalarsInSplitNodes.end())
4541 return {};
4542 return It->getSecond();
4543 }
4544
4545 /// Returns first vector node for value \p V, matching values \p VL.
4546 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4547 bool SameVF = false) const {
4548 assert(V && "V cannot be nullptr.");
4549 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4550 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4551 return TE;
4552 return nullptr;
4553 }
4554
4555 /// Check that the operand node of alternate node does not generate
4556 /// buildvector sequence. If it is, then probably not worth it to build
4557 /// alternate shuffle, if number of buildvector operands + alternate
4558 /// instruction > than the number of buildvector instructions.
4559 /// \param S the instructions state of the analyzed values.
4560 /// \param VL list of the instructions with alternate opcodes.
4561 bool areAltOperandsProfitable(const InstructionsState &S,
4562 ArrayRef<Value *> VL) const;
4563
4564 /// Contains all the outputs of legality analysis for a list of values to
4565 /// vectorize.
4566 class ScalarsVectorizationLegality {
4567 InstructionsState S;
4568 bool IsLegal;
4569 bool TryToFindDuplicates;
4570 bool TrySplitVectorize;
4571
4572 public:
4573 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4574 bool TryToFindDuplicates = true,
4575 bool TrySplitVectorize = false)
4576 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4577 TrySplitVectorize(TrySplitVectorize) {
4578 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4579 "Inconsistent state");
4580 }
4581 const InstructionsState &getInstructionsState() const { return S; };
4582 bool isLegal() const { return IsLegal; }
4583 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4584 bool trySplitVectorize() const { return TrySplitVectorize; }
4585 };
4586
4587 /// Checks if the specified list of the instructions/values can be vectorized
4588 /// in general.
4589 ScalarsVectorizationLegality
4590 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4591 const EdgeInfo &UserTreeIdx,
4592 bool TryCopyableElementsVectorization) const;
4593
4594 /// Checks if the specified list of the instructions/values can be vectorized
4595 /// and fills required data before actual scheduling of the instructions.
4596 TreeEntry::EntryState getScalarsVectorizationState(
4597 const InstructionsState &S, ArrayRef<Value *> VL,
4598 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4599 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4600
4601 /// Maps a specific scalar to its tree entry(ies).
4602 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4603
4604 /// List of deleted non-profitable nodes.
4605 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4606
4607 /// List of nodes, transformed to gathered, with their conservative
4608 /// gather/buildvector cost estimation.
4609 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4610
4611 /// Maps the operand index and entry to the corresponding tree entry.
4612 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4613 OperandsToTreeEntry;
4614
4615 /// Scalars, used in split vectorize nodes.
4616 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4617
4618 /// Maps a value to the proposed vectorizable size.
4619 SmallDenseMap<Value *, unsigned> InstrElementSize;
4620
4621 /// A list of scalars that we found that we need to keep as scalars.
4622 ValueSet MustGather;
4623
4624 /// A set of first non-schedulable values.
4625 ValueSet NonScheduledFirst;
4626
4627 /// A map between the vectorized entries and the last instructions in the
4628 /// bundles. The bundles are built in use order, not in the def order of the
4629 /// instructions. So, we cannot rely directly on the last instruction in the
4630 /// bundle being the last instruction in the program order during
4631 /// vectorization process since the basic blocks are affected, need to
4632 /// pre-gather them before.
4633 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4634
4635 /// Keeps the mapping between the last instructions and their insertion
4636 /// points, which is an instruction-after-the-last-instruction.
4637 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4638
4639 /// List of gather nodes, depending on other gather/vector nodes, which should
4640 /// be emitted after the vector instruction emission process to correctly
4641 /// handle order of the vector instructions and shuffles.
4642 SetVector<const TreeEntry *> PostponedGathers;
4643
4644 using ValueToGatherNodesMap =
4645 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4646 ValueToGatherNodesMap ValueToGatherNodes;
4647
4648 /// A list of the load entries (node indices), which can be vectorized using
4649 /// strided or masked gather approach, but attempted to be represented as
4650 /// contiguous loads.
4651 SetVector<unsigned> LoadEntriesToVectorize;
4652
4653 /// true if graph nodes transforming mode is on.
4654 bool IsGraphTransformMode = false;
4655
4656 /// The index of the first gathered load entry in the VectorizeTree.
4657 std::optional<unsigned> GatheredLoadsEntriesFirst;
4658
4659 /// Maps compress entries to their mask data for the final codegen.
4660 SmallDenseMap<const TreeEntry *,
4661 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4662 CompressEntryToData;
4663
4664 /// This POD struct describes one external user in the vectorized tree.
4665 struct ExternalUser {
4666 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4667 : Scalar(S), User(U), E(E), Lane(L) {}
4668
4669 /// Which scalar in our function.
4670 Value *Scalar = nullptr;
4671
4672 /// Which user that uses the scalar.
4673 llvm::User *User = nullptr;
4674
4675 /// Vector node, the value is part of.
4676 const TreeEntry &E;
4677
4678 /// Which lane does the scalar belong to.
4679 unsigned Lane;
4680 };
4681 using UserList = SmallVector<ExternalUser, 16>;
4682
4683 /// Checks if two instructions may access the same memory.
4684 ///
4685 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4686 /// is invariant in the calling loop.
4687 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4688 Instruction *Inst2) {
4689 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4690 // First check if the result is already in the cache.
4691 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4692 auto Res = AliasCache.try_emplace(Key);
4693 if (!Res.second)
4694 return Res.first->second;
4695 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4696 // Store the result in the cache.
4697 Res.first->getSecond() = Aliased;
4698 return Aliased;
4699 }
4700
4701 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4702
4703 /// Cache for alias results.
4704 /// TODO: consider moving this to the AliasAnalysis itself.
4705 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4706
4707 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4708 // globally through SLP because we don't perform any action which
4709 // invalidates capture results.
4710 BatchAAResults BatchAA;
4711
4712 /// Temporary store for deleted instructions. Instructions will be deleted
4713 /// eventually when the BoUpSLP is destructed. The deferral is required to
4714 /// ensure that there are no incorrect collisions in the AliasCache, which
4715 /// can happen if a new instruction is allocated at the same address as a
4716 /// previously deleted instruction.
4717 DenseSet<Instruction *> DeletedInstructions;
4718
4719 /// Set of the instruction, being analyzed already for reductions.
4720 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4721
4722 /// Set of hashes for the list of reduction values already being analyzed.
4723 DenseSet<size_t> AnalyzedReductionVals;
4724
4725 /// Values, already been analyzed for mininmal bitwidth and found to be
4726 /// non-profitable.
4727 DenseSet<Value *> AnalyzedMinBWVals;
4728
4729 /// A list of values that need to extracted out of the tree.
4730 /// This list holds pairs of (Internal Scalar : External User). External User
4731 /// can be nullptr, it means that this Internal Scalar will be used later,
4732 /// after vectorization.
4733 UserList ExternalUses;
4734
4735 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4736 /// extractelement instructions.
4737 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4738
4739 /// A list of scalar to be extracted without specific user necause of too many
4740 /// uses.
4741 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4742
4743 /// Values used only by @llvm.assume calls.
4744 SmallPtrSet<const Value *, 32> EphValues;
4745
4746 /// Holds all of the instructions that we gathered, shuffle instructions and
4747 /// extractelements.
4748 SetVector<Instruction *> GatherShuffleExtractSeq;
4749
4750 /// A list of blocks that we are going to CSE.
4751 DenseSet<BasicBlock *> CSEBlocks;
4752
4753 /// List of hashes of vector of loads, which are known to be non vectorizable.
4754 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4755
4756 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4757 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4758 /// instructions, while ScheduleBundle represents a batch of instructions,
4759 /// going to be groupped together. ScheduleCopyableData models extra user for
4760 /// "copyable" instructions.
4761 class ScheduleEntity {
4762 friend class ScheduleBundle;
4763 friend class ScheduleData;
4764 friend class ScheduleCopyableData;
4765
4766 protected:
4767 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4768 Kind getKind() const { return K; }
4769 ScheduleEntity(Kind K) : K(K) {}
4770
4771 private:
4772 /// Used for getting a "good" final ordering of instructions.
4773 int SchedulingPriority = 0;
4774 /// True if this instruction (or bundle) is scheduled (or considered as
4775 /// scheduled in the dry-run).
4776 bool IsScheduled = false;
4777 /// The kind of the ScheduleEntity.
4778 const Kind K = Kind::ScheduleData;
4779
4780 public:
4781 ScheduleEntity() = delete;
4782 /// Gets/sets the scheduling priority.
4783 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4784 int getSchedulingPriority() const { return SchedulingPriority; }
4785 bool isReady() const {
4786 if (const auto *SD = dyn_cast<ScheduleData>(this))
4787 return SD->isReady();
4788 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4789 return CD->isReady();
4790 return cast<ScheduleBundle>(this)->isReady();
4791 }
4792 /// Returns true if the dependency information has been calculated.
4793 /// Note that depenendency validity can vary between instructions within
4794 /// a single bundle.
4795 bool hasValidDependencies() const {
4796 if (const auto *SD = dyn_cast<ScheduleData>(this))
4797 return SD->hasValidDependencies();
4798 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4799 return CD->hasValidDependencies();
4800 return cast<ScheduleBundle>(this)->hasValidDependencies();
4801 }
4802 /// Gets the number of unscheduled dependencies.
4803 int getUnscheduledDeps() const {
4804 if (const auto *SD = dyn_cast<ScheduleData>(this))
4805 return SD->getUnscheduledDeps();
4806 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4807 return CD->getUnscheduledDeps();
4808 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4809 }
4810 /// Increments the number of unscheduled dependencies.
4811 int incrementUnscheduledDeps(int Incr) {
4812 if (auto *SD = dyn_cast<ScheduleData>(this))
4813 return SD->incrementUnscheduledDeps(Incr);
4814 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4815 }
4816 /// Gets the number of dependencies.
4817 int getDependencies() const {
4818 if (const auto *SD = dyn_cast<ScheduleData>(this))
4819 return SD->getDependencies();
4820 return cast<ScheduleCopyableData>(this)->getDependencies();
4821 }
4822 /// Gets the instruction.
4823 Instruction *getInst() const {
4824 if (const auto *SD = dyn_cast<ScheduleData>(this))
4825 return SD->getInst();
4826 return cast<ScheduleCopyableData>(this)->getInst();
4827 }
4828
4829 /// Gets/sets if the bundle is scheduled.
4830 bool isScheduled() const { return IsScheduled; }
4831 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4832
4833 static bool classof(const ScheduleEntity *) { return true; }
4834
4835#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4836 void dump(raw_ostream &OS) const {
4837 if (const auto *SD = dyn_cast<ScheduleData>(this))
4838 return SD->dump(OS);
4839 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4840 return CD->dump(OS);
4841 return cast<ScheduleBundle>(this)->dump(OS);
4842 }
4843
4844 LLVM_DUMP_METHOD void dump() const {
4845 dump(dbgs());
4846 dbgs() << '\n';
4847 }
4848#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4849 };
4850
4851#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4853 const BoUpSLP::ScheduleEntity &SE) {
4854 SE.dump(OS);
4855 return OS;
4856 }
4857#endif
4858
4859 /// Contains all scheduling relevant data for an instruction.
4860 /// A ScheduleData either represents a single instruction or a member of an
4861 /// instruction bundle (= a group of instructions which is combined into a
4862 /// vector instruction).
4863 class ScheduleData final : public ScheduleEntity {
4864 public:
4865 // The initial value for the dependency counters. It means that the
4866 // dependencies are not calculated yet.
4867 enum { InvalidDeps = -1 };
4868
4869 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4870 static bool classof(const ScheduleEntity *Entity) {
4871 return Entity->getKind() == Kind::ScheduleData;
4872 }
4873
4874 void init(int BlockSchedulingRegionID, Instruction *I) {
4875 NextLoadStore = nullptr;
4876 IsScheduled = false;
4877 SchedulingRegionID = BlockSchedulingRegionID;
4878 clearDependencies();
4879 Inst = I;
4880 }
4881
4882 /// Verify basic self consistency properties
4883 void verify() {
4884 if (hasValidDependencies()) {
4885 assert(UnscheduledDeps <= Dependencies && "invariant");
4886 } else {
4887 assert(UnscheduledDeps == Dependencies && "invariant");
4888 }
4889
4890 if (IsScheduled) {
4891 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4892 "unexpected scheduled state");
4893 }
4894 }
4895
4896 /// Returns true if the dependency information has been calculated.
4897 /// Note that depenendency validity can vary between instructions within
4898 /// a single bundle.
4899 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4900
4901 /// Returns true if it is ready for scheduling, i.e. it has no more
4902 /// unscheduled depending instructions/bundles.
4903 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4904
4905 /// Modifies the number of unscheduled dependencies for this instruction,
4906 /// and returns the number of remaining dependencies for the containing
4907 /// bundle.
4908 int incrementUnscheduledDeps(int Incr) {
4909 assert(hasValidDependencies() &&
4910 "increment of unscheduled deps would be meaningless");
4911 UnscheduledDeps += Incr;
4912 assert(UnscheduledDeps >= 0 &&
4913 "Expected valid number of unscheduled deps");
4914 return UnscheduledDeps;
4915 }
4916
4917 /// Sets the number of unscheduled dependencies to the number of
4918 /// dependencies.
4919 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4920
4921 /// Clears all dependency information.
4922 void clearDependencies() {
4923 clearDirectDependencies();
4924 MemoryDependencies.clear();
4925 ControlDependencies.clear();
4926 }
4927
4928 /// Clears all direct dependencies only, except for control and memory
4929 /// dependencies.
4930 /// Required for copyable elements to correctly handle control/memory deps
4931 /// and avoid extra reclaculation of such deps.
4932 void clearDirectDependencies() {
4933 Dependencies = InvalidDeps;
4934 resetUnscheduledDeps();
4935 IsScheduled = false;
4936 }
4937
4938 /// Gets the number of unscheduled dependencies.
4939 int getUnscheduledDeps() const { return UnscheduledDeps; }
4940 /// Gets the number of dependencies.
4941 int getDependencies() const { return Dependencies; }
4942 /// Initializes the number of dependencies.
4943 void initDependencies() { Dependencies = 0; }
4944 /// Increments the number of dependencies.
4945 void incDependencies() { Dependencies++; }
4946
4947 /// Gets scheduling region ID.
4948 int getSchedulingRegionID() const { return SchedulingRegionID; }
4949
4950 /// Gets the instruction.
4951 Instruction *getInst() const { return Inst; }
4952
4953 /// Gets the list of memory dependencies.
4954 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4955 return MemoryDependencies;
4956 }
4957 /// Adds a memory dependency.
4958 void addMemoryDependency(ScheduleData *Dep) {
4959 MemoryDependencies.push_back(Dep);
4960 }
4961 /// Gets the list of control dependencies.
4962 ArrayRef<ScheduleData *> getControlDependencies() const {
4963 return ControlDependencies;
4964 }
4965 /// Adds a control dependency.
4966 void addControlDependency(ScheduleData *Dep) {
4967 ControlDependencies.push_back(Dep);
4968 }
4969 /// Gets/sets the next load/store instruction in the block.
4970 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4971 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4972
4973 void dump(raw_ostream &OS) const { OS << *Inst; }
4974
4975 LLVM_DUMP_METHOD void dump() const {
4976 dump(dbgs());
4977 dbgs() << '\n';
4978 }
4979
4980 private:
4981 Instruction *Inst = nullptr;
4982
4983 /// Single linked list of all memory instructions (e.g. load, store, call)
4984 /// in the block - until the end of the scheduling region.
4985 ScheduleData *NextLoadStore = nullptr;
4986
4987 /// The dependent memory instructions.
4988 /// This list is derived on demand in calculateDependencies().
4989 SmallVector<ScheduleData *> MemoryDependencies;
4990
4991 /// List of instructions which this instruction could be control dependent
4992 /// on. Allowing such nodes to be scheduled below this one could introduce
4993 /// a runtime fault which didn't exist in the original program.
4994 /// ex: this is a load or udiv following a readonly call which inf loops
4995 SmallVector<ScheduleData *> ControlDependencies;
4996
4997 /// This ScheduleData is in the current scheduling region if this matches
4998 /// the current SchedulingRegionID of BlockScheduling.
4999 int SchedulingRegionID = 0;
5000
5001 /// The number of dependencies. Constitutes of the number of users of the
5002 /// instruction plus the number of dependent memory instructions (if any).
5003 /// This value is calculated on demand.
5004 /// If InvalidDeps, the number of dependencies is not calculated yet.
5005 int Dependencies = InvalidDeps;
5006
5007 /// The number of dependencies minus the number of dependencies of scheduled
5008 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5009 /// for scheduling.
5010 /// Note that this is negative as long as Dependencies is not calculated.
5011 int UnscheduledDeps = InvalidDeps;
5012 };
5013
5014#ifndef NDEBUG
5016 const BoUpSLP::ScheduleData &SD) {
5017 SD.dump(OS);
5018 return OS;
5019 }
5020#endif
5021
5022 class ScheduleBundle final : public ScheduleEntity {
5023 /// The schedule data for the instructions in the bundle.
5025 /// True if this bundle is valid.
5026 bool IsValid = true;
5027 /// The TreeEntry that this instruction corresponds to.
5028 TreeEntry *TE = nullptr;
5029 ScheduleBundle(bool IsValid)
5030 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5031
5032 public:
5033 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5034 static bool classof(const ScheduleEntity *Entity) {
5035 return Entity->getKind() == Kind::ScheduleBundle;
5036 }
5037
5038 /// Verify basic self consistency properties
5039 void verify() const {
5040 for (const ScheduleEntity *SD : Bundle) {
5041 if (SD->hasValidDependencies()) {
5042 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5043 "invariant");
5044 } else {
5045 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5046 "invariant");
5047 }
5048
5049 if (isScheduled()) {
5050 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5051 "unexpected scheduled state");
5052 }
5053 }
5054 }
5055
5056 /// Returns the number of unscheduled dependencies in the bundle.
5057 int unscheduledDepsInBundle() const {
5058 assert(*this && "bundle must not be empty");
5059 int Sum = 0;
5060 for (const ScheduleEntity *BundleMember : Bundle) {
5061 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5062 return ScheduleData::InvalidDeps;
5063 Sum += BundleMember->getUnscheduledDeps();
5064 }
5065 return Sum;
5066 }
5067
5068 /// Returns true if the dependency information has been calculated.
5069 /// Note that depenendency validity can vary between instructions within
5070 /// a single bundle.
5071 bool hasValidDependencies() const {
5072 return all_of(Bundle, [](const ScheduleEntity *SD) {
5073 return SD->hasValidDependencies();
5074 });
5075 }
5076
5077 /// Returns true if it is ready for scheduling, i.e. it has no more
5078 /// unscheduled depending instructions/bundles.
5079 bool isReady() const {
5080 assert(*this && "bundle must not be empty");
5081 return unscheduledDepsInBundle() == 0 && !isScheduled();
5082 }
5083
5084 /// Returns the bundle of scheduling data, associated with the current
5085 /// instruction.
5086 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5087 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5088 /// Adds an instruction to the bundle.
5089 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5090
5091 /// Gets/sets the associated tree entry.
5092 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5093 TreeEntry *getTreeEntry() const { return TE; }
5094
5095 static ScheduleBundle invalid() { return {false}; }
5096
5097 operator bool() const { return IsValid; }
5098
5099#ifndef NDEBUG
5100 void dump(raw_ostream &OS) const {
5101 if (!*this) {
5102 OS << "[]";
5103 return;
5104 }
5105 OS << '[';
5106 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5108 OS << "<Copyable>";
5109 OS << *SD->getInst();
5110 });
5111 OS << ']';
5112 }
5113
5114 LLVM_DUMP_METHOD void dump() const {
5115 dump(dbgs());
5116 dbgs() << '\n';
5117 }
5118#endif // NDEBUG
5119 };
5120
5121#ifndef NDEBUG
5123 const BoUpSLP::ScheduleBundle &Bundle) {
5124 Bundle.dump(OS);
5125 return OS;
5126 }
5127#endif
5128
5129 /// Contains all scheduling relevant data for the copyable instruction.
5130 /// It models the virtual instructions, supposed to replace the original
5131 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5132 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5133 /// instruction %virt = add %0, 0.
5134 class ScheduleCopyableData final : public ScheduleEntity {
5135 /// The source schedule data for the instruction.
5136 Instruction *Inst = nullptr;
5137 /// The edge information for the instruction.
5138 const EdgeInfo EI;
5139 /// This ScheduleData is in the current scheduling region if this matches
5140 /// the current SchedulingRegionID of BlockScheduling.
5141 int SchedulingRegionID = 0;
5142 /// Bundle, this data is part of.
5143 ScheduleBundle &Bundle;
5144
5145 public:
5146 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5147 const EdgeInfo &EI, ScheduleBundle &Bundle)
5148 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5149 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5150 static bool classof(const ScheduleEntity *Entity) {
5151 return Entity->getKind() == Kind::ScheduleCopyableData;
5152 }
5153
5154 /// Verify basic self consistency properties
5155 void verify() {
5156 if (hasValidDependencies()) {
5157 assert(UnscheduledDeps <= Dependencies && "invariant");
5158 } else {
5159 assert(UnscheduledDeps == Dependencies && "invariant");
5160 }
5161
5162 if (IsScheduled) {
5163 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5164 "unexpected scheduled state");
5165 }
5166 }
5167
5168 /// Returns true if the dependency information has been calculated.
5169 /// Note that depenendency validity can vary between instructions within
5170 /// a single bundle.
5171 bool hasValidDependencies() const {
5172 return Dependencies != ScheduleData::InvalidDeps;
5173 }
5174
5175 /// Returns true if it is ready for scheduling, i.e. it has no more
5176 /// unscheduled depending instructions/bundles.
5177 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5178
5179 /// Modifies the number of unscheduled dependencies for this instruction,
5180 /// and returns the number of remaining dependencies for the containing
5181 /// bundle.
5182 int incrementUnscheduledDeps(int Incr) {
5183 assert(hasValidDependencies() &&
5184 "increment of unscheduled deps would be meaningless");
5185 UnscheduledDeps += Incr;
5186 assert(UnscheduledDeps >= 0 && "invariant");
5187 return UnscheduledDeps;
5188 }
5189
5190 /// Sets the number of unscheduled dependencies to the number of
5191 /// dependencies.
5192 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5193
5194 /// Gets the number of unscheduled dependencies.
5195 int getUnscheduledDeps() const { return UnscheduledDeps; }
5196 /// Gets the number of dependencies.
5197 int getDependencies() const { return Dependencies; }
5198 /// Initializes the number of dependencies.
5199 void initDependencies() { Dependencies = 0; }
5200 /// Increments the number of dependencies.
5201 void incDependencies() { Dependencies++; }
5202
5203 /// Gets scheduling region ID.
5204 int getSchedulingRegionID() const { return SchedulingRegionID; }
5205
5206 /// Gets the instruction.
5207 Instruction *getInst() const { return Inst; }
5208
5209 /// Clears all dependency information.
5210 void clearDependencies() {
5211 Dependencies = ScheduleData::InvalidDeps;
5212 UnscheduledDeps = ScheduleData::InvalidDeps;
5213 IsScheduled = false;
5214 }
5215
5216 /// Gets the edge information.
5217 const EdgeInfo &getEdgeInfo() const { return EI; }
5218
5219 /// Gets the bundle.
5220 ScheduleBundle &getBundle() { return Bundle; }
5221 const ScheduleBundle &getBundle() const { return Bundle; }
5222
5223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5224 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5225
5226 LLVM_DUMP_METHOD void dump() const {
5227 dump(dbgs());
5228 dbgs() << '\n';
5229 }
5230#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5231
5232 private:
5233 /// true, if it has valid dependency information. These nodes always have
5234 /// only single dependency.
5235 int Dependencies = ScheduleData::InvalidDeps;
5236
5237 /// The number of dependencies minus the number of dependencies of scheduled
5238 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5239 /// for scheduling.
5240 /// Note that this is negative as long as Dependencies is not calculated.
5241 int UnscheduledDeps = ScheduleData::InvalidDeps;
5242 };
5243
5244#ifndef NDEBUG
5245 friend inline raw_ostream &
5246 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5247 SD.dump(OS);
5248 return OS;
5249 }
5250#endif
5251
5252 friend struct GraphTraits<BoUpSLP *>;
5253 friend struct DOTGraphTraits<BoUpSLP *>;
5254
5255 /// Contains all scheduling data for a basic block.
5256 /// It does not schedules instructions, which are not memory read/write
5257 /// instructions and their operands are either constants, or arguments, or
5258 /// phis, or instructions from others blocks, or their users are phis or from
5259 /// the other blocks. The resulting vector instructions can be placed at the
5260 /// beginning of the basic block without scheduling (if operands does not need
5261 /// to be scheduled) or at the end of the block (if users are outside of the
5262 /// block). It allows to save some compile time and memory used by the
5263 /// compiler.
5264 /// ScheduleData is assigned for each instruction in between the boundaries of
5265 /// the tree entry, even for those, which are not part of the graph. It is
5266 /// required to correctly follow the dependencies between the instructions and
5267 /// their correct scheduling. The ScheduleData is not allocated for the
5268 /// instructions, which do not require scheduling, like phis, nodes with
5269 /// extractelements/insertelements only or nodes with instructions, with
5270 /// uses/operands outside of the block.
5271 struct BlockScheduling {
5272 BlockScheduling(BasicBlock *BB)
5273 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5274
5275 void clear() {
5276 ScheduledBundles.clear();
5277 ScheduledBundlesList.clear();
5278 ScheduleCopyableDataMap.clear();
5279 ScheduleCopyableDataMapByInst.clear();
5280 ScheduleCopyableDataMapByInstUser.clear();
5281 ScheduleCopyableDataMapByUsers.clear();
5282 ReadyInsts.clear();
5283 ScheduleStart = nullptr;
5284 ScheduleEnd = nullptr;
5285 FirstLoadStoreInRegion = nullptr;
5286 LastLoadStoreInRegion = nullptr;
5287 RegionHasStackSave = false;
5288
5289 // Reduce the maximum schedule region size by the size of the
5290 // previous scheduling run.
5291 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5292 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5293 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5294 ScheduleRegionSize = 0;
5295
5296 // Make a new scheduling region, i.e. all existing ScheduleData is not
5297 // in the new region yet.
5298 ++SchedulingRegionID;
5299 }
5300
5301 ScheduleData *getScheduleData(Instruction *I) {
5302 if (!I)
5303 return nullptr;
5304 if (BB != I->getParent())
5305 // Avoid lookup if can't possibly be in map.
5306 return nullptr;
5307 ScheduleData *SD = ScheduleDataMap.lookup(I);
5308 if (SD && isInSchedulingRegion(*SD))
5309 return SD;
5310 return nullptr;
5311 }
5312
5313 ScheduleData *getScheduleData(Value *V) {
5314 return getScheduleData(dyn_cast<Instruction>(V));
5315 }
5316
5317 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5318 /// operand number) and value.
5319 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5320 const Value *V) const {
5321 if (ScheduleCopyableDataMap.empty())
5322 return nullptr;
5323 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5324 if (It == ScheduleCopyableDataMap.end())
5325 return nullptr;
5326 ScheduleCopyableData *SD = It->getSecond().get();
5327 if (!isInSchedulingRegion(*SD))
5328 return nullptr;
5329 return SD;
5330 }
5331
5332 /// Returns the ScheduleCopyableData for the given user \p User, operand
5333 /// number and operand \p V.
5335 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5336 const Value *V) {
5337 if (ScheduleCopyableDataMapByInstUser.empty())
5338 return {};
5339 const auto It = ScheduleCopyableDataMapByInstUser.find(
5340 std::make_pair(std::make_pair(User, OperandIdx), V));
5341 if (It == ScheduleCopyableDataMapByInstUser.end())
5342 return {};
5344 for (ScheduleCopyableData *SD : It->getSecond()) {
5345 if (isInSchedulingRegion(*SD))
5346 Res.push_back(SD);
5347 }
5348 return Res;
5349 }
5350
5351 /// Returns true if all operands of the given instruction \p User are
5352 /// replaced by copyable data.
5353 /// \param User The user instruction.
5354 /// \param Op The operand, which might be replaced by the copyable data.
5355 /// \param SLP The SLP tree.
5356 /// \param NumOps The number of operands used. If the instruction uses the
5357 /// same operand several times, check for the first use, then the second,
5358 /// etc.
5359 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5360 Instruction *Op, BoUpSLP &SLP,
5361 unsigned NumOps) const {
5362 assert(NumOps > 0 && "No operands");
5363 if (ScheduleCopyableDataMap.empty())
5364 return false;
5365 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5366 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5367 if (Entries.empty())
5368 return false;
5369 unsigned CurNumOps = 0;
5370 for (const Use &U : User->operands()) {
5371 if (U.get() != Op)
5372 continue;
5373 ++CurNumOps;
5374 // Check all tree entries, if they have operands replaced by copyable
5375 // data.
5376 for (TreeEntry *TE : Entries) {
5377 unsigned Inc = 0;
5378 bool IsNonSchedulableWithParentPhiNode =
5379 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5380 TE->UserTreeIndex.UserTE->hasState() &&
5381 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5382 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5383 // Count the number of unique phi nodes, which are the parent for
5384 // parent entry, and exit, if all the unique phis are processed.
5385 if (IsNonSchedulableWithParentPhiNode) {
5386 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5387 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5388 for (Value *V : ParentTE->Scalars) {
5389 auto *PHI = dyn_cast<PHINode>(V);
5390 if (!PHI)
5391 continue;
5392 if (ParentsUniqueUsers.insert(PHI).second &&
5393 is_contained(PHI->incoming_values(), User))
5394 ++Inc;
5395 }
5396 } else {
5397 Inc = count(TE->Scalars, User);
5398 }
5399
5400 // Check if the user is commutative.
5401 // The commutatives are handled later, as their operands can be
5402 // reordered.
5403 // Same applies even for non-commutative cmps, because we can invert
5404 // their predicate potentially and, thus, reorder the operands.
5405 bool IsCommutativeUser =
5406 ::isCommutative(User) &&
5407 ::isCommutableOperand(User, User, U.getOperandNo());
5408 if (!IsCommutativeUser) {
5409 Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User);
5410 IsCommutativeUser =
5411 ::isCommutative(MainOp, User) &&
5412 ::isCommutableOperand(MainOp, User, U.getOperandNo());
5413 }
5414 // The commutative user with the same operands can be safely
5415 // considered as non-commutative, operands reordering does not change
5416 // the semantics.
5417 assert(
5418 (!IsCommutativeUser ||
5419 (((::isCommutative(User) &&
5420 ::isCommutableOperand(User, User, 0) &&
5421 ::isCommutableOperand(User, User, 1)) ||
5422 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
5423 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5424 User, 0) &&
5425 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5426 User, 1))))) &&
5427 "Expected commutative user with 2 first commutable operands");
5428 bool IsCommutativeWithSameOps =
5429 IsCommutativeUser && User->getOperand(0) == User->getOperand(1);
5430 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5431 !isa<CmpInst>(User)) {
5432 EdgeInfo EI(TE, U.getOperandNo());
5433 if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
5434 continue;
5435 return false;
5436 }
5437 PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5438 .first->getSecond() += Inc;
5439 }
5440 }
5441 if (PotentiallyReorderedEntriesCount.empty())
5442 return true;
5443 // Check the commutative/cmp entries.
5444 for (auto &P : PotentiallyReorderedEntriesCount) {
5445 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5446 bool IsNonSchedulableWithParentPhiNode =
5447 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5448 P.first->UserTreeIndex.UserTE->hasState() &&
5449 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5450 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5451 auto *It = find(P.first->Scalars, User);
5452 do {
5453 assert(It != P.first->Scalars.end() &&
5454 "User is not in the tree entry");
5455 int Lane = std::distance(P.first->Scalars.begin(), It);
5456 assert(Lane >= 0 && "Lane is not found");
5457 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5458 Lane = P.first->ReorderIndices[Lane];
5459 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5460 "Couldn't find extract lane");
5461 // Count the number of unique phi nodes, which are the parent for
5462 // parent entry, and exit, if all the unique phis are processed.
5463 if (IsNonSchedulableWithParentPhiNode) {
5464 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5465 Value *User = ParentTE->Scalars[Lane];
5466 if (!ParentsUniqueUsers.insert(User).second) {
5467 It =
5468 find(make_range(std::next(It), P.first->Scalars.end()), User);
5469 continue;
5470 }
5471 }
5472 for (unsigned OpIdx :
5474 P.first->getMainOp()))) {
5475 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5476 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5477 --P.getSecond();
5478 }
5479 // If parent node is schedulable, it will be handled correctly.
5480 It = find(make_range(std::next(It), P.first->Scalars.end()), User);
5481 } while (It != P.first->Scalars.end());
5482 }
5483 return all_of(PotentiallyReorderedEntriesCount,
5484 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5485 return P.second == NumOps - 1;
5486 });
5487 }
5488
5490 getScheduleCopyableData(const Instruction *I) const {
5491 if (ScheduleCopyableDataMapByInst.empty())
5492 return {};
5493 const auto It = ScheduleCopyableDataMapByInst.find(I);
5494 if (It == ScheduleCopyableDataMapByInst.end())
5495 return {};
5497 for (ScheduleCopyableData *SD : It->getSecond()) {
5498 if (isInSchedulingRegion(*SD))
5499 Res.push_back(SD);
5500 }
5501 return Res;
5502 }
5503
5505 getScheduleCopyableDataUsers(const Instruction *User) const {
5506 if (ScheduleCopyableDataMapByUsers.empty())
5507 return {};
5508 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5509 if (It == ScheduleCopyableDataMapByUsers.end())
5510 return {};
5512 for (ScheduleCopyableData *SD : It->getSecond()) {
5513 if (isInSchedulingRegion(*SD))
5514 Res.push_back(SD);
5515 }
5516 return Res;
5517 }
5518
5519 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5520 Instruction *I,
5521 int SchedulingRegionID,
5522 ScheduleBundle &Bundle) {
5523 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5524 ScheduleCopyableData *CD =
5525 ScheduleCopyableDataMap
5526 .try_emplace(std::make_pair(EI, I),
5527 std::make_unique<ScheduleCopyableData>(
5528 SchedulingRegionID, I, EI, Bundle))
5529 .first->getSecond()
5530 .get();
5531 ScheduleCopyableDataMapByInst[I].push_back(CD);
5532 if (EI.UserTE) {
5533 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5534 const auto *It = find(Op, I);
5535 assert(It != Op.end() && "Lane not set");
5536 SmallPtrSet<Instruction *, 4> Visited;
5537 do {
5538 int Lane = std::distance(Op.begin(), It);
5539 assert(Lane >= 0 && "Lane not set");
5540 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5541 !EI.UserTE->ReorderIndices.empty())
5542 Lane = EI.UserTE->ReorderIndices[Lane];
5543 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5544 "Couldn't find extract lane");
5545 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5546 if (!Visited.insert(In).second) {
5547 It = find(make_range(std::next(It), Op.end()), I);
5548 continue;
5549 }
5550 ScheduleCopyableDataMapByInstUser
5551 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5552 .first->getSecond()
5553 .push_back(CD);
5554 ScheduleCopyableDataMapByUsers.try_emplace(I)
5555 .first->getSecond()
5556 .insert(CD);
5557 // Remove extra deps for users, becoming non-immediate users of the
5558 // instruction. It may happen, if the chain of same copyable elements
5559 // appears in the tree.
5560 if (In == I) {
5561 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5562 if (ScheduleCopyableData *UserCD =
5563 getScheduleCopyableData(UserEI, In))
5564 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5565 }
5566 It = find(make_range(std::next(It), Op.end()), I);
5567 } while (It != Op.end());
5568 } else {
5569 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5570 CD);
5571 }
5572 return *CD;
5573 }
5574
5575 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5576 auto *I = dyn_cast<Instruction>(V);
5577 if (!I)
5578 return {};
5579 auto It = ScheduledBundles.find(I);
5580 if (It == ScheduledBundles.end())
5581 return {};
5582 return It->getSecond();
5583 }
5584
5585 /// Returns true if the entity is in the scheduling region.
5586 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5587 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5588 return Data->getSchedulingRegionID() == SchedulingRegionID;
5589 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5590 return CD->getSchedulingRegionID() == SchedulingRegionID;
5591 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5592 [&](const ScheduleEntity *BundleMember) {
5593 return isInSchedulingRegion(*BundleMember);
5594 });
5595 }
5596
5597 /// Marks an instruction as scheduled and puts all dependent ready
5598 /// instructions into the ready-list.
5599 template <typename ReadyListType>
5600 void schedule(const BoUpSLP &R, const InstructionsState &S,
5601 const EdgeInfo &EI, ScheduleEntity *Data,
5602 ReadyListType &ReadyList) {
5603 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5605 // Handle the def-use chain dependencies.
5606
5607 // Decrement the unscheduled counter and insert to ready list if ready.
5608 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5609 if ((IsControl || Data->hasValidDependencies()) &&
5610 Data->incrementUnscheduledDeps(-1) == 0) {
5611 // There are no more unscheduled dependencies after
5612 // decrementing, so we can put the dependent instruction
5613 // into the ready list.
5614 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5616 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5617 CopyableBundle.push_back(&CD->getBundle());
5618 Bundles = CopyableBundle;
5619 } else {
5620 Bundles = getScheduleBundles(Data->getInst());
5621 }
5622 if (!Bundles.empty()) {
5623 for (ScheduleBundle *Bundle : Bundles) {
5624 if (Bundle->unscheduledDepsInBundle() == 0) {
5625 assert(!Bundle->isScheduled() &&
5626 "already scheduled bundle gets ready");
5627 ReadyList.insert(Bundle);
5629 << "SLP: gets ready: " << *Bundle << "\n");
5630 }
5631 }
5632 return;
5633 }
5634 assert(!Data->isScheduled() &&
5635 "already scheduled bundle gets ready");
5637 "Expected non-copyable data");
5638 ReadyList.insert(Data);
5639 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5640 }
5641 };
5642
5643 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5644 Instruction *I) {
5645 if (!ScheduleCopyableDataMap.empty()) {
5647 getScheduleCopyableData(User, OpIdx, I);
5648 for (ScheduleCopyableData *CD : CopyableData)
5649 DecrUnsched(CD, /*IsControl=*/false);
5650 if (!CopyableData.empty())
5651 return;
5652 }
5653 if (ScheduleData *OpSD = getScheduleData(I))
5654 DecrUnsched(OpSD, /*IsControl=*/false);
5655 };
5656
5657 // If BundleMember is a vector bundle, its operands may have been
5658 // reordered during buildTree(). We therefore need to get its operands
5659 // through the TreeEntry.
5660 if (!Bundles.empty()) {
5661 auto *In = BundleMember->getInst();
5662 // Count uses of each instruction operand.
5663 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5664 unsigned TotalOpCount = 0;
5665 if (isa<ScheduleCopyableData>(BundleMember)) {
5666 // Copyable data is used only once (uses itself).
5667 TotalOpCount = OperandsUses[In] = 1;
5668 } else {
5669 for (const Use &U : In->operands()) {
5670 if (auto *I = dyn_cast<Instruction>(U.get())) {
5671 auto Res = OperandsUses.try_emplace(I, 0);
5672 ++Res.first->getSecond();
5673 ++TotalOpCount;
5674 }
5675 }
5676 }
5677 // Decrement the unscheduled counter and insert to ready list if
5678 // ready.
5679 auto DecrUnschedForInst =
5680 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5681 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5682 &Checked) {
5683 if (!ScheduleCopyableDataMap.empty()) {
5684 const EdgeInfo EI = {UserTE, OpIdx};
5685 if (ScheduleCopyableData *CD =
5686 getScheduleCopyableData(EI, I)) {
5687 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
5688 return;
5689 DecrUnsched(CD, /*IsControl=*/false);
5690 return;
5691 }
5692 }
5693 auto It = OperandsUses.find(I);
5694 assert(It != OperandsUses.end() && "Operand not found");
5695 if (It->second > 0) {
5696 if (ScheduleData *OpSD = getScheduleData(I)) {
5697 if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
5698 return;
5699 --It->getSecond();
5700 assert(TotalOpCount > 0 && "No more operands to decrement");
5701 --TotalOpCount;
5702 DecrUnsched(OpSD, /*IsControl=*/false);
5703 } else {
5704 --It->getSecond();
5705 assert(TotalOpCount > 0 && "No more operands to decrement");
5706 --TotalOpCount;
5707 }
5708 }
5709 };
5710
5711 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5712 for (ScheduleBundle *Bundle : Bundles) {
5713 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5714 break;
5715 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5716 // Need to search for the lane since the tree entry can be
5717 // reordered.
5718 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
5719 bool IsNonSchedulableWithParentPhiNode =
5720 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5721 Bundle->getTreeEntry()->UserTreeIndex &&
5722 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5723 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5724 TreeEntry::SplitVectorize &&
5725 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5726 Instruction::PHI;
5727 do {
5728 int Lane =
5729 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5730 assert(Lane >= 0 && "Lane not set");
5731 if (isa<StoreInst>(In) &&
5732 !Bundle->getTreeEntry()->ReorderIndices.empty())
5733 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5734 assert(Lane < static_cast<int>(
5735 Bundle->getTreeEntry()->Scalars.size()) &&
5736 "Couldn't find extract lane");
5737
5738 // Since vectorization tree is being built recursively this
5739 // assertion ensures that the tree entry has all operands set
5740 // before reaching this code. Couple of exceptions known at the
5741 // moment are extracts where their second (immediate) operand is
5742 // not added. Since immediates do not affect scheduler behavior
5743 // this is considered okay.
5744 assert(In &&
5746 In->getNumOperands() ==
5747 Bundle->getTreeEntry()->getNumOperands() ||
5748 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5749 "Missed TreeEntry operands?");
5750
5751 // Count the number of unique phi nodes, which are the parent for
5752 // parent entry, and exit, if all the unique phis are processed.
5753 if (IsNonSchedulableWithParentPhiNode) {
5754 const TreeEntry *ParentTE =
5755 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5756 Value *User = ParentTE->Scalars[Lane];
5757 if (!ParentsUniqueUsers.insert(User).second) {
5758 It = std::find(std::next(It),
5759 Bundle->getTreeEntry()->Scalars.end(), In);
5760 continue;
5761 }
5762 }
5763
5764 for (unsigned OpIdx :
5765 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5766 if (auto *I = dyn_cast<Instruction>(
5767 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5768 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5769 << *I << "\n");
5770 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5771 }
5772 // If parent node is schedulable, it will be handled correctly.
5773 if (Bundle->getTreeEntry()->isCopyableElement(In))
5774 break;
5775 It = std::find(std::next(It),
5776 Bundle->getTreeEntry()->Scalars.end(), In);
5777 } while (It != Bundle->getTreeEntry()->Scalars.end());
5778 }
5779 } else {
5780 // If BundleMember is a stand-alone instruction, no operand reordering
5781 // has taken place, so we directly access its operands.
5782 for (Use &U : BundleMember->getInst()->operands()) {
5783 if (auto *I = dyn_cast<Instruction>(U.get())) {
5785 << "SLP: check for readiness (def): " << *I << "\n");
5786 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5787 }
5788 }
5789 }
5790 // Handle the memory dependencies.
5791 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5792 if (!SD)
5793 return;
5794 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5795 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5796 if (!VisitedMemory.insert(MemoryDep).second)
5797 continue;
5798 // There are no more unscheduled dependencies after decrementing,
5799 // so we can put the dependent instruction into the ready list.
5800 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5801 << *MemoryDep << "\n");
5802 DecrUnsched(MemoryDep);
5803 }
5804 // Handle the control dependencies.
5805 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5806 for (ScheduleData *Dep : SD->getControlDependencies()) {
5807 if (!VisitedControl.insert(Dep).second)
5808 continue;
5809 // There are no more unscheduled dependencies after decrementing,
5810 // so we can put the dependent instruction into the ready list.
5812 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5813 DecrUnsched(Dep, /*IsControl=*/true);
5814 }
5815 };
5816 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5817 SD->setScheduled(/*Scheduled=*/true);
5818 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5821 Instruction *In = SD->getInst();
5822 if (R.isVectorized(In)) {
5823 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5824 for (TreeEntry *TE : Entries) {
5826 In->getNumOperands() != TE->getNumOperands())
5827 continue;
5828 auto &BundlePtr =
5829 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5830 BundlePtr->setTreeEntry(TE);
5831 BundlePtr->add(SD);
5832 Bundles.push_back(BundlePtr.get());
5833 }
5834 }
5835 ProcessBundleMember(SD, Bundles);
5836 } else {
5837 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5838 Bundle.setScheduled(/*Scheduled=*/true);
5839 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5840 auto AreAllBundlesScheduled =
5841 [&](const ScheduleEntity *SD,
5842 ArrayRef<ScheduleBundle *> SDBundles) {
5844 return true;
5845 return !SDBundles.empty() &&
5846 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5847 return SDBundle->isScheduled();
5848 });
5849 };
5850 for (ScheduleEntity *SD : Bundle.getBundle()) {
5853 SDBundles = getScheduleBundles(SD->getInst());
5854 if (AreAllBundlesScheduled(SD, SDBundles)) {
5855 SD->setScheduled(/*Scheduled=*/true);
5856 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5857 : SDBundles);
5858 }
5859 }
5860 }
5861 }
5862
5863 /// Verify basic self consistency properties of the data structure.
5864 void verify() {
5865 if (!ScheduleStart)
5866 return;
5867
5868 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5869 ScheduleStart->comesBefore(ScheduleEnd) &&
5870 "Not a valid scheduling region?");
5871
5872 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5873 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5874 if (!Bundles.empty()) {
5875 for (ScheduleBundle *Bundle : Bundles) {
5876 assert(isInSchedulingRegion(*Bundle) &&
5877 "primary schedule data not in window?");
5878 Bundle->verify();
5879 }
5880 continue;
5881 }
5882 auto *SD = getScheduleData(I);
5883 if (!SD)
5884 continue;
5885 assert(isInSchedulingRegion(*SD) &&
5886 "primary schedule data not in window?");
5887 SD->verify();
5888 }
5889
5890 assert(all_of(ReadyInsts,
5891 [](const ScheduleEntity *Bundle) {
5892 return Bundle->isReady();
5893 }) &&
5894 "item in ready list not ready?");
5895 }
5896
5897 /// Put all instructions into the ReadyList which are ready for scheduling.
5898 template <typename ReadyListType>
5899 void initialFillReadyList(ReadyListType &ReadyList) {
5900 SmallPtrSet<ScheduleBundle *, 16> Visited;
5901 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5902 ScheduleData *SD = getScheduleData(I);
5903 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5904 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5905 !Bundles.empty()) {
5906 for (ScheduleBundle *Bundle : Bundles) {
5907 if (!Visited.insert(Bundle).second)
5908 continue;
5909 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5910 ReadyList.insert(Bundle);
5911 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5912 << *Bundle << "\n");
5913 }
5914 }
5915 continue;
5916 }
5917 ReadyList.insert(SD);
5919 << "SLP: initially in ready list: " << *SD << "\n");
5920 }
5921 }
5922 }
5923
5924 /// Build a bundle from the ScheduleData nodes corresponding to the
5925 /// scalar instruction for each lane.
5926 /// \param VL The list of scalar instructions.
5927 /// \param S The state of the instructions.
5928 /// \param EI The edge in the SLP graph or the user node/operand number.
5929 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5930 const InstructionsState &S, const EdgeInfo &EI);
5931
5932 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5933 /// cyclic dependencies. This is only a dry-run, no instructions are
5934 /// actually moved at this stage.
5935 /// \returns the scheduling bundle. The returned Optional value is not
5936 /// std::nullopt if \p VL is allowed to be scheduled.
5937 std::optional<ScheduleBundle *>
5938 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5939 const InstructionsState &S, const EdgeInfo &EI);
5940
5941 /// Allocates schedule data chunk.
5942 ScheduleData *allocateScheduleDataChunks();
5943
5944 /// Extends the scheduling region so that V is inside the region.
5945 /// \returns true if the region size is within the limit.
5946 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5947
5948 /// Initialize the ScheduleData structures for new instructions in the
5949 /// scheduling region.
5950 void initScheduleData(Instruction *FromI, Instruction *ToI,
5951 ScheduleData *PrevLoadStore,
5952 ScheduleData *NextLoadStore);
5953
5954 /// Updates the dependency information of a bundle and of all instructions/
5955 /// bundles which depend on the original bundle.
5956 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5957 BoUpSLP *SLP,
5958 ArrayRef<ScheduleData *> ControlDeps = {});
5959
5960 /// Sets all instruction in the scheduling region to un-scheduled.
5961 void resetSchedule();
5962
5963 BasicBlock *BB;
5964
5965 /// Simple memory allocation for ScheduleData.
5967
5968 /// The size of a ScheduleData array in ScheduleDataChunks.
5969 int ChunkSize;
5970
5971 /// The allocator position in the current chunk, which is the last entry
5972 /// of ScheduleDataChunks.
5973 int ChunkPos;
5974
5975 /// Attaches ScheduleData to Instruction.
5976 /// Note that the mapping survives during all vectorization iterations, i.e.
5977 /// ScheduleData structures are recycled.
5978 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5979
5980 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5981 /// number) and the operand instruction, represented as copyable element.
5982 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5983 std::unique_ptr<ScheduleCopyableData>>
5984 ScheduleCopyableDataMap;
5985
5986 /// Represents mapping between instruction and all related
5987 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5988 /// element). The SLP tree may contain several representations of the same
5989 /// instruction.
5990 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5991 ScheduleCopyableDataMapByInst;
5992
5993 /// Represents mapping between user value and operand number, the operand
5994 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5995 /// the same user may refernce the same operand in different tree entries
5996 /// and the operand may be modelled by the different copyable data element.
5997 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5999 ScheduleCopyableDataMapByInstUser;
6000
6001 /// Represents mapping between instruction and all related
6002 /// ScheduleCopyableData. It represents the mapping between the actual
6003 /// instruction and the last copyable data element in the chain. E.g., if
6004 /// the graph models the following instructions:
6005 /// %0 = non-add instruction ...
6006 /// ...
6007 /// %4 = add %3, 1
6008 /// %5 = add %4, 1
6009 /// %6 = insertelement poison, %0, 0
6010 /// %7 = insertelement %6, %5, 1
6011 /// And the graph is modeled as:
6012 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
6013 /// -> [1, 0] -> [%1, 0]
6014 ///
6015 /// this map will map %0 only to the copyable element <1>, which is the last
6016 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
6017 /// keep the map to <0>, not the %0.
6018 SmallDenseMap<const Instruction *,
6019 SmallSetVector<ScheduleCopyableData *, 4>>
6020 ScheduleCopyableDataMapByUsers;
6021
6022 /// Attaches ScheduleBundle to Instruction.
6023 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6024 ScheduledBundles;
6025 /// The list of ScheduleBundles.
6026 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
6027
6028 /// The ready-list for scheduling (only used for the dry-run).
6029 SetVector<ScheduleEntity *> ReadyInsts;
6030
6031 /// The first instruction of the scheduling region.
6032 Instruction *ScheduleStart = nullptr;
6033
6034 /// The first instruction _after_ the scheduling region.
6035 Instruction *ScheduleEnd = nullptr;
6036
6037 /// The first memory accessing instruction in the scheduling region
6038 /// (can be null).
6039 ScheduleData *FirstLoadStoreInRegion = nullptr;
6040
6041 /// The last memory accessing instruction in the scheduling region
6042 /// (can be null).
6043 ScheduleData *LastLoadStoreInRegion = nullptr;
6044
6045 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
6046 /// region? Used to optimize the dependence calculation for the
6047 /// common case where there isn't.
6048 bool RegionHasStackSave = false;
6049
6050 /// The current size of the scheduling region.
6051 int ScheduleRegionSize = 0;
6052
6053 /// The maximum size allowed for the scheduling region.
6054 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6055
6056 /// The ID of the scheduling region. For a new vectorization iteration this
6057 /// is incremented which "removes" all ScheduleData from the region.
6058 /// Make sure that the initial SchedulingRegionID is greater than the
6059 /// initial SchedulingRegionID in ScheduleData (which is 0).
6060 int SchedulingRegionID = 1;
6061 };
6062
6063 /// Attaches the BlockScheduling structures to basic blocks.
6064 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6065
6066 /// Performs the "real" scheduling. Done before vectorization is actually
6067 /// performed in a basic block.
6068 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6069
6070 /// List of users to ignore during scheduling and that don't need extracting.
6071 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6072
6073 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6074 /// sorted SmallVectors of unsigned.
6075 struct OrdersTypeDenseMapInfo {
6076 static OrdersType getEmptyKey() {
6077 OrdersType V;
6078 V.push_back(~1U);
6079 return V;
6080 }
6081
6082 static OrdersType getTombstoneKey() {
6083 OrdersType V;
6084 V.push_back(~2U);
6085 return V;
6086 }
6087
6088 static unsigned getHashValue(const OrdersType &V) {
6089 return static_cast<unsigned>(hash_combine_range(V));
6090 }
6091
6092 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6093 return LHS == RHS;
6094 }
6095 };
6096
6097 // Analysis and block reference.
6098 Function *F;
6099 ScalarEvolution *SE;
6100 TargetTransformInfo *TTI;
6101 TargetLibraryInfo *TLI;
6102 LoopInfo *LI;
6103 DominatorTree *DT;
6104 AssumptionCache *AC;
6105 DemandedBits *DB;
6106 const DataLayout *DL;
6107 OptimizationRemarkEmitter *ORE;
6108
6109 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6110 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6111
6112 /// Instruction builder to construct the vectorized tree.
6113 IRBuilder<TargetFolder> Builder;
6114
6115 /// A map of scalar integer values to the smallest bit width with which they
6116 /// can legally be represented. The values map to (width, signed) pairs,
6117 /// where "width" indicates the minimum bit width and "signed" is True if the
6118 /// value must be signed-extended, rather than zero-extended, back to its
6119 /// original width.
6120 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6121
6122 /// Final size of the reduced vector, if the current graph represents the
6123 /// input for the reduction and it was possible to narrow the size of the
6124 /// reduction.
6125 unsigned ReductionBitWidth = 0;
6126
6127 /// Canonical graph size before the transformations.
6128 unsigned BaseGraphSize = 1;
6129
6130 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6131 /// type sizes, used in the tree.
6132 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6133
6134 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6135 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6136 DenseSet<unsigned> ExtraBitWidthNodes;
6137};
6138
6139template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6143 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6144 SecondInfo::getEmptyKey());
6145 }
6146
6148 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6149 SecondInfo::getTombstoneKey());
6150 }
6151
6152 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6153 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6154 SecondInfo::getHashValue(Val.EdgeIdx));
6155 }
6156
6157 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6158 const BoUpSLP::EdgeInfo &RHS) {
6159 return LHS == RHS;
6160 }
6161};
6162
6163template <> struct llvm::GraphTraits<BoUpSLP *> {
6164 using TreeEntry = BoUpSLP::TreeEntry;
6165
6166 /// NodeRef has to be a pointer per the GraphWriter.
6168
6169 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6170
6171 /// Add the VectorizableTree to the index iterator to be able to return
6172 /// TreeEntry pointers.
6174 : public iterator_adaptor_base<
6175 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6177
6181
6182 NodeRef operator*() { return I->UserTE; }
6183 };
6184
6186 return R.VectorizableTree[0].get();
6187 }
6188
6190 return {&N->UserTreeIndex, N->Container};
6191 }
6192
6194 return {&N->UserTreeIndex + 1, N->Container};
6195 }
6196
6197 /// For the node iterator we just need to turn the TreeEntry iterator into a
6198 /// TreeEntry* iterator so that it dereferences to NodeRef.
6200 using ItTy = ContainerTy::iterator;
6201 ItTy It;
6202
6203 public:
6204 nodes_iterator(const ItTy &It2) : It(It2) {}
6205 NodeRef operator*() { return It->get(); }
6207 ++It;
6208 return *this;
6209 }
6210 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6211 };
6212
6214 return nodes_iterator(R->VectorizableTree.begin());
6215 }
6216
6218 return nodes_iterator(R->VectorizableTree.end());
6219 }
6220
6221 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6222};
6223
6224template <>
6226 using TreeEntry = BoUpSLP::TreeEntry;
6227
6228 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6229
6230 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6231 std::string Str;
6232 raw_string_ostream OS(Str);
6233 OS << Entry->Idx << ".\n";
6234 if (isSplat(Entry->Scalars))
6235 OS << "<splat> ";
6236 for (auto *V : Entry->Scalars) {
6237 OS << *V;
6238 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6239 return EU.Scalar == V;
6240 }))
6241 OS << " <extract>";
6242 OS << "\n";
6243 }
6244 return Str;
6245 }
6246
6247 static std::string getNodeAttributes(const TreeEntry *Entry,
6248 const BoUpSLP *) {
6249 if (Entry->isGather())
6250 return "color=red";
6251 if (Entry->State == TreeEntry::ScatterVectorize ||
6252 Entry->State == TreeEntry::StridedVectorize ||
6253 Entry->State == TreeEntry::CompressVectorize)
6254 return "color=blue";
6255 return "";
6256 }
6257};
6258
6261 for (auto *I : DeletedInstructions) {
6262 if (!I->getParent()) {
6263 // Temporarily insert instruction back to erase them from parent and
6264 // memory later.
6265 if (isa<PHINode>(I))
6266 // Phi nodes must be the very first instructions in the block.
6267 I->insertBefore(F->getEntryBlock(),
6268 F->getEntryBlock().getFirstNonPHIIt());
6269 else
6270 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6271 continue;
6272 }
6273 for (Use &U : I->operands()) {
6274 auto *Op = dyn_cast<Instruction>(U.get());
6275 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6277 DeadInsts.emplace_back(Op);
6278 }
6279 I->dropAllReferences();
6280 }
6281 for (auto *I : DeletedInstructions) {
6282 assert(I->use_empty() &&
6283 "trying to erase instruction with users.");
6284 I->eraseFromParent();
6285 }
6286
6287 // Cleanup any dead scalar code feeding the vectorized instructions
6289
6290#ifdef EXPENSIVE_CHECKS
6291 // If we could guarantee that this call is not extremely slow, we could
6292 // remove the ifdef limitation (see PR47712).
6293 assert(!verifyFunction(*F, &dbgs()));
6294#endif
6295}
6296
6297/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6298/// contains original mask for the scalars reused in the node. Procedure
6299/// transform this mask in accordance with the given \p Mask.
6301 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6302 "Expected non-empty mask.");
6303 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6304 Prev.swap(Reuses);
6305 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6306 if (Mask[I] != PoisonMaskElem)
6307 Reuses[Mask[I]] = Prev[I];
6308}
6309
6310/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6311/// the original order of the scalars. Procedure transforms the provided order
6312/// in accordance with the given \p Mask. If the resulting \p Order is just an
6313/// identity order, \p Order is cleared.
6315 bool BottomOrder = false) {
6316 assert(!Mask.empty() && "Expected non-empty mask.");
6317 unsigned Sz = Mask.size();
6318 if (BottomOrder) {
6319 SmallVector<unsigned> PrevOrder;
6320 if (Order.empty()) {
6321 PrevOrder.resize(Sz);
6322 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6323 } else {
6324 PrevOrder.swap(Order);
6325 }
6326 Order.assign(Sz, Sz);
6327 for (unsigned I = 0; I < Sz; ++I)
6328 if (Mask[I] != PoisonMaskElem)
6329 Order[I] = PrevOrder[Mask[I]];
6330 if (all_of(enumerate(Order), [&](const auto &Data) {
6331 return Data.value() == Sz || Data.index() == Data.value();
6332 })) {
6333 Order.clear();
6334 return;
6335 }
6336 fixupOrderingIndices(Order);
6337 return;
6338 }
6339 SmallVector<int> MaskOrder;
6340 if (Order.empty()) {
6341 MaskOrder.resize(Sz);
6342 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6343 } else {
6344 inversePermutation(Order, MaskOrder);
6345 }
6346 reorderReuses(MaskOrder, Mask);
6347 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6348 Order.clear();
6349 return;
6350 }
6351 Order.assign(Sz, Sz);
6352 for (unsigned I = 0; I < Sz; ++I)
6353 if (MaskOrder[I] != PoisonMaskElem)
6354 Order[MaskOrder[I]] = I;
6355 fixupOrderingIndices(Order);
6356}
6357
6358std::optional<BoUpSLP::OrdersType>
6359BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6360 bool TopToBottom, bool IgnoreReorder) {
6361 assert(TE.isGather() && "Expected gather node only.");
6362 // Try to find subvector extract/insert patterns and reorder only such
6363 // patterns.
6364 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6365 Type *ScalarTy = GatheredScalars.front()->getType();
6366 size_t NumScalars = GatheredScalars.size();
6367 if (!isValidElementType(ScalarTy))
6368 return std::nullopt;
6369 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6370 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6371 SmallVector<int> ExtractMask;
6372 SmallVector<int> Mask;
6375 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6377 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6378 /*ForOrder=*/true);
6379 // No shuffled operands - ignore.
6380 if (GatherShuffles.empty() && ExtractShuffles.empty())
6381 return std::nullopt;
6382 OrdersType CurrentOrder(NumScalars, NumScalars);
6383 if (GatherShuffles.size() == 1 &&
6384 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6385 Entries.front().front()->isSame(TE.Scalars)) {
6386 // If the full matched node in whole tree rotation - no need to consider the
6387 // matching order, rotating the whole tree.
6388 if (TopToBottom)
6389 return std::nullopt;
6390 // No need to keep the order for the same user node.
6391 if (Entries.front().front()->UserTreeIndex.UserTE ==
6392 TE.UserTreeIndex.UserTE)
6393 return std::nullopt;
6394 // No need to keep the order for the matched root node, if it can be freely
6395 // reordered.
6396 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6397 return std::nullopt;
6398 // If shuffling 2 elements only and the matching node has reverse reuses -
6399 // no need to count order, both work fine.
6400 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6401 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6402 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6403 [](const auto &P) {
6404 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6405 }))
6406 return std::nullopt;
6407
6408 // Perfect match in the graph, will reuse the previously vectorized
6409 // node. Cost is 0.
6410 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6411 return CurrentOrder;
6412 }
6413 auto IsSplatMask = [](ArrayRef<int> Mask) {
6414 int SingleElt = PoisonMaskElem;
6415 return all_of(Mask, [&](int I) {
6416 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6417 SingleElt = I;
6418 return I == PoisonMaskElem || I == SingleElt;
6419 });
6420 };
6421 // Exclusive broadcast mask - ignore.
6422 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6423 (Entries.size() != 1 ||
6424 Entries.front().front()->ReorderIndices.empty())) ||
6425 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6426 return std::nullopt;
6427 SmallBitVector ShuffledSubMasks(NumParts);
6428 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6429 ArrayRef<int> Mask, int PartSz, int NumParts,
6430 function_ref<unsigned(unsigned)> GetVF) {
6431 for (int I : seq<int>(0, NumParts)) {
6432 if (ShuffledSubMasks.test(I))
6433 continue;
6434 const int VF = GetVF(I);
6435 if (VF == 0)
6436 continue;
6437 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6438 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6439 // Shuffle of at least 2 vectors - ignore.
6440 if (any_of(Slice, not_equal_to(NumScalars))) {
6441 llvm::fill(Slice, NumScalars);
6442 ShuffledSubMasks.set(I);
6443 continue;
6444 }
6445 // Try to include as much elements from the mask as possible.
6446 int FirstMin = INT_MAX;
6447 int SecondVecFound = false;
6448 for (int K : seq<int>(Limit)) {
6449 int Idx = Mask[I * PartSz + K];
6450 if (Idx == PoisonMaskElem) {
6451 Value *V = GatheredScalars[I * PartSz + K];
6452 if (isConstant(V) && !isa<PoisonValue>(V)) {
6453 SecondVecFound = true;
6454 break;
6455 }
6456 continue;
6457 }
6458 if (Idx < VF) {
6459 if (FirstMin > Idx)
6460 FirstMin = Idx;
6461 } else {
6462 SecondVecFound = true;
6463 break;
6464 }
6465 }
6466 FirstMin = (FirstMin / PartSz) * PartSz;
6467 // Shuffle of at least 2 vectors - ignore.
6468 if (SecondVecFound) {
6469 llvm::fill(Slice, NumScalars);
6470 ShuffledSubMasks.set(I);
6471 continue;
6472 }
6473 for (int K : seq<int>(Limit)) {
6474 int Idx = Mask[I * PartSz + K];
6475 if (Idx == PoisonMaskElem)
6476 continue;
6477 Idx -= FirstMin;
6478 if (Idx >= PartSz) {
6479 SecondVecFound = true;
6480 break;
6481 }
6482 if (CurrentOrder[I * PartSz + Idx] >
6483 static_cast<unsigned>(I * PartSz + K) &&
6484 CurrentOrder[I * PartSz + Idx] !=
6485 static_cast<unsigned>(I * PartSz + Idx))
6486 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6487 }
6488 // Shuffle of at least 2 vectors - ignore.
6489 if (SecondVecFound) {
6490 llvm::fill(Slice, NumScalars);
6491 ShuffledSubMasks.set(I);
6492 continue;
6493 }
6494 }
6495 };
6496 int PartSz = getPartNumElems(NumScalars, NumParts);
6497 if (!ExtractShuffles.empty())
6498 TransformMaskToOrder(
6499 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6500 if (!ExtractShuffles[I])
6501 return 0U;
6502 unsigned VF = 0;
6503 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6504 for (unsigned Idx : seq<unsigned>(Sz)) {
6505 int K = I * PartSz + Idx;
6506 if (ExtractMask[K] == PoisonMaskElem)
6507 continue;
6508 if (!TE.ReuseShuffleIndices.empty())
6509 K = TE.ReuseShuffleIndices[K];
6510 if (K == PoisonMaskElem)
6511 continue;
6512 if (!TE.ReorderIndices.empty())
6513 K = std::distance(TE.ReorderIndices.begin(),
6514 find(TE.ReorderIndices, K));
6515 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6516 if (!EI)
6517 continue;
6518 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6519 ->getElementCount()
6520 .getKnownMinValue());
6521 }
6522 return VF;
6523 });
6524 // Check special corner case - single shuffle of the same entry.
6525 if (GatherShuffles.size() == 1 && NumParts != 1) {
6526 if (ShuffledSubMasks.any())
6527 return std::nullopt;
6528 PartSz = NumScalars;
6529 NumParts = 1;
6530 }
6531 if (!Entries.empty())
6532 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6533 if (!GatherShuffles[I])
6534 return 0U;
6535 return std::max(Entries[I].front()->getVectorFactor(),
6536 Entries[I].back()->getVectorFactor());
6537 });
6538 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6539 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6540 return std::nullopt;
6541 return std::move(CurrentOrder);
6542}
6543
6544static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6545 const TargetLibraryInfo &TLI,
6546 bool CompareOpcodes = true) {
6549 return false;
6550 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6551 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6552 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6553 (!GEP2 || GEP2->getNumOperands() == 2) &&
6554 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6555 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6556 !CompareOpcodes ||
6557 (GEP1 && GEP2 &&
6558 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6559}
6560
6561/// Calculates minimal alignment as a common alignment.
6562template <typename T>
6564 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6565 for (Value *V : VL)
6566 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6567 return CommonAlignment;
6568}
6569
6570/// Check if \p Order represents reverse order.
6572 assert(!Order.empty() &&
6573 "Order is empty. Please check it before using isReverseOrder.");
6574 unsigned Sz = Order.size();
6575 return all_of(enumerate(Order), [&](const auto &Pair) {
6576 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6577 });
6578}
6579
6580/// Checks if the provided list of pointers \p Pointers represents the strided
6581/// pointers for type ElemTy. If they are not, nullptr is returned.
6582/// Otherwise, SCEV* of the stride value is returned.
6583/// If `PointerOps` can be rearanged into the following sequence:
6584/// ```
6585/// %x + c_0 * stride,
6586/// %x + c_1 * stride,
6587/// %x + c_2 * stride
6588/// ...
6589/// ```
6590/// where each `c_i` is constant. The `Coeffs` will contain `c_0, c_1, c_2, ..`
6591/// and the SCEV of the `stride` will be returned.
6592static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6593 const DataLayout &DL, ScalarEvolution &SE,
6594 SmallVectorImpl<unsigned> &SortedIndices,
6595 SmallVectorImpl<int64_t> &Coeffs) {
6596 assert(Coeffs.size() == PointerOps.size() &&
6597 "Coeffs vector needs to be of correct size");
6599 const SCEV *PtrSCEVLowest = nullptr;
6600 const SCEV *PtrSCEVHighest = nullptr;
6601 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6602 // addresses).
6603 for (Value *Ptr : PointerOps) {
6604 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6605 if (!PtrSCEV)
6606 return nullptr;
6607 SCEVs.push_back(PtrSCEV);
6608 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6609 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6610 continue;
6611 }
6612 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6613 if (isa<SCEVCouldNotCompute>(Diff))
6614 return nullptr;
6615 if (Diff->isNonConstantNegative()) {
6616 PtrSCEVLowest = PtrSCEV;
6617 continue;
6618 }
6619 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6620 if (isa<SCEVCouldNotCompute>(Diff1))
6621 return nullptr;
6622 if (Diff1->isNonConstantNegative()) {
6623 PtrSCEVHighest = PtrSCEV;
6624 continue;
6625 }
6626 }
6627 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6628 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6629 if (isa<SCEVCouldNotCompute>(Dist))
6630 return nullptr;
6631 int Size = DL.getTypeStoreSize(ElemTy);
6632 auto TryGetStride = [&](const SCEV *Dist,
6633 const SCEV *Multiplier) -> const SCEV * {
6634 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6635 if (M->getOperand(0) == Multiplier)
6636 return M->getOperand(1);
6637 if (M->getOperand(1) == Multiplier)
6638 return M->getOperand(0);
6639 return nullptr;
6640 }
6641 if (Multiplier == Dist)
6642 return SE.getConstant(Dist->getType(), 1);
6643 return SE.getUDivExactExpr(Dist, Multiplier);
6644 };
6645 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6646 const SCEV *Stride = nullptr;
6647 if (Size != 1 || SCEVs.size() > 2) {
6648 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6649 Stride = TryGetStride(Dist, Sz);
6650 if (!Stride)
6651 return nullptr;
6652 }
6653 if (!Stride || isa<SCEVConstant>(Stride))
6654 return nullptr;
6655 // Iterate through all pointers and check if all distances are
6656 // unique multiple of Stride.
6657 using DistOrdPair = std::pair<int64_t, int>;
6658 auto Compare = llvm::less_first();
6659 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6660 int Cnt = 0;
6661 bool IsConsecutive = true;
6662 for (const auto [Idx, PtrSCEV] : enumerate(SCEVs)) {
6663 unsigned Dist = 0;
6664 if (PtrSCEV != PtrSCEVLowest) {
6665 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6666 const SCEV *Coeff = TryGetStride(Diff, Stride);
6667 if (!Coeff)
6668 return nullptr;
6669 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6670 if (!SC || isa<SCEVCouldNotCompute>(SC))
6671 return nullptr;
6672 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6673 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6674 SE.getMulExpr(Stride, SC)))
6675 ->isZero())
6676 return nullptr;
6677 Dist = SC->getAPInt().getZExtValue();
6678 } else {
6679 Coeffs[Idx] = 0;
6680 }
6681 // If the strides are not the same or repeated, we can't vectorize.
6682 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6683 return nullptr;
6684 auto Res = Offsets.emplace(Dist, Cnt);
6685 if (!Res.second)
6686 return nullptr;
6687 // Consecutive order if the inserted element is the last one.
6688 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6689 ++Cnt;
6690 }
6691 if (Offsets.size() != SCEVs.size())
6692 return nullptr;
6693 SortedIndices.clear();
6694 if (!IsConsecutive) {
6695 // Fill SortedIndices array only if it is non-consecutive.
6696 SortedIndices.resize(PointerOps.size());
6697 Cnt = 0;
6698 for (const std::pair<int64_t, int> &Pair : Offsets) {
6699 SortedIndices[Cnt] = Pair.second;
6700 ++Cnt;
6701 }
6702 }
6703 return Stride;
6704}
6705
6706static std::pair<InstructionCost, InstructionCost>
6708 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6709 Type *ScalarTy, VectorType *VecTy);
6710
6711/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6712/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6713/// subvector pattern.
6714static InstructionCost
6716 VectorType *Tp, ArrayRef<int> Mask = {},
6718 int Index = 0, VectorType *SubTp = nullptr,
6720 VectorType *DstTy = Tp;
6721 if (!Mask.empty())
6722 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6723
6724 if (Kind != TTI::SK_PermuteTwoSrc)
6725 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6726 Args);
6727 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6728 int NumSubElts;
6730 Mask, NumSrcElts, NumSubElts, Index)) {
6731 if (Index + NumSubElts > NumSrcElts &&
6732 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6733 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6734 TTI::TCK_RecipThroughput, Index, Tp);
6735 }
6736 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6737 Args);
6738}
6739
6740/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6741/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6742/// instead of a scalar.
6743static InstructionCost
6745 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6746 bool Extract, TTI::TargetCostKind CostKind,
6747 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6749 "ScalableVectorType is not supported.");
6750 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6751 getNumElements(Ty) &&
6752 "Incorrect usage.");
6753 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6754 assert(SLPReVec && "Only supported by REVEC.");
6755 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6756 // of CreateInsertElement.
6757 unsigned ScalarTyNumElements = VecTy->getNumElements();
6758 InstructionCost Cost = 0;
6759 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6760 if (!DemandedElts[I])
6761 continue;
6762 if (Insert)
6764 I * ScalarTyNumElements, VecTy);
6765 if (Extract)
6767 I * ScalarTyNumElements, VecTy);
6768 }
6769 return Cost;
6770 }
6771 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6772 CostKind, ForPoisonSrc, VL);
6773}
6774
6775/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6776/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6778 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6779 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6780 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6781 if (Opcode == Instruction::ExtractElement) {
6782 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6783 assert(SLPReVec && "Only supported by REVEC.");
6784 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6786 cast<VectorType>(Val), {}, CostKind,
6787 Index * VecTy->getNumElements(), VecTy);
6788 }
6789 }
6790 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6791 ScalarUserAndIdx);
6792}
6793
6794/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6795/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6797 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6798 VectorType *VecTy, unsigned Index,
6800 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6801 assert(SLPReVec && "Only supported by REVEC.");
6802 auto *SubTp =
6803 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6805 Index * ScalarTy->getNumElements(), SubTp) +
6806 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6807 CostKind);
6808 }
6809 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6810}
6811
6812/// Creates subvector insert. Generates shuffle using \p Generator or
6813/// using default shuffle.
6815 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6816 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6817 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6818 return Vec;
6819 const unsigned SubVecVF = getNumElements(V->getType());
6820 // Create shuffle, insertvector requires that index is multiple of
6821 // the subvector length.
6822 const unsigned VecVF = getNumElements(Vec->getType());
6824 if (isa<PoisonValue>(Vec)) {
6825 auto *Begin = std::next(Mask.begin(), Index);
6826 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6827 Vec = Builder.CreateShuffleVector(V, Mask);
6828 return Vec;
6829 }
6830 std::iota(Mask.begin(), Mask.end(), 0);
6831 std::iota(std::next(Mask.begin(), Index),
6832 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6833 if (Generator)
6834 return Generator(Vec, V, Mask);
6835 // 1. Resize V to the size of Vec.
6836 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6837 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6838 V = Builder.CreateShuffleVector(V, ResizeMask);
6839 // 2. Insert V into Vec.
6840 return Builder.CreateShuffleVector(Vec, V, Mask);
6841}
6842
6843/// Generates subvector extract using \p Generator or using default shuffle.
6845 unsigned SubVecVF, unsigned Index) {
6846 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6847 std::iota(Mask.begin(), Mask.end(), Index);
6848 return Builder.CreateShuffleVector(Vec, Mask);
6849}
6850
6851/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6852/// with \p Order.
6853/// \return true if the mask represents strided access, false - otherwise.
6855 ArrayRef<unsigned> Order, Type *ScalarTy,
6856 const DataLayout &DL, ScalarEvolution &SE,
6857 SmallVectorImpl<int> &CompressMask) {
6858 const unsigned Sz = PointerOps.size();
6859 CompressMask.assign(Sz, PoisonMaskElem);
6860 // The first element always set.
6861 CompressMask[0] = 0;
6862 // Check if the mask represents strided access.
6863 std::optional<unsigned> Stride = 0;
6864 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6865 for (unsigned I : seq<unsigned>(1, Sz)) {
6866 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6867 std::optional<int64_t> OptPos =
6868 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6869 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6870 return false;
6871 unsigned Pos = static_cast<unsigned>(*OptPos);
6872 CompressMask[I] = Pos;
6873 if (!Stride)
6874 continue;
6875 if (*Stride == 0) {
6876 *Stride = Pos;
6877 continue;
6878 }
6879 if (Pos != *Stride * I)
6880 Stride.reset();
6881 }
6882 return Stride.has_value();
6883}
6884
6885/// Checks if the \p VL can be transformed to a (masked)load + compress or
6886/// (masked) interleaved load.
6888 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6891 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6892 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6893 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6894 VectorType *&LoadVecTy) {
6895 InterleaveFactor = 0;
6896 Type *ScalarTy = VL.front()->getType();
6897 const size_t Sz = VL.size();
6898 auto *VecTy = getWidenedType(ScalarTy, Sz);
6900 SmallVector<int> Mask;
6901 if (!Order.empty())
6902 inversePermutation(Order, Mask);
6903 // Check external uses.
6904 for (const auto [I, V] : enumerate(VL)) {
6905 if (AreAllUsersVectorized(V))
6906 continue;
6907 InstructionCost ExtractCost =
6908 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6909 Mask.empty() ? I : Mask[I]);
6910 InstructionCost ScalarCost =
6911 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6912 if (ExtractCost <= ScalarCost)
6913 return false;
6914 }
6915 Value *Ptr0;
6916 Value *PtrN;
6917 if (Order.empty()) {
6918 Ptr0 = PointerOps.front();
6919 PtrN = PointerOps.back();
6920 } else {
6921 Ptr0 = PointerOps[Order.front()];
6922 PtrN = PointerOps[Order.back()];
6923 }
6924 std::optional<int64_t> Diff =
6925 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6926 if (!Diff)
6927 return false;
6928 const size_t MaxRegSize =
6930 .getFixedValue();
6931 // Check for very large distances between elements.
6932 if (*Diff / Sz >= MaxRegSize / 8)
6933 return false;
6934 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6935 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6936 Align CommonAlignment = LI->getAlign();
6937 IsMasked = !isSafeToLoadUnconditionally(
6938 Ptr0, LoadVecTy, CommonAlignment, DL,
6939 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6940 &TLI);
6941 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6942 LI->getPointerAddressSpace()))
6943 return false;
6944 // TODO: perform the analysis of each scalar load for better
6945 // safe-load-unconditionally analysis.
6946 bool IsStrided =
6947 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6948 assert(CompressMask.size() >= 2 && "At least two elements are required");
6949 SmallVector<Value *> OrderedPointerOps(PointerOps);
6950 if (!Order.empty())
6951 reorderScalars(OrderedPointerOps, Mask);
6952 auto [ScalarGEPCost, VectorGEPCost] =
6953 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6954 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6955 // The cost of scalar loads.
6956 InstructionCost ScalarLoadsCost =
6957 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6958 [&](InstructionCost C, Value *V) {
6959 return C + TTI.getInstructionCost(cast<Instruction>(V),
6960 CostKind);
6961 }) +
6962 ScalarGEPCost;
6963 APInt DemandedElts = APInt::getAllOnes(Sz);
6964 InstructionCost GatherCost =
6965 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6966 /*Insert=*/true,
6967 /*Extract=*/false, CostKind) +
6968 ScalarLoadsCost;
6969 InstructionCost LoadCost = 0;
6970 if (IsMasked) {
6971 LoadCost = TTI.getMemIntrinsicInstrCost(
6972 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
6973 CommonAlignment,
6974 LI->getPointerAddressSpace()),
6975 CostKind);
6976 } else {
6977 LoadCost =
6978 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6979 LI->getPointerAddressSpace(), CostKind);
6980 }
6981 if (IsStrided && !IsMasked && Order.empty()) {
6982 // Check for potential segmented(interleaved) loads.
6983 VectorType *AlignedLoadVecTy = getWidenedType(
6984 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6985 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6986 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6987 &TLI))
6988 AlignedLoadVecTy = LoadVecTy;
6989 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6990 CommonAlignment,
6991 LI->getPointerAddressSpace())) {
6992 InstructionCost InterleavedCost =
6993 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6994 Instruction::Load, AlignedLoadVecTy,
6995 CompressMask[1], {}, CommonAlignment,
6996 LI->getPointerAddressSpace(), CostKind, IsMasked);
6997 if (InterleavedCost < GatherCost) {
6998 InterleaveFactor = CompressMask[1];
6999 LoadVecTy = AlignedLoadVecTy;
7000 return true;
7001 }
7002 }
7003 }
7004 InstructionCost CompressCost = ::getShuffleCost(
7005 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
7006 if (!Order.empty()) {
7007 SmallVector<int> NewMask(Sz, PoisonMaskElem);
7008 for (unsigned I : seq<unsigned>(Sz)) {
7009 NewMask[I] = CompressMask[Mask[I]];
7010 }
7011 CompressMask.swap(NewMask);
7012 }
7013 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7014 return TotalVecCost < GatherCost;
7015}
7016
7017/// Checks if the \p VL can be transformed to a (masked)load + compress or
7018/// (masked) interleaved load.
7019static bool
7022 const DataLayout &DL, ScalarEvolution &SE,
7023 AssumptionCache &AC, const DominatorTree &DT,
7024 const TargetLibraryInfo &TLI,
7025 const function_ref<bool(Value *)> AreAllUsersVectorized) {
7026 bool IsMasked;
7027 unsigned InterleaveFactor;
7028 SmallVector<int> CompressMask;
7029 VectorType *LoadVecTy;
7030 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
7031 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7032 CompressMask, LoadVecTy);
7033}
7034
7035/// Checks if strided loads can be generated out of \p VL loads with pointers \p
7036/// PointerOps:
7037/// 1. Target with strided load support is detected.
7038/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
7039/// potential stride <= MaxProfitableLoadStride and the potential stride is
7040/// power-of-2 (to avoid perf regressions for the very small number of loads)
7041/// and max distance > number of loads, or potential stride is -1.
7042/// 3. The loads are ordered, or number of unordered loads <=
7043/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
7044/// to avoid extra costs for very expensive shuffles).
7045/// 4. Any pointer operand is an instruction with the users outside of the
7046/// current graph (for masked gathers extra extractelement instructions
7047/// might be required).
7049 Align Alignment, const int64_t Diff,
7050 const size_t Sz) const {
7051 if (Diff % (Sz - 1) != 0)
7052 return false;
7053
7054 // Try to generate strided load node.
7055 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
7056 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
7057 return !isVectorized(U) && !MustGather.contains(U);
7058 });
7059 });
7060
7061 const uint64_t AbsoluteDiff = std::abs(Diff);
7062 auto *VecTy = getWidenedType(ScalarTy, Sz);
7063 if (IsAnyPointerUsedOutGraph ||
7064 (AbsoluteDiff > Sz &&
7066 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
7067 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
7068 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7069 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7070 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7071 return false;
7072 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7073 return false;
7074 return true;
7075 }
7076 return false;
7077}
7078
7080 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7081 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7082 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
7083 const size_t Sz = PointerOps.size();
7084 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7085 // Go through `PointerOps` in sorted order and record offsets from
7086 // PointerOps[0]. We use PointerOps[0] rather than Ptr0 because
7087 // sortPtrAccesses only validates getPointersDiff for pairs relative to
7088 // PointerOps[0]. This is safe since only offset differences are used below.
7089 for (unsigned I : seq<unsigned>(Sz)) {
7090 Value *Ptr =
7091 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7092 std::optional<int64_t> Offset =
7093 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, Ptr, *DL, *SE);
7094 assert(Offset && "sortPtrAccesses should have validated this pointer");
7095 SortedOffsetsFromBase[I] = *Offset;
7096 }
7097
7098 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7099 // ```
7100 // [
7101 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7102 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7103 // ...
7104 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7105 // GroupSize - 1}), // last group
7106 // ]
7107 // ```
7108 // The distance between consecutive elements within each group should all be
7109 // the same `StrideWithinGroup`. The distance between the first elements of
7110 // consecutive groups should all be the same `StrideBetweenGroups`.
7111
7112 int64_t StrideWithinGroup =
7113 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7114 // Determine size of the first group. Later we will check that all other
7115 // groups have the same size.
7116 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7117 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7118 StrideWithinGroup;
7119 };
7120 auto Indices = seq<unsigned>(1, Sz);
7121 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7122 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7123
7124 unsigned VecSz = Sz;
7125 Type *NewScalarTy = ScalarTy;
7126
7127 // Quick detour: at this point we can say what the type of strided load would
7128 // be if all the checks pass. Check if this type is legal for the target.
7129 bool NeedsWidening = Sz != GroupSize;
7130 if (NeedsWidening) {
7131 if (Sz % GroupSize != 0)
7132 return false;
7133
7134 if (StrideWithinGroup != 1)
7135 return false;
7136 VecSz = Sz / GroupSize;
7137 NewScalarTy = Type::getIntNTy(
7138 SE->getContext(),
7139 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7140 }
7141
7142 if (!isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7143 return false;
7144
7145 int64_t StrideIntVal = StrideWithinGroup;
7146 if (NeedsWidening) {
7147 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7148 // Check that the strides between groups are all the same.
7149 unsigned CurrentGroupStartIdx = GroupSize;
7150 int64_t StrideBetweenGroups =
7151 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7152 StrideIntVal = StrideBetweenGroups;
7153 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7154 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7155 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7156 StrideBetweenGroups)
7157 return false;
7158 }
7159
7160 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7161 auto Indices = seq<unsigned>(StartIdx + 1, Sz);
7162 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7163 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7164 return GroupEndIdx - StartIdx == GroupSize;
7165 };
7166 for (unsigned I = 0; I < Sz; I += GroupSize) {
7167 if (!CheckGroup(I))
7168 return false;
7169 }
7170 }
7171
7172 Type *StrideTy = DL->getIndexType(Ptr0->getType());
7173 SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal);
7174 SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
7175 return true;
7176}
7177
7179 Type *ScalarTy, Align CommonAlignment,
7180 SmallVectorImpl<unsigned> &SortedIndices,
7181 StridedPtrInfo &SPtrInfo) const {
7182 // If each value in `PointerOps` is of the form `%x + Offset` where `Offset`
7183 // is constant, we partition `PointerOps` sequence into subsequences of
7184 // pointers with the same offset. For each offset we record values from
7185 // `PointerOps` and their indicies in `PointerOps`.
7187 OffsetToPointerOpIdxMap;
7188 for (auto [Idx, Ptr] : enumerate(PointerOps)) {
7189 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7190 if (!PtrSCEV)
7191 return false;
7192
7193 const auto *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
7194 int64_t Offset = 0;
7195 if (Add) {
7196 // `Offset` is non-zero.
7197 for (int I : seq<int>(Add->getNumOperands())) {
7198 const auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
7199 if (!SC)
7200 continue;
7201 Offset = SC->getAPInt().getSExtValue();
7202 break;
7203 }
7204 }
7205 OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
7206 OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
7207 }
7208 unsigned NumOffsets = OffsetToPointerOpIdxMap.size();
7209
7210 // Quick detour: at this point we can say what the type of strided load would
7211 // be if all the checks pass. Check if this type is legal for the target.
7212 const unsigned Sz = PointerOps.size();
7213 unsigned VecSz = Sz;
7214 Type *NewScalarTy = ScalarTy;
7215 if (NumOffsets > 1) {
7216 if (Sz % NumOffsets != 0)
7217 return false;
7218 VecSz = Sz / NumOffsets;
7219 NewScalarTy = Type::getIntNTy(
7220 SE->getContext(),
7221 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * NumOffsets);
7222 }
7223 FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
7224 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7225 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7226 return false;
7227
7228 // Check if the offsets are contiguous and that each group has the required
7229 // size.
7230 SmallVector<int64_t> SortedOffsetsV(NumOffsets);
7231 for (auto [Idx, MapPair] : enumerate(OffsetToPointerOpIdxMap)) {
7232 if (MapPair.second.first.size() != VecSz)
7233 return false;
7234 SortedOffsetsV[Idx] = MapPair.first;
7235 }
7236 sort(SortedOffsetsV);
7237
7238 if (NumOffsets > 1) {
7239 for (int I : seq<int>(1, SortedOffsetsV.size())) {
7240 if (SortedOffsetsV[I] - SortedOffsetsV[I - 1] != 1)
7241 return false;
7242 }
7243 }
7244
7245 // Introduce some notation for the explanations below. Let `PointerOps_j`
7246 // denote the subsequence of `PointerOps` with offsets equal to
7247 // `SortedOffsetsV[j]`. Let `SortedIndices_j` be a such that the sequence
7248 // ```
7249 // PointerOps_j[SortedIndices_j[0]],
7250 // PointerOps_j[SortedIndices_j[1]],
7251 // PointerOps_j[SortedIndices_j[2]],
7252 // ...
7253 // ```
7254 // is sorted. Also, let `IndicesInAllPointerOps_j` be the vector
7255 // of indices of the subsequence `PointerOps_j` in all of `PointerOps`,
7256 // i.e `PointerOps_j[i] = PointerOps[IndicesInAllPointerOps_j[i]]`.
7257 // The entire sorted `PointerOps` looks like this:
7258 // ```
7259 // PointerOps_0[SortedIndices_0[0]] = PointerOps[IndicesInAllPointerOps_0[0]],
7260 // PointerOps_1[SortedIndices_1[0]] = PointerOps[IndicesInAllPointerOps_1[0]],
7261 // PointerOps_2[SortedIndices_2[0]] = PointerOps[IndicesInAllPointerOps_2[0]],
7262 // ...
7263 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[0]] =
7264 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[0]],
7265 //
7266 // PointerOps_0[SortedIndices_0[1]] = PointerOps[IndicesInAllPointerOps_0[1]],
7267 // PointerOps_1[SortedIndices_1[1]] = PointerOps[IndicesInAllPointerOps_1[1]],
7268 // PointerOps_2[SortedIndices_2[1]] = PointerOps[IndicesInAllPointerOps_2[1]],
7269 // ...
7270 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[1]] =
7271 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[1]],
7272 //
7273 // PointerOps_0[SortedIndices_0[2]] = PointerOps[IndicesInAllPointerOps_0[2]],
7274 // PointerOps_1[SortedIndices_1[2]] = PointerOps[IndicesInAllPointerOps_1[2]],
7275 // PointerOps_2[SortedIndices_2[2]] = PointerOps[IndicesInAllPointerOps_2[2]],
7276 // ...
7277 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[2]] =
7278 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[2]],
7279 // ...
7280 // ...
7281 // ...
7282 // PointerOps_0[SortedIndices_0[VecSz - 1]] =
7283 // PointerOps[IndicesInAllPointerOps_0[VecSz - 1]],
7284 // PointerOps_1[SortedIndices_1[VecSz - 1]] =
7285 // PointerOps[IndicesInAllPointerOps_1[VecSz - 1]],
7286 // PointerOps_2[SortedIndices_2[VecSz - 1]] =
7287 // PointerOps[IndicesInAllPointerOps_2[VecSz - 1]],
7288 // ...
7289 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[VecSz - 1]] =
7290 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[VecSz - 1]],
7291 // ```
7292 // In order to be able to generate a strided load, we need the following
7293 // checks to pass:
7294 //
7295 // (1) for each `PointerOps_j` check that the distance
7296 // between adjacent pointers are all equal to the same value (stride).
7297 // (2) for each `PointerOps_j` check that coefficients calculated by
7298 // `calculateRtStride` are all the same.
7299 //
7300 // As we do that, also calculate SortedIndices. Since we should not modify
7301 // `SortedIndices` unless we know that all the checks succeed, record the
7302 // indicies into `SortedIndicesDraft`.
7303 SmallVector<unsigned> SortedIndicesDraft(Sz);
7304
7305 // Given sorted indices for a particular offset (as calculated by
7306 // calculateRtStride), update the `SortedIndicesDraft` for all of PointerOps.
7307 // Let `Offset` be `SortedOffsetsV[OffsetNum]`.
7308 // \param `OffsetNum` the index of `Offset` in `SortedOffsetsV`.
7309 // \param `IndicesInAllPointerOps` vector of indices of the
7310 // subsequence `PointerOps_OffsetNum` in `PointerOps`, i.e. using the above
7311 // notation `IndicesInAllPointerOps = IndicesInAllPointerOps_OffsetNum`.
7312 // \param `SortedIndicesForOffset = SortedIndices_OffsetNum`
7313 auto UpdateSortedIndices =
7314 [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7315 ArrayRef<unsigned> IndicesInAllPointerOps, const int64_t OffsetNum) {
7316 if (SortedIndicesForOffset.empty()) {
7317 SortedIndicesForOffset.resize(IndicesInAllPointerOps.size());
7318 std::iota(SortedIndicesForOffset.begin(),
7319 SortedIndicesForOffset.end(), 0);
7320 }
7321 for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
7322 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7323 IndicesInAllPointerOps[Idx];
7324 }
7325 };
7326
7327 int64_t LowestOffset = SortedOffsetsV[0];
7328 ArrayRef<Value *> PointerOps0 = OffsetToPointerOpIdxMap[LowestOffset].first;
7329
7330 SmallVector<int64_t> Coeffs0(VecSz);
7331 SmallVector<unsigned> SortedIndicesForOffset0;
7332 const SCEV *Stride0 = calculateRtStride(PointerOps0, ScalarTy, *DL, *SE,
7333 SortedIndicesForOffset0, Coeffs0);
7334 if (!Stride0)
7335 return false;
7336 unsigned NumCoeffs0 = Coeffs0.size();
7337 if (NumCoeffs0 * NumOffsets != Sz)
7338 return false;
7339 sort(Coeffs0);
7340
7341 ArrayRef<unsigned> IndicesInAllPointerOps0 =
7342 OffsetToPointerOpIdxMap[LowestOffset].second;
7343 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7344
7345 // Now that we know what the common stride and coefficients has to be check
7346 // the remaining `PointerOps_j`.
7347 SmallVector<int64_t> Coeffs;
7348 SmallVector<unsigned> SortedIndicesForOffset;
7349 for (int J : seq<int>(1, NumOffsets)) {
7350 Coeffs.clear();
7351 Coeffs.resize(VecSz);
7352 SortedIndicesForOffset.clear();
7353
7354 int64_t Offset = SortedOffsetsV[J];
7355 ArrayRef<Value *> PointerOpsForOffset =
7356 OffsetToPointerOpIdxMap[Offset].first;
7357 ArrayRef<unsigned> IndicesInAllPointerOps =
7358 OffsetToPointerOpIdxMap[Offset].second;
7359 const SCEV *StrideWithinGroup =
7360 calculateRtStride(PointerOpsForOffset, ScalarTy, *DL, *SE,
7361 SortedIndicesForOffset, Coeffs);
7362
7363 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7364 return false;
7365 if (Coeffs.size() != NumCoeffs0)
7366 return false;
7367 sort(Coeffs);
7368 if (Coeffs != Coeffs0)
7369 return false;
7370
7371 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7372 }
7373
7374 SortedIndices.clear();
7375 SortedIndices = SortedIndicesDraft;
7376 SPtrInfo.StrideSCEV = Stride0;
7377 SPtrInfo.Ty = StridedLoadTy;
7378 return true;
7379}
7380
7382 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7383 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7384 unsigned *BestVF, bool TryRecursiveCheck) const {
7385 // Check that a vectorized load would load the same memory as a scalar
7386 // load. For example, we don't want to vectorize loads that are smaller
7387 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7388 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7389 // from such a struct, we read/write packed bits disagreeing with the
7390 // unvectorized version.
7391 if (BestVF)
7392 *BestVF = 0;
7394 return LoadsState::Gather;
7395 Type *ScalarTy = VL0->getType();
7396
7397 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7398 return LoadsState::Gather;
7399
7400 // Make sure all loads in the bundle are simple - we can't vectorize
7401 // atomic or volatile loads.
7402 PointerOps.clear();
7403 const size_t Sz = VL.size();
7404 PointerOps.resize(Sz);
7405 auto *POIter = PointerOps.begin();
7406 for (Value *V : VL) {
7407 auto *L = dyn_cast<LoadInst>(V);
7408 if (!L || !L->isSimple())
7409 return LoadsState::Gather;
7410 *POIter = L->getPointerOperand();
7411 ++POIter;
7412 }
7413
7414 Order.clear();
7415 // Check the order of pointer operands or that all pointers are the same.
7416 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7417
7418 auto *VecTy = getWidenedType(ScalarTy, Sz);
7419 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7420 if (!IsSorted) {
7421 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7422 SPtrInfo))
7424
7425 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7426 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7427 return LoadsState::Gather;
7428
7429 if (!all_of(PointerOps, [&](Value *P) {
7430 return arePointersCompatible(P, PointerOps.front(), *TLI);
7431 }))
7432 return LoadsState::Gather;
7433
7434 } else {
7435 Value *Ptr0;
7436 Value *PtrN;
7437 if (Order.empty()) {
7438 Ptr0 = PointerOps.front();
7439 PtrN = PointerOps.back();
7440 } else {
7441 Ptr0 = PointerOps[Order.front()];
7442 PtrN = PointerOps[Order.back()];
7443 }
7444 // sortPtrAccesses validates getPointersDiff for all pointers relative to
7445 // PointerOps[0], so compute the span using PointerOps[0] as intermediate:
7446 // Diff = offset(PtrN) - offset(Ptr0) relative to PointerOps[0]
7447 std::optional<int64_t> Diff0 =
7448 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, Ptr0, *DL, *SE);
7449 std::optional<int64_t> DiffN =
7450 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, PtrN, *DL, *SE);
7451 assert(Diff0 && DiffN &&
7452 "sortPtrAccesses should have validated these pointers");
7453 int64_t Diff = *DiffN - *Diff0;
7454 // Check that the sorted loads are consecutive.
7455 if (static_cast<uint64_t>(Diff) == Sz - 1)
7456 return LoadsState::Vectorize;
7457 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7458 *TLI, [&](Value *V) {
7459 return areAllUsersVectorized(
7460 cast<Instruction>(V), UserIgnoreList);
7461 }))
7463 Align Alignment =
7464 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7465 ->getAlign();
7466 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7467 Diff, Ptr0, PtrN, SPtrInfo))
7469 }
7470 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7471 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7472 return LoadsState::Gather;
7473 // Correctly identify compare the cost of loads + shuffles rather than
7474 // strided/masked gather loads. Returns true if vectorized + shuffles
7475 // representation is better than just gather.
7476 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7477 unsigned *BestVF,
7478 bool ProfitableGatherPointers) {
7479 if (BestVF)
7480 *BestVF = 0;
7481 // Compare masked gather cost and loads + insert subvector costs.
7483 auto [ScalarGEPCost, VectorGEPCost] =
7484 getGEPCosts(TTI, PointerOps, PointerOps.front(),
7485 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7486 // Estimate the cost of masked gather GEP. If not a splat, roughly
7487 // estimate as a buildvector, otherwise estimate as splat.
7488 APInt DemandedElts = APInt::getAllOnes(Sz);
7489 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7490 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7491 if (static_cast<unsigned>(count_if(
7492 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7493 any_of(PointerOps, [&](Value *V) {
7494 return getUnderlyingObject(V) !=
7495 getUnderlyingObject(PointerOps.front());
7496 }))
7497 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7498 DemandedElts, /*Insert=*/true,
7499 /*Extract=*/false, CostKind);
7500 else
7501 VectorGEPCost +=
7503 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7504 /*Insert=*/true, /*Extract=*/false, CostKind) +
7505 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7506 // The cost of scalar loads.
7507 InstructionCost ScalarLoadsCost =
7508 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7509 [&](InstructionCost C, Value *V) {
7510 return C + TTI.getInstructionCost(
7512 }) +
7513 ScalarGEPCost;
7514 // The cost of masked gather.
7515 InstructionCost MaskedGatherCost =
7516 TTI.getMemIntrinsicInstrCost(
7517 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
7519 /*VariableMask=*/false, CommonAlignment),
7520 CostKind) +
7521 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7522 InstructionCost GatherCost =
7523 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7524 /*Insert=*/true,
7525 /*Extract=*/false, CostKind) +
7526 ScalarLoadsCost;
7527 // The list of loads is small or perform partial check already - directly
7528 // compare masked gather cost and gather cost.
7529 constexpr unsigned ListLimit = 4;
7530 if (!TryRecursiveCheck || VL.size() < ListLimit)
7531 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7532
7533 // FIXME: The following code has not been updated for non-power-of-2
7534 // vectors (and not whole registers). The splitting logic here does not
7535 // cover the original vector if the vector factor is not a power of two.
7536 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7537 return false;
7538
7539 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7540 unsigned MinVF = getMinVF(2 * Sz);
7541 DemandedElts.clearAllBits();
7542 // Iterate through possible vectorization factors and check if vectorized +
7543 // shuffles is better than just gather.
7544 for (unsigned VF =
7545 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7546 VF >= MinVF;
7547 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7549 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7550 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7552 SmallVector<Value *> PointerOps;
7553 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7554 PointerOps, SPtrInfo, BestVF,
7555 /*TryRecursiveCheck=*/false);
7556 // Check that the sorted loads are consecutive.
7557 if (LS == LoadsState::Gather) {
7558 if (BestVF) {
7559 DemandedElts.setAllBits();
7560 break;
7561 }
7562 DemandedElts.setBits(Cnt, Cnt + VF);
7563 continue;
7564 }
7565 // If need the reorder - consider as high-cost masked gather for now.
7566 if ((LS == LoadsState::Vectorize ||
7569 !Order.empty() && !isReverseOrder(Order))
7571 States.push_back(LS);
7572 }
7573 if (DemandedElts.isAllOnes())
7574 // All loads gathered - try smaller VF.
7575 continue;
7576 // Can be vectorized later as a serie of loads/insertelements.
7577 InstructionCost VecLdCost = 0;
7578 if (!DemandedElts.isZero()) {
7579 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7580 /*Insert=*/true,
7581 /*Extract=*/false, CostKind) +
7582 ScalarGEPCost;
7583 for (unsigned Idx : seq<unsigned>(VL.size()))
7584 if (DemandedElts[Idx])
7585 VecLdCost +=
7586 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7587 }
7588 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7589 for (auto [I, LS] : enumerate(States)) {
7590 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7591 InstructionCost VectorGEPCost =
7592 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7593 ? 0
7594 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7595 LI0->getPointerOperand(),
7596 Instruction::GetElementPtr, CostKind, ScalarTy,
7597 SubVecTy)
7598 .second;
7599 if (LS == LoadsState::ScatterVectorize) {
7600 if (static_cast<unsigned>(
7601 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7602 PointerOps.size() - 1 ||
7603 any_of(PointerOps, [&](Value *V) {
7604 return getUnderlyingObject(V) !=
7605 getUnderlyingObject(PointerOps.front());
7606 }))
7607 VectorGEPCost += getScalarizationOverhead(
7608 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7609 /*Insert=*/true, /*Extract=*/false, CostKind);
7610 else
7611 VectorGEPCost +=
7613 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7614 /*Insert=*/true, /*Extract=*/false, CostKind) +
7615 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7616 CostKind);
7617 }
7618 switch (LS) {
7620 VecLdCost +=
7621 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7622 LI0->getPointerAddressSpace(), CostKind,
7624 VectorGEPCost;
7625 break;
7627 VecLdCost += TTI.getMemIntrinsicInstrCost(
7629 Intrinsic::experimental_vp_strided_load,
7630 SubVecTy, LI0->getPointerOperand(),
7631 /*VariableMask=*/false, CommonAlignment),
7632 CostKind) +
7633 VectorGEPCost;
7634 break;
7636 VecLdCost += TTI.getMemIntrinsicInstrCost(
7638 Intrinsic::masked_load, SubVecTy,
7639 CommonAlignment, LI0->getPointerAddressSpace()),
7640 CostKind) +
7642 {}, CostKind);
7643 break;
7645 VecLdCost += TTI.getMemIntrinsicInstrCost(
7647 Intrinsic::masked_gather, SubVecTy,
7648 LI0->getPointerOperand(),
7649 /*VariableMask=*/false, CommonAlignment),
7650 CostKind) +
7651 VectorGEPCost;
7652 break;
7653 case LoadsState::Gather:
7654 // Gathers are already calculated - ignore.
7655 continue;
7656 }
7657 SmallVector<int> ShuffleMask(VL.size());
7658 for (int Idx : seq<int>(0, VL.size()))
7659 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7660 if (I > 0)
7661 VecLdCost +=
7662 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7663 CostKind, I * VF, SubVecTy);
7664 }
7665 // If masked gather cost is higher - better to vectorize, so
7666 // consider it as a gather node. It will be better estimated
7667 // later.
7668 if (MaskedGatherCost >= VecLdCost &&
7669 VecLdCost - GatherCost < -SLPCostThreshold) {
7670 if (BestVF)
7671 *BestVF = VF;
7672 return true;
7673 }
7674 }
7675 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7676 };
7677 // TODO: need to improve analysis of the pointers, if not all of them are
7678 // GEPs or have > 2 operands, we end up with a gather node, which just
7679 // increases the cost.
7680 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7681 bool ProfitableGatherPointers =
7682 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7683 return L->isLoopInvariant(V);
7684 })) <= Sz / 2;
7685 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7687 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7688 (GEP && GEP->getNumOperands() == 2 &&
7689 isa<Constant, Instruction>(GEP->getOperand(1)));
7690 })) {
7691 // Check if potential masked gather can be represented as series
7692 // of loads + insertsubvectors.
7693 // If masked gather cost is higher - better to vectorize, so
7694 // consider it as a gather node. It will be better estimated
7695 // later.
7696 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7697 ProfitableGatherPointers))
7699 }
7700
7701 return LoadsState::Gather;
7702}
7703
7705 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7706 const DataLayout &DL, ScalarEvolution &SE,
7707 SmallVectorImpl<unsigned> &SortedIndices) {
7708 assert(
7709 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7710 "Expected list of pointer operands.");
7711 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7712 // Ptr into, sort and return the sorted indices with values next to one
7713 // another.
7715 std::pair<BasicBlock *, Value *>,
7717 Bases;
7718 Bases
7719 .try_emplace(std::make_pair(
7721 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7722
7723 SortedIndices.clear();
7724 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7725 auto Key = std::make_pair(BBs[Cnt + 1],
7727 bool Found = any_of(Bases.try_emplace(Key).first->second,
7728 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7729 std::optional<int64_t> Diff =
7730 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7731 ElemTy, Ptr, DL, SE,
7732 /*StrictCheck=*/true);
7733 if (!Diff)
7734 return false;
7735
7736 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7737 return true;
7738 });
7739
7740 if (!Found) {
7741 // If we haven't found enough to usefully cluster, return early.
7742 if (Bases.size() > VL.size() / 2 - 1)
7743 return false;
7744
7745 // Not found already - add a new Base
7746 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7747 }
7748 }
7749
7750 if (Bases.size() == VL.size())
7751 return false;
7752
7753 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7754 Bases.front().second.size() == VL.size()))
7755 return false;
7756
7757 // For each of the bases sort the pointers by Offset and check if any of the
7758 // base become consecutively allocated.
7759 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7760 SmallPtrSet<Value *, 13> FirstPointers;
7761 SmallPtrSet<Value *, 13> SecondPointers;
7762 Value *P1 = Ptr1;
7763 Value *P2 = Ptr2;
7764 unsigned Depth = 0;
7765 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7766 if (P1 == P2 || Depth > RecursionMaxDepth)
7767 return false;
7768 FirstPointers.insert(P1);
7769 SecondPointers.insert(P2);
7770 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7771 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7772 ++Depth;
7773 }
7774 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7775 "Unable to find matching root.");
7776 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7777 };
7778 for (auto &Base : Bases) {
7779 for (auto &Vec : Base.second) {
7780 if (Vec.size() > 1) {
7782 int64_t InitialOffset = std::get<1>(Vec[0]);
7783 bool AnyConsecutive =
7784 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7785 return std::get<1>(P.value()) ==
7786 int64_t(P.index()) + InitialOffset;
7787 });
7788 // Fill SortedIndices array only if it looks worth-while to sort the
7789 // ptrs.
7790 if (!AnyConsecutive)
7791 return false;
7792 }
7793 }
7794 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7795 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7796 });
7797 }
7798
7799 for (auto &T : Bases)
7800 for (const auto &Vec : T.second)
7801 for (const auto &P : Vec)
7802 SortedIndices.push_back(std::get<2>(P));
7803
7804 assert(SortedIndices.size() == VL.size() &&
7805 "Expected SortedIndices to be the size of VL");
7806 return true;
7807}
7808
7809std::optional<BoUpSLP::OrdersType>
7810BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7811 assert(TE.isGather() && "Expected gather node only.");
7812 Type *ScalarTy = TE.Scalars[0]->getType();
7813
7815 Ptrs.reserve(TE.Scalars.size());
7817 BBs.reserve(TE.Scalars.size());
7818 for (Value *V : TE.Scalars) {
7819 auto *L = dyn_cast<LoadInst>(V);
7820 if (!L || !L->isSimple())
7821 return std::nullopt;
7822 Ptrs.push_back(L->getPointerOperand());
7823 BBs.push_back(L->getParent());
7824 }
7825
7826 BoUpSLP::OrdersType Order;
7827 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7828 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7829 return std::move(Order);
7830 return std::nullopt;
7831}
7832
7833/// Check if two insertelement instructions are from the same buildvector.
7836 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7837 // Instructions must be from the same basic blocks.
7838 if (VU->getParent() != V->getParent())
7839 return false;
7840 // Checks if 2 insertelements are from the same buildvector.
7841 if (VU->getType() != V->getType())
7842 return false;
7843 // Multiple used inserts are separate nodes.
7844 if (!VU->hasOneUse() && !V->hasOneUse())
7845 return false;
7846 auto *IE1 = VU;
7847 auto *IE2 = V;
7848 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7849 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7850 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7851 return false;
7852 // Go through the vector operand of insertelement instructions trying to find
7853 // either VU as the original vector for IE2 or V as the original vector for
7854 // IE1.
7855 SmallBitVector ReusedIdx(
7856 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7857 bool IsReusedIdx = false;
7858 do {
7859 if (IE2 == VU && !IE1)
7860 return VU->hasOneUse();
7861 if (IE1 == V && !IE2)
7862 return V->hasOneUse();
7863 if (IE1 && IE1 != V) {
7864 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7865 IsReusedIdx |= ReusedIdx.test(Idx1);
7866 ReusedIdx.set(Idx1);
7867 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7868 IE1 = nullptr;
7869 else
7870 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7871 }
7872 if (IE2 && IE2 != VU) {
7873 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7874 IsReusedIdx |= ReusedIdx.test(Idx2);
7875 ReusedIdx.set(Idx2);
7876 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7877 IE2 = nullptr;
7878 else
7879 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7880 }
7881 } while (!IsReusedIdx && (IE1 || IE2));
7882 return false;
7883}
7884
7885/// Checks if the specified instruction \p I is an alternate operation for
7886/// the given \p MainOp and \p AltOp instructions.
7887static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7888 Instruction *AltOp,
7889 const TargetLibraryInfo &TLI);
7890
7891std::optional<BoUpSLP::OrdersType>
7892BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7893 bool IgnoreReorder) {
7894 // No need to reorder if need to shuffle reuses, still need to shuffle the
7895 // node.
7896 if (!TE.ReuseShuffleIndices.empty()) {
7897 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7898 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7899 "Reshuffling scalars not yet supported for nodes with padding");
7900
7901 if (isSplat(TE.Scalars))
7902 return std::nullopt;
7903 // Check if reuse shuffle indices can be improved by reordering.
7904 // For this, check that reuse mask is "clustered", i.e. each scalar values
7905 // is used once in each submask of size <number_of_scalars>.
7906 // Example: 4 scalar values.
7907 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7908 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7909 // element 3 is used twice in the second submask.
7910 unsigned Sz = TE.Scalars.size();
7911 if (TE.isGather()) {
7912 if (std::optional<OrdersType> CurrentOrder =
7913 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7914 SmallVector<int> Mask;
7915 fixupOrderingIndices(*CurrentOrder);
7916 inversePermutation(*CurrentOrder, Mask);
7917 ::addMask(Mask, TE.ReuseShuffleIndices);
7918 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7919 unsigned Sz = TE.Scalars.size();
7920 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7921 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7922 if (Idx != PoisonMaskElem)
7923 Res[Idx + K * Sz] = I + K * Sz;
7924 }
7925 return std::move(Res);
7926 }
7927 }
7928 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7929 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7930 2 * TE.getVectorFactor())) == 1)
7931 return std::nullopt;
7932 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7933 return std::nullopt;
7934 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7935 Sz)) {
7936 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7937 if (TE.ReorderIndices.empty())
7938 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7939 else
7940 inversePermutation(TE.ReorderIndices, ReorderMask);
7941 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7942 unsigned VF = ReorderMask.size();
7943 OrdersType ResOrder(VF, VF);
7944 unsigned NumParts = divideCeil(VF, Sz);
7945 SmallBitVector UsedVals(NumParts);
7946 for (unsigned I = 0; I < VF; I += Sz) {
7947 int Val = PoisonMaskElem;
7948 unsigned UndefCnt = 0;
7949 unsigned Limit = std::min(Sz, VF - I);
7950 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7951 [&](int Idx) {
7952 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7953 Val = Idx;
7954 if (Idx == PoisonMaskElem)
7955 ++UndefCnt;
7956 return Idx != PoisonMaskElem && Idx != Val;
7957 }) ||
7958 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7959 UndefCnt > Sz / 2)
7960 return std::nullopt;
7961 UsedVals.set(Val);
7962 for (unsigned K = 0; K < NumParts; ++K) {
7963 unsigned Idx = Val + Sz * K;
7964 if (Idx < VF && I + K < VF)
7965 ResOrder[Idx] = I + K;
7966 }
7967 }
7968 return std::move(ResOrder);
7969 }
7970 unsigned VF = TE.getVectorFactor();
7971 // Try build correct order for extractelement instructions.
7972 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7973 TE.ReuseShuffleIndices.end());
7974 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7975 all_of(TE.Scalars, [Sz](Value *V) {
7976 if (isa<PoisonValue>(V))
7977 return true;
7978 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7979 return Idx && *Idx < Sz;
7980 })) {
7981 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7982 "by BinaryOperator and CastInst.");
7983 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7984 if (TE.ReorderIndices.empty())
7985 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7986 else
7987 inversePermutation(TE.ReorderIndices, ReorderMask);
7988 for (unsigned I = 0; I < VF; ++I) {
7989 int &Idx = ReusedMask[I];
7990 if (Idx == PoisonMaskElem)
7991 continue;
7992 Value *V = TE.Scalars[ReorderMask[Idx]];
7993 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7994 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7995 }
7996 }
7997 // Build the order of the VF size, need to reorder reuses shuffles, they are
7998 // always of VF size.
7999 OrdersType ResOrder(VF);
8000 std::iota(ResOrder.begin(), ResOrder.end(), 0);
8001 auto *It = ResOrder.begin();
8002 for (unsigned K = 0; K < VF; K += Sz) {
8003 OrdersType CurrentOrder(TE.ReorderIndices);
8004 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
8005 if (SubMask.front() == PoisonMaskElem)
8006 std::iota(SubMask.begin(), SubMask.end(), 0);
8007 reorderOrder(CurrentOrder, SubMask);
8008 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
8009 std::advance(It, Sz);
8010 }
8011 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
8012 return Data.index() == Data.value();
8013 }))
8014 return std::nullopt; // No need to reorder.
8015 return std::move(ResOrder);
8016 }
8017 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8018 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8019 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
8020 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
8021 return std::nullopt;
8022 if (TE.State == TreeEntry::SplitVectorize ||
8023 ((TE.State == TreeEntry::Vectorize ||
8024 TE.State == TreeEntry::StridedVectorize ||
8025 TE.State == TreeEntry::CompressVectorize) &&
8027 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
8028 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8029 "Alternate instructions are only supported by "
8030 "BinaryOperator and CastInst.");
8031 return TE.ReorderIndices;
8032 }
8033 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8034 TE.isAltShuffle()) {
8035 assert(TE.ReuseShuffleIndices.empty() &&
8036 "ReuseShuffleIndices should be "
8037 "empty for alternate instructions.");
8038 SmallVector<int> Mask;
8039 TE.buildAltOpShuffleMask(
8040 [&](Instruction *I) {
8041 assert(TE.getMatchingMainOpOrAltOp(I) &&
8042 "Unexpected main/alternate opcode");
8043 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
8044 },
8045 Mask);
8046 const int VF = TE.getVectorFactor();
8047 OrdersType ResOrder(VF, VF);
8048 for (unsigned I : seq<unsigned>(VF)) {
8049 if (Mask[I] == PoisonMaskElem)
8050 continue;
8051 ResOrder[Mask[I] % VF] = I;
8052 }
8053 return std::move(ResOrder);
8054 }
8055 if (!TE.ReorderIndices.empty())
8056 return TE.ReorderIndices;
8057 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8058 if (!TE.ReorderIndices.empty())
8059 return TE.ReorderIndices;
8060
8061 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
8062 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
8063 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
8064 continue;
8065 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
8066 if (!II)
8067 continue;
8068 Instruction *BVHead = nullptr;
8069 BasicBlock *BB = II->getParent();
8070 while (II && II->hasOneUse() && II->getParent() == BB) {
8071 BVHead = II;
8072 II = dyn_cast<InsertElementInst>(II->getOperand(0));
8073 }
8074 I = BVHead;
8075 }
8076
8077 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
8078 assert(BB1 != BB2 && "Expected different basic blocks.");
8079 if (!DT->isReachableFromEntry(BB1))
8080 return false;
8081 if (!DT->isReachableFromEntry(BB2))
8082 return true;
8083 auto *NodeA = DT->getNode(BB1);
8084 auto *NodeB = DT->getNode(BB2);
8085 assert(NodeA && "Should only process reachable instructions");
8086 assert(NodeB && "Should only process reachable instructions");
8087 assert((NodeA == NodeB) ==
8088 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8089 "Different nodes should have different DFS numbers");
8090 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8091 };
8092 auto PHICompare = [&](unsigned I1, unsigned I2) {
8093 Value *V1 = TE.Scalars[I1];
8094 Value *V2 = TE.Scalars[I2];
8095 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
8096 return false;
8097 if (isa<PoisonValue>(V1))
8098 return true;
8099 if (isa<PoisonValue>(V2))
8100 return false;
8101 if (V1->getNumUses() < V2->getNumUses())
8102 return true;
8103 if (V1->getNumUses() > V2->getNumUses())
8104 return false;
8105 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
8106 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
8107 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8108 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8109 FirstUserOfPhi2->getParent());
8110 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
8111 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
8112 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
8113 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
8114 if (IE1 && !IE2)
8115 return true;
8116 if (!IE1 && IE2)
8117 return false;
8118 if (IE1 && IE2) {
8119 if (UserBVHead[I1] && !UserBVHead[I2])
8120 return true;
8121 if (!UserBVHead[I1])
8122 return false;
8123 if (UserBVHead[I1] == UserBVHead[I2])
8124 return getElementIndex(IE1) < getElementIndex(IE2);
8125 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
8126 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
8127 UserBVHead[I2]->getParent());
8128 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8129 }
8130 if (EE1 && !EE2)
8131 return true;
8132 if (!EE1 && EE2)
8133 return false;
8134 if (EE1 && EE2) {
8135 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
8136 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
8137 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
8138 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
8139 if (!Inst2 && !P2)
8140 return Inst1 || P1;
8141 if (EE1->getOperand(0) == EE2->getOperand(0))
8142 return getElementIndex(EE1) < getElementIndex(EE2);
8143 if (!Inst1 && Inst2)
8144 return false;
8145 if (Inst1 && Inst2) {
8146 if (Inst1->getParent() != Inst2->getParent())
8147 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
8148 return Inst1->comesBefore(Inst2);
8149 }
8150 if (!P1 && P2)
8151 return false;
8152 assert(P1 && P2 &&
8153 "Expected either instructions or arguments vector operands.");
8154 return P1->getArgNo() < P2->getArgNo();
8155 }
8156 return false;
8157 };
8158 OrdersType Phis(TE.Scalars.size());
8159 std::iota(Phis.begin(), Phis.end(), 0);
8160 stable_sort(Phis, PHICompare);
8161 if (isIdentityOrder(Phis))
8162 return std::nullopt; // No need to reorder.
8163 return std::move(Phis);
8164 }
8165 if (TE.isGather() &&
8166 (!TE.hasState() || !TE.isAltShuffle() ||
8167 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8168 allSameType(TE.Scalars)) {
8169 // TODO: add analysis of other gather nodes with extractelement
8170 // instructions and other values/instructions, not only undefs.
8171 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8173 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
8174 all_of(TE.Scalars, [](Value *V) {
8175 auto *EE = dyn_cast<ExtractElementInst>(V);
8176 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8177 })) {
8178 // Check that gather of extractelements can be represented as
8179 // just a shuffle of a single vector.
8180 OrdersType CurrentOrder;
8181 bool Reuse =
8182 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
8183 if (Reuse || !CurrentOrder.empty())
8184 return std::move(CurrentOrder);
8185 }
8186 // If the gather node is <undef, v, .., poison> and
8187 // insertelement poison, v, 0 [+ permute]
8188 // is cheaper than
8189 // insertelement poison, v, n - try to reorder.
8190 // If rotating the whole graph, exclude the permute cost, the whole graph
8191 // might be transformed.
8192 int Sz = TE.Scalars.size();
8193 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
8194 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
8195 const auto *It = find_if_not(TE.Scalars, isConstant);
8196 if (It == TE.Scalars.begin())
8197 return OrdersType();
8198 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
8199 if (It != TE.Scalars.end()) {
8200 OrdersType Order(Sz, Sz);
8201 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8202 Order[Idx] = 0;
8203 fixupOrderingIndices(Order);
8204 SmallVector<int> Mask;
8205 inversePermutation(Order, Mask);
8206 InstructionCost PermuteCost =
8207 TopToBottom
8208 ? 0
8209 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
8210 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
8211 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
8212 PoisonValue::get(Ty), *It);
8213 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
8214 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
8215 PoisonValue::get(Ty), *It);
8216 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8217 OrdersType Order(Sz, Sz);
8218 Order[Idx] = 0;
8219 return std::move(Order);
8220 }
8221 }
8222 }
8223 if (isSplat(TE.Scalars))
8224 return std::nullopt;
8225 if (TE.Scalars.size() >= 3)
8226 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
8227 return Order;
8228 // Check if can include the order of vectorized loads. For masked gathers do
8229 // extra analysis later, so include such nodes into a special list.
8230 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8231 SmallVector<Value *> PointerOps;
8232 StridedPtrInfo SPtrInfo;
8233 OrdersType CurrentOrder;
8234 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
8235 CurrentOrder, PointerOps, SPtrInfo);
8238 return std::move(CurrentOrder);
8239 }
8240 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
8241 // has been auditted for correctness with non-power-of-two vectors.
8242 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
8243 if (std::optional<OrdersType> CurrentOrder =
8244 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
8245 return CurrentOrder;
8246 }
8247 return std::nullopt;
8248}
8249
8250/// Checks if the given mask is a "clustered" mask with the same clusters of
8251/// size \p Sz, which are not identity submasks.
8253 unsigned Sz) {
8254 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
8255 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
8256 return false;
8257 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
8258 ArrayRef<int> Cluster = Mask.slice(I, Sz);
8259 if (Cluster != FirstCluster)
8260 return false;
8261 }
8262 return true;
8263}
8264
8265void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8266 // Reorder reuses mask.
8267 reorderReuses(TE.ReuseShuffleIndices, Mask);
8268 const unsigned Sz = TE.Scalars.size();
8269 // For vectorized and non-clustered reused no need to do anything else.
8270 if (!TE.isGather() ||
8272 Sz) ||
8273 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
8274 return;
8275 SmallVector<int> NewMask;
8276 inversePermutation(TE.ReorderIndices, NewMask);
8277 addMask(NewMask, TE.ReuseShuffleIndices);
8278 // Clear reorder since it is going to be applied to the new mask.
8279 TE.ReorderIndices.clear();
8280 // Try to improve gathered nodes with clustered reuses, if possible.
8281 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
8282 SmallVector<unsigned> NewOrder(Slice);
8283 inversePermutation(NewOrder, NewMask);
8284 reorderScalars(TE.Scalars, NewMask);
8285 // Fill the reuses mask with the identity submasks.
8286 for (auto *It = TE.ReuseShuffleIndices.begin(),
8287 *End = TE.ReuseShuffleIndices.end();
8288 It != End; std::advance(It, Sz))
8289 std::iota(It, std::next(It, Sz), 0);
8290}
8291
8293 ArrayRef<unsigned> SecondaryOrder) {
8294 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8295 "Expected same size of orders");
8296 size_t Sz = Order.size();
8297 SmallBitVector UsedIndices(Sz);
8298 for (unsigned Idx : seq<unsigned>(0, Sz)) {
8299 if (Order[Idx] != Sz)
8300 UsedIndices.set(Order[Idx]);
8301 }
8302 if (SecondaryOrder.empty()) {
8303 for (unsigned Idx : seq<unsigned>(0, Sz))
8304 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8305 Order[Idx] = Idx;
8306 } else {
8307 for (unsigned Idx : seq<unsigned>(0, Sz))
8308 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8309 !UsedIndices.test(SecondaryOrder[Idx]))
8310 Order[Idx] = SecondaryOrder[Idx];
8311 }
8312}
8313
8316 return false;
8317
8318 constexpr unsigned TinyVF = 2;
8319 constexpr unsigned TinyTree = 10;
8320 constexpr unsigned PhiOpsLimit = 12;
8321 constexpr unsigned GatherLoadsLimit = 2;
8322 if (VectorizableTree.size() <= TinyTree)
8323 return true;
8324 if (VectorizableTree.front()->hasState() &&
8325 !VectorizableTree.front()->isGather() &&
8326 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8327 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8328 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8329 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8330 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8331 VectorizableTree.front()->ReorderIndices.empty()) {
8332 // Check if the tree has only single store and single (unordered) load node,
8333 // other nodes are phis or geps/binops, combined with phis, and/or single
8334 // gather load node
8335 if (VectorizableTree.front()->hasState() &&
8336 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8337 VectorizableTree.front()->Scalars.size() == TinyVF &&
8338 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8339 return false;
8340 // Single node, which require reorder - skip.
8341 if (VectorizableTree.front()->hasState() &&
8342 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8343 VectorizableTree.front()->ReorderIndices.empty()) {
8344 const unsigned ReorderedSplitsCnt =
8345 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8346 return TE->State == TreeEntry::SplitVectorize &&
8347 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8348 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8349 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
8350 });
8351 if (ReorderedSplitsCnt <= 1 &&
8352 static_cast<unsigned>(count_if(
8353 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8354 return ((!TE->isGather() &&
8355 (TE->ReorderIndices.empty() ||
8356 (TE->UserTreeIndex.UserTE &&
8357 TE->UserTreeIndex.UserTE->State ==
8358 TreeEntry::Vectorize &&
8359 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8360 .empty()))) ||
8361 (TE->isGather() && TE->ReorderIndices.empty() &&
8362 (!TE->hasState() || TE->isAltShuffle() ||
8363 TE->getOpcode() == Instruction::Load ||
8364 TE->getOpcode() == Instruction::ZExt ||
8365 TE->getOpcode() == Instruction::SExt))) &&
8366 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8367 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
8368 return !isConstant(V) && isVectorized(V);
8369 }));
8370 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8371 return false;
8372 }
8373 bool HasPhis = false;
8374 bool HasLoad = true;
8375 unsigned GatherLoads = 0;
8376 for (const std::unique_ptr<TreeEntry> &TE :
8377 ArrayRef(VectorizableTree).drop_front()) {
8378 if (TE->State == TreeEntry::SplitVectorize)
8379 continue;
8380 if (!TE->hasState()) {
8381 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
8383 continue;
8384 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8386 continue;
8387 return true;
8388 }
8389 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8390 if (!TE->isGather()) {
8391 HasLoad = false;
8392 continue;
8393 }
8394 if (HasLoad)
8395 return true;
8396 ++GatherLoads;
8397 if (GatherLoads >= GatherLoadsLimit)
8398 return true;
8399 }
8400 if (TE->getOpcode() == Instruction::GetElementPtr ||
8401 Instruction::isBinaryOp(TE->getOpcode()))
8402 continue;
8403 if (TE->getOpcode() != Instruction::PHI &&
8404 (!TE->hasCopyableElements() ||
8405 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8406 TE->Scalars.size() / 2))
8407 return true;
8408 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8409 TE->getNumOperands() > PhiOpsLimit)
8410 return false;
8411 HasPhis = true;
8412 }
8413 return !HasPhis;
8414 }
8415 return true;
8416}
8417
8418void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8419 ArrayRef<int> MaskOrder) {
8420 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8421 SmallVector<int> NewMask(getVectorFactor());
8422 SmallVector<int> NewMaskOrder(getVectorFactor());
8423 std::iota(NewMask.begin(), NewMask.end(), 0);
8424 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8425 if (Idx == 0) {
8426 copy(Mask, NewMask.begin());
8427 copy(MaskOrder, NewMaskOrder.begin());
8428 } else {
8429 assert(Idx == 1 && "Expected either 0 or 1 index.");
8430 unsigned Offset = CombinedEntriesWithIndices.back().second;
8431 for (unsigned I : seq<unsigned>(Mask.size())) {
8432 NewMask[I + Offset] = Mask[I] + Offset;
8433 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8434 }
8435 }
8436 reorderScalars(Scalars, NewMask);
8437 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8438 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8439 ReorderIndices.clear();
8440}
8441
8443 // Maps VF to the graph nodes.
8445 // ExtractElement gather nodes which can be vectorized and need to handle
8446 // their ordering.
8448
8449 // Phi nodes can have preferred ordering based on their result users
8451
8452 // AltShuffles can also have a preferred ordering that leads to fewer
8453 // instructions, e.g., the addsub instruction in x86.
8454 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8455
8456 // Maps a TreeEntry to the reorder indices of external users.
8458 ExternalUserReorderMap;
8459 // Find all reorderable nodes with the given VF.
8460 // Currently the are vectorized stores,loads,extracts + some gathering of
8461 // extracts.
8462 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8463 const std::unique_ptr<TreeEntry> &TE) {
8464 // Look for external users that will probably be vectorized.
8465 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8466 findExternalStoreUsersReorderIndices(TE.get());
8467 if (!ExternalUserReorderIndices.empty()) {
8468 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8469 ExternalUserReorderMap.try_emplace(TE.get(),
8470 std::move(ExternalUserReorderIndices));
8471 }
8472
8473 // Patterns like [fadd,fsub] can be combined into a single instruction in
8474 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8475 // to take into account their order when looking for the most used order.
8476 if (TE->hasState() && TE->isAltShuffle() &&
8477 TE->State != TreeEntry::SplitVectorize) {
8478 Type *ScalarTy = TE->Scalars[0]->getType();
8479 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8480 unsigned Opcode0 = TE->getOpcode();
8481 unsigned Opcode1 = TE->getAltOpcode();
8482 SmallBitVector OpcodeMask(
8483 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8484 // If this pattern is supported by the target then we consider the order.
8485 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8486 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8487 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8488 }
8489 // TODO: Check the reverse order too.
8490 }
8491
8492 bool IgnoreReorder =
8493 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8494 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8495 VectorizableTree.front()->getOpcode() == Instruction::Store);
8496 if (std::optional<OrdersType> CurrentOrder =
8497 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8498 // Do not include ordering for nodes used in the alt opcode vectorization,
8499 // better to reorder them during bottom-to-top stage. If follow the order
8500 // here, it causes reordering of the whole graph though actually it is
8501 // profitable just to reorder the subgraph that starts from the alternate
8502 // opcode vectorization node. Such nodes already end-up with the shuffle
8503 // instruction and it is just enough to change this shuffle rather than
8504 // rotate the scalars for the whole graph.
8505 unsigned Cnt = 0;
8506 const TreeEntry *UserTE = TE.get();
8507 while (UserTE && Cnt < RecursionMaxDepth) {
8508 if (!UserTE->UserTreeIndex)
8509 break;
8510 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8511 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8512 UserTE->UserTreeIndex.UserTE->Idx != 0)
8513 return;
8514 UserTE = UserTE->UserTreeIndex.UserTE;
8515 ++Cnt;
8516 }
8517 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8518 if (!(TE->State == TreeEntry::Vectorize ||
8519 TE->State == TreeEntry::StridedVectorize ||
8520 TE->State == TreeEntry::SplitVectorize ||
8521 TE->State == TreeEntry::CompressVectorize) ||
8522 !TE->ReuseShuffleIndices.empty())
8523 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8524 if (TE->State == TreeEntry::Vectorize &&
8525 TE->getOpcode() == Instruction::PHI)
8526 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8527 }
8528 });
8529
8530 // Reorder the graph nodes according to their vectorization factor.
8531 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8532 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8533 auto It = VFToOrderedEntries.find(VF);
8534 if (It == VFToOrderedEntries.end())
8535 continue;
8536 // Try to find the most profitable order. We just are looking for the most
8537 // used order and reorder scalar elements in the nodes according to this
8538 // mostly used order.
8539 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8540 // Delete VF entry upon exit.
8541 llvm::scope_exit Cleanup([&]() { VFToOrderedEntries.erase(It); });
8542
8543 // All operands are reordered and used only in this node - propagate the
8544 // most used order to the user node.
8547 OrdersUses;
8548 for (const TreeEntry *OpTE : OrderedEntries) {
8549 // No need to reorder this nodes, still need to extend and to use shuffle,
8550 // just need to merge reordering shuffle and the reuse shuffle.
8551 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8552 OpTE->State != TreeEntry::SplitVectorize)
8553 continue;
8554 // Count number of orders uses.
8555 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8556 &PhisToOrders]() -> const OrdersType & {
8557 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8558 auto It = GathersToOrders.find(OpTE);
8559 if (It != GathersToOrders.end())
8560 return It->second;
8561 }
8562 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8563 auto It = AltShufflesToOrders.find(OpTE);
8564 if (It != AltShufflesToOrders.end())
8565 return It->second;
8566 }
8567 if (OpTE->State == TreeEntry::Vectorize &&
8568 OpTE->getOpcode() == Instruction::PHI) {
8569 auto It = PhisToOrders.find(OpTE);
8570 if (It != PhisToOrders.end())
8571 return It->second;
8572 }
8573 return OpTE->ReorderIndices;
8574 }();
8575 // First consider the order of the external scalar users.
8576 auto It = ExternalUserReorderMap.find(OpTE);
8577 if (It != ExternalUserReorderMap.end()) {
8578 const auto &ExternalUserReorderIndices = It->second;
8579 // If the OpTE vector factor != number of scalars - use natural order,
8580 // it is an attempt to reorder node with reused scalars but with
8581 // external uses.
8582 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8583 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8584 ExternalUserReorderIndices.size();
8585 } else {
8586 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8587 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8588 }
8589 // No other useful reorder data in this entry.
8590 if (Order.empty())
8591 continue;
8592 }
8593 // Stores actually store the mask, not the order, need to invert.
8594 if (OpTE->State == TreeEntry::Vectorize &&
8595 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8596 assert(!OpTE->isAltShuffle() &&
8597 "Alternate instructions are only supported by BinaryOperator "
8598 "and CastInst.");
8599 SmallVector<int> Mask;
8600 inversePermutation(Order, Mask);
8601 unsigned E = Order.size();
8602 OrdersType CurrentOrder(E, E);
8603 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8604 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8605 });
8606 fixupOrderingIndices(CurrentOrder);
8607 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8608 } else {
8609 ++OrdersUses.try_emplace(Order, 0).first->second;
8610 }
8611 }
8612 if (OrdersUses.empty())
8613 continue;
8614 // Choose the most used order.
8615 unsigned IdentityCnt = 0;
8616 unsigned FilledIdentityCnt = 0;
8617 OrdersType IdentityOrder(VF, VF);
8618 for (auto &Pair : OrdersUses) {
8619 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8620 if (!Pair.first.empty())
8621 FilledIdentityCnt += Pair.second;
8622 IdentityCnt += Pair.second;
8623 combineOrders(IdentityOrder, Pair.first);
8624 }
8625 }
8626 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8627 unsigned Cnt = IdentityCnt;
8628 for (auto &Pair : OrdersUses) {
8629 // Prefer identity order. But, if filled identity found (non-empty order)
8630 // with same number of uses, as the new candidate order, we can choose
8631 // this candidate order.
8632 if (Cnt < Pair.second ||
8633 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8634 Cnt == Pair.second && !BestOrder.empty() &&
8635 isIdentityOrder(BestOrder))) {
8636 combineOrders(Pair.first, BestOrder);
8637 BestOrder = Pair.first;
8638 Cnt = Pair.second;
8639 } else {
8640 combineOrders(BestOrder, Pair.first);
8641 }
8642 }
8643 // Set order of the user node.
8644 if (isIdentityOrder(BestOrder))
8645 continue;
8646 fixupOrderingIndices(BestOrder);
8647 SmallVector<int> Mask;
8648 inversePermutation(BestOrder, Mask);
8649 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8650 unsigned E = BestOrder.size();
8651 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8652 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8653 });
8654 // Do an actual reordering, if profitable.
8655 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8656 // Just do the reordering for the nodes with the given VF.
8657 if (TE->Scalars.size() != VF) {
8658 if (TE->ReuseShuffleIndices.size() == VF) {
8659 assert(TE->State != TreeEntry::SplitVectorize &&
8660 "Split vectorized not expected.");
8661 // Need to reorder the reuses masks of the operands with smaller VF to
8662 // be able to find the match between the graph nodes and scalar
8663 // operands of the given node during vectorization/cost estimation.
8664 assert(
8665 (!TE->UserTreeIndex ||
8666 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8667 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8668 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8669 "All users must be of VF size.");
8670 if (SLPReVec) {
8671 assert(SLPReVec && "Only supported by REVEC.");
8672 // ShuffleVectorInst does not do reorderOperands (and it should not
8673 // because ShuffleVectorInst supports only a limited set of
8674 // patterns). Only do reorderNodeWithReuses if the user is not
8675 // ShuffleVectorInst.
8676 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8677 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8678 continue;
8679 }
8680 // Update ordering of the operands with the smaller VF than the given
8681 // one.
8682 reorderNodeWithReuses(*TE, Mask);
8683 // Update orders in user split vectorize nodes.
8684 if (TE->UserTreeIndex &&
8685 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8686 TE->UserTreeIndex.UserTE->reorderSplitNode(
8687 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8688 }
8689 continue;
8690 }
8691 if ((TE->State == TreeEntry::SplitVectorize &&
8692 TE->ReuseShuffleIndices.empty()) ||
8693 ((TE->State == TreeEntry::Vectorize ||
8694 TE->State == TreeEntry::StridedVectorize ||
8695 TE->State == TreeEntry::CompressVectorize) &&
8697 InsertElementInst>(TE->getMainOp()) ||
8698 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8699 assert(
8700 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8701 TE->ReuseShuffleIndices.empty())) &&
8702 "Alternate instructions are only supported by BinaryOperator "
8703 "and CastInst.");
8704 // Build correct orders for extract{element,value}, loads,
8705 // stores and alternate (split) nodes.
8706 reorderOrder(TE->ReorderIndices, Mask);
8707 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8708 TE->reorderOperands(Mask);
8709 } else {
8710 // Reorder the node and its operands.
8711 TE->reorderOperands(Mask);
8712 assert(TE->ReorderIndices.empty() &&
8713 "Expected empty reorder sequence.");
8714 reorderScalars(TE->Scalars, Mask);
8715 }
8716 if (!TE->ReuseShuffleIndices.empty()) {
8717 // Apply reversed order to keep the original ordering of the reused
8718 // elements to avoid extra reorder indices shuffling.
8719 OrdersType CurrentOrder;
8720 reorderOrder(CurrentOrder, MaskOrder);
8721 SmallVector<int> NewReuses;
8722 inversePermutation(CurrentOrder, NewReuses);
8723 addMask(NewReuses, TE->ReuseShuffleIndices);
8724 TE->ReuseShuffleIndices.swap(NewReuses);
8725 } else if (TE->UserTreeIndex &&
8726 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8727 // Update orders in user split vectorize nodes.
8728 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8729 Mask, MaskOrder);
8730 }
8731 }
8732}
8733
8734void BoUpSLP::buildReorderableOperands(
8735 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8736 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8737 SmallVectorImpl<TreeEntry *> &GatherOps) {
8738 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8739 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8740 return OpData.first == I &&
8741 (OpData.second->State == TreeEntry::Vectorize ||
8742 OpData.second->State == TreeEntry::StridedVectorize ||
8743 OpData.second->State == TreeEntry::CompressVectorize ||
8744 OpData.second->State == TreeEntry::SplitVectorize);
8745 }))
8746 continue;
8747 // Do not request operands, if they do not exist.
8748 if (UserTE->hasState()) {
8749 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8750 UserTE->getOpcode() == Instruction::ExtractValue)
8751 continue;
8752 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8753 continue;
8754 if (UserTE->getOpcode() == Instruction::Store &&
8755 UserTE->State == TreeEntry::Vectorize && I == 1)
8756 continue;
8757 if (UserTE->getOpcode() == Instruction::Load &&
8758 (UserTE->State == TreeEntry::Vectorize ||
8759 UserTE->State == TreeEntry::StridedVectorize ||
8760 UserTE->State == TreeEntry::CompressVectorize))
8761 continue;
8762 }
8763 TreeEntry *TE = getOperandEntry(UserTE, I);
8764 assert(TE && "Expected operand entry.");
8765 if (!TE->isGather()) {
8766 // Add the node to the list of the ordered nodes with the identity
8767 // order.
8768 Edges.emplace_back(I, TE);
8769 // Add ScatterVectorize nodes to the list of operands, where just
8770 // reordering of the scalars is required. Similar to the gathers, so
8771 // simply add to the list of gathered ops.
8772 // If there are reused scalars, process this node as a regular vectorize
8773 // node, just reorder reuses mask.
8774 if (TE->State == TreeEntry::ScatterVectorize &&
8775 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8776 GatherOps.push_back(TE);
8777 continue;
8778 }
8779 if (ReorderableGathers.contains(TE))
8780 GatherOps.push_back(TE);
8781 }
8782}
8783
8784void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8785 struct TreeEntryCompare {
8786 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8787 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8788 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8789 return LHS->Idx < RHS->Idx;
8790 }
8791 };
8793 DenseSet<const TreeEntry *> GathersToOrders;
8794 // Find all reorderable leaf nodes with the given VF.
8795 // Currently the are vectorized loads,extracts without alternate operands +
8796 // some gathering of extracts.
8798 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8799 if (TE->State != TreeEntry::Vectorize &&
8800 TE->State != TreeEntry::StridedVectorize &&
8801 TE->State != TreeEntry::CompressVectorize &&
8802 TE->State != TreeEntry::SplitVectorize)
8803 NonVectorized.insert(TE.get());
8804 if (std::optional<OrdersType> CurrentOrder =
8805 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8806 Queue.push(TE.get());
8807 if (!(TE->State == TreeEntry::Vectorize ||
8808 TE->State == TreeEntry::StridedVectorize ||
8809 TE->State == TreeEntry::CompressVectorize ||
8810 TE->State == TreeEntry::SplitVectorize) ||
8811 !TE->ReuseShuffleIndices.empty())
8812 GathersToOrders.insert(TE.get());
8813 }
8814 }
8815
8816 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8817 // I.e., if the node has operands, that are reordered, try to make at least
8818 // one operand order in the natural order and reorder others + reorder the
8819 // user node itself.
8820 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8821 while (!Queue.empty()) {
8822 // 1. Filter out only reordered nodes.
8823 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8824 TreeEntry *TE = Queue.top();
8825 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8826 Queue.pop();
8827 SmallVector<TreeEntry *> OrderedOps(1, TE);
8828 while (!Queue.empty()) {
8829 TE = Queue.top();
8830 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8831 break;
8832 Queue.pop();
8833 OrderedOps.push_back(TE);
8834 }
8835 for (TreeEntry *TE : OrderedOps) {
8836 if (!(TE->State == TreeEntry::Vectorize ||
8837 TE->State == TreeEntry::StridedVectorize ||
8838 TE->State == TreeEntry::CompressVectorize ||
8839 TE->State == TreeEntry::SplitVectorize ||
8840 (TE->isGather() && GathersToOrders.contains(TE))) ||
8841 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8842 !Visited.insert(TE).second)
8843 continue;
8844 // Build a map between user nodes and their operands order to speedup
8845 // search. The graph currently does not provide this dependency directly.
8846 Users.first = TE->UserTreeIndex.UserTE;
8847 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8848 }
8849 if (Users.first) {
8850 auto &Data = Users;
8851 if (Data.first->State == TreeEntry::SplitVectorize) {
8852 assert(
8853 Data.second.size() <= 2 &&
8854 "Expected not greater than 2 operands for split vectorize node.");
8855 if (any_of(Data.second,
8856 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8857 continue;
8858 // Update orders in user split vectorize nodes.
8859 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8860 "Expected exactly 2 entries.");
8861 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8862 TreeEntry &OpTE = *VectorizableTree[P.first];
8863 OrdersType Order = OpTE.ReorderIndices;
8864 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8865 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8866 continue;
8867 const auto BestOrder =
8868 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8869 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8870 continue;
8871 Order = *BestOrder;
8872 }
8873 fixupOrderingIndices(Order);
8874 SmallVector<int> Mask;
8875 inversePermutation(Order, Mask);
8876 const unsigned E = Order.size();
8877 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8878 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8879 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8880 });
8881 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8882 // Clear ordering of the operand.
8883 if (!OpTE.ReorderIndices.empty()) {
8884 OpTE.ReorderIndices.clear();
8885 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8886 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8887 } else {
8888 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8889 reorderScalars(OpTE.Scalars, Mask);
8890 }
8891 }
8892 if (Data.first->ReuseShuffleIndices.empty() &&
8893 !Data.first->ReorderIndices.empty()) {
8894 // Insert user node to the list to try to sink reordering deeper in
8895 // the graph.
8896 Queue.push(Data.first);
8897 }
8898 continue;
8899 }
8900 // Check that operands are used only in the User node.
8901 SmallVector<TreeEntry *> GatherOps;
8902 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8903 GatherOps);
8904 // All operands are reordered and used only in this node - propagate the
8905 // most used order to the user node.
8908 OrdersUses;
8909 // Do the analysis for each tree entry only once, otherwise the order of
8910 // the same node my be considered several times, though might be not
8911 // profitable.
8914 for (const auto &Op : Data.second) {
8915 TreeEntry *OpTE = Op.second;
8916 if (!VisitedOps.insert(OpTE).second)
8917 continue;
8918 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8919 continue;
8920 const auto Order = [&]() -> const OrdersType {
8921 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8922 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8923 IgnoreReorder)
8924 .value_or(OrdersType(1));
8925 return OpTE->ReorderIndices;
8926 }();
8927 // The order is partially ordered, skip it in favor of fully non-ordered
8928 // orders.
8929 if (Order.size() == 1)
8930 continue;
8931
8932 // Check that the reordering does not increase number of shuffles, i.e.
8933 // same-values-nodes has same parents or their parents has same parents.
8934 if (!Order.empty() && !isIdentityOrder(Order)) {
8935 Value *Root = OpTE->hasState()
8936 ? OpTE->getMainOp()
8937 : *find_if_not(OpTE->Scalars, isConstant);
8938 auto GetSameNodesUsers = [&](Value *Root) {
8940 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8941 if (TE != OpTE && TE->UserTreeIndex &&
8942 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8943 TE->Scalars.size() == OpTE->Scalars.size() &&
8944 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8945 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8946 Res.insert(TE->UserTreeIndex.UserTE);
8947 }
8948 for (const TreeEntry *TE : getTreeEntries(Root)) {
8949 if (TE != OpTE && TE->UserTreeIndex &&
8950 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8951 TE->Scalars.size() == OpTE->Scalars.size() &&
8952 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8953 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8954 Res.insert(TE->UserTreeIndex.UserTE);
8955 }
8956 return Res.takeVector();
8957 };
8958 auto GetNumOperands = [](const TreeEntry *TE) {
8959 if (TE->State == TreeEntry::SplitVectorize)
8960 return TE->getNumOperands();
8961 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8962 return CI->arg_size();
8963 return TE->getNumOperands();
8964 };
8965 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8966 const TreeEntry *TE) {
8968 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8970 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8973 continue;
8974 const TreeEntry *Op = getOperandEntry(TE, Idx);
8975 if (Op->isGather() && Op->hasState()) {
8976 const TreeEntry *VecOp =
8977 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8978 if (VecOp)
8979 Op = VecOp;
8980 }
8981 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8982 return false;
8983 }
8984 return true;
8985 };
8986 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8987 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8988 if (!RevisitedOps.insert(UTE).second)
8989 return false;
8990 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8991 !UTE->ReuseShuffleIndices.empty() ||
8992 (UTE->UserTreeIndex &&
8993 UTE->UserTreeIndex.UserTE == Data.first) ||
8994 (Data.first->UserTreeIndex &&
8995 Data.first->UserTreeIndex.UserTE == UTE) ||
8996 (IgnoreReorder && UTE->UserTreeIndex &&
8997 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8998 NodeShouldBeReorderedWithOperands(UTE);
8999 }))
9000 continue;
9001 for (TreeEntry *UTE : Users) {
9003 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
9005 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
9008 continue;
9009 const TreeEntry *Op = getOperandEntry(UTE, Idx);
9010 Visited.erase(Op);
9011 Queue.push(const_cast<TreeEntry *>(Op));
9012 }
9013 }
9014 }
9015 unsigned NumOps = count_if(
9016 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
9017 return P.second == OpTE;
9018 });
9019 // Stores actually store the mask, not the order, need to invert.
9020 if (OpTE->State == TreeEntry::Vectorize &&
9021 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
9022 assert(!OpTE->isAltShuffle() &&
9023 "Alternate instructions are only supported by BinaryOperator "
9024 "and CastInst.");
9025 SmallVector<int> Mask;
9026 inversePermutation(Order, Mask);
9027 unsigned E = Order.size();
9028 OrdersType CurrentOrder(E, E);
9029 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
9030 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9031 });
9032 fixupOrderingIndices(CurrentOrder);
9033 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
9034 } else {
9035 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
9036 }
9037 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
9038 const auto AllowsReordering = [&](const TreeEntry *TE) {
9039 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9040 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9041 (IgnoreReorder && TE->Idx == 0))
9042 return true;
9043 if (TE->isGather()) {
9044 if (GathersToOrders.contains(TE))
9045 return !getReorderingData(*TE, /*TopToBottom=*/false,
9046 IgnoreReorder)
9047 .value_or(OrdersType(1))
9048 .empty();
9049 return true;
9050 }
9051 return false;
9052 };
9053 if (OpTE->UserTreeIndex) {
9054 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9055 if (!VisitedUsers.insert(UserTE).second)
9056 continue;
9057 // May reorder user node if it requires reordering, has reused
9058 // scalars, is an alternate op vectorize node or its op nodes require
9059 // reordering.
9060 if (AllowsReordering(UserTE))
9061 continue;
9062 // Check if users allow reordering.
9063 // Currently look up just 1 level of operands to avoid increase of
9064 // the compile time.
9065 // Profitable to reorder if definitely more operands allow
9066 // reordering rather than those with natural order.
9068 if (static_cast<unsigned>(count_if(
9069 Ops, [UserTE, &AllowsReordering](
9070 const std::pair<unsigned, TreeEntry *> &Op) {
9071 return AllowsReordering(Op.second) &&
9072 Op.second->UserTreeIndex.UserTE == UserTE;
9073 })) <= Ops.size() / 2)
9074 ++Res.first->second;
9075 }
9076 }
9077 if (OrdersUses.empty()) {
9078 Visited.insert_range(llvm::make_second_range(Data.second));
9079 continue;
9080 }
9081 // Choose the most used order.
9082 unsigned IdentityCnt = 0;
9083 unsigned VF = Data.second.front().second->getVectorFactor();
9084 OrdersType IdentityOrder(VF, VF);
9085 for (auto &Pair : OrdersUses) {
9086 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
9087 IdentityCnt += Pair.second;
9088 combineOrders(IdentityOrder, Pair.first);
9089 }
9090 }
9091 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
9092 unsigned Cnt = IdentityCnt;
9093 for (auto &Pair : OrdersUses) {
9094 // Prefer identity order. But, if filled identity found (non-empty
9095 // order) with same number of uses, as the new candidate order, we can
9096 // choose this candidate order.
9097 if (Cnt < Pair.second) {
9098 combineOrders(Pair.first, BestOrder);
9099 BestOrder = Pair.first;
9100 Cnt = Pair.second;
9101 } else {
9102 combineOrders(BestOrder, Pair.first);
9103 }
9104 }
9105 // Set order of the user node.
9106 if (isIdentityOrder(BestOrder)) {
9107 Visited.insert_range(llvm::make_second_range(Data.second));
9108 continue;
9109 }
9110 fixupOrderingIndices(BestOrder);
9111 // Erase operands from OrderedEntries list and adjust their orders.
9112 VisitedOps.clear();
9113 SmallVector<int> Mask;
9114 inversePermutation(BestOrder, Mask);
9115 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
9116 unsigned E = BestOrder.size();
9117 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
9118 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9119 });
9120 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
9121 TreeEntry *TE = Op.second;
9122 if (!VisitedOps.insert(TE).second)
9123 continue;
9124 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
9125 reorderNodeWithReuses(*TE, Mask);
9126 continue;
9127 }
9128 // Gathers are processed separately.
9129 if (TE->State != TreeEntry::Vectorize &&
9130 TE->State != TreeEntry::StridedVectorize &&
9131 TE->State != TreeEntry::CompressVectorize &&
9132 TE->State != TreeEntry::SplitVectorize &&
9133 (TE->State != TreeEntry::ScatterVectorize ||
9134 TE->ReorderIndices.empty()))
9135 continue;
9136 assert((BestOrder.size() == TE->ReorderIndices.size() ||
9137 TE->ReorderIndices.empty()) &&
9138 "Non-matching sizes of user/operand entries.");
9139 reorderOrder(TE->ReorderIndices, Mask);
9140 if (IgnoreReorder && TE == VectorizableTree.front().get())
9141 IgnoreReorder = false;
9142 }
9143 // For gathers just need to reorder its scalars.
9144 for (TreeEntry *Gather : GatherOps) {
9145 assert(Gather->ReorderIndices.empty() &&
9146 "Unexpected reordering of gathers.");
9147 if (!Gather->ReuseShuffleIndices.empty()) {
9148 // Just reorder reuses indices.
9149 reorderReuses(Gather->ReuseShuffleIndices, Mask);
9150 continue;
9151 }
9152 reorderScalars(Gather->Scalars, Mask);
9153 Visited.insert(Gather);
9154 }
9155 // Reorder operands of the user node and set the ordering for the user
9156 // node itself.
9157 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
9158 return TE.isAltShuffle() &&
9159 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9160 TE.ReorderIndices.empty());
9161 };
9162 if (Data.first->State != TreeEntry::Vectorize ||
9164 Data.first->getMainOp()) ||
9165 IsNotProfitableAltCodeNode(*Data.first))
9166 Data.first->reorderOperands(Mask);
9167 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
9168 IsNotProfitableAltCodeNode(*Data.first) ||
9169 Data.first->State == TreeEntry::StridedVectorize ||
9170 Data.first->State == TreeEntry::CompressVectorize) {
9171 reorderScalars(Data.first->Scalars, Mask);
9172 reorderOrder(Data.first->ReorderIndices, MaskOrder,
9173 /*BottomOrder=*/true);
9174 if (Data.first->ReuseShuffleIndices.empty() &&
9175 !Data.first->ReorderIndices.empty() &&
9176 !IsNotProfitableAltCodeNode(*Data.first)) {
9177 // Insert user node to the list to try to sink reordering deeper in
9178 // the graph.
9179 Queue.push(Data.first);
9180 }
9181 } else {
9182 reorderOrder(Data.first->ReorderIndices, Mask);
9183 }
9184 }
9185 }
9186 // If the reordering is unnecessary, just remove the reorder.
9187 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9188 VectorizableTree.front()->ReuseShuffleIndices.empty())
9189 VectorizableTree.front()->ReorderIndices.clear();
9190}
9191
9192Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
9193 if (Entry.hasState() &&
9194 (Entry.getOpcode() == Instruction::Store ||
9195 Entry.getOpcode() == Instruction::Load) &&
9196 Entry.State == TreeEntry::StridedVectorize &&
9197 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
9198 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
9199 return dyn_cast<Instruction>(Entry.Scalars.front());
9200}
9201
9203 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
9204 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9205 DenseMap<Value *, unsigned> ScalarToExtUses;
9206 // Collect the values that we need to extract from the tree.
9207 for (auto &TEPtr : VectorizableTree) {
9208 TreeEntry *Entry = TEPtr.get();
9209
9210 // No need to handle users of gathered values.
9211 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9212 DeletedNodes.contains(Entry) ||
9213 TransformedToGatherNodes.contains(Entry))
9214 continue;
9215
9216 // For each lane:
9217 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9218 Value *Scalar = Entry->Scalars[Lane];
9219 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
9220 continue;
9221
9222 // All uses must be replaced already? No need to do it again.
9223 auto It = ScalarToExtUses.find(Scalar);
9224 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
9225 continue;
9226
9227 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
9228 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9229 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
9230 << " from " << *Scalar << "for many users.\n");
9231 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9232 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9233 ExternalUsesWithNonUsers.insert(Scalar);
9234 continue;
9235 }
9236
9237 // Check if the scalar is externally used as an extra arg.
9238 const auto ExtI = ExternallyUsedValues.find(Scalar);
9239 if (ExtI != ExternallyUsedValues.end()) {
9240 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9241 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
9242 << FoundLane << " from " << *Scalar << ".\n");
9243 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
9244 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9245 continue;
9246 }
9247 for (User *U : Scalar->users()) {
9248 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
9249
9250 Instruction *UserInst = dyn_cast<Instruction>(U);
9251 if (!UserInst || isDeleted(UserInst))
9252 continue;
9253
9254 // Ignore users in the user ignore list.
9255 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9256 continue;
9257
9258 // Skip in-tree scalars that become vectors
9259 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
9260 any_of(UseEntries, [this](const TreeEntry *UseEntry) {
9261 return !DeletedNodes.contains(UseEntry) &&
9262 !TransformedToGatherNodes.contains(UseEntry);
9263 })) {
9264 // Some in-tree scalars will remain as scalar in vectorized
9265 // instructions. If that is the case, the one in FoundLane will
9266 // be used.
9267 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9268 isa<LoadInst, StoreInst>(UserInst)) ||
9269 isa<CallInst>(UserInst)) ||
9270 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9271 if (DeletedNodes.contains(UseEntry) ||
9272 TransformedToGatherNodes.contains(UseEntry))
9273 return true;
9274 return UseEntry->State == TreeEntry::ScatterVectorize ||
9276 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9277 TTI);
9278 })) {
9279 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9280 << ".\n");
9281 assert(none_of(UseEntries,
9282 [](TreeEntry *UseEntry) {
9283 return UseEntry->isGather();
9284 }) &&
9285 "Bad state");
9286 continue;
9287 }
9288 U = nullptr;
9289 if (It != ScalarToExtUses.end()) {
9290 ExternalUses[It->second].User = nullptr;
9291 break;
9292 }
9293 }
9294
9295 if (U && Scalar->hasNUsesOrMore(UsesLimit))
9296 U = nullptr;
9297 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9298 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9299 << " from lane " << FoundLane << " from " << *Scalar
9300 << ".\n");
9301 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9302 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9303 ExternalUsesWithNonUsers.insert(Scalar);
9304 if (!U)
9305 break;
9306 }
9307 }
9308 }
9309}
9310
9312BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9315 PtrToStoresMap;
9316 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
9317 Value *V = TE->Scalars[Lane];
9318 // Don't iterate over the users of constant data.
9319 if (!isa<Instruction>(V))
9320 continue;
9321 // To save compilation time we don't visit if we have too many users.
9322 if (V->hasNUsesOrMore(UsesLimit))
9323 break;
9324
9325 // Collect stores per pointer object.
9326 for (User *U : V->users()) {
9327 auto *SI = dyn_cast<StoreInst>(U);
9328 // Test whether we can handle the store. V might be a global, which could
9329 // be used in a different function.
9330 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9331 !isValidElementType(SI->getValueOperand()->getType()))
9332 continue;
9333 // Skip entry if already
9334 if (isVectorized(U))
9335 continue;
9336
9337 Value *Ptr =
9338 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
9339 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9340 SI->getValueOperand()->getType(), Ptr}];
9341 // For now just keep one store per pointer object per lane.
9342 // TODO: Extend this to support multiple stores per pointer per lane
9343 if (StoresVec.size() > Lane)
9344 continue;
9345 if (!StoresVec.empty()) {
9346 std::optional<int64_t> Diff = getPointersDiff(
9347 SI->getValueOperand()->getType(), SI->getPointerOperand(),
9348 SI->getValueOperand()->getType(),
9349 StoresVec.front()->getPointerOperand(), *DL, *SE,
9350 /*StrictCheck=*/true);
9351 // We failed to compare the pointers so just abandon this store.
9352 if (!Diff)
9353 continue;
9354 }
9355 StoresVec.push_back(SI);
9356 }
9357 }
9358 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9359 unsigned I = 0;
9360 for (auto &P : PtrToStoresMap) {
9361 Res[I].swap(P.second);
9362 ++I;
9363 }
9364 return Res;
9365}
9366
9367bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9368 OrdersType &ReorderIndices) const {
9369 // We check whether the stores in StoreVec can form a vector by sorting them
9370 // and checking whether they are consecutive.
9371
9372 // To avoid calling getPointersDiff() while sorting we create a vector of
9373 // pairs {store, offset from first} and sort this instead.
9375 StoreInst *S0 = StoresVec[0];
9376 StoreOffsetVec.emplace_back(0, 0);
9377 Type *S0Ty = S0->getValueOperand()->getType();
9378 Value *S0Ptr = S0->getPointerOperand();
9379 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
9380 StoreInst *SI = StoresVec[Idx];
9381 std::optional<int64_t> Diff =
9382 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
9383 SI->getPointerOperand(), *DL, *SE,
9384 /*StrictCheck=*/true);
9385 StoreOffsetVec.emplace_back(*Diff, Idx);
9386 }
9387
9388 // Check if the stores are consecutive by checking if their difference is 1.
9389 if (StoreOffsetVec.size() != StoresVec.size())
9390 return false;
9391 sort(StoreOffsetVec, llvm::less_first());
9392 unsigned Idx = 0;
9393 int64_t PrevDist = 0;
9394 for (const auto &P : StoreOffsetVec) {
9395 if (Idx > 0 && P.first != PrevDist + 1)
9396 return false;
9397 PrevDist = P.first;
9398 ++Idx;
9399 }
9400
9401 // Calculate the shuffle indices according to their offset against the sorted
9402 // StoreOffsetVec.
9403 ReorderIndices.assign(StoresVec.size(), 0);
9404 bool IsIdentity = true;
9405 for (auto [I, P] : enumerate(StoreOffsetVec)) {
9406 ReorderIndices[P.second] = I;
9407 IsIdentity &= P.second == I;
9408 }
9409 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9410 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9411 // same convention here.
9412 if (IsIdentity)
9413 ReorderIndices.clear();
9414
9415 return true;
9416}
9417
9418#ifndef NDEBUG
9420 for (unsigned Idx : Order)
9421 dbgs() << Idx << ", ";
9422 dbgs() << "\n";
9423}
9424#endif
9425
9427BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9428 unsigned NumLanes = TE->Scalars.size();
9429
9430 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9431
9432 // Holds the reorder indices for each candidate store vector that is a user of
9433 // the current TreeEntry.
9434 SmallVector<OrdersType, 1> ExternalReorderIndices;
9435
9436 // Now inspect the stores collected per pointer and look for vectorization
9437 // candidates. For each candidate calculate the reorder index vector and push
9438 // it into `ExternalReorderIndices`
9439 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9440 // If we have fewer than NumLanes stores, then we can't form a vector.
9441 if (StoresVec.size() != NumLanes)
9442 continue;
9443
9444 // If the stores are not consecutive then abandon this StoresVec.
9445 OrdersType ReorderIndices;
9446 if (!canFormVector(StoresVec, ReorderIndices))
9447 continue;
9448
9449 // We now know that the scalars in StoresVec can form a vector instruction,
9450 // so set the reorder indices.
9451 ExternalReorderIndices.push_back(ReorderIndices);
9452 }
9453 return ExternalReorderIndices;
9454}
9455
9457 const SmallDenseSet<Value *> &UserIgnoreLst) {
9458 deleteTree();
9459 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9460 "TreeEntryToStridedPtrInfoMap is not cleared");
9461 UserIgnoreList = &UserIgnoreLst;
9462 if (!allSameType(Roots))
9463 return;
9464 buildTreeRec(Roots, 0, EdgeInfo());
9465}
9466
9468 deleteTree();
9469 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9470 "TreeEntryToStridedPtrInfoMap is not cleared");
9471 if (!allSameType(Roots))
9472 return;
9473 buildTreeRec(Roots, 0, EdgeInfo());
9474}
9475
9476/// Tries to find subvector of loads and builds new vector of only loads if can
9477/// be profitable.
9479 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9481 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9482 bool AddNew = true) {
9483 if (VL.empty())
9484 return;
9485 Type *ScalarTy = getValueType(VL.front());
9486 if (!isValidElementType(ScalarTy))
9487 return;
9489 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9490 for (Value *V : VL) {
9491 auto *LI = dyn_cast<LoadInst>(V);
9492 if (!LI)
9493 continue;
9494 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9495 continue;
9496 bool IsFound = false;
9497 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9498 assert(LI->getParent() == Data.front().first->getParent() &&
9499 LI->getType() == Data.front().first->getType() &&
9500 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9501 getUnderlyingObject(Data.front().first->getPointerOperand(),
9503 "Expected loads with the same type, same parent and same "
9504 "underlying pointer.");
9505 std::optional<int64_t> Dist = getPointersDiff(
9506 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9507 Data.front().first->getPointerOperand(), DL, SE,
9508 /*StrictCheck=*/true);
9509 if (!Dist)
9510 continue;
9511 auto It = Map.find(*Dist);
9512 if (It != Map.end() && It->second != LI)
9513 continue;
9514 if (It == Map.end()) {
9515 Data.emplace_back(LI, *Dist);
9516 Map.try_emplace(*Dist, LI);
9517 }
9518 IsFound = true;
9519 break;
9520 }
9521 if (!IsFound) {
9522 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9523 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9524 }
9525 }
9526 auto FindMatchingLoads =
9529 &GatheredLoads,
9530 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9531 int64_t &Offset, unsigned &Start) {
9532 if (Loads.empty())
9533 return GatheredLoads.end();
9534 LoadInst *LI = Loads.front().first;
9535 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9536 if (Idx < Start)
9537 continue;
9538 ToAdd.clear();
9539 if (LI->getParent() != Data.front().first->getParent() ||
9540 LI->getType() != Data.front().first->getType())
9541 continue;
9542 std::optional<int64_t> Dist =
9544 Data.front().first->getType(),
9545 Data.front().first->getPointerOperand(), DL, SE,
9546 /*StrictCheck=*/true);
9547 if (!Dist)
9548 continue;
9549 SmallSet<int64_t, 4> DataDists;
9551 for (std::pair<LoadInst *, int64_t> P : Data) {
9552 DataDists.insert(P.second);
9553 DataLoads.insert(P.first);
9554 }
9555 // Found matching gathered loads - check if all loads are unique or
9556 // can be effectively vectorized.
9557 unsigned NumUniques = 0;
9558 for (auto [Cnt, Pair] : enumerate(Loads)) {
9559 bool Used = DataLoads.contains(Pair.first);
9560 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9561 ++NumUniques;
9562 ToAdd.insert(Cnt);
9563 } else if (Used) {
9564 Repeated.insert(Cnt);
9565 }
9566 }
9567 if (NumUniques > 0 &&
9568 (Loads.size() == NumUniques ||
9569 (Loads.size() - NumUniques >= 2 &&
9570 Loads.size() - NumUniques >= Loads.size() / 2 &&
9571 (has_single_bit(Data.size() + NumUniques) ||
9572 bit_ceil(Data.size()) <
9573 bit_ceil(Data.size() + NumUniques))))) {
9574 Offset = *Dist;
9575 Start = Idx + 1;
9576 return std::next(GatheredLoads.begin(), Idx);
9577 }
9578 }
9579 ToAdd.clear();
9580 return GatheredLoads.end();
9581 };
9582 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9583 unsigned Start = 0;
9584 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9585 int64_t Offset = 0;
9586 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9587 Offset, Start);
9588 while (It != GatheredLoads.end()) {
9589 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9590 for (unsigned Idx : LocalToAdd)
9591 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9592 ToAdd.insert_range(LocalToAdd);
9593 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9594 Start);
9595 }
9596 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9597 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9598 })) {
9599 auto AddNewLoads =
9601 for (unsigned Idx : seq<unsigned>(Data.size())) {
9602 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9603 continue;
9604 Loads.push_back(Data[Idx]);
9605 }
9606 };
9607 if (!AddNew) {
9608 LoadInst *LI = Data.front().first;
9609 It = find_if(
9610 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9611 return PD.front().first->getParent() == LI->getParent() &&
9612 PD.front().first->getType() == LI->getType();
9613 });
9614 while (It != GatheredLoads.end()) {
9615 AddNewLoads(*It);
9616 It = std::find_if(
9617 std::next(It), GatheredLoads.end(),
9618 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9619 return PD.front().first->getParent() == LI->getParent() &&
9620 PD.front().first->getType() == LI->getType();
9621 });
9622 }
9623 }
9624 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9625 AddNewLoads(GatheredLoads.emplace_back());
9626 }
9627 }
9628}
9629
9630void BoUpSLP::tryToVectorizeGatheredLoads(
9631 const SmallMapVector<
9632 std::tuple<BasicBlock *, Value *, Type *>,
9633 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9634 &GatheredLoads) {
9635 GatheredLoadsEntriesFirst = VectorizableTree.size();
9636
9637 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9638 LoadEntriesToVectorize.size());
9639 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9640 Set.insert_range(VectorizableTree[Idx]->Scalars);
9641
9642 // Sort loads by distance.
9643 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9644 const std::pair<LoadInst *, int64_t> &L2) {
9645 return L1.second > L2.second;
9646 };
9647
9648 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9649 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9650 Loads.size());
9651 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9652 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9653 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9654 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9655 };
9656
9657 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9658 BoUpSLP::ValueSet &VectorizedLoads,
9659 SmallVectorImpl<LoadInst *> &NonVectorized,
9660 bool Final, unsigned MaxVF) {
9662 unsigned StartIdx = 0;
9663 SmallVector<int> CandidateVFs;
9664 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9665 CandidateVFs.push_back(MaxVF);
9666 for (int NumElts = getFloorFullVectorNumberOfElements(
9667 *TTI, Loads.front()->getType(), MaxVF);
9668 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9669 *TTI, Loads.front()->getType(), NumElts - 1)) {
9670 CandidateVFs.push_back(NumElts);
9671 if (VectorizeNonPowerOf2 && NumElts > 2)
9672 CandidateVFs.push_back(NumElts - 1);
9673 }
9674
9675 if (Final && CandidateVFs.empty())
9676 return Results;
9677
9678 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9679 for (unsigned NumElts : CandidateVFs) {
9680 if (Final && NumElts > BestVF)
9681 continue;
9682 SmallVector<unsigned> MaskedGatherVectorized;
9683 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9684 ++Cnt) {
9685 ArrayRef<LoadInst *> Slice =
9686 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9687 if (VectorizedLoads.count(Slice.front()) ||
9688 VectorizedLoads.count(Slice.back()) ||
9690 continue;
9691 // Check if it is profitable to try vectorizing gathered loads. It is
9692 // profitable if we have more than 3 consecutive loads or if we have
9693 // less but all users are vectorized or deleted.
9694 bool AllowToVectorize = false;
9695 // Check if it is profitable to vectorize 2-elements loads.
9696 if (NumElts == 2) {
9697 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9698 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9699 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9700 for (LoadInst *LI : Slice) {
9701 // If single use/user - allow to vectorize.
9702 if (LI->hasOneUse())
9703 continue;
9704 // 1. Check if number of uses equals number of users.
9705 // 2. All users are deleted.
9706 // 3. The load broadcasts are not allowed or the load is not
9707 // broadcasted.
9708 if (static_cast<unsigned int>(std::distance(
9709 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9710 return false;
9711 if (!IsLegalBroadcastLoad)
9712 continue;
9713 if (LI->hasNUsesOrMore(UsesLimit))
9714 return false;
9715 for (User *U : LI->users()) {
9716 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9717 continue;
9718 for (const TreeEntry *UTE : getTreeEntries(U)) {
9719 for (int I : seq<int>(UTE->getNumOperands())) {
9720 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9721 return V == LI || isa<PoisonValue>(V);
9722 }))
9723 // Found legal broadcast - do not vectorize.
9724 return false;
9725 }
9726 }
9727 }
9728 }
9729 return true;
9730 };
9731 AllowToVectorize = CheckIfAllowed(Slice);
9732 } else {
9733 AllowToVectorize =
9734 (NumElts >= 3 ||
9735 any_of(ValueToGatherNodes.at(Slice.front()),
9736 [=](const TreeEntry *TE) {
9737 return TE->Scalars.size() == 2 &&
9738 ((TE->Scalars.front() == Slice.front() &&
9739 TE->Scalars.back() == Slice.back()) ||
9740 (TE->Scalars.front() == Slice.back() &&
9741 TE->Scalars.back() == Slice.front()));
9742 })) &&
9743 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9744 Slice.size());
9745 }
9746 if (AllowToVectorize) {
9747 SmallVector<Value *> PointerOps;
9748 OrdersType CurrentOrder;
9749 // Try to build vector load.
9750 ArrayRef<Value *> Values(
9751 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9752 StridedPtrInfo SPtrInfo;
9753 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9754 PointerOps, SPtrInfo, &BestVF);
9755 if (LS != LoadsState::Gather ||
9756 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9757 if (LS == LoadsState::ScatterVectorize) {
9758 if (MaskedGatherVectorized.empty() ||
9759 Cnt >= MaskedGatherVectorized.back() + NumElts)
9760 MaskedGatherVectorized.push_back(Cnt);
9761 continue;
9762 }
9763 if (LS != LoadsState::Gather) {
9764 Results.emplace_back(Values, LS);
9765 VectorizedLoads.insert_range(Slice);
9766 // If we vectorized initial block, no need to try to vectorize it
9767 // again.
9768 if (Cnt == StartIdx)
9769 StartIdx += NumElts;
9770 }
9771 // Check if the whole array was vectorized already - exit.
9772 if (StartIdx >= Loads.size())
9773 break;
9774 // Erase last masked gather candidate, if another candidate within
9775 // the range is found to be better.
9776 if (!MaskedGatherVectorized.empty() &&
9777 Cnt < MaskedGatherVectorized.back() + NumElts)
9778 MaskedGatherVectorized.pop_back();
9779 Cnt += NumElts - 1;
9780 continue;
9781 }
9782 }
9783 if (!AllowToVectorize || BestVF == 0)
9785 }
9786 // Mark masked gathers candidates as vectorized, if any.
9787 for (unsigned Cnt : MaskedGatherVectorized) {
9788 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9789 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9790 ArrayRef<Value *> Values(
9791 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9792 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9793 VectorizedLoads.insert_range(Slice);
9794 // If we vectorized initial block, no need to try to vectorize it again.
9795 if (Cnt == StartIdx)
9796 StartIdx += NumElts;
9797 }
9798 }
9799 for (LoadInst *LI : Loads) {
9800 if (!VectorizedLoads.contains(LI))
9801 NonVectorized.push_back(LI);
9802 }
9803 return Results;
9804 };
9805 auto ProcessGatheredLoads =
9806 [&, &TTI = *TTI](
9808 bool Final = false) {
9809 SmallVector<LoadInst *> NonVectorized;
9810 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9811 GatheredLoads) {
9812 if (LoadsDists.size() <= 1) {
9813 NonVectorized.push_back(LoadsDists.back().first);
9814 continue;
9815 }
9817 LoadsDists);
9818 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9819 stable_sort(LocalLoadsDists, LoadSorter);
9821 unsigned MaxConsecutiveDistance = 0;
9822 unsigned CurrentConsecutiveDist = 1;
9823 int64_t LastDist = LocalLoadsDists.front().second;
9824 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9825 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9826 if (isVectorized(L.first))
9827 continue;
9828 assert(LastDist >= L.second &&
9829 "Expected first distance always not less than second");
9830 if (static_cast<uint64_t>(LastDist - L.second) ==
9831 CurrentConsecutiveDist) {
9832 ++CurrentConsecutiveDist;
9833 MaxConsecutiveDistance =
9834 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9835 Loads.push_back(L.first);
9836 continue;
9837 }
9838 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9839 !Loads.empty())
9840 Loads.pop_back();
9841 CurrentConsecutiveDist = 1;
9842 LastDist = L.second;
9843 Loads.push_back(L.first);
9844 }
9845 if (Loads.size() <= 1)
9846 continue;
9847 if (AllowMaskedGather)
9848 MaxConsecutiveDistance = Loads.size();
9849 else if (MaxConsecutiveDistance < 2)
9850 continue;
9851 BoUpSLP::ValueSet VectorizedLoads;
9852 SmallVector<LoadInst *> SortedNonVectorized;
9854 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9855 Final, MaxConsecutiveDistance);
9856 if (!Results.empty() && !SortedNonVectorized.empty() &&
9857 OriginalLoads.size() == Loads.size() &&
9858 MaxConsecutiveDistance == Loads.size() &&
9860 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9861 return P.second == LoadsState::ScatterVectorize;
9862 })) {
9863 VectorizedLoads.clear();
9864 SmallVector<LoadInst *> UnsortedNonVectorized;
9866 UnsortedResults =
9867 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9868 UnsortedNonVectorized, Final,
9869 OriginalLoads.size());
9870 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9871 SortedNonVectorized.swap(UnsortedNonVectorized);
9872 Results.swap(UnsortedResults);
9873 }
9874 }
9875 for (auto [Slice, _] : Results) {
9876 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9877 << Slice.size() << ")\n");
9878 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9879 for (Value *L : Slice)
9880 if (!isVectorized(L))
9881 SortedNonVectorized.push_back(cast<LoadInst>(L));
9882 continue;
9883 }
9884
9885 // Select maximum VF as a maximum of user gathered nodes and
9886 // distance between scalar loads in these nodes.
9887 unsigned MaxVF = Slice.size();
9888 unsigned UserMaxVF = 0;
9889 unsigned InterleaveFactor = 0;
9890 if (MaxVF == 2) {
9891 UserMaxVF = MaxVF;
9892 } else {
9893 // Found distance between segments of the interleaved loads.
9894 std::optional<unsigned> InterleavedLoadsDistance = 0;
9895 unsigned Order = 0;
9896 std::optional<unsigned> CommonVF = 0;
9897 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9898 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9899 for (auto [Idx, V] : enumerate(Slice)) {
9900 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9901 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9902 unsigned Pos =
9903 EntryToPosition.try_emplace(E, Idx).first->second;
9904 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9905 if (CommonVF) {
9906 if (*CommonVF == 0) {
9907 CommonVF = E->Scalars.size();
9908 continue;
9909 }
9910 if (*CommonVF != E->Scalars.size())
9911 CommonVF.reset();
9912 }
9913 // Check if the load is the part of the interleaved load.
9914 if (Pos != Idx && InterleavedLoadsDistance) {
9915 if (!DeinterleavedNodes.contains(E) &&
9916 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9917 if (isa<Constant>(V))
9918 return false;
9919 if (isVectorized(V))
9920 return true;
9921 const auto &Nodes = ValueToGatherNodes.at(V);
9922 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9923 !is_contained(Slice, V);
9924 })) {
9925 InterleavedLoadsDistance.reset();
9926 continue;
9927 }
9928 DeinterleavedNodes.insert(E);
9929 if (*InterleavedLoadsDistance == 0) {
9930 InterleavedLoadsDistance = Idx - Pos;
9931 continue;
9932 }
9933 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9934 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9935 InterleavedLoadsDistance.reset();
9936 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9937 }
9938 }
9939 }
9940 DeinterleavedNodes.clear();
9941 // Check if the large load represents interleaved load operation.
9942 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9943 CommonVF.value_or(0) != 0) {
9944 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9945 unsigned VF = *CommonVF;
9946 OrdersType Order;
9947 SmallVector<Value *> PointerOps;
9948 StridedPtrInfo SPtrInfo;
9949 // Segmented load detected - vectorize at maximum vector factor.
9950 if (InterleaveFactor <= Slice.size() &&
9951 TTI.isLegalInterleavedAccessType(
9952 getWidenedType(Slice.front()->getType(), VF),
9953 InterleaveFactor,
9954 cast<LoadInst>(Slice.front())->getAlign(),
9955 cast<LoadInst>(Slice.front())
9957 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9958 SPtrInfo) == LoadsState::Vectorize) {
9959 UserMaxVF = InterleaveFactor * VF;
9960 } else {
9961 InterleaveFactor = 0;
9962 }
9963 }
9964 // Cannot represent the loads as consecutive vectorizable nodes -
9965 // just exit.
9966 unsigned ConsecutiveNodesSize = 0;
9967 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9968 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9969 [&, Slice = Slice](const auto &P) {
9970 const auto *It = find_if(Slice, [&](Value *V) {
9971 return std::get<1>(P).contains(V);
9972 });
9973 if (It == Slice.end())
9974 return false;
9975 const TreeEntry &TE =
9976 *VectorizableTree[std::get<0>(P)];
9977 ArrayRef<Value *> VL = TE.Scalars;
9978 OrdersType Order;
9979 SmallVector<Value *> PointerOps;
9980 StridedPtrInfo SPtrInfo;
9982 VL, VL.front(), Order, PointerOps, SPtrInfo);
9983 if (State == LoadsState::ScatterVectorize ||
9985 return false;
9986 ConsecutiveNodesSize += VL.size();
9987 size_t Start = std::distance(Slice.begin(), It);
9988 size_t Sz = Slice.size() - Start;
9989 return Sz < VL.size() ||
9990 Slice.slice(Start, VL.size()) != VL;
9991 }))
9992 continue;
9993 // Try to build long masked gather loads.
9994 UserMaxVF = bit_ceil(UserMaxVF);
9995 if (InterleaveFactor == 0 &&
9996 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9997 [&, Slice = Slice](unsigned Idx) {
9998 OrdersType Order;
9999 SmallVector<Value *> PointerOps;
10000 StridedPtrInfo SPtrInfo;
10001 return canVectorizeLoads(
10002 Slice.slice(Idx * UserMaxVF, UserMaxVF),
10003 Slice[Idx * UserMaxVF], Order, PointerOps,
10004 SPtrInfo) == LoadsState::ScatterVectorize;
10005 }))
10006 UserMaxVF = MaxVF;
10007 if (Slice.size() != ConsecutiveNodesSize)
10008 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
10009 }
10010 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10011 bool IsVectorized = true;
10012 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
10013 ArrayRef<Value *> SubSlice =
10014 Slice.slice(I, std::min(VF, E - I));
10015 if (isVectorized(SubSlice.front()))
10016 continue;
10017 // Check if the subslice is to be-vectorized entry, which is not
10018 // equal to entry.
10019 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10020 [&](const auto &P) {
10021 return !SubSlice.equals(
10022 VectorizableTree[std::get<0>(P)]
10023 ->Scalars) &&
10024 set_is_subset(SubSlice, std::get<1>(P));
10025 }))
10026 continue;
10027 unsigned Sz = VectorizableTree.size();
10028 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
10029 if (Sz == VectorizableTree.size()) {
10030 IsVectorized = false;
10031 // Try non-interleaved vectorization with smaller vector
10032 // factor.
10033 if (InterleaveFactor > 0) {
10034 VF = 2 * (MaxVF / InterleaveFactor);
10035 InterleaveFactor = 0;
10036 }
10037 continue;
10038 }
10039 }
10040 if (IsVectorized)
10041 break;
10042 }
10043 }
10044 NonVectorized.append(SortedNonVectorized);
10045 }
10046 return NonVectorized;
10047 };
10048 for (const auto &GLs : GatheredLoads) {
10049 const auto &Ref = GLs.second;
10050 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
10051 if (!Ref.empty() && !NonVectorized.empty() &&
10052 std::accumulate(
10053 Ref.begin(), Ref.end(), 0u,
10054 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10055 -> unsigned { return S + LoadsDists.size(); }) !=
10056 NonVectorized.size() &&
10057 IsMaskedGatherSupported(NonVectorized)) {
10059 FinalGatheredLoads;
10060 for (LoadInst *LI : NonVectorized) {
10061 // Reinsert non-vectorized loads to other list of loads with the same
10062 // base pointers.
10063 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
10064 FinalGatheredLoads,
10065 /*AddNew=*/false);
10066 }
10067 // Final attempt to vectorize non-vectorized loads.
10068 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
10069 }
10070 }
10071 // Try to vectorize postponed load entries, previously marked as gathered.
10072 for (unsigned Idx : LoadEntriesToVectorize) {
10073 const TreeEntry &E = *VectorizableTree[Idx];
10074 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
10075 // Avoid reordering, if possible.
10076 if (!E.ReorderIndices.empty()) {
10077 // Build a mask out of the reorder indices and reorder scalars per this
10078 // mask.
10079 SmallVector<int> ReorderMask;
10080 inversePermutation(E.ReorderIndices, ReorderMask);
10081 reorderScalars(GatheredScalars, ReorderMask);
10082 }
10083 buildTreeRec(GatheredScalars, 0, EdgeInfo());
10084 }
10085 // If no new entries created, consider it as no gathered loads entries must be
10086 // handled.
10087 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10088 VectorizableTree.size())
10089 GatheredLoadsEntriesFirst.reset();
10090}
10091
10092/// Generates key/subkey pair for the given value to provide effective sorting
10093/// of the values and better detection of the vectorizable values sequences. The
10094/// keys/subkeys can be used for better sorting of the values themselves (keys)
10095/// and in values subgroups (subkeys).
10096static std::pair<size_t, size_t> generateKeySubkey(
10097 Value *V, const TargetLibraryInfo *TLI,
10098 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
10099 bool AllowAlternate) {
10100 hash_code Key = hash_value(V->getValueID() + 2);
10101 hash_code SubKey = hash_value(0);
10102 // Sort the loads by the distance between the pointers.
10103 if (auto *LI = dyn_cast<LoadInst>(V)) {
10104 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
10105 if (LI->isSimple())
10106 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
10107 else
10108 Key = SubKey = hash_value(LI);
10109 } else if (isVectorLikeInstWithConstOps(V)) {
10110 // Sort extracts by the vector operands.
10112 Key = hash_value(Value::UndefValueVal + 1);
10113 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
10114 if (!isUndefVector(EI->getVectorOperand()).all() &&
10115 !isa<UndefValue>(EI->getIndexOperand()))
10116 SubKey = hash_value(EI->getVectorOperand());
10117 }
10118 } else if (auto *I = dyn_cast<Instruction>(V)) {
10119 // Sort other instructions just by the opcodes except for CMPInst.
10120 // For CMP also sort by the predicate kind.
10122 isValidForAlternation(I->getOpcode())) {
10123 if (AllowAlternate)
10124 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
10125 else
10126 Key = hash_combine(hash_value(I->getOpcode()), Key);
10127 SubKey = hash_combine(
10128 hash_value(I->getOpcode()), hash_value(I->getType()),
10130 ? I->getType()
10131 : cast<CastInst>(I)->getOperand(0)->getType()));
10132 // For casts, look through the only operand to improve compile time.
10133 if (isa<CastInst>(I)) {
10134 std::pair<size_t, size_t> OpVals =
10135 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
10136 /*AllowAlternate=*/true);
10137 Key = hash_combine(OpVals.first, Key);
10138 SubKey = hash_combine(OpVals.first, SubKey);
10139 }
10140 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
10141 CmpInst::Predicate Pred = CI->getPredicate();
10142 if (CI->isCommutative())
10143 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
10145 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
10146 hash_value(SwapPred),
10147 hash_value(CI->getOperand(0)->getType()));
10148 } else if (auto *Call = dyn_cast<CallInst>(I)) {
10151 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
10152 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
10153 SubKey = hash_combine(hash_value(I->getOpcode()),
10154 hash_value(Call->getCalledFunction()));
10155 } else {
10157 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
10158 }
10159 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
10160 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
10161 hash_value(Op.Tag), SubKey);
10162 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
10163 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
10164 SubKey = hash_value(Gep->getPointerOperand());
10165 else
10166 SubKey = hash_value(Gep);
10167 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
10168 !isa<ConstantInt>(I->getOperand(1))) {
10169 // Do not try to vectorize instructions with potentially high cost.
10170 SubKey = hash_value(I);
10171 } else {
10172 SubKey = hash_value(I->getOpcode());
10173 }
10174 Key = hash_combine(hash_value(I->getParent()), Key);
10175 }
10176 return std::make_pair(Key, SubKey);
10177}
10178
10179/// Checks if the specified instruction \p I is an main operation for the given
10180/// \p MainOp and \p AltOp instructions.
10181static bool isMainInstruction(Instruction *I, Instruction *MainOp,
10182 Instruction *AltOp, const TargetLibraryInfo &TLI);
10183
10184bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
10185 ArrayRef<Value *> VL) const {
10186 Type *ScalarTy = S.getMainOp()->getType();
10187 unsigned Opcode0 = S.getOpcode();
10188 unsigned Opcode1 = S.getAltOpcode();
10189 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10190 // If this pattern is supported by the target then consider it profitable.
10191 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
10192 Opcode1, OpcodeMask))
10193 return true;
10194 SmallVector<ValueList> Operands;
10195 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
10196 Operands.emplace_back();
10197 // Prepare the operand vector.
10198 for (Value *V : VL) {
10199 if (isa<PoisonValue>(V)) {
10200 Operands.back().push_back(
10201 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
10202 continue;
10203 }
10204 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
10205 }
10206 }
10207 if (Operands.size() == 2) {
10208 // Try find best operands candidates.
10209 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
10211 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
10212 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
10213 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
10214 std::optional<int> Res = findBestRootPair(Candidates);
10215 switch (Res.value_or(0)) {
10216 case 0:
10217 break;
10218 case 1:
10219 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
10220 break;
10221 case 2:
10222 std::swap(Operands[0][I], Operands[1][I]);
10223 break;
10224 default:
10225 llvm_unreachable("Unexpected index.");
10226 }
10227 }
10228 }
10229 DenseSet<unsigned> UniqueOpcodes;
10230 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
10231 unsigned NonInstCnt = 0;
10232 // Estimate number of instructions, required for the vectorized node and for
10233 // the buildvector node.
10234 unsigned UndefCnt = 0;
10235 // Count the number of extra shuffles, required for vector nodes.
10236 unsigned ExtraShuffleInsts = 0;
10237 // Check that operands do not contain same values and create either perfect
10238 // diamond match or shuffled match.
10239 if (Operands.size() == 2) {
10240 // Do not count same operands twice.
10241 if (Operands.front() == Operands.back()) {
10242 Operands.erase(Operands.begin());
10243 } else if (!allConstant(Operands.front()) &&
10244 all_of(Operands.front(), [&](Value *V) {
10245 return is_contained(Operands.back(), V);
10246 })) {
10247 Operands.erase(Operands.begin());
10248 ++ExtraShuffleInsts;
10249 }
10250 }
10251 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
10252 // Vectorize node, if:
10253 // 1. at least single operand is constant or splat.
10254 // 2. Operands have many loop invariants (the instructions are not loop
10255 // invariants).
10256 // 3. At least single unique operands is supposed to vectorized.
10257 return none_of(Operands,
10258 [&](ArrayRef<Value *> Op) {
10259 if (allConstant(Op) ||
10260 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
10261 getSameOpcode(Op, *TLI)))
10262 return false;
10263 DenseMap<Value *, unsigned> Uniques;
10264 for (Value *V : Op) {
10266 isVectorized(V) || (L && L->isLoopInvariant(V))) {
10267 if (isa<UndefValue>(V))
10268 ++UndefCnt;
10269 continue;
10270 }
10271 auto Res = Uniques.try_emplace(V, 0);
10272 // Found first duplicate - need to add shuffle.
10273 if (!Res.second && Res.first->second == 1)
10274 ++ExtraShuffleInsts;
10275 ++Res.first->getSecond();
10276 if (auto *I = dyn_cast<Instruction>(V))
10277 UniqueOpcodes.insert(I->getOpcode());
10278 else if (Res.second)
10279 ++NonInstCnt;
10280 }
10281 return none_of(Uniques, [&](const auto &P) {
10282 return P.first->hasNUsesOrMore(P.second + 1) &&
10283 none_of(P.first->users(), [&](User *U) {
10284 return isVectorized(U) || Uniques.contains(U);
10285 });
10286 });
10287 }) ||
10288 // Do not vectorize node, if estimated number of vector instructions is
10289 // more than estimated number of buildvector instructions. Number of
10290 // vector operands is number of vector instructions + number of vector
10291 // instructions for operands (buildvectors). Number of buildvector
10292 // instructions is just number_of_operands * number_of_scalars.
10293 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10294 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
10295 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10296}
10297
10298/// Builds the arguments types vector for the given call instruction with the
10299/// given \p ID for the specified vector factor.
10302 const unsigned VF, unsigned MinBW,
10303 const TargetTransformInfo *TTI) {
10304 SmallVector<Type *> ArgTys;
10305 for (auto [Idx, Arg] : enumerate(CI->args())) {
10308 ArgTys.push_back(Arg->getType());
10309 continue;
10310 }
10311 if (MinBW > 0) {
10312 ArgTys.push_back(
10313 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
10314 continue;
10315 }
10316 }
10317 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
10318 }
10319 return ArgTys;
10320}
10321
10322/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10323/// function (if possible) calls. Returns invalid cost for the corresponding
10324/// calls, if they cannot be vectorized/will be scalarized.
10325static std::pair<InstructionCost, InstructionCost>
10328 ArrayRef<Type *> ArgTys) {
10329 auto Shape = VFShape::get(CI->getFunctionType(),
10331 false /*HasGlobalPred*/);
10332 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10333 auto LibCost = InstructionCost::getInvalid();
10334 if (!CI->isNoBuiltin() && VecFunc) {
10335 // Calculate the cost of the vector library call.
10336 // If the corresponding vector call is cheaper, return its cost.
10337 LibCost =
10338 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
10339 }
10341
10342 // Calculate the cost of the vector intrinsic call.
10343 FastMathFlags FMF;
10344 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
10345 FMF = FPCI->getFastMathFlags();
10346 const InstructionCost ScalarLimit = 10000;
10347 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10348 LibCost.isValid() ? LibCost : ScalarLimit);
10349 auto IntrinsicCost =
10350 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
10351 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10352 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10354
10355 return {IntrinsicCost, LibCost};
10356}
10357
10358BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10359 const InstructionsState &S, ArrayRef<Value *> VL,
10360 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10361 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10362 assert(S.getMainOp() &&
10363 "Expected instructions with same/alternate opcodes only.");
10364
10365 unsigned ShuffleOrOp =
10366 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10367 Instruction *VL0 = S.getMainOp();
10368 switch (ShuffleOrOp) {
10369 case Instruction::PHI: {
10370 // Too many operands - gather, most probably won't be vectorized.
10371 if (VL0->getNumOperands() > MaxPHINumOperands)
10372 return TreeEntry::NeedToGather;
10373 // Check for terminator values (e.g. invoke).
10374 for (Value *V : VL) {
10375 auto *PHI = dyn_cast<PHINode>(V);
10376 if (!PHI)
10377 continue;
10378 for (Value *Incoming : PHI->incoming_values()) {
10380 if (Term && Term->isTerminator()) {
10382 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10383 return TreeEntry::NeedToGather;
10384 }
10385 }
10386 }
10387
10388 return TreeEntry::Vectorize;
10389 }
10390 case Instruction::ExtractElement:
10391 if (any_of(VL, [&](Value *V) {
10392 auto *EI = dyn_cast<ExtractElementInst>(V);
10393 if (!EI)
10394 return true;
10395 return isVectorized(EI->getOperand(0));
10396 }))
10397 return TreeEntry::NeedToGather;
10398 [[fallthrough]];
10399 case Instruction::ExtractValue: {
10400 bool Reuse = canReuseExtract(VL, CurrentOrder);
10401 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10402 // non-full registers).
10403 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
10404 return TreeEntry::NeedToGather;
10405 if (Reuse || !CurrentOrder.empty())
10406 return TreeEntry::Vectorize;
10407 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10408 return TreeEntry::NeedToGather;
10409 }
10410 case Instruction::InsertElement: {
10411 // Check that we have a buildvector and not a shuffle of 2 or more
10412 // different vectors.
10413 ValueSet SourceVectors;
10414 for (Value *V : VL) {
10415 if (isa<PoisonValue>(V)) {
10416 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10417 return TreeEntry::NeedToGather;
10418 }
10419 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10420 assert(getElementIndex(V) != std::nullopt &&
10421 "Non-constant or undef index?");
10422 }
10423
10424 if (count_if(VL, [&SourceVectors](Value *V) {
10425 return !SourceVectors.contains(V);
10426 }) >= 2) {
10427 // Found 2nd source vector - cancel.
10428 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10429 "different source vectors.\n");
10430 return TreeEntry::NeedToGather;
10431 }
10432
10433 if (any_of(VL, [&SourceVectors](Value *V) {
10434 // The last InsertElement can have multiple uses.
10435 return SourceVectors.contains(V) && !V->hasOneUse();
10436 })) {
10437 assert(SLPReVec && "Only supported by REVEC.");
10438 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10439 "multiple uses.\n");
10440 return TreeEntry::NeedToGather;
10441 }
10442
10443 return TreeEntry::Vectorize;
10444 }
10445 case Instruction::Load: {
10446 // Check that a vectorized load would load the same memory as a scalar
10447 // load. For example, we don't want to vectorize loads that are smaller
10448 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10449 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10450 // from such a struct, we read/write packed bits disagreeing with the
10451 // unvectorized version.
10452 auto IsGatheredNode = [&]() {
10453 if (!GatheredLoadsEntriesFirst)
10454 return false;
10455 return all_of(VL, [&](Value *V) {
10456 if (isa<PoisonValue>(V))
10457 return true;
10458 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10459 return TE->Idx >= *GatheredLoadsEntriesFirst;
10460 });
10461 });
10462 };
10463 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10465 return TreeEntry::Vectorize;
10467 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10468 // Delay slow vectorized nodes for better vectorization attempts.
10469 LoadEntriesToVectorize.insert(VectorizableTree.size());
10470 return TreeEntry::NeedToGather;
10471 }
10472 return IsGatheredNode() ? TreeEntry::NeedToGather
10473 : TreeEntry::CompressVectorize;
10475 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10476 // Delay slow vectorized nodes for better vectorization attempts.
10477 LoadEntriesToVectorize.insert(VectorizableTree.size());
10478 return TreeEntry::NeedToGather;
10479 }
10480 return IsGatheredNode() ? TreeEntry::NeedToGather
10481 : TreeEntry::ScatterVectorize;
10483 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10484 // Delay slow vectorized nodes for better vectorization attempts.
10485 LoadEntriesToVectorize.insert(VectorizableTree.size());
10486 return TreeEntry::NeedToGather;
10487 }
10488 return IsGatheredNode() ? TreeEntry::NeedToGather
10489 : TreeEntry::StridedVectorize;
10490 case LoadsState::Gather:
10491#ifndef NDEBUG
10492 Type *ScalarTy = VL0->getType();
10493 if (DL->getTypeSizeInBits(ScalarTy) !=
10494 DL->getTypeAllocSizeInBits(ScalarTy))
10495 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10496 else if (any_of(VL, [](Value *V) {
10497 auto *LI = dyn_cast<LoadInst>(V);
10498 return !LI || !LI->isSimple();
10499 }))
10500 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10501 else
10502 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10503#endif // NDEBUG
10505 return TreeEntry::NeedToGather;
10506 }
10507 llvm_unreachable("Unexpected state of loads");
10508 }
10509 case Instruction::ZExt:
10510 case Instruction::SExt:
10511 case Instruction::FPToUI:
10512 case Instruction::FPToSI:
10513 case Instruction::FPExt:
10514 case Instruction::PtrToInt:
10515 case Instruction::IntToPtr:
10516 case Instruction::SIToFP:
10517 case Instruction::UIToFP:
10518 case Instruction::Trunc:
10519 case Instruction::FPTrunc:
10520 case Instruction::BitCast: {
10521 Type *SrcTy = VL0->getOperand(0)->getType();
10522 for (Value *V : VL) {
10523 if (isa<PoisonValue>(V))
10524 continue;
10525 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10526 if (Ty != SrcTy || !isValidElementType(Ty)) {
10527 LLVM_DEBUG(
10528 dbgs() << "SLP: Gathering casts with different src types.\n");
10529 return TreeEntry::NeedToGather;
10530 }
10531 }
10532 return TreeEntry::Vectorize;
10533 }
10534 case Instruction::ICmp:
10535 case Instruction::FCmp: {
10536 // Check that all of the compares have the same predicate.
10537 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10539 Type *ComparedTy = VL0->getOperand(0)->getType();
10540 for (Value *V : VL) {
10541 if (isa<PoisonValue>(V))
10542 continue;
10543 auto *Cmp = cast<CmpInst>(V);
10544 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10545 Cmp->getOperand(0)->getType() != ComparedTy) {
10546 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10547 return TreeEntry::NeedToGather;
10548 }
10549 }
10550 return TreeEntry::Vectorize;
10551 }
10552 case Instruction::Select:
10553 case Instruction::FNeg:
10554 case Instruction::Add:
10555 case Instruction::FAdd:
10556 case Instruction::Sub:
10557 case Instruction::FSub:
10558 case Instruction::Mul:
10559 case Instruction::FMul:
10560 case Instruction::UDiv:
10561 case Instruction::SDiv:
10562 case Instruction::FDiv:
10563 case Instruction::URem:
10564 case Instruction::SRem:
10565 case Instruction::FRem:
10566 case Instruction::Shl:
10567 case Instruction::LShr:
10568 case Instruction::AShr:
10569 case Instruction::And:
10570 case Instruction::Or:
10571 case Instruction::Xor:
10572 case Instruction::Freeze:
10573 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10574 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10575 auto *I = dyn_cast<Instruction>(V);
10576 return I && I->isBinaryOp() && !I->isFast();
10577 }))
10578 return TreeEntry::NeedToGather;
10579 return TreeEntry::Vectorize;
10580 case Instruction::GetElementPtr: {
10581 // We don't combine GEPs with complicated (nested) indexing.
10582 for (Value *V : VL) {
10583 auto *I = dyn_cast<GetElementPtrInst>(V);
10584 if (!I)
10585 continue;
10586 if (I->getNumOperands() != 2) {
10587 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10588 return TreeEntry::NeedToGather;
10589 }
10590 }
10591
10592 // We can't combine several GEPs into one vector if they operate on
10593 // different types.
10594 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10595 for (Value *V : VL) {
10596 auto *GEP = dyn_cast<GEPOperator>(V);
10597 if (!GEP)
10598 continue;
10599 Type *CurTy = GEP->getSourceElementType();
10600 if (Ty0 != CurTy) {
10601 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10602 return TreeEntry::NeedToGather;
10603 }
10604 }
10605
10606 // We don't combine GEPs with non-constant indexes.
10607 Type *Ty1 = VL0->getOperand(1)->getType();
10608 for (Value *V : VL) {
10609 auto *I = dyn_cast<GetElementPtrInst>(V);
10610 if (!I)
10611 continue;
10612 auto *Op = I->getOperand(1);
10613 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10614 (Op->getType() != Ty1 &&
10615 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10616 Op->getType()->getScalarSizeInBits() >
10617 DL->getIndexSizeInBits(
10618 V->getType()->getPointerAddressSpace())))) {
10619 LLVM_DEBUG(
10620 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10621 return TreeEntry::NeedToGather;
10622 }
10623 }
10624
10625 return TreeEntry::Vectorize;
10626 }
10627 case Instruction::Store: {
10628 // Check if the stores are consecutive or if we need to swizzle them.
10629 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10630 // Avoid types that are padded when being allocated as scalars, while
10631 // being packed together in a vector (such as i1).
10632 if (DL->getTypeSizeInBits(ScalarTy) !=
10633 DL->getTypeAllocSizeInBits(ScalarTy)) {
10634 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10635 return TreeEntry::NeedToGather;
10636 }
10637 // Make sure all stores in the bundle are simple - we can't vectorize
10638 // atomic or volatile stores.
10639 for (Value *V : VL) {
10640 auto *SI = cast<StoreInst>(V);
10641 if (!SI->isSimple()) {
10642 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10643 return TreeEntry::NeedToGather;
10644 }
10645 PointerOps.push_back(SI->getPointerOperand());
10646 }
10647
10648 // Check the order of pointer operands.
10649 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10650 Value *Ptr0;
10651 Value *PtrN;
10652 if (CurrentOrder.empty()) {
10653 Ptr0 = PointerOps.front();
10654 PtrN = PointerOps.back();
10655 } else {
10656 Ptr0 = PointerOps[CurrentOrder.front()];
10657 PtrN = PointerOps[CurrentOrder.back()];
10658 }
10659 std::optional<int64_t> Dist =
10660 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10661 // Check that the sorted pointer operands are consecutive.
10662 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10663 return TreeEntry::Vectorize;
10664 }
10665
10666 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10667 return TreeEntry::NeedToGather;
10668 }
10669 case Instruction::Call: {
10670 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10671 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10672 auto *I = dyn_cast<Instruction>(V);
10673 return I && !I->isFast();
10674 }))
10675 return TreeEntry::NeedToGather;
10676 // Check if the calls are all to the same vectorizable intrinsic or
10677 // library function.
10678 CallInst *CI = cast<CallInst>(VL0);
10680
10681 VFShape Shape = VFShape::get(
10682 CI->getFunctionType(),
10683 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10684 false /*HasGlobalPred*/);
10685 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10686
10687 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10688 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10689 return TreeEntry::NeedToGather;
10690 }
10691 Function *F = CI->getCalledFunction();
10692 unsigned NumArgs = CI->arg_size();
10693 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10694 for (unsigned J = 0; J != NumArgs; ++J)
10696 ScalarArgs[J] = CI->getArgOperand(J);
10697 for (Value *V : VL) {
10698 CallInst *CI2 = dyn_cast<CallInst>(V);
10699 if (!CI2 || CI2->getCalledFunction() != F ||
10700 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10701 (VecFunc &&
10702 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10704 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10705 << "\n");
10706 return TreeEntry::NeedToGather;
10707 }
10708 // Some intrinsics have scalar arguments and should be same in order for
10709 // them to be vectorized.
10710 for (unsigned J = 0; J != NumArgs; ++J) {
10712 Value *A1J = CI2->getArgOperand(J);
10713 if (ScalarArgs[J] != A1J) {
10715 << "SLP: mismatched arguments in call:" << *CI
10716 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10717 return TreeEntry::NeedToGather;
10718 }
10719 }
10720 }
10721 // Verify that the bundle operands are identical between the two calls.
10722 if (CI->hasOperandBundles() &&
10723 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10724 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10725 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10726 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10727 << "!=" << *V << '\n');
10728 return TreeEntry::NeedToGather;
10729 }
10730 }
10731 SmallVector<Type *> ArgTys =
10732 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10733 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10734 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10735 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10736 return TreeEntry::NeedToGather;
10737
10738 return TreeEntry::Vectorize;
10739 }
10740 case Instruction::ShuffleVector: {
10741 if (!S.isAltShuffle()) {
10742 // REVEC can support non alternate shuffle.
10744 return TreeEntry::Vectorize;
10745 // If this is not an alternate sequence of opcode like add-sub
10746 // then do not vectorize this instruction.
10747 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10748 return TreeEntry::NeedToGather;
10749 }
10750 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10751 LLVM_DEBUG(
10752 dbgs()
10753 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10754 "the whole alt sequence is not profitable.\n");
10755 return TreeEntry::NeedToGather;
10756 }
10757
10758 return TreeEntry::Vectorize;
10759 }
10760 default:
10761 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10762 return TreeEntry::NeedToGather;
10763 }
10764}
10765
10766namespace {
10767/// Allows to correctly handle operands of the phi nodes based on the \p Main
10768/// PHINode order of incoming basic blocks/values.
10769class PHIHandler {
10770 DominatorTree &DT;
10771 PHINode *Main = nullptr;
10774
10775public:
10776 PHIHandler() = delete;
10777 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10778 : DT(DT), Main(Main), Phis(Phis),
10779 Operands(Main->getNumIncomingValues(),
10780 SmallVector<Value *>(Phis.size(), nullptr)) {}
10781 void buildOperands() {
10782 constexpr unsigned FastLimit = 4;
10783 if (Main->getNumIncomingValues() <= FastLimit) {
10784 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10785 BasicBlock *InBB = Main->getIncomingBlock(I);
10786 if (!DT.isReachableFromEntry(InBB)) {
10787 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10788 continue;
10789 }
10790 // Prepare the operand vector.
10791 for (auto [Idx, V] : enumerate(Phis)) {
10792 auto *P = dyn_cast<PHINode>(V);
10793 if (!P) {
10795 "Expected isa instruction or poison value.");
10796 Operands[I][Idx] = V;
10797 continue;
10798 }
10799 if (P->getIncomingBlock(I) == InBB)
10800 Operands[I][Idx] = P->getIncomingValue(I);
10801 else
10802 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10803 }
10804 }
10805 return;
10806 }
10807 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10808 Blocks;
10809 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10810 BasicBlock *InBB = Main->getIncomingBlock(I);
10811 if (!DT.isReachableFromEntry(InBB)) {
10812 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10813 continue;
10814 }
10815 Blocks.try_emplace(InBB).first->second.push_back(I);
10816 }
10817 for (auto [Idx, V] : enumerate(Phis)) {
10818 if (isa<PoisonValue>(V)) {
10819 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10820 Operands[I][Idx] = V;
10821 continue;
10822 }
10823 auto *P = cast<PHINode>(V);
10824 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10825 BasicBlock *InBB = P->getIncomingBlock(I);
10826 if (InBB == Main->getIncomingBlock(I)) {
10827 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10828 continue;
10829 Operands[I][Idx] = P->getIncomingValue(I);
10830 continue;
10831 }
10832 auto *It = Blocks.find(InBB);
10833 if (It == Blocks.end())
10834 continue;
10835 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10836 }
10837 }
10838 for (const auto &P : Blocks) {
10839 ArrayRef<unsigned> IncomingValues = P.second;
10840 if (IncomingValues.size() <= 1)
10841 continue;
10842 unsigned BasicI = IncomingValues.consume_front();
10843 for (unsigned I : IncomingValues) {
10844 assert(all_of(enumerate(Operands[I]),
10845 [&](const auto &Data) {
10846 return !Data.value() ||
10847 Data.value() == Operands[BasicI][Data.index()];
10848 }) &&
10849 "Expected empty operands list.");
10850 Operands[I] = Operands[BasicI];
10851 }
10852 }
10853 }
10854 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10855};
10856} // namespace
10857
10858/// Returns main/alternate instructions for the given \p VL. Unlike
10859/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10860/// node support.
10861/// \returns first main/alt instructions, if only poisons and instruction with
10862/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10863static std::pair<Instruction *, Instruction *>
10865 Instruction *MainOp = nullptr;
10866 Instruction *AltOp = nullptr;
10867 for (Value *V : VL) {
10868 if (isa<PoisonValue>(V))
10869 continue;
10870 auto *I = dyn_cast<Instruction>(V);
10871 if (!I)
10872 return {};
10873 if (!MainOp) {
10874 MainOp = I;
10875 continue;
10876 }
10877 if (MainOp->getOpcode() == I->getOpcode()) {
10878 if (I->getParent() != MainOp->getParent())
10879 return {};
10880 continue;
10881 }
10882 if (!AltOp) {
10883 AltOp = I;
10884 continue;
10885 }
10886 if (AltOp->getOpcode() == I->getOpcode()) {
10887 if (I->getParent() != AltOp->getParent())
10888 return {};
10889 continue;
10890 }
10891 return {};
10892 }
10893 if (!AltOp)
10894 return {};
10895 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10896 "Expected different main and alt instructions.");
10897 return std::make_pair(MainOp, AltOp);
10898}
10899
10900/// Checks that every instruction appears once in the list and if not, packs
10901/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10902/// unique scalars is extended by poison values to the whole register size.
10903///
10904/// \returns false if \p VL could not be uniquified, in which case \p VL is
10905/// unchanged and \p ReuseShuffleIndices is empty.
10907 SmallVectorImpl<int> &ReuseShuffleIndices,
10908 const TargetTransformInfo &TTI,
10909 const TargetLibraryInfo &TLI,
10910 const InstructionsState &S,
10911 const BoUpSLP::EdgeInfo &UserTreeIdx,
10912 bool TryPad = false) {
10913 // Check that every instruction appears once in this bundle.
10914 SmallVector<Value *> UniqueValues;
10915 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10916 for (Value *V : VL) {
10917 if (isConstant(V)) {
10918 // Constants are always considered distinct, even if the same constant
10919 // appears multiple times in VL.
10920 ReuseShuffleIndices.emplace_back(
10921 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10922 UniqueValues.emplace_back(V);
10923 continue;
10924 }
10925 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10926 ReuseShuffleIndices.emplace_back(Res.first->second);
10927 if (Res.second)
10928 UniqueValues.emplace_back(V);
10929 }
10930
10931 // Easy case: VL has unique values and a "natural" size
10932 size_t NumUniqueScalarValues = UniqueValues.size();
10933 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10934 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10935 if (NumUniqueScalarValues == VL.size() &&
10936 (VectorizeNonPowerOf2 || IsFullVectors)) {
10937 ReuseShuffleIndices.clear();
10938 return true;
10939 }
10940
10941 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10942 if ((UserTreeIdx.UserTE &&
10943 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10945 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10946 "for nodes with padding.\n");
10947 ReuseShuffleIndices.clear();
10948 return false;
10949 }
10950
10951 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10952 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10953 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10954 return isa<UndefValue>(V) || !isConstant(V);
10955 }))) {
10956 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10957 S.getMainOp()->isSafeToRemove() &&
10958 (S.areInstructionsWithCopyableElements() ||
10959 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10960 // Find the number of elements, which forms full vectors.
10961 unsigned PWSz = getFullVectorNumberOfElements(
10962 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10963 PWSz = std::min<unsigned>(PWSz, VL.size());
10964 if (PWSz == VL.size()) {
10965 // We ended up with the same size after removing duplicates and
10966 // upgrading the resulting vector size to a "nice size". Just keep
10967 // the initial VL then.
10968 ReuseShuffleIndices.clear();
10969 } else {
10970 // Pad unique values with poison to grow the vector to a "nice" size
10971 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10972 UniqueValues.end());
10973 PaddedUniqueValues.append(
10974 PWSz - UniqueValues.size(),
10975 PoisonValue::get(UniqueValues.front()->getType()));
10976 // Check that extended with poisons/copyable operations are still valid
10977 // for vectorization (div/rem are not allowed).
10978 if ((!S.areInstructionsWithCopyableElements() &&
10979 !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
10980 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10981 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10982 isa<CallInst>(S.getMainOp())))) {
10983 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10984 ReuseShuffleIndices.clear();
10985 return false;
10986 }
10987 VL = std::move(PaddedUniqueValues);
10988 }
10989 return true;
10990 }
10991 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10992 ReuseShuffleIndices.clear();
10993 return false;
10994 }
10995 VL = std::move(UniqueValues);
10996 return true;
10997}
10998
10999bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
11000 const InstructionsState &LocalState,
11001 SmallVectorImpl<Value *> &Op1,
11002 SmallVectorImpl<Value *> &Op2,
11003 OrdersType &ReorderIndices) const {
11004 constexpr unsigned SmallNodeSize = 4;
11005 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11007 return false;
11008
11009 // Check if this is a duplicate of another split entry.
11010 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
11011 << ".\n");
11012 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
11013 if (E->isSame(VL)) {
11014 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
11015 << *LocalState.getMainOp() << ".\n");
11016 return false;
11017 }
11018 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11019 if (all_of(VL, [&](Value *V) {
11020 return isa<PoisonValue>(V) || Values.contains(V);
11021 })) {
11022 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11023 return false;
11024 }
11025 }
11026
11027 ReorderIndices.assign(VL.size(), VL.size());
11028 SmallBitVector Op1Indices(VL.size());
11029 for (auto [Idx, V] : enumerate(VL)) {
11030 auto *I = dyn_cast<Instruction>(V);
11031 if (!I) {
11032 Op1.push_back(V);
11033 Op1Indices.set(Idx);
11034 continue;
11035 }
11036 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11037 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
11038 *TLI)) ||
11039 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11040 !isAlternateInstruction(I, LocalState.getMainOp(),
11041 LocalState.getAltOp(), *TLI))) {
11042 Op1.push_back(V);
11043 Op1Indices.set(Idx);
11044 continue;
11045 }
11046 Op2.push_back(V);
11047 }
11048 Type *ScalarTy = getValueType(VL.front());
11049 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
11050 unsigned Opcode0 = LocalState.getOpcode();
11051 unsigned Opcode1 = LocalState.getAltOpcode();
11052 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11053 // Enable split node, only if all nodes do not form legal alternate
11054 // instruction (like X86 addsub).
11055 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
11056 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
11057 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11058 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
11059 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
11060 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
11061 return false;
11062 // Enable split node, only if all nodes are power-of-2/full registers.
11063 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
11064 for (unsigned Idx : seq<unsigned>(VL.size())) {
11065 if (Op1Indices.test(Idx)) {
11066 ReorderIndices[Op1Cnt] = Idx;
11067 ++Op1Cnt;
11068 } else {
11069 ReorderIndices[Op2Cnt] = Idx;
11070 ++Op2Cnt;
11071 }
11072 }
11073 if (isIdentityOrder(ReorderIndices))
11074 ReorderIndices.clear();
11075 SmallVector<int> Mask;
11076 if (!ReorderIndices.empty())
11077 inversePermutation(ReorderIndices, Mask);
11078 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11079 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
11080 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
11081 // Check non-profitable single register ops, which better to be represented
11082 // as alternate ops.
11083 if (NumParts >= VL.size())
11084 return false;
11086 InstructionCost InsertCost = ::getShuffleCost(
11087 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
11088 FixedVectorType *SubVecTy =
11089 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
11090 InstructionCost NewShuffleCost =
11091 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
11092 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11093 (Mask.empty() || InsertCost >= NewShuffleCost))
11094 return false;
11095 if ((LocalState.getMainOp()->isBinaryOp() &&
11096 LocalState.getAltOp()->isBinaryOp() &&
11097 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11098 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11099 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11100 (LocalState.getMainOp()->isUnaryOp() &&
11101 LocalState.getAltOp()->isUnaryOp())) {
11102 InstructionCost OriginalVecOpsCost =
11103 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11104 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11105 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
11106 for (unsigned Idx : seq<unsigned>(VL.size())) {
11107 if (isa<PoisonValue>(VL[Idx]))
11108 continue;
11109 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
11110 }
11111 InstructionCost OriginalCost =
11112 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
11113 VecTy, OriginalMask, Kind);
11114 InstructionCost NewVecOpsCost =
11115 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11116 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11117 InstructionCost NewCost =
11118 NewVecOpsCost + InsertCost +
11119 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11120 VectorizableTree.front()->getOpcode() == Instruction::Store
11121 ? NewShuffleCost
11122 : 0);
11123 // If not profitable to split - exit.
11124 if (NewCost >= OriginalCost)
11125 return false;
11126 }
11127 return true;
11128}
11129
11130namespace {
11131/// Class accepts incoming list of values, checks if it is able to model
11132/// "copyable" values as compatible operations, and generates the list of values
11133/// for scheduling and list of operands doe the new nodes.
11134class InstructionsCompatibilityAnalysis {
11135 DominatorTree &DT;
11136 const DataLayout &DL;
11137 const TargetTransformInfo &TTI;
11138 const TargetLibraryInfo &TLI;
11139 unsigned MainOpcode = 0;
11140 Instruction *MainOp = nullptr;
11141
11142 /// Checks if the opcode is supported as the main opcode for copyable
11143 /// elements.
11144 static bool isSupportedOpcode(const unsigned Opcode) {
11145 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11146 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11147 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11148 Opcode == Instruction::And || Opcode == Instruction::Or ||
11149 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11150 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11151 Opcode == Instruction::FDiv;
11152 }
11153
11154 /// Identifies the best candidate value, which represents main opcode
11155 /// operation.
11156 /// Currently the best candidate is the Add instruction with the parent
11157 /// block with the highest DFS incoming number (block, that dominates other).
11158 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
11159 BasicBlock *Parent = nullptr;
11160 // Checks if the instruction has supported opcode.
11161 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
11162 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
11163 return false;
11164 return I && isSupportedOpcode(I->getOpcode()) &&
11165 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
11166 };
11167 // Exclude operands instructions immediately to improve compile time, it
11168 // will be unable to schedule anyway.
11169 SmallDenseSet<Value *, 8> Operands;
11170 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11171 bool AnyUndef = false;
11172 for (Value *V : VL) {
11173 auto *I = dyn_cast<Instruction>(V);
11174 if (!I) {
11175 AnyUndef |= isa<UndefValue>(V);
11176 continue;
11177 }
11178 if (!DT.isReachableFromEntry(I->getParent()))
11179 continue;
11180 if (Candidates.empty()) {
11181 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11182 Parent = I->getParent();
11183 Operands.insert(I->op_begin(), I->op_end());
11184 continue;
11185 }
11186 if (Parent == I->getParent()) {
11187 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11188 Operands.insert(I->op_begin(), I->op_end());
11189 continue;
11190 }
11191 auto *NodeA = DT.getNode(Parent);
11192 auto *NodeB = DT.getNode(I->getParent());
11193 assert(NodeA && "Should only process reachable instructions");
11194 assert(NodeB && "Should only process reachable instructions");
11195 assert((NodeA == NodeB) ==
11196 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11197 "Different nodes should have different DFS numbers");
11198 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11199 Candidates.clear();
11200 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11201 Parent = I->getParent();
11202 Operands.clear();
11203 Operands.insert(I->op_begin(), I->op_end());
11204 }
11205 }
11206 unsigned BestOpcodeNum = 0;
11207 MainOp = nullptr;
11208 bool UsedOutside = false;
11209 for (const auto &P : Candidates) {
11210 bool PUsedOutside = all_of(P.second, isUsedOutsideBlock);
11211 if (UsedOutside && !PUsedOutside)
11212 continue;
11213 if (!UsedOutside && PUsedOutside)
11214 BestOpcodeNum = 0;
11215 if (P.second.size() < BestOpcodeNum)
11216 continue;
11217 // If have inner dependencies - skip.
11218 if (!PUsedOutside && any_of(P.second, [&](Instruction *I) {
11219 return Operands.contains(I);
11220 }))
11221 continue;
11222 UsedOutside = PUsedOutside;
11223 for (Instruction *I : P.second) {
11224 if (IsSupportedInstruction(I, AnyUndef)) {
11225 MainOp = I;
11226 BestOpcodeNum = P.second.size();
11227 break;
11228 }
11229 }
11230 }
11231 if (MainOp) {
11232 // Do not match, if any copyable is a terminator from the same block as
11233 // the main operation.
11234 if (any_of(VL, [&](Value *V) {
11235 auto *I = dyn_cast<Instruction>(V);
11236 return I && I->getParent() == MainOp->getParent() &&
11237 I->isTerminator();
11238 })) {
11239 MainOp = nullptr;
11240 return;
11241 }
11242 MainOpcode = MainOp->getOpcode();
11243 }
11244 }
11245
11246 /// Returns the idempotent value for the \p MainOp with the detected \p
11247 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
11248 /// the operand itself, since V or V == V.
11249 Value *selectBestIdempotentValue() const {
11250 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11251 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
11252 !MainOp->isCommutative());
11253 }
11254
11255 /// Returns the value and operands for the \p V, considering if it is original
11256 /// instruction and its actual operands should be returned, or it is a
11257 /// copyable element and its should be represented as idempotent instruction.
11258 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
11259 if (isa<PoisonValue>(V))
11260 return {V, V};
11261 if (!S.isCopyableElement(V))
11262 return convertTo(cast<Instruction>(V), S).second;
11263 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11264 return {V, selectBestIdempotentValue()};
11265 }
11266
11267 /// Builds operands for the original instructions.
11268 void
11269 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
11270 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
11271
11272 unsigned ShuffleOrOp =
11273 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11274 Instruction *VL0 = S.getMainOp();
11275
11276 switch (ShuffleOrOp) {
11277 case Instruction::PHI: {
11278 auto *PH = cast<PHINode>(VL0);
11279
11280 // Keeps the reordered operands to avoid code duplication.
11281 PHIHandler Handler(DT, PH, VL);
11282 Handler.buildOperands();
11283 Operands.assign(PH->getNumOperands(), {});
11284 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
11285 Operands[I].assign(Handler.getOperands(I).begin(),
11286 Handler.getOperands(I).end());
11287 return;
11288 }
11289 case Instruction::ExtractValue:
11290 case Instruction::ExtractElement:
11291 // This is a special case, as it does not gather, but at the same time
11292 // we are not extending buildTree_rec() towards the operands.
11293 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
11294 return;
11295 case Instruction::InsertElement:
11296 Operands.assign(2, {VL.size(), nullptr});
11297 for (auto [Idx, V] : enumerate(VL)) {
11298 auto *IE = cast<InsertElementInst>(V);
11299 for (auto [OpIdx, Ops] : enumerate(Operands))
11300 Ops[Idx] = IE->getOperand(OpIdx);
11301 }
11302 return;
11303 case Instruction::Load:
11304 Operands.assign(
11305 1, {VL.size(),
11306 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
11307 for (auto [V, Op] : zip(VL, Operands.back())) {
11308 auto *LI = dyn_cast<LoadInst>(V);
11309 if (!LI)
11310 continue;
11311 Op = LI->getPointerOperand();
11312 }
11313 return;
11314 case Instruction::ZExt:
11315 case Instruction::SExt:
11316 case Instruction::FPToUI:
11317 case Instruction::FPToSI:
11318 case Instruction::FPExt:
11319 case Instruction::PtrToInt:
11320 case Instruction::IntToPtr:
11321 case Instruction::SIToFP:
11322 case Instruction::UIToFP:
11323 case Instruction::Trunc:
11324 case Instruction::FPTrunc:
11325 case Instruction::BitCast:
11326 case Instruction::ICmp:
11327 case Instruction::FCmp:
11328 case Instruction::Select:
11329 case Instruction::FNeg:
11330 case Instruction::Add:
11331 case Instruction::FAdd:
11332 case Instruction::Sub:
11333 case Instruction::FSub:
11334 case Instruction::Mul:
11335 case Instruction::FMul:
11336 case Instruction::UDiv:
11337 case Instruction::SDiv:
11338 case Instruction::FDiv:
11339 case Instruction::URem:
11340 case Instruction::SRem:
11341 case Instruction::FRem:
11342 case Instruction::Shl:
11343 case Instruction::LShr:
11344 case Instruction::AShr:
11345 case Instruction::And:
11346 case Instruction::Or:
11347 case Instruction::Xor:
11348 case Instruction::Freeze:
11349 case Instruction::Store:
11350 case Instruction::ShuffleVector:
11351 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11352 for (auto [Idx, V] : enumerate(VL)) {
11353 auto *I = dyn_cast<Instruction>(V);
11354 if (!I) {
11355 for (auto [OpIdx, Ops] : enumerate(Operands))
11356 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11357 continue;
11358 }
11359 auto [Op, ConvertedOps] = convertTo(I, S);
11360 for (auto [OpIdx, Ops] : enumerate(Operands))
11361 Ops[Idx] = ConvertedOps[OpIdx];
11362 }
11363 return;
11364 case Instruction::GetElementPtr: {
11365 Operands.assign(2, {VL.size(), nullptr});
11366 // Need to cast all indices to the same type before vectorization to
11367 // avoid crash.
11368 // Required to be able to find correct matches between different gather
11369 // nodes and reuse the vectorized values rather than trying to gather them
11370 // again.
11371 const unsigned IndexIdx = 1;
11372 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
11373 Type *Ty =
11374 all_of(VL,
11375 [&](Value *V) {
11377 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
11378 })
11379 ? VL0Ty
11380 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
11381 ->getPointerOperandType()
11382 ->getScalarType());
11383 for (auto [Idx, V] : enumerate(VL)) {
11385 if (!GEP) {
11386 Operands[0][Idx] = V;
11387 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11388 continue;
11389 }
11390 Operands[0][Idx] = GEP->getPointerOperand();
11391 auto *Op = GEP->getOperand(IndexIdx);
11392 auto *CI = dyn_cast<ConstantInt>(Op);
11393 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
11394 CI, Ty, CI->getValue().isSignBitSet(), DL)
11395 : Op;
11396 }
11397 return;
11398 }
11399 case Instruction::Call: {
11400 auto *CI = cast<CallInst>(VL0);
11402 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
11404 continue;
11405 auto &Ops = Operands.emplace_back();
11406 for (Value *V : VL) {
11407 auto *I = dyn_cast<Instruction>(V);
11408 Ops.push_back(I ? I->getOperand(Idx)
11409 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
11410 }
11411 }
11412 return;
11413 }
11414 default:
11415 break;
11416 }
11417 llvm_unreachable("Unexpected vectorization of the instructions.");
11418 }
11419
11420public:
11421 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11422 const TargetTransformInfo &TTI,
11423 const TargetLibraryInfo &TLI)
11424 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11425
11426 InstructionsState
11427 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
11428 bool TryCopyableElementsVectorization,
11429 bool WithProfitabilityCheck = false,
11430 bool SkipSameCodeCheck = false) {
11431 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11432 ? InstructionsState::invalid()
11433 : getSameOpcode(VL, TLI);
11434 if (S)
11435 return S;
11436 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
11437 return S;
11438 findAndSetMainInstruction(VL, R);
11439 if (!MainOp)
11440 return InstructionsState::invalid();
11441 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11442 if (!WithProfitabilityCheck)
11443 return S;
11444 // Check if it is profitable to vectorize the instruction.
11445 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11446 auto BuildCandidates =
11447 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11448 Value *V2) {
11449 if (V1 != V2 && isa<PHINode>(V1))
11450 return;
11451 auto *I1 = dyn_cast<Instruction>(V1);
11452 auto *I2 = dyn_cast<Instruction>(V2);
11453 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11454 I1->getParent() != I2->getParent())
11455 return;
11456 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
11457 };
11458 if (VL.size() == 2) {
11459 // Check if the operands allow better vectorization.
11460 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11461 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11462 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11463 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11464 R.findBestRootPair(Candidates1) &&
11465 R.findBestRootPair(Candidates2);
11466 if (!Res && isCommutative(MainOp)) {
11467 Candidates1.clear();
11468 Candidates2.clear();
11469 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11470 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11471 Res = !Candidates1.empty() && !Candidates2.empty() &&
11472 R.findBestRootPair(Candidates1) &&
11473 R.findBestRootPair(Candidates2);
11474 }
11475 if (!Res)
11476 return InstructionsState::invalid();
11478 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
11479 InstructionCost VectorCost;
11480 FixedVectorType *VecTy =
11481 getWidenedType(S.getMainOp()->getType(), VL.size());
11482 switch (MainOpcode) {
11483 case Instruction::Add:
11484 case Instruction::Sub:
11485 case Instruction::LShr:
11486 case Instruction::Shl:
11487 case Instruction::SDiv:
11488 case Instruction::UDiv:
11489 case Instruction::And:
11490 case Instruction::Or:
11491 case Instruction::Xor:
11492 case Instruction::FAdd:
11493 case Instruction::FMul:
11494 case Instruction::FSub:
11495 case Instruction::FDiv:
11496 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
11497 break;
11498 default:
11499 llvm_unreachable("Unexpected instruction.");
11500 }
11501 if (VectorCost > ScalarCost)
11502 return InstructionsState::invalid();
11503 return S;
11504 }
11505 assert(Operands.size() == 2 && "Unexpected number of operands!");
11506 unsigned CopyableNum =
11507 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
11508 if (CopyableNum < VL.size() / 2)
11509 return S;
11510 // Too many phi copyables - exit.
11511 const unsigned Limit = VL.size() / 24;
11512 if ((CopyableNum >= VL.size() - Limit ||
11513 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11514 CopyableNum >= MaxPHINumOperands) &&
11515 all_of(VL, [&](Value *V) {
11516 return isa<PHINode>(V) || !S.isCopyableElement(V);
11517 }))
11518 return InstructionsState::invalid();
11519 // Check profitability if number of copyables > VL.size() / 2.
11520 // 1. Reorder operands for better matching.
11521 if (isCommutative(MainOp)) {
11522 for (auto &Ops : Operands) {
11523 // Make instructions the first operands.
11524 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
11525 std::swap(Ops.front(), Ops.back());
11526 continue;
11527 }
11528 // Make constants the second operands.
11529 if (isa<Constant>(Ops.front())) {
11530 std::swap(Ops.front(), Ops.back());
11531 continue;
11532 }
11533 }
11534 }
11535 // 2. Check, if operands can be vectorized.
11536 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
11537 return InstructionsState::invalid();
11538 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11539 if (allConstant(Ops) || isSplat(Ops))
11540 return true;
11541 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11542 // one is different.
11543 constexpr unsigned Limit = 4;
11544 if (Operands.front().size() >= Limit) {
11545 SmallDenseMap<const Value *, unsigned> Counters;
11546 for (Value *V : Ops) {
11547 if (isa<UndefValue>(V))
11548 continue;
11549 ++Counters[V];
11550 }
11551 if (Counters.size() == 2 &&
11552 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11553 return C.second == 1;
11554 }))
11555 return true;
11556 }
11557 // First operand not a constant or splat? Last attempt - check for
11558 // potential vectorization.
11559 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11560 InstructionsState OpS = Analysis.buildInstructionsState(
11561 Ops, R, /*TryCopyableElementsVectorization=*/true);
11562 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11563 return false;
11564 unsigned CopyableNum =
11565 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11566 return CopyableNum <= VL.size() / 2;
11567 };
11568 if (!CheckOperand(Operands.front()))
11569 return InstructionsState::invalid();
11570
11571 return S;
11572 }
11573
11574 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11575 ArrayRef<Value *> VL) {
11576 assert(S && "Invalid state!");
11578 if (S.areInstructionsWithCopyableElements()) {
11579 MainOp = S.getMainOp();
11580 MainOpcode = S.getOpcode();
11581 Operands.assign(MainOp->getNumOperands(),
11582 BoUpSLP::ValueList(VL.size(), nullptr));
11583 for (auto [Idx, V] : enumerate(VL)) {
11584 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11585 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11586 Operands[OperandIdx][Idx] = Operand;
11587 }
11588 } else {
11589 buildOriginalOperands(S, VL, Operands);
11590 }
11591 return Operands;
11592 }
11593};
11594} // namespace
11595
11596BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11597 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11598 bool TryCopyableElementsVectorization) const {
11599 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11600
11601 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11602 InstructionsState S = Analysis.buildInstructionsState(
11603 VL, *this, TryCopyableElementsVectorization,
11604 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11605
11606 bool AreScatterAllGEPSameBlock = false;
11607 if (!S) {
11608 SmallVector<unsigned> SortedIndices;
11609 BasicBlock *BB = nullptr;
11610 bool IsScatterVectorizeUserTE =
11611 UserTreeIdx.UserTE &&
11612 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11613 AreScatterAllGEPSameBlock =
11614 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11615 VL.size() > 2 &&
11616 all_of(VL,
11617 [&BB](Value *V) {
11618 auto *I = dyn_cast<GetElementPtrInst>(V);
11619 if (!I)
11620 return doesNotNeedToBeScheduled(V);
11621 if (!BB)
11622 BB = I->getParent();
11623 return BB == I->getParent() && I->getNumOperands() == 2;
11624 }) &&
11625 BB &&
11626 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL,
11627 *SE, SortedIndices));
11628 if (!AreScatterAllGEPSameBlock) {
11629 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11630 "C,S,B,O, small shuffle. \n";
11631 dbgs() << "[";
11632 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11633 dbgs() << "]\n");
11634 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11635 /*TryToFindDuplicates=*/true,
11636 /*TrySplitVectorize=*/true);
11637 }
11638 // Reset S to make it GetElementPtr kind of node.
11639 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11640 assert(It != VL.end() && "Expected at least one GEP.");
11641 S = getSameOpcode(*It, *TLI);
11642 }
11643 assert(S && "Must be valid.");
11644
11645 // Don't handle vectors.
11646 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11647 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11648 // Do not try to pack to avoid extra instructions here.
11649 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11650 /*TryToFindDuplicates=*/false);
11651 }
11652
11653 // Check that all of the users of the scalars that we want to vectorize are
11654 // schedulable.
11655 BasicBlock *BB = S.getMainOp()->getParent();
11656
11658 !DT->isReachableFromEntry(BB)) {
11659 // Don't go into unreachable blocks. They may contain instructions with
11660 // dependency cycles which confuse the final scheduling.
11661 // Do not vectorize EH and non-returning blocks, not profitable in most
11662 // cases.
11663 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11664 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11665 }
11666
11667 // Don't go into catchswitch blocks, which can happen with PHIs.
11668 // Such blocks can only have PHIs and the catchswitch. There is no
11669 // place to insert a shuffle if we need to, so just avoid that issue.
11671 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11672 // Do not try to pack to avoid extra instructions here.
11673 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11674 /*TryToFindDuplicates=*/false);
11675 }
11676
11677 // Don't handle scalable vectors
11678 if (S.getOpcode() == Instruction::ExtractElement &&
11680 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11681 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11682 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11683 }
11684
11685 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11686 // a load), in which case peek through to include it in the tree, without
11687 // ballooning over-budget.
11688 if (Depth >= RecursionMaxDepth &&
11689 (S.isAltShuffle() || VL.size() < 4 ||
11690 !(match(S.getMainOp(), m_Load(m_Value())) ||
11691 all_of(VL, [&S](const Value *I) {
11692 return match(I,
11694 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11695 })))) {
11696 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11697 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11698 }
11699
11700 // Check if this is a duplicate of another entry.
11701 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11702 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11703 if (E->isSame(VL)) {
11704 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11705 << ".\n");
11706 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11707 }
11708 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11709 if (all_of(VL, [&](Value *V) {
11710 return isa<PoisonValue>(V) || Values.contains(V) ||
11711 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11712 LI->getLoopFor(S.getMainOp()->getParent()) &&
11713 isVectorized(V));
11714 })) {
11715 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11716 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11717 }
11718 }
11719
11720 // If all of the operands are identical or constant we have a simple solution.
11721 // If we deal with insert/extract instructions, they all must have constant
11722 // indices, otherwise we should gather them, not try to vectorize.
11723 // If alternate op node with 2 elements with gathered operands - do not
11724 // vectorize.
11725 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11726 if (!S || !S.isAltShuffle() || VL.size() > 2)
11727 return false;
11728 if (VectorizableTree.size() < MinTreeSize)
11729 return false;
11730 if (Depth >= RecursionMaxDepth - 1)
11731 return true;
11732 // Check if all operands are extracts, part of vector node or can build a
11733 // regular vectorize node.
11734 SmallVector<unsigned, 8> InstsCount;
11735 for (Value *V : VL) {
11736 auto *I = cast<Instruction>(V);
11737 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11738 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11739 }));
11740 }
11741 bool IsCommutative =
11742 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11743 if ((IsCommutative &&
11744 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11745 (!IsCommutative &&
11746 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11747 return true;
11748 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11750 auto *I1 = cast<Instruction>(VL.front());
11751 auto *I2 = cast<Instruction>(VL.back());
11752 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11753 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11754 I2->getOperand(Op));
11755 if (static_cast<unsigned>(count_if(
11756 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11758 })) >= S.getMainOp()->getNumOperands() / 2)
11759 return false;
11760 if (S.getMainOp()->getNumOperands() > 2)
11761 return true;
11762 if (IsCommutative) {
11763 // Check permuted operands.
11764 Candidates.clear();
11765 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11766 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11767 I2->getOperand((Op + 1) % E));
11768 if (any_of(
11769 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11771 }))
11772 return false;
11773 }
11774 return true;
11775 };
11776 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11777 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11778 if (!AreAllSameInsts || isSplat(VL) ||
11780 S.getMainOp()) &&
11782 NotProfitableForVectorization(VL)) {
11783 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11784 dbgs() << "[";
11785 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11786 dbgs() << "]\n");
11787 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11788 }
11789
11790 // Don't vectorize ephemeral values.
11791 if (!EphValues.empty()) {
11792 for (Value *V : VL) {
11793 if (EphValues.count(V)) {
11794 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11795 << ") is ephemeral.\n");
11796 // Do not try to pack to avoid extra instructions here.
11797 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11798 /*TryToFindDuplicates=*/false);
11799 }
11800 }
11801 }
11802
11803 // We now know that this is a vector of instructions of the same type from
11804 // the same block.
11805
11806 // Check that none of the instructions in the bundle are already in the tree
11807 // and the node may be not profitable for the vectorization as the small
11808 // alternate node.
11809 if (S.isAltShuffle()) {
11810 auto GetNumVectorizedExtracted = [&]() {
11811 APInt Extracted = APInt::getZero(VL.size());
11812 APInt Vectorized = APInt::getAllOnes(VL.size());
11813 for (auto [Idx, V] : enumerate(VL)) {
11814 auto *I = dyn_cast<Instruction>(V);
11815 if (!I || doesNotNeedToBeScheduled(I) ||
11816 all_of(I->operands(), [&](const Use &U) {
11817 return isa<ExtractElementInst>(U.get());
11818 }))
11819 continue;
11820 if (isVectorized(I))
11821 Vectorized.clearBit(Idx);
11822 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11823 Extracted.setBit(Idx);
11824 }
11825 return std::make_pair(Vectorized, Extracted);
11826 };
11827 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11829 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11830 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11831 // Rough cost estimation, if the vector code (+ potential extracts) is
11832 // more profitable than the scalar + buildvector.
11833 Type *ScalarTy = VL.front()->getType();
11834 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11835 InstructionCost VectorizeCostEstimate =
11836 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11837 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11838 /*Insert=*/false, /*Extract=*/true, Kind);
11839 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11840 *TTI, ScalarTy, VecTy, Vectorized,
11841 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11842 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11843 }
11844 if (PreferScalarize) {
11845 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11846 "node is not profitable.\n");
11847 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11848 }
11849 }
11850
11851 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11852 if (UserIgnoreList && !UserIgnoreList->empty()) {
11853 for (Value *V : VL) {
11854 if (UserIgnoreList->contains(V)) {
11855 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11856 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11857 }
11858 }
11859 }
11860
11861 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11862}
11863
11864void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11865 const EdgeInfo &UserTreeIdx,
11866 unsigned InterleaveFactor) {
11867 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11868
11869 SmallVector<int> ReuseShuffleIndices;
11870 SmallVector<Value *> VL(VLRef);
11871
11872 // Tries to build split node.
11873 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11874 SmallVector<Value *> Op1, Op2;
11875 OrdersType ReorderIndices;
11876 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11877 return false;
11878
11879 auto Invalid = ScheduleBundle::invalid();
11880 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11881 UserTreeIdx, {}, ReorderIndices);
11882 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11883 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11884 InstructionsState S = getSameOpcode(Op, *TLI);
11885 if (S && (isa<LoadInst>(S.getMainOp()) ||
11886 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11887 // Build gather node for loads, they will be gathered later.
11888 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11889 Idx == 0 ? 0 : Op1.size());
11890 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11891 } else {
11892 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11893 Idx == 0 ? 0 : Op1.size());
11894 buildTreeRec(Op, Depth, {TE, Idx});
11895 }
11896 };
11897 AddNode(Op1, 0);
11898 AddNode(Op2, 1);
11899 return true;
11900 };
11901
11902 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11903 bool AreConsts = false;
11904 for (Value *V : VL) {
11905 if (isa<PoisonValue>(V))
11906 continue;
11907 if (isa<Constant>(V)) {
11908 AreConsts = true;
11909 continue;
11910 }
11911 if (!isa<PHINode>(V))
11912 return false;
11913 }
11914 return AreConsts;
11915 };
11916 if (AreOnlyConstsWithPHIs(VL)) {
11917 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11918 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11919 return;
11920 }
11921
11922 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11923 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11924 InstructionsState S = Legality.getInstructionsState();
11925 if (!Legality.isLegal()) {
11926 if (Legality.trySplitVectorize()) {
11927 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11928 // Last chance to try to vectorize alternate node.
11929 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11930 return;
11931 }
11932 if (!S)
11933 Legality = getScalarsVectorizationLegality(
11934 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11935 if (!Legality.isLegal()) {
11936 if (Legality.tryToFindDuplicates())
11937 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11938 UserTreeIdx);
11939
11940 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11941 return;
11942 }
11943 S = Legality.getInstructionsState();
11944 }
11945
11946 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11947 if (S.isAltShuffle() && TrySplitNode(S))
11948 return;
11949
11950 // Check that every instruction appears once in this bundle.
11951 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11952 /*TryPad=*/true)) {
11953 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11954 return;
11955 }
11956
11957 // Perform specific checks for each particular instruction kind.
11958 bool IsScatterVectorizeUserTE =
11959 UserTreeIdx.UserTE &&
11960 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11961 OrdersType CurrentOrder;
11962 SmallVector<Value *> PointerOps;
11963 StridedPtrInfo SPtrInfo;
11964 TreeEntry::EntryState State = getScalarsVectorizationState(
11965 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11966 if (State == TreeEntry::NeedToGather) {
11967 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11968 return;
11969 }
11970
11971 Instruction *VL0 = S.getMainOp();
11972 BasicBlock *BB = VL0->getParent();
11973 auto &BSRef = BlocksSchedules[BB];
11974 if (!BSRef)
11975 BSRef = std::make_unique<BlockScheduling>(BB);
11976
11977 BlockScheduling &BS = *BSRef;
11978
11979 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11980 std::optional<ScheduleBundle *> BundlePtr =
11981 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11982#ifdef EXPENSIVE_CHECKS
11983 // Make sure we didn't break any internal invariants
11984 BS.verify();
11985#endif
11986 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11987 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11988 // Last chance to try to vectorize alternate node.
11989 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11990 return;
11991 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11992 NonScheduledFirst.insert(VL.front());
11993 if (S.getOpcode() == Instruction::Load &&
11994 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11996 return;
11997 }
11998 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11999 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
12000 ScheduleBundle Empty;
12001 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
12002 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
12003
12004 unsigned ShuffleOrOp =
12005 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
12006 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
12007 // Postpone PHI nodes creation
12008 SmallVector<unsigned> PHIOps;
12009 for (unsigned I : seq<unsigned>(Operands.size())) {
12010 ArrayRef<Value *> Op = Operands[I];
12011 if (Op.empty())
12012 continue;
12013 InstructionsState S = getSameOpcode(Op, *TLI);
12014 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12015 buildTreeRec(Op, Depth + 1, {TE, I});
12016 else
12017 PHIOps.push_back(I);
12018 }
12019 for (unsigned I : PHIOps)
12020 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12021 };
12022 switch (ShuffleOrOp) {
12023 case Instruction::PHI: {
12024 TreeEntry *TE =
12025 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12026 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
12027 TE->dump());
12028
12029 TE->setOperands(Operands);
12030 CreateOperandNodes(TE, Operands);
12031 return;
12032 }
12033 case Instruction::ExtractValue:
12034 case Instruction::ExtractElement: {
12035 if (CurrentOrder.empty()) {
12036 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
12037 } else {
12038 LLVM_DEBUG({
12039 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
12040 "with order";
12041 for (unsigned Idx : CurrentOrder)
12042 dbgs() << " " << Idx;
12043 dbgs() << "\n";
12044 });
12045 fixupOrderingIndices(CurrentOrder);
12046 }
12047 // Insert new order with initial value 0, if it does not exist,
12048 // otherwise return the iterator to the existing one.
12049 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12050 ReuseShuffleIndices, CurrentOrder);
12051 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
12052 "(ExtractValueInst/ExtractElementInst).\n";
12053 TE->dump());
12054 // This is a special case, as it does not gather, but at the same time
12055 // we are not extending buildTreeRec() towards the operands.
12056 TE->setOperands(Operands);
12057 return;
12058 }
12059 case Instruction::InsertElement: {
12060 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
12061
12062 auto OrdCompare = [](const std::pair<int, int> &P1,
12063 const std::pair<int, int> &P2) {
12064 return P1.first > P2.first;
12065 };
12066 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
12067 decltype(OrdCompare)>
12068 Indices(OrdCompare);
12069 for (int I = 0, E = VL.size(); I < E; ++I) {
12070 unsigned Idx = *getElementIndex(VL[I]);
12071 Indices.emplace(Idx, I);
12072 }
12073 OrdersType CurrentOrder(VL.size(), VL.size());
12074 bool IsIdentity = true;
12075 for (int I = 0, E = VL.size(); I < E; ++I) {
12076 CurrentOrder[Indices.top().second] = I;
12077 IsIdentity &= Indices.top().second == I;
12078 Indices.pop();
12079 }
12080 if (IsIdentity)
12081 CurrentOrder.clear();
12082 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12083 {}, CurrentOrder);
12084 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
12085 TE->dump());
12086
12087 TE->setOperands(Operands);
12088 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
12089 return;
12090 }
12091 case Instruction::Load: {
12092 // Check that a vectorized load would load the same memory as a scalar
12093 // load. For example, we don't want to vectorize loads that are smaller
12094 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
12095 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
12096 // from such a struct, we read/write packed bits disagreeing with the
12097 // unvectorized version.
12098 TreeEntry *TE = nullptr;
12099 fixupOrderingIndices(CurrentOrder);
12100 switch (State) {
12101 case TreeEntry::Vectorize:
12102 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12103 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
12104 if (CurrentOrder.empty())
12105 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
12106 TE->dump());
12107 else
12109 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
12110 TE->dump());
12111 break;
12112 case TreeEntry::CompressVectorize:
12113 // Vectorizing non-consecutive loads with (masked)load + compress.
12114 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
12115 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12116 LLVM_DEBUG(
12117 dbgs()
12118 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12119 TE->dump());
12120 break;
12121 case TreeEntry::StridedVectorize:
12122 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12123 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
12124 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12125 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
12126 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
12127 TE->dump());
12128 break;
12129 case TreeEntry::ScatterVectorize:
12130 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12131 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
12132 UserTreeIdx, ReuseShuffleIndices);
12133 LLVM_DEBUG(
12134 dbgs()
12135 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12136 TE->dump());
12137 break;
12138 case TreeEntry::CombinedVectorize:
12139 case TreeEntry::SplitVectorize:
12140 case TreeEntry::NeedToGather:
12141 llvm_unreachable("Unexpected loads state.");
12142 }
12143 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12144 assert(Operands.size() == 1 && "Expected a single operand only");
12145 SmallVector<int> Mask;
12146 inversePermutation(CurrentOrder, Mask);
12147 reorderScalars(Operands.front(), Mask);
12148 }
12149 TE->setOperands(Operands);
12150 if (State == TreeEntry::ScatterVectorize)
12151 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
12152 return;
12153 }
12154 case Instruction::ZExt:
12155 case Instruction::SExt:
12156 case Instruction::FPToUI:
12157 case Instruction::FPToSI:
12158 case Instruction::FPExt:
12159 case Instruction::PtrToInt:
12160 case Instruction::IntToPtr:
12161 case Instruction::SIToFP:
12162 case Instruction::UIToFP:
12163 case Instruction::Trunc:
12164 case Instruction::FPTrunc:
12165 case Instruction::BitCast: {
12166 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12167 std::make_pair(std::numeric_limits<unsigned>::min(),
12168 std::numeric_limits<unsigned>::max()));
12169 if (ShuffleOrOp == Instruction::ZExt ||
12170 ShuffleOrOp == Instruction::SExt) {
12171 CastMaxMinBWSizes = std::make_pair(
12172 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
12173 PrevMaxBW),
12174 std::min<unsigned>(
12175 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
12176 PrevMinBW));
12177 } else if (ShuffleOrOp == Instruction::Trunc) {
12178 CastMaxMinBWSizes = std::make_pair(
12179 std::max<unsigned>(
12180 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
12181 PrevMaxBW),
12182 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
12183 PrevMinBW));
12184 }
12185 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12186 ReuseShuffleIndices);
12187 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
12188 TE->dump());
12189
12190 TE->setOperands(Operands);
12191 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12192 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12193 if (ShuffleOrOp == Instruction::Trunc) {
12194 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12195 } else if (ShuffleOrOp == Instruction::SIToFP ||
12196 ShuffleOrOp == Instruction::UIToFP) {
12197 unsigned NumSignBits =
12198 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
12199 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
12200 APInt Mask = DB->getDemandedBits(OpI);
12201 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
12202 }
12203 if (NumSignBits * 2 >=
12204 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
12205 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12206 }
12207 return;
12208 }
12209 case Instruction::ICmp:
12210 case Instruction::FCmp: {
12211 // Check that all of the compares have the same predicate.
12212 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12213 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12214 ReuseShuffleIndices);
12215 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
12216 TE->dump());
12217
12218 VLOperands Ops(VL, Operands, S, *this);
12219 if (cast<CmpInst>(VL0)->isCommutative()) {
12220 // Commutative predicate - collect + sort operands of the instructions
12221 // so that each side is more likely to have the same opcode.
12223 "Commutative Predicate mismatch");
12224 Ops.reorder();
12225 Operands.front() = Ops.getVL(0);
12226 Operands.back() = Ops.getVL(1);
12227 } else {
12228 // Collect operands - commute if it uses the swapped predicate.
12229 for (auto [Idx, V] : enumerate(VL)) {
12230 if (isa<PoisonValue>(V))
12231 continue;
12232 auto *Cmp = cast<CmpInst>(V);
12233 if (Cmp->getPredicate() != P0)
12234 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12235 }
12236 }
12237 TE->setOperands(Operands);
12238 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
12239 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
12240 if (ShuffleOrOp == Instruction::ICmp) {
12241 unsigned NumSignBits0 =
12242 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
12243 if (NumSignBits0 * 2 >=
12244 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
12245 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12246 unsigned NumSignBits1 =
12247 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
12248 if (NumSignBits1 * 2 >=
12249 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
12250 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
12251 }
12252 return;
12253 }
12254 case Instruction::Select:
12255 case Instruction::FNeg:
12256 case Instruction::Add:
12257 case Instruction::FAdd:
12258 case Instruction::Sub:
12259 case Instruction::FSub:
12260 case Instruction::Mul:
12261 case Instruction::FMul:
12262 case Instruction::UDiv:
12263 case Instruction::SDiv:
12264 case Instruction::FDiv:
12265 case Instruction::URem:
12266 case Instruction::SRem:
12267 case Instruction::FRem:
12268 case Instruction::Shl:
12269 case Instruction::LShr:
12270 case Instruction::AShr:
12271 case Instruction::And:
12272 case Instruction::Or:
12273 case Instruction::Xor:
12274 case Instruction::Freeze: {
12275 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12276 ReuseShuffleIndices);
12277 LLVM_DEBUG(
12278 dbgs() << "SLP: added a new TreeEntry "
12279 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12280 TE->dump());
12281
12282 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
12283 VLOperands Ops(VL, Operands, S, *this);
12284 Ops.reorder();
12285 Operands[0] = Ops.getVL(0);
12286 Operands[1] = Ops.getVL(1);
12287 }
12288 TE->setOperands(Operands);
12289 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12290 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12291 return;
12292 }
12293 case Instruction::GetElementPtr: {
12294 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12295 ReuseShuffleIndices);
12296 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
12297 TE->dump());
12298 TE->setOperands(Operands);
12299
12300 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
12301 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12302 return;
12303 }
12304 case Instruction::Store: {
12305 bool Consecutive = CurrentOrder.empty();
12306 if (!Consecutive)
12307 fixupOrderingIndices(CurrentOrder);
12308 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12309 ReuseShuffleIndices, CurrentOrder);
12310 if (Consecutive)
12311 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
12312 TE->dump());
12313 else
12314 LLVM_DEBUG(
12315 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
12316 TE->dump());
12317 TE->setOperands(Operands);
12318 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
12319 return;
12320 }
12321 case Instruction::Call: {
12322 // Check if the calls are all to the same vectorizable intrinsic or
12323 // library function.
12324 CallInst *CI = cast<CallInst>(VL0);
12326
12327 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12328 ReuseShuffleIndices);
12329 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
12330 TE->dump());
12331 if (isCommutative(VL0)) {
12332 VLOperands Ops(VL, Operands, S, *this);
12333 Ops.reorder();
12334 Operands[0] = Ops.getVL(0);
12335 Operands[1] = Ops.getVL(1);
12336 }
12337 TE->setOperands(Operands);
12338 for (unsigned I : seq<unsigned>(CI->arg_size())) {
12339 // For scalar operands no need to create an entry since no need to
12340 // vectorize it.
12342 continue;
12343 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12344 }
12345 return;
12346 }
12347 case Instruction::ShuffleVector: {
12348 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12349 ReuseShuffleIndices);
12350 if (S.isAltShuffle()) {
12351 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
12352 TE->dump());
12353 } else {
12354 assert(SLPReVec && "Only supported by REVEC.");
12355 LLVM_DEBUG(
12356 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12357 TE->dump());
12358 }
12359
12360 // Reorder operands if reordering would enable vectorization.
12361 auto *CI = dyn_cast<CmpInst>(VL0);
12362 if (CI && any_of(VL, [](Value *V) {
12363 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
12364 })) {
12365 auto *MainCI = cast<CmpInst>(S.getMainOp());
12366 auto *AltCI = cast<CmpInst>(S.getAltOp());
12367 CmpInst::Predicate MainP = MainCI->getPredicate();
12368 CmpInst::Predicate AltP = AltCI->getPredicate();
12369 assert(MainP != AltP &&
12370 "Expected different main/alternate predicates.");
12371 // Collect operands - commute if it uses the swapped predicate or
12372 // alternate operation.
12373 for (auto [Idx, V] : enumerate(VL)) {
12374 if (isa<PoisonValue>(V))
12375 continue;
12376 auto *Cmp = cast<CmpInst>(V);
12377
12378 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
12379 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12380 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12381 } else {
12382 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12383 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12384 }
12385 }
12386 TE->setOperands(Operands);
12387 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
12388 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
12389 return;
12390 }
12391
12392 if (isa<BinaryOperator>(VL0) || CI) {
12393 VLOperands Ops(VL, Operands, S, *this);
12394 Ops.reorder();
12395 Operands[0] = Ops.getVL(0);
12396 Operands[1] = Ops.getVL(1);
12397 }
12398 TE->setOperands(Operands);
12399 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12400 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12401 return;
12402 }
12403 default:
12404 break;
12405 }
12406 llvm_unreachable("Unexpected vectorization of the instructions.");
12407}
12408
12409unsigned BoUpSLP::canMapToVector(Type *T) const {
12410 unsigned N = 1;
12411 Type *EltTy = T;
12412
12414 if (EltTy->isEmptyTy())
12415 return 0;
12416 if (auto *ST = dyn_cast<StructType>(EltTy)) {
12417 // Check that struct is homogeneous.
12418 for (const auto *Ty : ST->elements())
12419 if (Ty != *ST->element_begin())
12420 return 0;
12421 N *= ST->getNumElements();
12422 EltTy = *ST->element_begin();
12423 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
12424 N *= AT->getNumElements();
12425 EltTy = AT->getElementType();
12426 } else {
12427 auto *VT = cast<FixedVectorType>(EltTy);
12428 N *= VT->getNumElements();
12429 EltTy = VT->getElementType();
12430 }
12431 }
12432
12433 if (!isValidElementType(EltTy))
12434 return 0;
12435 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
12436 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12437 VTSize != DL->getTypeStoreSizeInBits(T))
12438 return 0;
12439 return N;
12440}
12441
12442bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12443 SmallVectorImpl<unsigned> &CurrentOrder,
12444 bool ResizeAllowed) const {
12446 assert(It != VL.end() && "Expected at least one extract instruction.");
12447 auto *E0 = cast<Instruction>(*It);
12448 assert(
12450 "Invalid opcode");
12451 // Check if all of the extracts come from the same vector and from the
12452 // correct offset.
12453 Value *Vec = E0->getOperand(0);
12454
12455 CurrentOrder.clear();
12456
12457 // We have to extract from a vector/aggregate with the same number of elements.
12458 unsigned NElts;
12459 if (E0->getOpcode() == Instruction::ExtractValue) {
12460 NElts = canMapToVector(Vec->getType());
12461 if (!NElts)
12462 return false;
12463 // Check if load can be rewritten as load of vector.
12464 LoadInst *LI = dyn_cast<LoadInst>(Vec);
12465 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
12466 return false;
12467 } else {
12468 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
12469 }
12470
12471 unsigned E = VL.size();
12472 if (!ResizeAllowed && NElts != E)
12473 return false;
12474 SmallVector<int> Indices(E, PoisonMaskElem);
12475 unsigned MinIdx = NElts, MaxIdx = 0;
12476 for (auto [I, V] : enumerate(VL)) {
12477 auto *Inst = dyn_cast<Instruction>(V);
12478 if (!Inst)
12479 continue;
12480 if (Inst->getOperand(0) != Vec)
12481 return false;
12482 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
12483 if (isa<UndefValue>(EE->getIndexOperand()))
12484 continue;
12485 std::optional<unsigned> Idx = getExtractIndex(Inst);
12486 if (!Idx)
12487 return false;
12488 const unsigned ExtIdx = *Idx;
12489 if (ExtIdx >= NElts)
12490 continue;
12491 Indices[I] = ExtIdx;
12492 if (MinIdx > ExtIdx)
12493 MinIdx = ExtIdx;
12494 if (MaxIdx < ExtIdx)
12495 MaxIdx = ExtIdx;
12496 }
12497 if (MaxIdx - MinIdx + 1 > E)
12498 return false;
12499 if (MaxIdx + 1 <= E)
12500 MinIdx = 0;
12501
12502 // Check that all of the indices extract from the correct offset.
12503 bool ShouldKeepOrder = true;
12504 // Assign to all items the initial value E + 1 so we can check if the extract
12505 // instruction index was used already.
12506 // Also, later we can check that all the indices are used and we have a
12507 // consecutive access in the extract instructions, by checking that no
12508 // element of CurrentOrder still has value E + 1.
12509 CurrentOrder.assign(E, E);
12510 for (unsigned I = 0; I < E; ++I) {
12511 if (Indices[I] == PoisonMaskElem)
12512 continue;
12513 const unsigned ExtIdx = Indices[I] - MinIdx;
12514 if (CurrentOrder[ExtIdx] != E) {
12515 CurrentOrder.clear();
12516 return false;
12517 }
12518 ShouldKeepOrder &= ExtIdx == I;
12519 CurrentOrder[ExtIdx] = I;
12520 }
12521 if (ShouldKeepOrder)
12522 CurrentOrder.clear();
12523
12524 return ShouldKeepOrder;
12525}
12526
12527bool BoUpSLP::areAllUsersVectorized(
12528 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12529 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
12530 all_of(I->users(), [this](User *U) {
12531 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12532 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12533 });
12534}
12535
12536void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12537 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12538 SmallVectorImpl<Value *> *OpScalars,
12539 SmallVectorImpl<Value *> *AltScalars) const {
12540 unsigned Sz = Scalars.size();
12541 Mask.assign(Sz, PoisonMaskElem);
12542 SmallVector<int> OrderMask;
12543 if (!ReorderIndices.empty())
12544 inversePermutation(ReorderIndices, OrderMask);
12545 for (unsigned I = 0; I < Sz; ++I) {
12546 unsigned Idx = I;
12547 if (!ReorderIndices.empty())
12548 Idx = OrderMask[I];
12549 if (isa<PoisonValue>(Scalars[Idx]))
12550 continue;
12551 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12552 if (IsAltOp(OpInst)) {
12553 Mask[I] = Sz + Idx;
12554 if (AltScalars)
12555 AltScalars->push_back(OpInst);
12556 } else {
12557 Mask[I] = Idx;
12558 if (OpScalars)
12559 OpScalars->push_back(OpInst);
12560 }
12561 }
12562 if (!ReuseShuffleIndices.empty()) {
12563 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12564 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12565 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12566 });
12567 Mask.swap(NewMask);
12568 }
12569}
12570
12572 Instruction *AltOp,
12573 const TargetLibraryInfo &TLI) {
12574 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12575}
12576
12578 Instruction *AltOp,
12579 const TargetLibraryInfo &TLI) {
12580 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12581 auto *AltCI = cast<CmpInst>(AltOp);
12582 CmpInst::Predicate MainP = MainCI->getPredicate();
12583 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12584 assert(MainP != AltP && "Expected different main/alternate predicates.");
12585 auto *CI = cast<CmpInst>(I);
12586 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12587 return false;
12588 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12589 return true;
12590 CmpInst::Predicate P = CI->getPredicate();
12592
12593 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12594 "CmpInst expected to match either main or alternate predicate or "
12595 "their swap.");
12596 return MainP != P && MainP != SwappedP;
12597 }
12598 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12599}
12600
12601TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12602 assert(!Ops.empty());
12603 const auto *Op0 = Ops.front();
12604
12605 const bool IsConstant = all_of(Ops, [](Value *V) {
12606 // TODO: We should allow undef elements here
12607 return isConstant(V) && !isa<UndefValue>(V);
12608 });
12609 const bool IsUniform = all_of(Ops, [=](Value *V) {
12610 // TODO: We should allow undef elements here
12611 return V == Op0;
12612 });
12613 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12614 // TODO: We should allow undef elements here
12615 if (auto *CI = dyn_cast<ConstantInt>(V))
12616 return CI->getValue().isPowerOf2();
12617 return false;
12618 });
12619 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12620 // TODO: We should allow undef elements here
12621 if (auto *CI = dyn_cast<ConstantInt>(V))
12622 return CI->getValue().isNegatedPowerOf2();
12623 return false;
12624 });
12625
12627 if (IsConstant && IsUniform)
12629 else if (IsConstant)
12631 else if (IsUniform)
12633
12635 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12636 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12637
12638 return {VK, VP};
12639}
12640
12641namespace {
12642/// The base class for shuffle instruction emission and shuffle cost estimation.
12643class BaseShuffleAnalysis {
12644protected:
12645 Type *ScalarTy = nullptr;
12646
12647 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12648
12649 /// V is expected to be a vectorized value.
12650 /// When REVEC is disabled, there is no difference between VF and
12651 /// VNumElements.
12652 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12653 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12654 /// of 8.
12655 unsigned getVF(Value *V) const {
12656 assert(V && "V cannot be nullptr");
12657 assert(isa<FixedVectorType>(V->getType()) &&
12658 "V does not have FixedVectorType");
12659 assert(ScalarTy && "ScalarTy cannot be nullptr");
12660 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12661 unsigned VNumElements =
12662 cast<FixedVectorType>(V->getType())->getNumElements();
12663 assert(VNumElements > ScalarTyNumElements &&
12664 "the number of elements of V is not large enough");
12665 assert(VNumElements % ScalarTyNumElements == 0 &&
12666 "the number of elements of V is not a vectorized value");
12667 return VNumElements / ScalarTyNumElements;
12668 }
12669
12670 /// Checks if the mask is an identity mask.
12671 /// \param IsStrict if is true the function returns false if mask size does
12672 /// not match vector size.
12673 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12674 bool IsStrict) {
12675 int Limit = Mask.size();
12676 int VF = VecTy->getNumElements();
12677 int Index = -1;
12678 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12679 return true;
12680 if (!IsStrict) {
12681 // Consider extract subvector starting from index 0.
12682 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12683 Index == 0)
12684 return true;
12685 // All VF-size submasks are identity (e.g.
12686 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12687 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12688 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12689 return all_of(Slice, equal_to(PoisonMaskElem)) ||
12691 }))
12692 return true;
12693 }
12694 return false;
12695 }
12696
12697 /// Tries to combine 2 different masks into single one.
12698 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12699 /// change the size of the vector, \p LocalVF is the original size of the
12700 /// shuffled vector.
12701 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12702 ArrayRef<int> ExtMask) {
12703 unsigned VF = Mask.size();
12704 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12705 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12706 if (ExtMask[I] == PoisonMaskElem)
12707 continue;
12708 int MaskedIdx = Mask[ExtMask[I] % VF];
12709 NewMask[I] =
12710 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12711 }
12712 Mask.swap(NewMask);
12713 }
12714
12715 /// Looks through shuffles trying to reduce final number of shuffles in the
12716 /// code. The function looks through the previously emitted shuffle
12717 /// instructions and properly mark indices in mask as undef.
12718 /// For example, given the code
12719 /// \code
12720 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12721 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12722 /// \endcode
12723 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12724 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12725 /// <0, 1, 2, 3> for the shuffle.
12726 /// If 2 operands are of different size, the smallest one will be resized and
12727 /// the mask recalculated properly.
12728 /// For example, given the code
12729 /// \code
12730 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12731 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12732 /// \endcode
12733 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12734 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12735 /// <0, 1, 2, 3> for the shuffle.
12736 /// So, it tries to transform permutations to simple vector merge, if
12737 /// possible.
12738 /// \param V The input vector which must be shuffled using the given \p Mask.
12739 /// If the better candidate is found, \p V is set to this best candidate
12740 /// vector.
12741 /// \param Mask The input mask for the shuffle. If the best candidate is found
12742 /// during looking-through-shuffles attempt, it is updated accordingly.
12743 /// \param SinglePermute true if the shuffle operation is originally a
12744 /// single-value-permutation. In this case the look-through-shuffles procedure
12745 /// may look for resizing shuffles as the best candidates.
12746 /// \return true if the shuffle results in the non-resizing identity shuffle
12747 /// (and thus can be ignored), false - otherwise.
12748 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12749 bool SinglePermute) {
12750 Value *Op = V;
12751 ShuffleVectorInst *IdentityOp = nullptr;
12752 SmallVector<int> IdentityMask;
12753 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12754 // Exit if not a fixed vector type or changing size shuffle.
12755 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12756 if (!SVTy)
12757 break;
12758 // Remember the identity or broadcast mask, if it is not a resizing
12759 // shuffle. If no better candidates are found, this Op and Mask will be
12760 // used in the final shuffle.
12761 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12762 if (!IdentityOp || !SinglePermute ||
12763 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12765 IdentityMask.size()))) {
12766 IdentityOp = SV;
12767 // Store current mask in the IdentityMask so later we did not lost
12768 // this info if IdentityOp is selected as the best candidate for the
12769 // permutation.
12770 IdentityMask.assign(Mask);
12771 }
12772 }
12773 // Remember the broadcast mask. If no better candidates are found, this Op
12774 // and Mask will be used in the final shuffle.
12775 // Zero splat can be used as identity too, since it might be used with
12776 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12777 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12778 // expensive, the analysis founds out, that the source vector is just a
12779 // broadcast, this original mask can be transformed to identity mask <0,
12780 // 1, 2, 3>.
12781 // \code
12782 // %0 = shuffle %v, poison, zeroinitalizer
12783 // %res = shuffle %0, poison, <3, 1, 2, 0>
12784 // \endcode
12785 // may be transformed to
12786 // \code
12787 // %0 = shuffle %v, poison, zeroinitalizer
12788 // %res = shuffle %0, poison, <0, 1, 2, 3>
12789 // \endcode
12790 if (SV->isZeroEltSplat()) {
12791 IdentityOp = SV;
12792 IdentityMask.assign(Mask);
12793 }
12794 int LocalVF = Mask.size();
12795 if (auto *SVOpTy =
12796 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12797 LocalVF = SVOpTy->getNumElements();
12798 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12799 for (auto [Idx, I] : enumerate(Mask)) {
12800 if (I == PoisonMaskElem ||
12801 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12802 continue;
12803 ExtMask[Idx] = SV->getMaskValue(I);
12804 }
12805 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12806 SV->getOperand(0),
12807 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12808 .all();
12809 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12810 SV->getOperand(1),
12811 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12812 .all();
12813 if (!IsOp1Undef && !IsOp2Undef) {
12814 // Update mask and mark undef elems.
12815 for (int &I : Mask) {
12816 if (I == PoisonMaskElem)
12817 continue;
12818 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12820 I = PoisonMaskElem;
12821 }
12822 break;
12823 }
12824 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12825 combineMasks(LocalVF, ShuffleMask, Mask);
12826 Mask.swap(ShuffleMask);
12827 if (IsOp2Undef)
12828 Op = SV->getOperand(0);
12829 else
12830 Op = SV->getOperand(1);
12831 }
12832 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12833 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12835 if (IdentityOp) {
12836 V = IdentityOp;
12837 assert(Mask.size() == IdentityMask.size() &&
12838 "Expected masks of same sizes.");
12839 // Clear known poison elements.
12840 for (auto [I, Idx] : enumerate(Mask))
12841 if (Idx == PoisonMaskElem)
12842 IdentityMask[I] = PoisonMaskElem;
12843 Mask.swap(IdentityMask);
12844 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12845 return SinglePermute &&
12846 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12847 /*IsStrict=*/true) ||
12848 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12849 Shuffle->isZeroEltSplat() &&
12851 all_of(enumerate(Mask), [&](const auto &P) {
12852 return P.value() == PoisonMaskElem ||
12853 Shuffle->getShuffleMask()[P.index()] == 0;
12854 })));
12855 }
12856 V = Op;
12857 return false;
12858 }
12859 V = Op;
12860 return true;
12861 }
12862
12863 /// Smart shuffle instruction emission, walks through shuffles trees and
12864 /// tries to find the best matching vector for the actual shuffle
12865 /// instruction.
12866 template <typename T, typename ShuffleBuilderTy>
12867 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12868 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12869 assert(V1 && "Expected at least one vector value.");
12870 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12871 SmallVector<int> NewMask(Mask);
12872 if (ScalarTyNumElements != 1) {
12873 assert(SLPReVec && "FixedVectorType is not expected.");
12874 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12875 Mask = NewMask;
12876 }
12877 if (V2)
12878 Builder.resizeToMatch(V1, V2);
12879 int VF = Mask.size();
12880 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12881 VF = FTy->getNumElements();
12883 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12884 .all()) {
12885 // Peek through shuffles.
12886 Value *Op1 = V1;
12887 Value *Op2 = V2;
12888 int VF =
12889 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12890 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12891 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12892 for (int I = 0, E = Mask.size(); I < E; ++I) {
12893 if (Mask[I] < VF)
12894 CombinedMask1[I] = Mask[I];
12895 else
12896 CombinedMask2[I] = Mask[I] - VF;
12897 }
12898 Value *PrevOp1;
12899 Value *PrevOp2;
12900 do {
12901 PrevOp1 = Op1;
12902 PrevOp2 = Op2;
12903 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12904 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12905 // Check if we have 2 resizing shuffles - need to peek through operands
12906 // again.
12907 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12908 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12909 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12910 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12911 if (I == PoisonMaskElem)
12912 continue;
12913 ExtMask1[Idx] = SV1->getMaskValue(I);
12914 }
12915 SmallBitVector UseMask1 = buildUseMask(
12916 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12917 ->getNumElements(),
12918 ExtMask1, UseMask::SecondArg);
12919 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12920 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12921 if (I == PoisonMaskElem)
12922 continue;
12923 ExtMask2[Idx] = SV2->getMaskValue(I);
12924 }
12925 SmallBitVector UseMask2 = buildUseMask(
12926 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12927 ->getNumElements(),
12928 ExtMask2, UseMask::SecondArg);
12929 if (SV1->getOperand(0)->getType() ==
12930 SV2->getOperand(0)->getType() &&
12931 SV1->getOperand(0)->getType() != SV1->getType() &&
12932 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12933 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12934 Op1 = SV1->getOperand(0);
12935 Op2 = SV2->getOperand(0);
12936 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12937 int LocalVF = ShuffleMask1.size();
12938 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12939 LocalVF = FTy->getNumElements();
12940 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12941 CombinedMask1.swap(ShuffleMask1);
12942 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12943 LocalVF = ShuffleMask2.size();
12944 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12945 LocalVF = FTy->getNumElements();
12946 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12947 CombinedMask2.swap(ShuffleMask2);
12948 }
12949 }
12950 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12951 Builder.resizeToMatch(Op1, Op2);
12952 VF = std::max(cast<VectorType>(Op1->getType())
12953 ->getElementCount()
12954 .getKnownMinValue(),
12956 ->getElementCount()
12957 .getKnownMinValue());
12958 for (int I = 0, E = Mask.size(); I < E; ++I) {
12959 if (CombinedMask2[I] != PoisonMaskElem) {
12960 assert(CombinedMask1[I] == PoisonMaskElem &&
12961 "Expected undefined mask element");
12962 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12963 }
12964 }
12965 if (Op1 == Op2 &&
12966 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12967 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12969 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12970 ArrayRef(CombinedMask1))))
12971 return Builder.createIdentity(Op1);
12972 return Builder.createShuffleVector(
12973 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12974 CombinedMask1);
12975 }
12976 if (isa<PoisonValue>(V1))
12977 return Builder.createPoison(
12978 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12979 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12980 assert(V1 && "Expected non-null value after looking through shuffles.");
12981
12982 if (!IsIdentity)
12983 return Builder.createShuffleVector(V1, NewMask);
12984 return Builder.createIdentity(V1);
12985 }
12986
12987 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12988 /// shuffle emission.
12989 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12990 ArrayRef<int> Mask) {
12991 for (unsigned I : seq<unsigned>(CommonMask.size()))
12992 if (Mask[I] != PoisonMaskElem)
12993 CommonMask[I] = I;
12994 }
12995};
12996} // namespace
12997
12998/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12999static std::pair<InstructionCost, InstructionCost>
13001 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
13002 Type *ScalarTy, VectorType *VecTy) {
13003 InstructionCost ScalarCost = 0;
13004 InstructionCost VecCost = 0;
13005 // Here we differentiate two cases: (1) when Ptrs represent a regular
13006 // vectorization tree node (as they are pointer arguments of scattered
13007 // loads) or (2) when Ptrs are the arguments of loads or stores being
13008 // vectorized as plane wide unit-stride load/store since all the
13009 // loads/stores are known to be from/to adjacent locations.
13010 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
13011 // Case 2: estimate costs for pointer related costs when vectorizing to
13012 // a wide load/store.
13013 // Scalar cost is estimated as a set of pointers with known relationship
13014 // between them.
13015 // For vector code we will use BasePtr as argument for the wide load/store
13016 // but we also need to account all the instructions which are going to
13017 // stay in vectorized code due to uses outside of these scalar
13018 // loads/stores.
13019 ScalarCost = TTI.getPointersChainCost(
13020 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
13021 CostKind);
13022
13023 SmallVector<const Value *> PtrsRetainedInVecCode;
13024 for (Value *V : Ptrs) {
13025 if (V == BasePtr) {
13026 PtrsRetainedInVecCode.push_back(V);
13027 continue;
13028 }
13029 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
13030 // For simplicity assume Ptr to stay in vectorized code if it's not a
13031 // GEP instruction. We don't care since it's cost considered free.
13032 // TODO: We should check for any uses outside of vectorizable tree
13033 // rather than just single use.
13034 if (!Ptr || !Ptr->hasOneUse())
13035 PtrsRetainedInVecCode.push_back(V);
13036 }
13037
13038 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
13039 // If all pointers stay in vectorized code then we don't have
13040 // any savings on that.
13041 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
13042 }
13043 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
13044 TTI::PointersChainInfo::getKnownStride(),
13045 VecTy, CostKind);
13046 } else {
13047 // Case 1: Ptrs are the arguments of loads that we are going to transform
13048 // into masked gather load intrinsic.
13049 // All the scalar GEPs will be removed as a result of vectorization.
13050 // For any external uses of some lanes extract element instructions will
13051 // be generated (which cost is estimated separately).
13052 TTI::PointersChainInfo PtrsInfo =
13053 all_of(Ptrs,
13054 [](const Value *V) {
13055 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
13056 return Ptr && !Ptr->hasAllConstantIndices();
13057 })
13058 ? TTI::PointersChainInfo::getUnknownStride()
13059 : TTI::PointersChainInfo::getKnownStride();
13060
13061 ScalarCost =
13062 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
13063 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
13064 if (!BaseGEP) {
13065 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
13066 if (It != Ptrs.end())
13067 BaseGEP = cast<GEPOperator>(*It);
13068 }
13069 if (BaseGEP) {
13070 SmallVector<const Value *> Indices(BaseGEP->indices());
13071 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
13072 BaseGEP->getPointerOperand(), Indices, VecTy,
13073 CostKind);
13074 }
13075 }
13076
13077 return std::make_pair(ScalarCost, VecCost);
13078}
13079
13080void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13081 assert(TE.isGather() && TE.ReorderIndices.empty() &&
13082 "Expected gather node without reordering.");
13083 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
13084 SmallSet<size_t, 2> LoadKeyUsed;
13085
13086 // Do not reorder nodes if it small (just 2 elements), all-constant or all
13087 // instructions have same opcode already.
13088 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
13089 all_of(TE.Scalars, isConstant))
13090 return;
13091
13092 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
13093 return VectorizableTree[Idx]->isSame(TE.Scalars);
13094 }))
13095 return;
13096
13097 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
13098 Key = hash_combine(hash_value(LI->getParent()), Key);
13099 Value *Ptr =
13100 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
13101 if (LoadKeyUsed.contains(Key)) {
13102 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
13103 if (LIt != LoadsMap.end()) {
13104 for (LoadInst *RLI : LIt->second) {
13105 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
13106 LI->getType(), LI->getPointerOperand(), *DL, *SE,
13107 /*StrictCheck=*/true))
13108 return hash_value(RLI->getPointerOperand());
13109 }
13110 for (LoadInst *RLI : LIt->second) {
13112 LI->getPointerOperand(), *TLI)) {
13113 hash_code SubKey = hash_value(RLI->getPointerOperand());
13114 return SubKey;
13115 }
13116 }
13117 if (LIt->second.size() > 2) {
13118 hash_code SubKey =
13119 hash_value(LIt->second.back()->getPointerOperand());
13120 return SubKey;
13121 }
13122 }
13123 }
13124 LoadKeyUsed.insert(Key);
13125 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
13126 return hash_value(LI->getPointerOperand());
13127 };
13128 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13129 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
13130 bool IsOrdered = true;
13131 unsigned NumInstructions = 0;
13132 // Try to "cluster" scalar instructions, to be able to build extra vectorized
13133 // nodes.
13134 for (auto [I, V] : enumerate(TE.Scalars)) {
13135 size_t Key = 1, Idx = 1;
13136 if (auto *Inst = dyn_cast<Instruction>(V);
13138 !isDeleted(Inst) && !isVectorized(V)) {
13139 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
13140 /*AllowAlternate=*/false);
13141 ++NumInstructions;
13142 }
13143 auto &Container = SortedValues[Key];
13144 if (IsOrdered && !KeyToIndex.contains(V) &&
13147 ((Container.contains(Idx) &&
13148 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
13149 (!Container.empty() && !Container.contains(Idx) &&
13150 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
13151 IsOrdered = false;
13152 auto &KTI = KeyToIndex[V];
13153 if (KTI.empty())
13154 Container[Idx].push_back(V);
13155 KTI.push_back(I);
13156 }
13158 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
13159 if (!IsOrdered && NumInstructions > 1) {
13160 unsigned Cnt = 0;
13161 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
13162 for (const auto &D : SortedValues) {
13163 for (const auto &P : D.second) {
13164 unsigned Sz = 0;
13165 for (Value *V : P.second) {
13166 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
13167 for (auto [K, Idx] : enumerate(Indices)) {
13168 TE.ReorderIndices[Cnt + K] = Idx;
13169 TE.Scalars[Cnt + K] = V;
13170 }
13171 Sz += Indices.size();
13172 Cnt += Indices.size();
13173 }
13174 if (Sz > 1 && isa<Instruction>(P.second.front())) {
13175 const unsigned SubVF = getFloorFullVectorNumberOfElements(
13176 *TTI, TE.Scalars.front()->getType(), Sz);
13177 SubVectors.emplace_back(Cnt - Sz, SubVF);
13178 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
13179 DemandedElts.clearBit(I);
13180 } else if (!P.second.empty() && isConstant(P.second.front())) {
13181 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
13182 DemandedElts.clearBit(I);
13183 }
13184 }
13185 }
13186 }
13187 // Reuses always require shuffles, so consider it as profitable.
13188 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
13189 return;
13190 // Do simple cost estimation.
13193 auto *ScalarTy = TE.Scalars.front()->getType();
13194 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
13195 for (auto [Idx, Sz] : SubVectors) {
13197 Idx, getWidenedType(ScalarTy, Sz));
13198 }
13199 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
13200 /*Insert=*/true,
13201 /*Extract=*/false, CostKind);
13202 int Sz = TE.Scalars.size();
13203 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
13204 TE.ReorderIndices.end());
13205 for (unsigned I : seq<unsigned>(Sz)) {
13206 Value *V = TE.getOrdered(I);
13207 if (isa<PoisonValue>(V)) {
13208 ReorderMask[I] = PoisonMaskElem;
13209 } else if (isConstant(V) || DemandedElts[I]) {
13210 ReorderMask[I] = I + TE.ReorderIndices.size();
13211 }
13212 }
13213 Cost += ::getShuffleCost(*TTI,
13214 any_of(ReorderMask, [&](int I) { return I >= Sz; })
13217 VecTy, ReorderMask);
13218 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
13219 ReorderMask.assign(Sz, PoisonMaskElem);
13220 for (unsigned I : seq<unsigned>(Sz)) {
13221 Value *V = TE.getOrdered(I);
13222 if (isConstant(V)) {
13223 DemandedElts.clearBit(I);
13224 if (!isa<PoisonValue>(V))
13225 ReorderMask[I] = I;
13226 } else {
13227 ReorderMask[I] = I + Sz;
13228 }
13229 }
13230 InstructionCost BVCost =
13231 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
13232 /*Insert=*/true, /*Extract=*/false, CostKind);
13233 if (!DemandedElts.isAllOnes())
13234 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
13235 if (Cost >= BVCost) {
13236 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
13237 reorderScalars(TE.Scalars, Mask);
13238 TE.ReorderIndices.clear();
13239 }
13240}
13241
13242/// Check if we can convert fadd/fsub sequence to FMAD.
13243/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
13245 const InstructionsState &S,
13246 DominatorTree &DT, const DataLayout &DL,
13248 const TargetLibraryInfo &TLI) {
13249 assert(all_of(VL,
13250 [](Value *V) {
13251 return V->getType()->getScalarType()->isFloatingPointTy();
13252 }) &&
13253 "Can only convert to FMA for floating point types");
13254 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
13255
13256 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
13257 FastMathFlags FMF;
13258 FMF.set();
13259 for (Value *V : VL) {
13260 auto *I = dyn_cast<Instruction>(V);
13261 if (!I)
13262 continue;
13263 if (S.isCopyableElement(I))
13264 continue;
13265 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
13266 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13267 continue;
13268 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13269 FMF &= FPCI->getFastMathFlags();
13270 }
13271 return FMF.allowContract();
13272 };
13273 if (!CheckForContractable(VL))
13275 // fmul also should be contractable
13276 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
13277 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
13278
13279 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
13280 if (!OpS.valid())
13282
13283 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13285 if (!CheckForContractable(Operands.front()))
13287 // Compare the costs.
13288 InstructionCost FMulPlusFAddCost = 0;
13289 InstructionCost FMACost = 0;
13291 FastMathFlags FMF;
13292 FMF.set();
13293 for (Value *V : VL) {
13294 auto *I = dyn_cast<Instruction>(V);
13295 if (!I)
13296 continue;
13297 if (!S.isCopyableElement(I))
13298 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13299 FMF &= FPCI->getFastMathFlags();
13300 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13301 }
13302 unsigned NumOps = 0;
13303 for (auto [V, Op] : zip(VL, Operands.front())) {
13304 if (S.isCopyableElement(V))
13305 continue;
13306 auto *I = dyn_cast<Instruction>(Op);
13307 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
13308 if (auto *OpI = dyn_cast<Instruction>(V))
13309 FMACost += TTI.getInstructionCost(OpI, CostKind);
13310 if (I)
13311 FMACost += TTI.getInstructionCost(I, CostKind);
13312 continue;
13313 }
13314 ++NumOps;
13315 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13316 FMF &= FPCI->getFastMathFlags();
13317 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13318 }
13319 Type *Ty = VL.front()->getType();
13320 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
13321 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
13322 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
13323}
13324
13327 BaseGraphSize = VectorizableTree.size();
13328 // Turn graph transforming mode on and off, when done.
13329 class GraphTransformModeRAAI {
13330 bool &SavedIsGraphTransformMode;
13331
13332 public:
13333 GraphTransformModeRAAI(bool &IsGraphTransformMode)
13334 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13335 IsGraphTransformMode = true;
13336 }
13337 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
13338 } TransformContext(IsGraphTransformMode);
13339 // Operands are profitable if they are:
13340 // 1. At least one constant
13341 // or
13342 // 2. Splats
13343 // or
13344 // 3. Results in good vectorization opportunity, i.e. may generate vector
13345 // nodes and reduce cost of the graph.
13346 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
13347 const InstructionsState &S) {
13349 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
13350 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
13351 I2->getOperand(Op));
13352 return all_of(
13353 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
13354 return all_of(Cand,
13355 [](const std::pair<Value *, Value *> &P) {
13356 return isa<Constant>(P.first) ||
13357 isa<Constant>(P.second) || P.first == P.second;
13358 }) ||
13360 });
13361 };
13362
13363 // Try to reorder gather nodes for better vectorization opportunities.
13364 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13365 TreeEntry &E = *VectorizableTree[Idx];
13366 if (E.isGather())
13367 reorderGatherNode(E);
13368 }
13369
13370 // Better to use full gathered loads analysis, if there are only 2 loads
13371 // gathered nodes each having less than 16 elements.
13372 constexpr unsigned VFLimit = 16;
13373 bool ForceLoadGather =
13374 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
13375 return TE->isGather() && TE->hasState() &&
13376 TE->getOpcode() == Instruction::Load &&
13377 TE->getVectorFactor() < VFLimit;
13378 }) == 2;
13379
13380 // Checks if the scalars are used in other node.
13381 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
13382 function_ref<bool(Value *)> CheckContainer) {
13383 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
13384 if (isa<PoisonValue>(V))
13385 return true;
13386 auto *I = dyn_cast<Instruction>(V);
13387 if (!I)
13388 return false;
13389 return is_contained(TE->Scalars, I) || CheckContainer(I);
13390 });
13391 };
13392 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
13393 if (E.hasState()) {
13394 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
13395 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13396 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13397 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13398 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13399 return is_contained(TEs, TE);
13400 });
13401 });
13402 }))
13403 return true;
13404 ;
13405 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
13406 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13407 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13408 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13409 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13410 return is_contained(TEs, TE);
13411 });
13412 });
13413 }))
13414 return true;
13415 } else {
13416 // Check if the gather node full copy of split node.
13417 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
13418 if (It != E.Scalars.end()) {
13419 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
13420 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13421 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13422 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13423 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13424 return is_contained(TEs, TE);
13425 });
13426 });
13427 }))
13428 return true;
13429 }
13430 }
13431 return false;
13432 };
13433 // The tree may grow here, so iterate over nodes, built before.
13434 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13435 TreeEntry &E = *VectorizableTree[Idx];
13436 if (E.isGather()) {
13437 ArrayRef<Value *> VL = E.Scalars;
13438 const unsigned Sz = getVectorElementSize(VL.front());
13439 unsigned MinVF = getMinVF(2 * Sz);
13440 // Do not try partial vectorization for small nodes (<= 2), nodes with the
13441 // same opcode and same parent block or all constants.
13442 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13443 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
13444 // We use allSameOpcode instead of isAltShuffle because we don't
13445 // want to use interchangeable instruction here.
13446 !allSameOpcode(VL) || !allSameBlock(VL)) ||
13447 allConstant(VL) || isSplat(VL))
13448 continue;
13449 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13450 continue;
13451 // Check if the node is a copy of other vector nodes.
13452 if (CheckForSameVectorNodes(E))
13453 continue;
13454 // Try to find vectorizable sequences and transform them into a series of
13455 // insertvector instructions.
13456 unsigned StartIdx = 0;
13457 unsigned End = VL.size();
13458 SmallBitVector Processed(End);
13459 for (unsigned VF = getFloorFullVectorNumberOfElements(
13460 *TTI, VL.front()->getType(), VL.size() - 1);
13461 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13462 *TTI, VL.front()->getType(), VF - 1)) {
13463 if (StartIdx + VF > End)
13464 continue;
13466 bool AllStrided = true;
13467 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13468 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
13469 // If any instruction is vectorized already - do not try again.
13470 // Reuse the existing node, if it fully matches the slice.
13471 if ((Processed.test(Cnt) || isVectorized(Slice.front())) &&
13472 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
13473 continue;
13474 // Constant already handled effectively - skip.
13475 if (allConstant(Slice))
13476 continue;
13477 // Do not try to vectorize small splats (less than vector register and
13478 // only with the single non-undef element).
13479 bool IsSplat = isSplat(Slice);
13480 bool IsTwoRegisterSplat = true;
13481 if (IsSplat && VF == 2) {
13482 unsigned NumRegs2VF = ::getNumberOfParts(
13483 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
13484 IsTwoRegisterSplat = NumRegs2VF == 2;
13485 }
13486 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
13487 count(Slice, Slice.front()) ==
13488 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
13489 : 1)) {
13490 if (IsSplat)
13491 continue;
13492 InstructionsState S = getSameOpcode(Slice, *TLI);
13493 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
13494 (S.getOpcode() == Instruction::Load &&
13496 (S.getOpcode() != Instruction::Load &&
13497 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
13498 continue;
13499 if (VF == 2) {
13500 // Try to vectorize reduced values or if all users are vectorized.
13501 // For expensive instructions extra extracts might be profitable.
13502 if ((!UserIgnoreList || E.Idx != 0) &&
13503 TTI->getInstructionCost(S.getMainOp(), CostKind) <
13505 !all_of(Slice, [&](Value *V) {
13506 if (isa<PoisonValue>(V))
13507 return true;
13508 return areAllUsersVectorized(cast<Instruction>(V),
13509 UserIgnoreList);
13510 }))
13511 continue;
13512 if (S.getOpcode() == Instruction::Load) {
13513 OrdersType Order;
13514 SmallVector<Value *> PointerOps;
13515 StridedPtrInfo SPtrInfo;
13516 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
13517 PointerOps, SPtrInfo);
13518 AllStrided &= Res == LoadsState::StridedVectorize ||
13520 Res == LoadsState::Gather;
13521 // Do not vectorize gathers.
13522 if (Res == LoadsState::ScatterVectorize ||
13523 Res == LoadsState::Gather) {
13524 if (Res == LoadsState::Gather) {
13526 // If reductions and the scalars from the root node are
13527 // analyzed - mark as non-vectorizable reduction.
13528 if (UserIgnoreList && E.Idx == 0)
13529 analyzedReductionVals(Slice);
13530 }
13531 continue;
13532 }
13533 } else if (S.getOpcode() == Instruction::ExtractElement ||
13534 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
13536 !CheckOperandsProfitability(
13537 S.getMainOp(),
13540 S))) {
13541 // Do not vectorize extractelements (handled effectively
13542 // alread). Do not vectorize non-profitable instructions (with
13543 // low cost and non-vectorizable operands.)
13544 continue;
13545 }
13546 }
13547 }
13548 Slices.emplace_back(Cnt, Slice.size());
13549 }
13550 // Do not try to vectorize if all slides are strided or gathered with
13551 // vector factor 2 and there are more than 2 slices. Better to handle
13552 // them in gathered loads analysis, may result in better vectorization.
13553 if (VF == 2 && AllStrided && Slices.size() > 2)
13554 continue;
13555 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13556 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13557 Processed.set(Cnt, Cnt + Sz);
13558 if (StartIdx == Cnt)
13559 StartIdx = Cnt + Sz;
13560 if (End == Cnt + Sz)
13561 End = Cnt;
13562 };
13563 for (auto [Cnt, Sz] : Slices) {
13564 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13565 const TreeEntry *SameTE = nullptr;
13566 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13567 It != Slice.end()) {
13568 // If any instruction is vectorized already - do not try again.
13569 SameTE = getSameValuesTreeEntry(*It, Slice);
13570 }
13571 unsigned PrevSize = VectorizableTree.size();
13572 [[maybe_unused]] unsigned PrevEntriesSize =
13573 LoadEntriesToVectorize.size();
13574 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13575 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13576 VectorizableTree[PrevSize]->isGather() &&
13577 VectorizableTree[PrevSize]->hasState() &&
13578 VectorizableTree[PrevSize]->getOpcode() !=
13579 Instruction::ExtractElement &&
13580 !isSplat(Slice)) {
13581 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13582 analyzedReductionVals(Slice);
13583 VectorizableTree.pop_back();
13584 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13585 "LoadEntriesToVectorize expected to remain the same");
13586 continue;
13587 }
13588 AddCombinedNode(PrevSize, Cnt, Sz);
13589 }
13590 }
13591 // Restore ordering, if no extra vectorization happened.
13592 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13593 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13594 reorderScalars(E.Scalars, Mask);
13595 E.ReorderIndices.clear();
13596 }
13597 }
13598 if (!E.hasState())
13599 continue;
13600 switch (E.getOpcode()) {
13601 case Instruction::Load: {
13602 // No need to reorder masked gather loads, just reorder the scalar
13603 // operands.
13604 if (E.State != TreeEntry::Vectorize)
13605 break;
13606 Type *ScalarTy = E.getMainOp()->getType();
13607 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13608 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13609 // Check if profitable to represent consecutive load + reverse as strided
13610 // load with stride -1.
13611 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13612 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13613 SmallVector<int> Mask;
13614 inversePermutation(E.ReorderIndices, Mask);
13615 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13616 InstructionCost OriginalVecCost =
13617 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13618 BaseLI->getPointerAddressSpace(), CostKind,
13620 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13621 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13622 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13623 VecTy, BaseLI->getPointerOperand(),
13624 /*VariableMask=*/false, CommonAlignment,
13625 BaseLI),
13626 CostKind);
13627 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13628 // Strided load is more profitable than consecutive load + reverse -
13629 // transform the node to strided load.
13630 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13631 ->getPointerOperand()
13632 ->getType());
13633 StridedPtrInfo SPtrInfo;
13634 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13635 SPtrInfo.Ty = VecTy;
13636 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13637 E.State = TreeEntry::StridedVectorize;
13638 }
13639 }
13640 break;
13641 }
13642 case Instruction::Store: {
13643 Type *ScalarTy =
13644 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13645 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13646 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13647 // Check if profitable to represent consecutive load + reverse as strided
13648 // load with stride -1.
13649 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13650 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13651 SmallVector<int> Mask;
13652 inversePermutation(E.ReorderIndices, Mask);
13653 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13654 InstructionCost OriginalVecCost =
13655 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13656 BaseSI->getPointerAddressSpace(), CostKind,
13658 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13659 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13660 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13661 VecTy, BaseSI->getPointerOperand(),
13662 /*VariableMask=*/false, CommonAlignment,
13663 BaseSI),
13664 CostKind);
13665 if (StridedCost < OriginalVecCost)
13666 // Strided store is more profitable than reverse + consecutive store -
13667 // transform the node to strided store.
13668 E.State = TreeEntry::StridedVectorize;
13669 } else if (!E.ReorderIndices.empty()) {
13670 // Check for interleaved stores.
13671 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13672 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13673 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13674 if (Mask.size() < 4)
13675 return 0u;
13676 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13678 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13679 TTI.isLegalInterleavedAccessType(
13680 VecTy, Factor, BaseSI->getAlign(),
13681 BaseSI->getPointerAddressSpace()))
13682 return Factor;
13683 }
13684
13685 return 0u;
13686 };
13687 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13688 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13689 if (InterleaveFactor != 0)
13690 E.setInterleave(InterleaveFactor);
13691 }
13692 break;
13693 }
13694 case Instruction::Select: {
13695 if (E.State != TreeEntry::Vectorize)
13696 break;
13697 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13698 if (MinMaxID == Intrinsic::not_intrinsic)
13699 break;
13700 // This node is a minmax node.
13701 E.CombinedOp = TreeEntry::MinMax;
13702 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13703 if (SelectOnly && CondEntry->UserTreeIndex &&
13704 CondEntry->State == TreeEntry::Vectorize) {
13705 // The condition node is part of the combined minmax node.
13706 CondEntry->State = TreeEntry::CombinedVectorize;
13707 }
13708 break;
13709 }
13710 case Instruction::FSub:
13711 case Instruction::FAdd: {
13712 // Check if possible to convert (a*b)+c to fma.
13713 if (E.State != TreeEntry::Vectorize ||
13714 !E.getOperations().isAddSubLikeOp())
13715 break;
13716 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13717 .isValid())
13718 break;
13719 // This node is a fmuladd node.
13720 E.CombinedOp = TreeEntry::FMulAdd;
13721 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13722 if (FMulEntry->UserTreeIndex &&
13723 FMulEntry->State == TreeEntry::Vectorize) {
13724 // The FMul node is part of the combined fmuladd node.
13725 FMulEntry->State = TreeEntry::CombinedVectorize;
13726 }
13727 break;
13728 }
13729 default:
13730 break;
13731 }
13732 }
13733
13734 if (LoadEntriesToVectorize.empty()) {
13735 // Single load node - exit.
13736 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13737 VectorizableTree.front()->getOpcode() == Instruction::Load)
13738 return;
13739 // Small graph with small VF - exit.
13740 constexpr unsigned SmallTree = 3;
13741 constexpr unsigned SmallVF = 2;
13742 if ((VectorizableTree.size() <= SmallTree &&
13743 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13744 (VectorizableTree.size() <= 2 && UserIgnoreList))
13745 return;
13746
13747 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13748 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13749 getCanonicalGraphSize() <= SmallTree &&
13750 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13751 [](const std::unique_ptr<TreeEntry> &TE) {
13752 return TE->isGather() && TE->hasState() &&
13753 TE->getOpcode() == Instruction::Load &&
13754 !allSameBlock(TE->Scalars);
13755 }) == 1)
13756 return;
13757 }
13758
13759 // A list of loads to be gathered during the vectorization process. We can
13760 // try to vectorize them at the end, if profitable.
13761 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13763 GatheredLoads;
13764
13765 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13766 TreeEntry &E = *TE;
13767 if (E.isGather() &&
13768 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13769 (!E.hasState() && any_of(E.Scalars,
13770 [&](Value *V) {
13771 return isa<LoadInst>(V) &&
13772 !isVectorized(V) &&
13773 !isDeleted(cast<Instruction>(V));
13774 }))) &&
13775 !isSplat(E.Scalars)) {
13776 for (Value *V : E.Scalars) {
13777 auto *LI = dyn_cast<LoadInst>(V);
13778 if (!LI)
13779 continue;
13780 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13781 continue;
13783 *this, V, *DL, *SE, *TTI,
13784 GatheredLoads[std::make_tuple(
13785 LI->getParent(),
13786 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13787 LI->getType())]);
13788 }
13789 }
13790 }
13791 // Try to vectorize gathered loads if this is not just a gather of loads.
13792 if (!GatheredLoads.empty())
13793 tryToVectorizeGatheredLoads(GatheredLoads);
13794}
13795
13796/// Merges shuffle masks and emits final shuffle instruction, if required. It
13797/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13798/// when the actual shuffle instruction is generated only if this is actually
13799/// required. Otherwise, the shuffle instruction emission is delayed till the
13800/// end of the process, to reduce the number of emitted instructions and further
13801/// analysis/transformations.
13802class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13803 bool IsFinalized = false;
13804 SmallVector<int> CommonMask;
13806 const TargetTransformInfo &TTI;
13807 InstructionCost Cost = 0;
13808 SmallDenseSet<Value *> VectorizedVals;
13809 BoUpSLP &R;
13810 SmallPtrSetImpl<Value *> &CheckedExtracts;
13811 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13812 /// While set, still trying to estimate the cost for the same nodes and we
13813 /// can delay actual cost estimation (virtual shuffle instruction emission).
13814 /// May help better estimate the cost if same nodes must be permuted + allows
13815 /// to move most of the long shuffles cost estimation to TTI.
13816 bool SameNodesEstimated = true;
13817
13818 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13819 if (Ty->getScalarType()->isPointerTy()) {
13822 IntegerType::get(Ty->getContext(),
13823 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13824 Ty->getScalarType());
13825 if (auto *VTy = dyn_cast<VectorType>(Ty))
13826 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13827 return Res;
13828 }
13829 return Constant::getAllOnesValue(Ty);
13830 }
13831
13832 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13833 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13834 return TTI::TCC_Free;
13835 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13836 InstructionCost GatherCost = 0;
13837 SmallVector<Value *> Gathers(VL);
13838 if (!Root && isSplat(VL)) {
13839 // Found the broadcasting of the single scalar, calculate the cost as
13840 // the broadcast.
13841 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13842 assert(It != VL.end() && "Expected at least one non-undef value.");
13843 // Add broadcast for non-identity shuffle only.
13844 bool NeedShuffle =
13845 count(VL, *It) > 1 &&
13846 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13847 if (!NeedShuffle) {
13848 if (isa<FixedVectorType>(ScalarTy)) {
13849 assert(SLPReVec && "FixedVectorType is not expected.");
13850 return TTI.getShuffleCost(
13851 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13852 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13853 cast<FixedVectorType>(ScalarTy));
13854 }
13855 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13856 CostKind, std::distance(VL.begin(), It),
13857 PoisonValue::get(VecTy), *It);
13858 }
13859
13860 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13861 transform(VL, ShuffleMask.begin(), [](Value *V) {
13862 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13863 });
13864 InstructionCost InsertCost =
13865 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13866 PoisonValue::get(VecTy), *It);
13867 return InsertCost + ::getShuffleCost(TTI,
13869 VecTy, ShuffleMask, CostKind,
13870 /*Index=*/0, /*SubTp=*/nullptr,
13871 /*Args=*/*It);
13872 }
13873 return GatherCost +
13874 (all_of(Gathers, IsaPred<UndefValue>)
13876 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13877 ScalarTy));
13878 };
13879
13880 /// Compute the cost of creating a vector containing the extracted values from
13881 /// \p VL.
13883 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13884 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13885 unsigned NumParts) {
13886 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13887 unsigned NumElts =
13888 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13889 auto *EE = dyn_cast<ExtractElementInst>(V);
13890 if (!EE)
13891 return Sz;
13892 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13893 if (!VecTy)
13894 return Sz;
13895 return std::max(Sz, VecTy->getNumElements());
13896 });
13897 // FIXME: this must be moved to TTI for better estimation.
13898 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13899 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13901 SmallVectorImpl<unsigned> &SubVecSizes)
13902 -> std::optional<TTI::ShuffleKind> {
13903 if (NumElts <= EltsPerVector)
13904 return std::nullopt;
13905 int OffsetReg0 =
13906 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13907 [](int S, int I) {
13908 if (I == PoisonMaskElem)
13909 return S;
13910 return std::min(S, I);
13911 }),
13912 EltsPerVector);
13913 int OffsetReg1 = OffsetReg0;
13914 DenseSet<int> RegIndices;
13915 // Check that if trying to permute same single/2 input vectors.
13917 int FirstRegId = -1;
13918 Indices.assign(1, OffsetReg0);
13919 for (auto [Pos, I] : enumerate(Mask)) {
13920 if (I == PoisonMaskElem)
13921 continue;
13922 int Idx = I - OffsetReg0;
13923 int RegId =
13924 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13925 if (FirstRegId < 0)
13926 FirstRegId = RegId;
13927 RegIndices.insert(RegId);
13928 if (RegIndices.size() > 2)
13929 return std::nullopt;
13930 if (RegIndices.size() == 2) {
13931 ShuffleKind = TTI::SK_PermuteTwoSrc;
13932 if (Indices.size() == 1) {
13933 OffsetReg1 = alignDown(
13934 std::accumulate(
13935 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13936 [&](int S, int I) {
13937 if (I == PoisonMaskElem)
13938 return S;
13939 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13940 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13941 if (RegId == FirstRegId)
13942 return S;
13943 return std::min(S, I);
13944 }),
13945 EltsPerVector);
13946 unsigned Index = OffsetReg1 % NumElts;
13947 Indices.push_back(Index);
13948 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13949 }
13950 Idx = I - OffsetReg1;
13951 }
13952 I = (Idx % NumElts) % EltsPerVector +
13953 (RegId == FirstRegId ? 0 : EltsPerVector);
13954 }
13955 return ShuffleKind;
13956 };
13957 InstructionCost Cost = 0;
13958
13959 // Process extracts in blocks of EltsPerVector to check if the source vector
13960 // operand can be re-used directly. If not, add the cost of creating a
13961 // shuffle to extract the values into a vector register.
13962 for (unsigned Part : seq<unsigned>(NumParts)) {
13963 if (!ShuffleKinds[Part])
13964 continue;
13965 ArrayRef<int> MaskSlice = Mask.slice(
13966 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13967 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13968 copy(MaskSlice, SubMask.begin());
13970 SmallVector<unsigned, 2> SubVecSizes;
13971 std::optional<TTI::ShuffleKind> RegShuffleKind =
13972 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13973 if (!RegShuffleKind) {
13974 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13976 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13977 Cost +=
13978 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13979 getWidenedType(ScalarTy, NumElts), MaskSlice);
13980 continue;
13981 }
13982 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13983 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13984 Cost +=
13985 ::getShuffleCost(TTI, *RegShuffleKind,
13986 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13987 }
13988 const unsigned BaseVF = getFullVectorNumberOfElements(
13989 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13990 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13991 assert((Idx + SubVecSize) <= BaseVF &&
13992 "SK_ExtractSubvector index out of range");
13994 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13995 Idx, getWidenedType(ScalarTy, SubVecSize));
13996 }
13997 // Second attempt to check, if just a permute is better estimated than
13998 // subvector extract.
13999 SubMask.assign(NumElts, PoisonMaskElem);
14000 copy(MaskSlice, SubMask.begin());
14001 InstructionCost OriginalCost = ::getShuffleCost(
14002 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
14003 if (OriginalCost < Cost)
14004 Cost = OriginalCost;
14005 }
14006 return Cost;
14007 }
14008 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
14009 /// mask \p Mask, register number \p Part, that includes \p SliceSize
14010 /// elements.
14011 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
14012 ArrayRef<int> Mask, unsigned Part,
14013 unsigned SliceSize) {
14014 if (SameNodesEstimated) {
14015 // Delay the cost estimation if the same nodes are reshuffling.
14016 // If we already requested the cost of reshuffling of E1 and E2 before, no
14017 // need to estimate another cost with the sub-Mask, instead include this
14018 // sub-Mask into the CommonMask to estimate it later and avoid double cost
14019 // estimation.
14020 if ((InVectors.size() == 2 &&
14021 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
14022 cast<const TreeEntry *>(InVectors.back()) == E2) ||
14023 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
14024 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
14025 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
14026 [](int Idx) { return Idx == PoisonMaskElem; }) &&
14027 "Expected all poisoned elements.");
14028 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
14029 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
14030 return;
14031 }
14032 // Found non-matching nodes - need to estimate the cost for the matched
14033 // and transform mask.
14034 Cost += createShuffle(InVectors.front(),
14035 InVectors.size() == 1 ? nullptr : InVectors.back(),
14036 CommonMask);
14037 transformMaskAfterShuffle(CommonMask, CommonMask);
14038 } else if (InVectors.size() == 2) {
14039 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14040 transformMaskAfterShuffle(CommonMask, CommonMask);
14041 }
14042 SameNodesEstimated = false;
14043 if (!E2 && InVectors.size() == 1) {
14044 unsigned VF = E1.getVectorFactor();
14045 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
14046 VF = std::max(VF, getVF(V1));
14047 } else {
14048 const auto *E = cast<const TreeEntry *>(InVectors.front());
14049 VF = std::max(VF, E->getVectorFactor());
14050 }
14051 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14052 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14053 CommonMask[Idx] = Mask[Idx] + VF;
14054 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
14055 transformMaskAfterShuffle(CommonMask, CommonMask);
14056 } else {
14057 auto P = InVectors.front();
14058 Cost += createShuffle(&E1, E2, Mask);
14059 unsigned VF = Mask.size();
14060 if (Value *V1 = dyn_cast<Value *>(P)) {
14061 VF = std::max(VF,
14062 getNumElements(V1->getType()));
14063 } else {
14064 const auto *E = cast<const TreeEntry *>(P);
14065 VF = std::max(VF, E->getVectorFactor());
14066 }
14067 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14068 if (Mask[Idx] != PoisonMaskElem)
14069 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
14070 Cost += createShuffle(P, InVectors.front(), CommonMask);
14071 transformMaskAfterShuffle(CommonMask, CommonMask);
14072 }
14073 }
14074
14075 class ShuffleCostBuilder {
14076 const TargetTransformInfo &TTI;
14077
14078 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
14079 int Index = -1;
14080 return Mask.empty() ||
14081 (VF == Mask.size() &&
14084 Index == 0);
14085 }
14086
14087 public:
14088 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
14089 ~ShuffleCostBuilder() = default;
14090 InstructionCost createShuffleVector(Value *V1, Value *,
14091 ArrayRef<int> Mask) const {
14092 // Empty mask or identity mask are free.
14093 unsigned VF =
14094 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
14095 if (isEmptyOrIdentity(Mask, VF))
14096 return TTI::TCC_Free;
14097 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
14098 cast<VectorType>(V1->getType()), Mask);
14099 }
14100 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
14101 // Empty mask or identity mask are free.
14102 unsigned VF =
14103 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
14104 if (isEmptyOrIdentity(Mask, VF))
14105 return TTI::TCC_Free;
14106 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
14107 cast<VectorType>(V1->getType()), Mask);
14108 }
14109 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
14110 InstructionCost createPoison(Type *Ty, unsigned VF) const {
14111 return TTI::TCC_Free;
14112 }
14113 void resizeToMatch(Value *&, Value *&) const {}
14114 };
14115
14116 /// Smart shuffle instruction emission, walks through shuffles trees and
14117 /// tries to find the best matching vector for the actual shuffle
14118 /// instruction.
14120 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
14122 ArrayRef<int> Mask) {
14123 ShuffleCostBuilder Builder(TTI);
14124 SmallVector<int> CommonMask(Mask);
14125 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
14126 unsigned CommonVF = Mask.size();
14127 InstructionCost ExtraCost = 0;
14128 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
14129 unsigned VF) -> InstructionCost {
14130 if (E.isGather() && allConstant(E.Scalars))
14131 return TTI::TCC_Free;
14132 Type *EScalarTy = E.Scalars.front()->getType();
14133 bool IsSigned = true;
14134 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
14135 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
14136 IsSigned = It->second.second;
14137 }
14138 if (EScalarTy != ScalarTy) {
14139 unsigned CastOpcode = Instruction::Trunc;
14140 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14141 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14142 if (DstSz > SrcSz)
14143 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14144 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
14145 getWidenedType(EScalarTy, VF),
14146 TTI::CastContextHint::None, CostKind);
14147 }
14148 return TTI::TCC_Free;
14149 };
14150 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
14151 if (isa<Constant>(V))
14152 return TTI::TCC_Free;
14153 auto *VecTy = cast<VectorType>(V->getType());
14154 Type *EScalarTy = VecTy->getElementType();
14155 if (EScalarTy != ScalarTy) {
14156 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
14157 unsigned CastOpcode = Instruction::Trunc;
14158 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14159 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14160 if (DstSz > SrcSz)
14161 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14162 return TTI.getCastInstrCost(
14163 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
14164 VecTy, TTI::CastContextHint::None, CostKind);
14165 }
14166 return TTI::TCC_Free;
14167 };
14168 if (!V1 && !V2 && !P2.isNull()) {
14169 // Shuffle 2 entry nodes.
14170 const TreeEntry *E = cast<const TreeEntry *>(P1);
14171 unsigned VF = E->getVectorFactor();
14172 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
14173 CommonVF = std::max(VF, E2->getVectorFactor());
14174 assert(all_of(Mask,
14175 [=](int Idx) {
14176 return Idx < 2 * static_cast<int>(CommonVF);
14177 }) &&
14178 "All elements in mask must be less than 2 * CommonVF.");
14179 if (E->Scalars.size() == E2->Scalars.size()) {
14180 SmallVector<int> EMask = E->getCommonMask();
14181 SmallVector<int> E2Mask = E2->getCommonMask();
14182 if (!EMask.empty() || !E2Mask.empty()) {
14183 for (int &Idx : CommonMask) {
14184 if (Idx == PoisonMaskElem)
14185 continue;
14186 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
14187 Idx = EMask[Idx];
14188 else if (Idx >= static_cast<int>(CommonVF))
14189 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
14190 E->Scalars.size();
14191 }
14192 }
14193 CommonVF = E->Scalars.size();
14194 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14195 GetNodeMinBWAffectedCost(*E2, CommonVF);
14196 } else {
14197 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14198 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14199 }
14200 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14201 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14202 } else if (!V1 && P2.isNull()) {
14203 // Shuffle single entry node.
14204 const TreeEntry *E = cast<const TreeEntry *>(P1);
14205 unsigned VF = E->getVectorFactor();
14206 CommonVF = VF;
14207 assert(
14208 all_of(Mask,
14209 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14210 "All elements in mask must be less than CommonVF.");
14211 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14212 SmallVector<int> EMask = E->getCommonMask();
14213 assert(!EMask.empty() && "Expected non-empty common mask.");
14214 for (int &Idx : CommonMask) {
14215 if (Idx != PoisonMaskElem)
14216 Idx = EMask[Idx];
14217 }
14218 CommonVF = E->Scalars.size();
14219 } else if (unsigned Factor = E->getInterleaveFactor();
14220 Factor > 0 && E->Scalars.size() != Mask.size() &&
14222 Factor)) {
14223 // Deinterleaved nodes are free.
14224 std::iota(CommonMask.begin(), CommonMask.end(), 0);
14225 }
14226 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14227 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14228 // Not identity/broadcast? Try to see if the original vector is better.
14229 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14230 CommonVF == CommonMask.size() &&
14231 any_of(enumerate(CommonMask),
14232 [](const auto &&P) {
14233 return P.value() != PoisonMaskElem &&
14234 static_cast<unsigned>(P.value()) != P.index();
14235 }) &&
14236 any_of(CommonMask,
14237 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
14238 SmallVector<int> ReorderMask;
14239 inversePermutation(E->ReorderIndices, ReorderMask);
14240 ::addMask(CommonMask, ReorderMask);
14241 }
14242 } else if (V1 && P2.isNull()) {
14243 // Shuffle single vector.
14244 ExtraCost += GetValueMinBWAffectedCost(V1);
14245 CommonVF = getVF(V1);
14246 assert(
14247 all_of(Mask,
14248 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14249 "All elements in mask must be less than CommonVF.");
14250 } else if (V1 && !V2) {
14251 // Shuffle vector and tree node.
14252 unsigned VF = getVF(V1);
14253 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
14254 CommonVF = std::max(VF, E2->getVectorFactor());
14255 assert(all_of(Mask,
14256 [=](int Idx) {
14257 return Idx < 2 * static_cast<int>(CommonVF);
14258 }) &&
14259 "All elements in mask must be less than 2 * CommonVF.");
14260 if (E2->Scalars.size() == VF && VF != CommonVF) {
14261 SmallVector<int> E2Mask = E2->getCommonMask();
14262 assert(!E2Mask.empty() && "Expected non-empty common mask.");
14263 for (int &Idx : CommonMask) {
14264 if (Idx == PoisonMaskElem)
14265 continue;
14266 if (Idx >= static_cast<int>(CommonVF))
14267 Idx = E2Mask[Idx - CommonVF] + VF;
14268 }
14269 CommonVF = VF;
14270 }
14271 ExtraCost += GetValueMinBWAffectedCost(V1);
14272 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14273 ExtraCost += GetNodeMinBWAffectedCost(
14274 *E2, std::min(CommonVF, E2->getVectorFactor()));
14275 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14276 } else if (!V1 && V2) {
14277 // Shuffle vector and tree node.
14278 unsigned VF = getVF(V2);
14279 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
14280 CommonVF = std::max(VF, E1->getVectorFactor());
14281 assert(all_of(Mask,
14282 [=](int Idx) {
14283 return Idx < 2 * static_cast<int>(CommonVF);
14284 }) &&
14285 "All elements in mask must be less than 2 * CommonVF.");
14286 if (E1->Scalars.size() == VF && VF != CommonVF) {
14287 SmallVector<int> E1Mask = E1->getCommonMask();
14288 assert(!E1Mask.empty() && "Expected non-empty common mask.");
14289 for (int &Idx : CommonMask) {
14290 if (Idx == PoisonMaskElem)
14291 continue;
14292 if (Idx >= static_cast<int>(CommonVF))
14293 Idx = E1Mask[Idx - CommonVF] + VF;
14294 else
14295 Idx = E1Mask[Idx];
14296 }
14297 CommonVF = VF;
14298 }
14299 ExtraCost += GetNodeMinBWAffectedCost(
14300 *E1, std::min(CommonVF, E1->getVectorFactor()));
14301 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14302 ExtraCost += GetValueMinBWAffectedCost(V2);
14303 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14304 } else {
14305 assert(V1 && V2 && "Expected both vectors.");
14306 unsigned VF = getVF(V1);
14307 CommonVF = std::max(VF, getVF(V2));
14308 assert(all_of(Mask,
14309 [=](int Idx) {
14310 return Idx < 2 * static_cast<int>(CommonVF);
14311 }) &&
14312 "All elements in mask must be less than 2 * CommonVF.");
14313 ExtraCost +=
14314 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14315 if (V1->getType() != V2->getType()) {
14316 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14317 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14318 } else {
14319 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
14320 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14321 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
14322 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14323 }
14324 }
14325 InVectors.front() =
14326 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14327 if (InVectors.size() == 2)
14328 InVectors.pop_back();
14329 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14330 V1, V2, CommonMask, Builder, ScalarTy);
14331 }
14332
14333public:
14335 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
14336 SmallPtrSetImpl<Value *> &CheckedExtracts)
14337 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14338 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14339 CheckedExtracts(CheckedExtracts) {}
14340 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14341 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14342 unsigned NumParts, bool &UseVecBaseAsInput) {
14343 UseVecBaseAsInput = false;
14344 if (Mask.empty())
14345 return nullptr;
14346 Value *VecBase = nullptr;
14347 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14348 if (!E->ReorderIndices.empty()) {
14349 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14350 E->ReorderIndices.end());
14351 reorderScalars(VL, ReorderMask);
14352 }
14353 // Check if it can be considered reused if same extractelements were
14354 // vectorized already.
14355 bool PrevNodeFound = any_of(
14356 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14357 [&](const std::unique_ptr<TreeEntry> &TE) {
14358 return ((TE->hasState() && !TE->isAltShuffle() &&
14359 TE->getOpcode() == Instruction::ExtractElement) ||
14360 TE->isGather()) &&
14361 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14362 return VL.size() > Data.index() &&
14363 (Mask[Data.index()] == PoisonMaskElem ||
14364 isa<UndefValue>(VL[Data.index()]) ||
14365 Data.value() == VL[Data.index()]);
14366 });
14367 });
14368 SmallPtrSet<Value *, 4> UniqueBases;
14369 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14370 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
14371 for (unsigned Part : seq<unsigned>(NumParts)) {
14372 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14373 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14374 for (auto [I, V] :
14375 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
14376 // Ignore non-extractelement scalars.
14377 if (isa<UndefValue>(V) ||
14378 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
14379 continue;
14380 // If all users of instruction are going to be vectorized and this
14381 // instruction itself is not going to be vectorized, consider this
14382 // instruction as dead and remove its cost from the final cost of the
14383 // vectorized tree.
14384 // Also, avoid adjusting the cost for extractelements with multiple uses
14385 // in different graph entries.
14386 auto *EE = cast<ExtractElementInst>(V);
14387 VecBase = EE->getVectorOperand();
14388 UniqueBases.insert(VecBase);
14389 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
14390 if (!CheckedExtracts.insert(V).second ||
14391 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
14392 any_of(VEs,
14393 [&](const TreeEntry *TE) {
14394 return R.DeletedNodes.contains(TE) ||
14395 R.TransformedToGatherNodes.contains(TE);
14396 }) ||
14397 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
14398 !R.isVectorized(EE) &&
14399 count_if(E->Scalars, [&](Value *V) { return V == EE; }) !=
14400 count_if(E->UserTreeIndex.UserTE->Scalars,
14401 [&](Value *V) { return V == EE; })) ||
14402 any_of(EE->users(),
14403 [&](User *U) {
14404 return isa<GetElementPtrInst>(U) &&
14405 !R.areAllUsersVectorized(cast<Instruction>(U),
14406 &VectorizedVals);
14407 }) ||
14408 (!VEs.empty() && !is_contained(VEs, E)))
14409 continue;
14410 std::optional<unsigned> EEIdx = getExtractIndex(EE);
14411 if (!EEIdx)
14412 continue;
14413 unsigned Idx = *EEIdx;
14414 // Take credit for instruction that will become dead.
14415 if (EE->hasOneUse() || !PrevNodeFound) {
14416 Instruction *Ext = EE->user_back();
14417 if (isa<SExtInst, ZExtInst>(Ext) &&
14419 // Use getExtractWithExtendCost() to calculate the cost of
14420 // extractelement/ext pair.
14421 Cost -= TTI.getExtractWithExtendCost(
14422 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
14423 Idx, CostKind);
14424 // Add back the cost of s|zext which is subtracted separately.
14425 Cost += TTI.getCastInstrCost(
14426 Ext->getOpcode(), Ext->getType(), EE->getType(),
14428 continue;
14429 }
14430 }
14431 APInt &DemandedElts =
14432 VectorOpsToExtracts
14433 .try_emplace(VecBase,
14434 APInt::getZero(getNumElements(VecBase->getType())))
14435 .first->getSecond();
14436 DemandedElts.setBit(Idx);
14437 }
14438 }
14439 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14441 DemandedElts, /*Insert=*/false,
14442 /*Extract=*/true, CostKind);
14443 // Check that gather of extractelements can be represented as just a
14444 // shuffle of a single/two vectors the scalars are extracted from.
14445 // Found the bunch of extractelement instructions that must be gathered
14446 // into a vector and can be represented as a permutation elements in a
14447 // single input vector or of 2 input vectors.
14448 // Done for reused if same extractelements were vectorized already.
14449 if (!PrevNodeFound)
14450 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14451 InVectors.assign(1, E);
14452 CommonMask.assign(Mask.begin(), Mask.end());
14453 transformMaskAfterShuffle(CommonMask, CommonMask);
14454 SameNodesEstimated = false;
14455 if (NumParts != 1 && UniqueBases.size() != 1) {
14456 UseVecBaseAsInput = true;
14457 VecBase =
14458 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14459 }
14460 return VecBase;
14461 }
14462 /// Checks if the specified entry \p E needs to be delayed because of its
14463 /// dependency nodes.
14464 std::optional<InstructionCost>
14465 needToDelay(const TreeEntry *,
14467 // No need to delay the cost estimation during analysis.
14468 return std::nullopt;
14469 }
14470 /// Reset the builder to handle perfect diamond match.
14472 IsFinalized = false;
14473 CommonMask.clear();
14474 InVectors.clear();
14475 Cost = 0;
14476 VectorizedVals.clear();
14477 SameNodesEstimated = true;
14478 }
14479 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14480 if (&E1 == &E2) {
14481 assert(all_of(Mask,
14482 [&](int Idx) {
14483 return Idx < static_cast<int>(E1.getVectorFactor());
14484 }) &&
14485 "Expected single vector shuffle mask.");
14486 add(E1, Mask);
14487 return;
14488 }
14489 if (InVectors.empty()) {
14490 CommonMask.assign(Mask.begin(), Mask.end());
14491 InVectors.assign({&E1, &E2});
14492 return;
14493 }
14494 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14495 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14496 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14497 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14498 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
14499 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14500 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14501 }
14502 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14503 if (InVectors.empty()) {
14504 CommonMask.assign(Mask.begin(), Mask.end());
14505 InVectors.assign(1, &E1);
14506 return;
14507 }
14508 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14509 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14510 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14511 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14512 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
14513 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14514 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
14515 if (!SameNodesEstimated && InVectors.size() == 1)
14516 InVectors.emplace_back(&E1);
14517 }
14518 /// Adds 2 input vectors and the mask for their shuffling.
14519 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14520 // May come only for shuffling of 2 vectors with extractelements, already
14521 // handled in adjustExtracts.
14522 assert(InVectors.size() == 1 &&
14523 all_of(enumerate(CommonMask),
14524 [&](auto P) {
14525 if (P.value() == PoisonMaskElem)
14526 return Mask[P.index()] == PoisonMaskElem;
14527 auto *EI = cast<ExtractElementInst>(
14528 cast<const TreeEntry *>(InVectors.front())
14529 ->getOrdered(P.index()));
14530 return EI->getVectorOperand() == V1 ||
14531 EI->getVectorOperand() == V2;
14532 }) &&
14533 "Expected extractelement vectors.");
14534 }
14535 /// Adds another one input vector and the mask for the shuffling.
14536 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14537 if (InVectors.empty()) {
14538 assert(CommonMask.empty() && !ForExtracts &&
14539 "Expected empty input mask/vectors.");
14540 CommonMask.assign(Mask.begin(), Mask.end());
14541 InVectors.assign(1, V1);
14542 return;
14543 }
14544 if (ForExtracts) {
14545 // No need to add vectors here, already handled them in adjustExtracts.
14546 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14547 !CommonMask.empty() &&
14548 all_of(enumerate(CommonMask),
14549 [&](auto P) {
14550 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14551 ->getOrdered(P.index());
14552 if (P.value() == PoisonMaskElem)
14553 return P.value() == Mask[P.index()] ||
14554 isa<UndefValue>(Scalar);
14555 if (isa<Constant>(V1))
14556 return true;
14557 auto *EI = cast<ExtractElementInst>(Scalar);
14558 return EI->getVectorOperand() == V1;
14559 }) &&
14560 "Expected only tree entry for extractelement vectors.");
14561 return;
14562 }
14563 assert(!InVectors.empty() && !CommonMask.empty() &&
14564 "Expected only tree entries from extracts/reused buildvectors.");
14565 unsigned VF = getVF(V1);
14566 if (InVectors.size() == 2) {
14567 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14568 transformMaskAfterShuffle(CommonMask, CommonMask);
14569 VF = std::max<unsigned>(VF, CommonMask.size());
14570 } else if (const auto *InTE =
14571 InVectors.front().dyn_cast<const TreeEntry *>()) {
14572 VF = std::max(VF, InTE->getVectorFactor());
14573 } else {
14574 VF = std::max(
14575 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14576 ->getNumElements());
14577 }
14578 InVectors.push_back(V1);
14579 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14580 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14581 CommonMask[Idx] = Mask[Idx] + VF;
14582 }
14583 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14584 Value *Root = nullptr) {
14585 Cost += getBuildVectorCost(VL, Root);
14586 if (!Root) {
14587 // FIXME: Need to find a way to avoid use of getNullValue here.
14589 unsigned VF = VL.size();
14590 if (MaskVF != 0)
14591 VF = std::min(VF, MaskVF);
14592 Type *VLScalarTy = VL.front()->getType();
14593 for (Value *V : VL.take_front(VF)) {
14594 Type *ScalarTy = VLScalarTy->getScalarType();
14595 if (isa<PoisonValue>(V)) {
14596 Vals.push_back(PoisonValue::get(ScalarTy));
14597 continue;
14598 }
14599 if (isa<UndefValue>(V)) {
14600 Vals.push_back(UndefValue::get(ScalarTy));
14601 continue;
14602 }
14603 Vals.push_back(Constant::getNullValue(ScalarTy));
14604 }
14605 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14606 assert(SLPReVec && "FixedVectorType is not expected.");
14607 // When REVEC is enabled, we need to expand vector types into scalar
14608 // types.
14609 Vals = replicateMask(Vals, VecTy->getNumElements());
14610 }
14611 return ConstantVector::get(Vals);
14612 }
14615 cast<FixedVectorType>(Root->getType())->getNumElements()),
14616 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14617 }
14619 /// Finalize emission of the shuffles.
14621 ArrayRef<int> ExtMask,
14622 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14623 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14626 Action = {}) {
14627 IsFinalized = true;
14628 if (Action) {
14629 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14630 if (InVectors.size() == 2)
14631 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14632 else
14633 Cost += createShuffle(Vec, nullptr, CommonMask);
14634 transformMaskAfterShuffle(CommonMask, CommonMask);
14635 assert(VF > 0 &&
14636 "Expected vector length for the final value before action.");
14637 Value *V = cast<Value *>(Vec);
14638 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14639 Cost += createShuffle(V1, V2, Mask);
14640 return V1;
14641 });
14642 InVectors.front() = V;
14643 }
14644 if (!SubVectors.empty()) {
14645 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14646 if (InVectors.size() == 2)
14647 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14648 else
14649 Cost += createShuffle(Vec, nullptr, CommonMask);
14650 transformMaskAfterShuffle(CommonMask, CommonMask);
14651 // Add subvectors permutation cost.
14652 if (!SubVectorsMask.empty()) {
14653 assert(SubVectorsMask.size() <= CommonMask.size() &&
14654 "Expected same size of masks for subvectors and common mask.");
14655 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14656 copy(SubVectorsMask, SVMask.begin());
14657 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14658 if (I2 != PoisonMaskElem) {
14659 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14660 I1 = I2 + CommonMask.size();
14661 }
14662 }
14664 getWidenedType(ScalarTy, CommonMask.size()),
14665 SVMask, CostKind);
14666 }
14667 for (auto [E, Idx] : SubVectors) {
14668 Type *EScalarTy = E->Scalars.front()->getType();
14669 bool IsSigned = true;
14670 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14671 EScalarTy =
14672 IntegerType::get(EScalarTy->getContext(), It->second.first);
14673 IsSigned = It->second.second;
14674 }
14675 if (ScalarTy != EScalarTy) {
14676 unsigned CastOpcode = Instruction::Trunc;
14677 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14678 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14679 if (DstSz > SrcSz)
14680 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14681 Cost += TTI.getCastInstrCost(
14682 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14683 getWidenedType(EScalarTy, E->getVectorFactor()),
14685 }
14688 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14689 getWidenedType(ScalarTy, E->getVectorFactor()));
14690 if (!CommonMask.empty()) {
14691 std::iota(std::next(CommonMask.begin(), Idx),
14692 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14693 Idx);
14694 }
14695 }
14696 }
14697
14698 if (!ExtMask.empty()) {
14699 if (CommonMask.empty()) {
14700 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14701 } else {
14702 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14703 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14704 if (ExtMask[I] == PoisonMaskElem)
14705 continue;
14706 NewMask[I] = CommonMask[ExtMask[I]];
14707 }
14708 CommonMask.swap(NewMask);
14709 }
14710 }
14711 if (CommonMask.empty()) {
14712 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14713 return Cost;
14714 }
14715 return Cost +
14716 createShuffle(InVectors.front(),
14717 InVectors.size() == 2 ? InVectors.back() : nullptr,
14718 CommonMask);
14719 }
14720
14722 assert((IsFinalized || CommonMask.empty()) &&
14723 "Shuffle construction must be finalized.");
14724 }
14725};
14726
14727const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14728 unsigned Idx) const {
14729 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14730 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14731 return Op;
14732}
14733
14734TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14735 if (TE.State == TreeEntry::ScatterVectorize ||
14736 TE.State == TreeEntry::StridedVectorize)
14738 if (TE.State == TreeEntry::CompressVectorize)
14740 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14741 !TE.isAltShuffle()) {
14742 if (TE.ReorderIndices.empty())
14744 SmallVector<int> Mask;
14745 inversePermutation(TE.ReorderIndices, Mask);
14746 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14748 }
14750}
14751
14753BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14754 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14755 ArrayRef<Value *> VL = E->Scalars;
14756
14757 Type *ScalarTy = getValueType(VL[0]);
14758 if (!isValidElementType(ScalarTy))
14759 return InstructionCost::getInvalid();
14761
14762 // If we have computed a smaller type for the expression, update VecTy so
14763 // that the costs will be accurate.
14764 auto It = MinBWs.find(E);
14765 Type *OrigScalarTy = ScalarTy;
14766 if (It != MinBWs.end()) {
14767 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14768 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14769 if (VecTy)
14770 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14771 }
14772 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14773 unsigned EntryVF = E->getVectorFactor();
14774 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14775
14776 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
14777 if (allConstant(VL))
14778 return 0;
14779 if (isa<InsertElementInst>(VL[0]))
14780 return InstructionCost::getInvalid();
14781 if (isa<CmpInst>(VL.front()))
14782 ScalarTy = VL.front()->getType();
14783 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14784 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14785 }
14786 if (E->State == TreeEntry::SplitVectorize) {
14787 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14788 "Expected exactly 2 combined entries.");
14789 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14790 InstructionCost VectorCost = 0;
14791 if (E->ReorderIndices.empty()) {
14792 VectorCost = ::getShuffleCost(
14793 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14794 E->CombinedEntriesWithIndices.back().second,
14796 ScalarTy,
14797 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14798 ->getVectorFactor()));
14799 } else {
14800 unsigned CommonVF =
14801 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14802 ->getVectorFactor(),
14803 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14804 ->getVectorFactor());
14805 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14806 getWidenedType(ScalarTy, CommonVF),
14807 E->getSplitMask(), CostKind);
14808 }
14809 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14810 return VectorCost;
14811 }
14812 InstructionCost CommonCost = 0;
14813 SmallVector<int> Mask;
14814 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14815 (E->State != TreeEntry::StridedVectorize ||
14816 !isReverseOrder(E->ReorderIndices))) {
14817 SmallVector<int> NewMask;
14818 if (E->getOpcode() == Instruction::Store) {
14819 // For stores the order is actually a mask.
14820 NewMask.resize(E->ReorderIndices.size());
14821 copy(E->ReorderIndices, NewMask.begin());
14822 } else {
14823 inversePermutation(E->ReorderIndices, NewMask);
14824 }
14825 ::addMask(Mask, NewMask);
14826 }
14827 if (!E->ReuseShuffleIndices.empty())
14828 ::addMask(Mask, E->ReuseShuffleIndices);
14829 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14830 CommonCost =
14831 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14832 assert((E->State == TreeEntry::Vectorize ||
14833 E->State == TreeEntry::ScatterVectorize ||
14834 E->State == TreeEntry::StridedVectorize ||
14835 E->State == TreeEntry::CompressVectorize) &&
14836 "Unhandled state");
14837 assert(E->getOpcode() &&
14838 ((allSameType(VL) && allSameBlock(VL)) ||
14839 (E->getOpcode() == Instruction::GetElementPtr &&
14840 E->getMainOp()->getType()->isPointerTy()) ||
14841 E->hasCopyableElements()) &&
14842 "Invalid VL");
14843 Instruction *VL0 = E->getMainOp();
14844 unsigned ShuffleOrOp =
14845 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14846 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14847 ShuffleOrOp = E->CombinedOp;
14848 SmallSetVector<Value *, 16> UniqueValues;
14849 SmallVector<unsigned, 16> UniqueIndexes;
14850 for (auto [Idx, V] : enumerate(VL))
14851 if (UniqueValues.insert(V))
14852 UniqueIndexes.push_back(Idx);
14853 const unsigned Sz = UniqueValues.size();
14854 SmallBitVector UsedScalars(Sz, false);
14855 for (unsigned I = 0; I < Sz; ++I) {
14856 if (isa<Instruction>(UniqueValues[I]) &&
14857 !E->isCopyableElement(UniqueValues[I]) &&
14858 getTreeEntries(UniqueValues[I]).front() == E)
14859 continue;
14860 UsedScalars.set(I);
14861 }
14862 auto GetCastContextHint = [&](Value *V) {
14863 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14864 return getCastContextHint(*OpTEs.front());
14865 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14866 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14867 !SrcState.isAltShuffle())
14870 };
14871 auto GetCostDiff =
14872 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14873 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14874 // Calculate the cost of this instruction.
14875 InstructionCost ScalarCost = 0;
14876 if (isa<CastInst, CallInst>(VL0)) {
14877 // For some of the instructions no need to calculate cost for each
14878 // particular instruction, we can use the cost of the single
14879 // instruction x total number of scalar instructions.
14880 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14881 } else {
14882 for (unsigned I = 0; I < Sz; ++I) {
14883 if (UsedScalars.test(I))
14884 continue;
14885 ScalarCost += ScalarEltCost(I);
14886 }
14887 }
14888
14889 InstructionCost VecCost = VectorCost(CommonCost);
14890 // Check if the current node must be resized, if the parent node is not
14891 // resized.
14892 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14893 E->Idx != 0 &&
14894 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14895 const EdgeInfo &EI = E->UserTreeIndex;
14896 if (!EI.UserTE->hasState() ||
14897 EI.UserTE->getOpcode() != Instruction::Select ||
14898 EI.EdgeIdx != 0) {
14899 auto UserBWIt = MinBWs.find(EI.UserTE);
14900 Type *UserScalarTy =
14901 (EI.UserTE->isGather() ||
14902 EI.UserTE->State == TreeEntry::SplitVectorize)
14903 ? EI.UserTE->Scalars.front()->getType()
14904 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14905 if (UserBWIt != MinBWs.end())
14906 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14907 UserBWIt->second.first);
14908 if (ScalarTy != UserScalarTy) {
14909 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14910 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14911 unsigned VecOpcode;
14912 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14913 if (BWSz > SrcBWSz)
14914 VecOpcode = Instruction::Trunc;
14915 else
14916 VecOpcode =
14917 It->second.second ? Instruction::SExt : Instruction::ZExt;
14918 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14919 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14920 CostKind);
14921 }
14922 }
14923 }
14924 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14925 ScalarCost, "Calculated costs for Tree"));
14926 return VecCost - ScalarCost;
14927 };
14928 // Calculate cost difference from vectorizing set of GEPs.
14929 // Negative value means vectorizing is profitable.
14930 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14931 assert((E->State == TreeEntry::Vectorize ||
14932 E->State == TreeEntry::StridedVectorize ||
14933 E->State == TreeEntry::CompressVectorize) &&
14934 "Entry state expected to be Vectorize, StridedVectorize or "
14935 "MaskedLoadCompressVectorize here.");
14936 InstructionCost ScalarCost = 0;
14937 InstructionCost VecCost = 0;
14938 std::tie(ScalarCost, VecCost) = getGEPCosts(
14939 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14940 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14941 "Calculated GEPs cost for Tree"));
14942
14943 return VecCost - ScalarCost;
14944 };
14945
14946 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14947 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14948 if (MinMaxID == Intrinsic::not_intrinsic)
14949 return InstructionCost::getInvalid();
14950 Type *CanonicalType = Ty;
14951 if (CanonicalType->isPtrOrPtrVectorTy())
14952 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14953 CanonicalType->getContext(),
14954 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14955
14956 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14957 {CanonicalType, CanonicalType});
14959 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14960 // If the selects are the only uses of the compares, they will be
14961 // dead and we can adjust the cost by removing their cost.
14962 if (VI && SelectOnly) {
14963 assert((!Ty->isVectorTy() || SLPReVec) &&
14964 "Expected only for scalar type.");
14965 auto *CI = cast<CmpInst>(VI->getOperand(0));
14966 IntrinsicCost -= TTI->getCmpSelInstrCost(
14967 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14968 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14969 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14970 }
14971 return IntrinsicCost;
14972 };
14973 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14974 Instruction *VI) {
14975 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14976 return Cost;
14977 };
14978 switch (ShuffleOrOp) {
14979 case Instruction::PHI: {
14980 // Count reused scalars.
14981 InstructionCost ScalarCost = 0;
14982 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14983 for (Value *V : UniqueValues) {
14984 auto *PHI = dyn_cast<PHINode>(V);
14985 if (!PHI)
14986 continue;
14987
14988 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14989 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14990 Value *Op = PHI->getIncomingValue(I);
14991 Operands[I] = Op;
14992 }
14993 if (const TreeEntry *OpTE =
14994 getSameValuesTreeEntry(Operands.front(), Operands))
14995 if (CountedOps.insert(OpTE).second &&
14996 !OpTE->ReuseShuffleIndices.empty())
14997 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14998 OpTE->Scalars.size());
14999 }
15000
15001 return CommonCost - ScalarCost;
15002 }
15003 case Instruction::ExtractValue:
15004 case Instruction::ExtractElement: {
15005 APInt DemandedElts;
15006 VectorType *SrcVecTy = nullptr;
15007 auto GetScalarCost = [&](unsigned Idx) {
15008 if (isa<PoisonValue>(UniqueValues[Idx]))
15010
15011 auto *I = cast<Instruction>(UniqueValues[Idx]);
15012 if (!SrcVecTy) {
15013 if (ShuffleOrOp == Instruction::ExtractElement) {
15014 auto *EE = cast<ExtractElementInst>(I);
15015 SrcVecTy = EE->getVectorOperandType();
15016 } else {
15017 auto *EV = cast<ExtractValueInst>(I);
15018 Type *AggregateTy = EV->getAggregateOperand()->getType();
15019 unsigned NumElts;
15020 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
15021 NumElts = ATy->getNumElements();
15022 else
15023 NumElts = AggregateTy->getStructNumElements();
15024 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
15025 }
15026 }
15027 if (I->hasOneUse()) {
15028 Instruction *Ext = I->user_back();
15029 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
15031 // Use getExtractWithExtendCost() to calculate the cost of
15032 // extractelement/ext pair.
15033 InstructionCost Cost = TTI->getExtractWithExtendCost(
15034 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
15035 CostKind);
15036 // Subtract the cost of s|zext which is subtracted separately.
15037 Cost -= TTI->getCastInstrCost(
15038 Ext->getOpcode(), Ext->getType(), I->getType(),
15040 return Cost;
15041 }
15042 }
15043 if (DemandedElts.isZero())
15044 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
15045 DemandedElts.setBit(*getExtractIndex(I));
15047 };
15048 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15049 return CommonCost - (DemandedElts.isZero()
15051 : TTI.getScalarizationOverhead(
15052 SrcVecTy, DemandedElts, /*Insert=*/false,
15053 /*Extract=*/true, CostKind));
15054 };
15055 return GetCostDiff(GetScalarCost, GetVectorCost);
15056 }
15057 case Instruction::InsertElement: {
15058 assert(E->ReuseShuffleIndices.empty() &&
15059 "Unique insertelements only are expected.");
15060 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
15061 unsigned const NumElts = SrcVecTy->getNumElements();
15062 unsigned const NumScalars = VL.size();
15063
15064 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
15065
15066 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15067 unsigned OffsetBeg = *getElementIndex(VL.front());
15068 unsigned OffsetEnd = OffsetBeg;
15069 InsertMask[OffsetBeg] = 0;
15070 for (auto [I, V] : enumerate(VL.drop_front())) {
15071 unsigned Idx = *getElementIndex(V);
15072 if (OffsetBeg > Idx)
15073 OffsetBeg = Idx;
15074 else if (OffsetEnd < Idx)
15075 OffsetEnd = Idx;
15076 InsertMask[Idx] = I + 1;
15077 }
15078 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
15079 if (NumOfParts > 0 && NumOfParts < NumElts)
15080 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
15081 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15082 VecScalarsSz;
15083 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15084 unsigned InsertVecSz = std::min<unsigned>(
15085 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
15086 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15087 bool IsWholeSubvector =
15088 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
15089 // Check if we can safely insert a subvector. If it is not possible, just
15090 // generate a whole-sized vector and shuffle the source vector and the new
15091 // subvector.
15092 if (OffsetBeg + InsertVecSz > VecSz) {
15093 // Align OffsetBeg to generate correct mask.
15094 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
15095 InsertVecSz = VecSz;
15096 }
15097
15098 APInt DemandedElts = APInt::getZero(NumElts);
15099 // TODO: Add support for Instruction::InsertValue.
15100 SmallVector<int> Mask;
15101 if (!E->ReorderIndices.empty()) {
15102 inversePermutation(E->ReorderIndices, Mask);
15103 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
15104 } else {
15105 Mask.assign(VecSz, PoisonMaskElem);
15106 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
15107 }
15108 bool IsIdentity = true;
15109 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
15110 Mask.swap(PrevMask);
15111 for (unsigned I = 0; I < NumScalars; ++I) {
15112 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
15113 DemandedElts.setBit(InsertIdx);
15114 IsIdentity &= InsertIdx - OffsetBeg == I;
15115 Mask[InsertIdx - OffsetBeg] = I;
15116 }
15117 assert(Offset < NumElts && "Failed to find vector index offset");
15118
15120 Cost -=
15121 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
15122 /*Insert*/ true, /*Extract*/ false, CostKind);
15123
15124 // First cost - resize to actual vector size if not identity shuffle or
15125 // need to shift the vector.
15126 // Do not calculate the cost if the actual size is the register size and
15127 // we can merge this shuffle with the following SK_Select.
15128 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
15129 if (!IsIdentity)
15131 InsertVecTy, Mask);
15132 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15133 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15134 }));
15135 // Second cost - permutation with subvector, if some elements are from the
15136 // initial vector or inserting a subvector.
15137 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
15138 // subvector of ActualVecTy.
15139 SmallBitVector InMask =
15140 isUndefVector(FirstInsert->getOperand(0),
15141 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
15142 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
15143 if (InsertVecSz != VecSz) {
15144 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
15145 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
15146 CostKind, OffsetBeg - Offset, InsertVecTy);
15147 } else {
15148 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
15149 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
15150 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
15151 I <= End; ++I)
15152 if (Mask[I] != PoisonMaskElem)
15153 Mask[I] = I + VecSz;
15154 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
15155 Mask[I] =
15156 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
15157 Cost +=
15158 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
15159 }
15160 }
15161 return Cost;
15162 }
15163 case Instruction::ZExt:
15164 case Instruction::SExt:
15165 case Instruction::FPToUI:
15166 case Instruction::FPToSI:
15167 case Instruction::FPExt:
15168 case Instruction::PtrToInt:
15169 case Instruction::IntToPtr:
15170 case Instruction::SIToFP:
15171 case Instruction::UIToFP:
15172 case Instruction::Trunc:
15173 case Instruction::FPTrunc:
15174 case Instruction::BitCast: {
15175 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15176 Type *SrcScalarTy = VL0->getOperand(0)->getType();
15177 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
15178 unsigned Opcode = ShuffleOrOp;
15179 unsigned VecOpcode = Opcode;
15180 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15181 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
15182 // Check if the values are candidates to demote.
15183 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
15184 if (SrcIt != MinBWs.end()) {
15185 SrcBWSz = SrcIt->second.first;
15186 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
15187 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
15188 SrcVecTy =
15189 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
15190 }
15191 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15192 if (BWSz == SrcBWSz) {
15193 VecOpcode = Instruction::BitCast;
15194 } else if (BWSz < SrcBWSz) {
15195 VecOpcode = Instruction::Trunc;
15196 } else if (It != MinBWs.end()) {
15197 assert(BWSz > SrcBWSz && "Invalid cast!");
15198 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15199 } else if (SrcIt != MinBWs.end()) {
15200 assert(BWSz > SrcBWSz && "Invalid cast!");
15201 VecOpcode =
15202 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15203 }
15204 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15205 !SrcIt->second.second) {
15206 VecOpcode = Instruction::UIToFP;
15207 }
15208 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
15209 assert(Idx == 0 && "Expected 0 index only");
15210 return TTI->getCastInstrCost(Opcode, VL0->getType(),
15211 VL0->getOperand(0)->getType(),
15213 };
15214 auto GetVectorCost = [=](InstructionCost CommonCost) {
15215 // Do not count cost here if minimum bitwidth is in effect and it is just
15216 // a bitcast (here it is just a noop).
15217 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
15218 return CommonCost;
15219 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
15220 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
15221
15222 bool IsArithmeticExtendedReduction =
15223 E->Idx == 0 && UserIgnoreList &&
15224 all_of(*UserIgnoreList, [](Value *V) {
15225 auto *I = cast<Instruction>(V);
15226 return is_contained({Instruction::Add, Instruction::FAdd,
15227 Instruction::Mul, Instruction::FMul,
15228 Instruction::And, Instruction::Or,
15229 Instruction::Xor},
15230 I->getOpcode());
15231 });
15232 if (IsArithmeticExtendedReduction &&
15233 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
15234 return CommonCost;
15235 return CommonCost +
15236 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
15237 VecOpcode == Opcode ? VI : nullptr);
15238 };
15239 return GetCostDiff(GetScalarCost, GetVectorCost);
15240 }
15241 case Instruction::FCmp:
15242 case Instruction::ICmp:
15243 case Instruction::Select: {
15244 CmpPredicate VecPred, SwappedVecPred;
15245 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
15246 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
15247 match(VL0, MatchCmp))
15248 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
15249 else
15250 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
15253 auto GetScalarCost = [&](unsigned Idx) {
15254 if (isa<PoisonValue>(UniqueValues[Idx]))
15256
15257 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15258 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
15261 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
15262 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
15263 !match(VI, MatchCmp)) ||
15264 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
15265 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
15266 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
15269
15270 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
15271 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
15272 CostKind, getOperandInfo(VI->getOperand(0)),
15273 getOperandInfo(VI->getOperand(1)), VI);
15274 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
15275 if (IntrinsicCost.isValid())
15276 ScalarCost = IntrinsicCost;
15277
15278 return ScalarCost;
15279 };
15280 auto GetVectorCost = [&](InstructionCost CommonCost) {
15281 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15282
15283 InstructionCost VecCost =
15284 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
15285 CostKind, getOperandInfo(E->getOperand(0)),
15286 getOperandInfo(E->getOperand(1)), VL0);
15287 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
15288 auto *CondType =
15289 getWidenedType(SI->getCondition()->getType(), VL.size());
15290 unsigned CondNumElements = CondType->getNumElements();
15291 unsigned VecTyNumElements = getNumElements(VecTy);
15292 assert(VecTyNumElements >= CondNumElements &&
15293 VecTyNumElements % CondNumElements == 0 &&
15294 "Cannot vectorize Instruction::Select");
15295 if (CondNumElements != VecTyNumElements) {
15296 // When the return type is i1 but the source is fixed vector type, we
15297 // need to duplicate the condition value.
15298 VecCost += ::getShuffleCost(
15299 *TTI, TTI::SK_PermuteSingleSrc, CondType,
15300 createReplicatedMask(VecTyNumElements / CondNumElements,
15301 CondNumElements));
15302 }
15303 }
15304 return VecCost + CommonCost;
15305 };
15306 return GetCostDiff(GetScalarCost, GetVectorCost);
15307 }
15308 case TreeEntry::MinMax: {
15309 auto GetScalarCost = [&](unsigned Idx) {
15310 return GetMinMaxCost(OrigScalarTy);
15311 };
15312 auto GetVectorCost = [&](InstructionCost CommonCost) {
15313 InstructionCost VecCost = GetMinMaxCost(VecTy);
15314 return VecCost + CommonCost;
15315 };
15316 return GetCostDiff(GetScalarCost, GetVectorCost);
15317 }
15318 case TreeEntry::FMulAdd: {
15319 auto GetScalarCost = [&](unsigned Idx) {
15320 if (isa<PoisonValue>(UniqueValues[Idx]))
15322 return GetFMulAddCost(E->getOperations(),
15323 cast<Instruction>(UniqueValues[Idx]));
15324 };
15325 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15326 FastMathFlags FMF;
15327 FMF.set();
15328 for (Value *V : E->Scalars) {
15329 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
15330 FMF &= FPCI->getFastMathFlags();
15331 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
15332 FMF &= FPCIOp->getFastMathFlags();
15333 }
15334 }
15335 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15336 {VecTy, VecTy, VecTy}, FMF);
15337 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
15338 return VecCost + CommonCost;
15339 };
15340 return GetCostDiff(GetScalarCost, GetVectorCost);
15341 }
15342 case Instruction::FNeg:
15343 case Instruction::Add:
15344 case Instruction::FAdd:
15345 case Instruction::Sub:
15346 case Instruction::FSub:
15347 case Instruction::Mul:
15348 case Instruction::FMul:
15349 case Instruction::UDiv:
15350 case Instruction::SDiv:
15351 case Instruction::FDiv:
15352 case Instruction::URem:
15353 case Instruction::SRem:
15354 case Instruction::FRem:
15355 case Instruction::Shl:
15356 case Instruction::LShr:
15357 case Instruction::AShr:
15358 case Instruction::And:
15359 case Instruction::Or:
15360 case Instruction::Xor: {
15361 auto GetScalarCost = [&](unsigned Idx) {
15362 if (isa<PoisonValue>(UniqueValues[Idx]))
15364
15365 // We cannot retrieve the operand from UniqueValues[Idx] because an
15366 // interchangeable instruction may be used. The order and the actual
15367 // operand might differ from what is retrieved from UniqueValues[Idx].
15368 unsigned Lane = UniqueIndexes[Idx];
15369 Value *Op1 = E->getOperand(0)[Lane];
15370 Value *Op2;
15371 SmallVector<const Value *, 2> Operands(1, Op1);
15372 if (isa<UnaryOperator>(UniqueValues[Idx])) {
15373 Op2 = Op1;
15374 } else {
15375 Op2 = E->getOperand(1)[Lane];
15376 Operands.push_back(Op2);
15377 }
15380 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
15381 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
15382 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
15383 I && (ShuffleOrOp == Instruction::FAdd ||
15384 ShuffleOrOp == Instruction::FSub)) {
15385 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
15386 if (IntrinsicCost.isValid())
15387 ScalarCost = IntrinsicCost;
15388 }
15389 return ScalarCost;
15390 };
15391 auto GetVectorCost = [=](InstructionCost CommonCost) {
15392 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15393 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15394 ArrayRef<Value *> Ops = E->getOperand(I);
15395 if (all_of(Ops, [&](Value *Op) {
15396 auto *CI = dyn_cast<ConstantInt>(Op);
15397 return CI && CI->getValue().countr_one() >= It->second.first;
15398 }))
15399 return CommonCost;
15400 }
15401 }
15402 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
15403 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
15404 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
15405 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
15406 Op2Info, {}, nullptr, TLI) +
15407 CommonCost;
15408 };
15409 return GetCostDiff(GetScalarCost, GetVectorCost);
15410 }
15411 case Instruction::GetElementPtr: {
15412 return CommonCost + GetGEPCostDiff(VL, VL0);
15413 }
15414 case Instruction::Load: {
15415 auto GetScalarCost = [&](unsigned Idx) {
15416 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
15417 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15418 VI->getAlign(), VI->getPointerAddressSpace(),
15420 };
15421 auto *LI0 = cast<LoadInst>(VL0);
15422 auto GetVectorCost = [&](InstructionCost CommonCost) {
15423 InstructionCost VecLdCost;
15424 switch (E->State) {
15425 case TreeEntry::Vectorize:
15426 if (unsigned Factor = E->getInterleaveFactor()) {
15427 VecLdCost = TTI->getInterleavedMemoryOpCost(
15428 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15429 LI0->getPointerAddressSpace(), CostKind);
15430
15431 } else {
15432 VecLdCost = TTI->getMemoryOpCost(
15433 Instruction::Load, VecTy, LI0->getAlign(),
15434 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15435 }
15436 break;
15437 case TreeEntry::StridedVectorize: {
15438 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
15439 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15440 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
15441 Align CommonAlignment =
15442 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15443 VecLdCost = TTI->getMemIntrinsicInstrCost(
15444 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15445 StridedLoadTy, LI0->getPointerOperand(),
15446 /*VariableMask=*/false, CommonAlignment),
15447 CostKind);
15448 if (StridedLoadTy != VecTy)
15449 VecLdCost +=
15450 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15451 getCastContextHint(*E), CostKind);
15452
15453 break;
15454 }
15455 case TreeEntry::CompressVectorize: {
15456 bool IsMasked;
15457 unsigned InterleaveFactor;
15458 SmallVector<int> CompressMask;
15459 VectorType *LoadVecTy;
15460 SmallVector<Value *> Scalars(VL);
15461 if (!E->ReorderIndices.empty()) {
15462 SmallVector<int> Mask(E->ReorderIndices.begin(),
15463 E->ReorderIndices.end());
15464 reorderScalars(Scalars, Mask);
15465 }
15466 SmallVector<Value *> PointerOps(Scalars.size());
15467 for (auto [I, V] : enumerate(Scalars))
15468 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15469 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15470 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15471 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
15472 CompressMask, LoadVecTy);
15473 assert(IsVectorized && "Failed to vectorize load");
15474 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
15475 InterleaveFactor, IsMasked);
15476 Align CommonAlignment = LI0->getAlign();
15477 if (InterleaveFactor) {
15478 VecLdCost = TTI->getInterleavedMemoryOpCost(
15479 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15480 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
15481 } else if (IsMasked) {
15482 VecLdCost = TTI->getMemIntrinsicInstrCost(
15483 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15484 CommonAlignment,
15485 LI0->getPointerAddressSpace()),
15486 CostKind);
15487 // TODO: include this cost into CommonCost.
15488 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15489 LoadVecTy, CompressMask, CostKind);
15490 } else {
15491 VecLdCost = TTI->getMemoryOpCost(
15492 Instruction::Load, LoadVecTy, CommonAlignment,
15493 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15494 // TODO: include this cost into CommonCost.
15495 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15496 LoadVecTy, CompressMask, CostKind);
15497 }
15498 break;
15499 }
15500 case TreeEntry::ScatterVectorize: {
15501 Align CommonAlignment =
15502 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15503 VecLdCost = TTI->getMemIntrinsicInstrCost(
15504 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15505 LI0->getPointerOperand(),
15506 /*VariableMask=*/false, CommonAlignment),
15507 CostKind);
15508 break;
15509 }
15510 case TreeEntry::CombinedVectorize:
15511 case TreeEntry::SplitVectorize:
15512 case TreeEntry::NeedToGather:
15513 llvm_unreachable("Unexpected vectorization state.");
15514 }
15515 return VecLdCost + CommonCost;
15516 };
15517
15518 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
15519 // If this node generates masked gather load then it is not a terminal node.
15520 // Hence address operand cost is estimated separately.
15521 if (E->State == TreeEntry::ScatterVectorize)
15522 return Cost;
15523
15524 // Estimate cost of GEPs since this tree node is a terminator.
15525 SmallVector<Value *> PointerOps(VL.size());
15526 for (auto [I, V] : enumerate(VL))
15527 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15528 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15529 }
15530 case Instruction::Store: {
15531 bool IsReorder = !E->ReorderIndices.empty();
15532 auto GetScalarCost = [=](unsigned Idx) {
15533 auto *VI = cast<StoreInst>(VL[Idx]);
15534 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
15535 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15536 VI->getAlign(), VI->getPointerAddressSpace(),
15537 CostKind, OpInfo, VI);
15538 };
15539 auto *BaseSI =
15540 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
15541 auto GetVectorCost = [=](InstructionCost CommonCost) {
15542 // We know that we can merge the stores. Calculate the cost.
15543 InstructionCost VecStCost;
15544 if (E->State == TreeEntry::StridedVectorize) {
15545 Align CommonAlignment =
15546 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
15547 VecStCost = TTI->getMemIntrinsicInstrCost(
15548 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15549 VecTy, BaseSI->getPointerOperand(),
15550 /*VariableMask=*/false, CommonAlignment),
15551 CostKind);
15552 } else {
15553 assert(E->State == TreeEntry::Vectorize &&
15554 "Expected either strided or consecutive stores.");
15555 if (unsigned Factor = E->getInterleaveFactor()) {
15556 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
15557 "No reused shuffles expected");
15558 CommonCost = 0;
15559 VecStCost = TTI->getInterleavedMemoryOpCost(
15560 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15561 BaseSI->getPointerAddressSpace(), CostKind);
15562 } else {
15563 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
15564 VecStCost = TTI->getMemoryOpCost(
15565 Instruction::Store, VecTy, BaseSI->getAlign(),
15566 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
15567 }
15568 }
15569 return VecStCost + CommonCost;
15570 };
15571 SmallVector<Value *> PointerOps(VL.size());
15572 for (auto [I, V] : enumerate(VL)) {
15573 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
15574 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
15575 }
15576
15577 return GetCostDiff(GetScalarCost, GetVectorCost) +
15578 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15579 }
15580 case Instruction::Call: {
15581 auto GetScalarCost = [&](unsigned Idx) {
15582 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15585 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15586 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15587 }
15588 return TTI->getCallInstrCost(CI->getCalledFunction(),
15590 CI->getFunctionType()->params(), CostKind);
15591 };
15592 auto GetVectorCost = [=](InstructionCost CommonCost) {
15593 auto *CI = cast<CallInst>(VL0);
15596 CI, ID, VecTy->getNumElements(),
15597 It != MinBWs.end() ? It->second.first : 0, TTI);
15598 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15599 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15600 };
15601 return GetCostDiff(GetScalarCost, GetVectorCost);
15602 }
15603 case Instruction::ShuffleVector: {
15604 if (!SLPReVec || E->isAltShuffle())
15605 assert(E->isAltShuffle() &&
15606 ((Instruction::isBinaryOp(E->getOpcode()) &&
15607 Instruction::isBinaryOp(E->getAltOpcode())) ||
15608 (Instruction::isCast(E->getOpcode()) &&
15609 Instruction::isCast(E->getAltOpcode())) ||
15610 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15611 "Invalid Shuffle Vector Operand");
15612 // Try to find the previous shuffle node with the same operands and same
15613 // main/alternate ops.
15614 auto TryFindNodeWithEqualOperands = [=]() {
15615 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15616 if (TE.get() == E)
15617 break;
15618 if (TE->hasState() && TE->isAltShuffle() &&
15619 ((TE->getOpcode() == E->getOpcode() &&
15620 TE->getAltOpcode() == E->getAltOpcode()) ||
15621 (TE->getOpcode() == E->getAltOpcode() &&
15622 TE->getAltOpcode() == E->getOpcode())) &&
15623 TE->hasEqualOperands(*E))
15624 return true;
15625 }
15626 return false;
15627 };
15628 auto GetScalarCost = [&](unsigned Idx) {
15629 if (isa<PoisonValue>(UniqueValues[Idx]))
15631
15632 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15633 assert(E->getMatchingMainOpOrAltOp(VI) &&
15634 "Unexpected main/alternate opcode");
15635 (void)E;
15636 return TTI->getInstructionCost(VI, CostKind);
15637 };
15638 // Need to clear CommonCost since the final shuffle cost is included into
15639 // vector cost.
15640 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15641 // VecCost is equal to sum of the cost of creating 2 vectors
15642 // and the cost of creating shuffle.
15643 InstructionCost VecCost = 0;
15644 if (TryFindNodeWithEqualOperands()) {
15645 LLVM_DEBUG({
15646 dbgs() << "SLP: diamond match for alternate node found.\n";
15647 E->dump();
15648 });
15649 // No need to add new vector costs here since we're going to reuse
15650 // same main/alternate vector ops, just do different shuffling.
15651 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15652 VecCost =
15653 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15654 VecCost +=
15655 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15656 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15657 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15658 VecCost = TTIRef.getCmpSelInstrCost(
15659 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15660 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15661 VL0);
15662 VecCost += TTIRef.getCmpSelInstrCost(
15663 E->getOpcode(), VecTy, MaskTy,
15664 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15665 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15666 E->getAltOp());
15667 } else {
15668 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15669 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15670 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15671 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15672 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15673 unsigned SrcBWSz =
15674 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15675 if (SrcIt != MinBWs.end()) {
15676 SrcBWSz = SrcIt->second.first;
15677 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15678 SrcTy = getWidenedType(SrcSclTy, VL.size());
15679 }
15680 if (BWSz <= SrcBWSz) {
15681 if (BWSz < SrcBWSz)
15682 VecCost =
15683 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15685 LLVM_DEBUG({
15686 dbgs()
15687 << "SLP: alternate extension, which should be truncated.\n";
15688 E->dump();
15689 });
15690 return VecCost;
15691 }
15692 }
15693 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15695 VecCost +=
15696 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15698 }
15699 SmallVector<int> Mask;
15700 E->buildAltOpShuffleMask(
15701 [&](Instruction *I) {
15702 assert(E->getMatchingMainOpOrAltOp(I) &&
15703 "Unexpected main/alternate opcode");
15704 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15705 *TLI);
15706 },
15707 Mask);
15709 FinalVecTy, Mask, CostKind);
15710 // Patterns like [fadd,fsub] can be combined into a single instruction
15711 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15712 // need to take into account their order when looking for the most used
15713 // order.
15714 unsigned Opcode0 = E->getOpcode();
15715 unsigned Opcode1 = E->getAltOpcode();
15716 SmallBitVector OpcodeMask(
15717 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15718 // If this pattern is supported by the target then we consider the
15719 // order.
15720 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15721 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15722 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15723 return AltVecCost < VecCost ? AltVecCost : VecCost;
15724 }
15725 // TODO: Check the reverse order too.
15726 return VecCost;
15727 };
15728 if (SLPReVec && !E->isAltShuffle())
15729 return GetCostDiff(
15730 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15731 // If a group uses mask in order, the shufflevector can be
15732 // eliminated by instcombine. Then the cost is 0.
15734 "Not supported shufflevector usage.");
15735 auto *SV = cast<ShuffleVectorInst>(VL.front());
15736 unsigned SVNumElements =
15737 cast<FixedVectorType>(SV->getOperand(0)->getType())
15738 ->getNumElements();
15739 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15740 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15741 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15742 int NextIndex = 0;
15743 if (!all_of(Group, [&](Value *V) {
15745 "Not supported shufflevector usage.");
15746 auto *SV = cast<ShuffleVectorInst>(V);
15747 int Index;
15748 [[maybe_unused]] bool IsExtractSubvectorMask =
15749 SV->isExtractSubvectorMask(Index);
15750 assert(IsExtractSubvectorMask &&
15751 "Not supported shufflevector usage.");
15752 if (NextIndex != Index)
15753 return false;
15754 NextIndex += SV->getShuffleMask().size();
15755 return true;
15756 }))
15757 return ::getShuffleCost(
15759 calculateShufflevectorMask(E->Scalars));
15760 }
15761 return TTI::TCC_Free;
15762 });
15763 return GetCostDiff(GetScalarCost, GetVectorCost);
15764 }
15765 case Instruction::Freeze:
15766 return CommonCost;
15767 default:
15768 llvm_unreachable("Unknown instruction");
15769 }
15770}
15771
15772bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15773 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15774 << VectorizableTree.size() << " is fully vectorizable .\n");
15775
15776 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15777 SmallVector<int> Mask;
15778 return TE->isGather() &&
15779 !any_of(TE->Scalars,
15780 [this](Value *V) { return EphValues.contains(V); }) &&
15781 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15782 TE->Scalars.size() < Limit ||
15783 (((TE->hasState() &&
15784 TE->getOpcode() == Instruction::ExtractElement) ||
15786 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15787 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15788 !TE->isAltShuffle()) ||
15789 any_of(TE->Scalars, IsaPred<LoadInst>));
15790 };
15791
15792 // We only handle trees of heights 1 and 2.
15793 if (VectorizableTree.size() == 1 &&
15794 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15795 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15796 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15797 (ForReduction &&
15798 AreVectorizableGathers(VectorizableTree[0].get(),
15799 VectorizableTree[0]->Scalars.size()) &&
15800 VectorizableTree[0]->getVectorFactor() > 2)))
15801 return true;
15802
15803 if (VectorizableTree.size() != 2)
15804 return false;
15805
15806 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15807 // with the second gather nodes if they have less scalar operands rather than
15808 // the initial tree element (may be profitable to shuffle the second gather)
15809 // or they are extractelements, which form shuffle.
15810 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15811 AreVectorizableGathers(VectorizableTree[1].get(),
15812 VectorizableTree[0]->Scalars.size()))
15813 return true;
15814
15815 // Gathering cost would be too much for tiny trees.
15816 if (VectorizableTree[0]->isGather() ||
15817 (VectorizableTree[1]->isGather() &&
15818 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15819 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15820 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15821 return false;
15822
15823 return true;
15824}
15825
15826static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15828 bool MustMatchOrInst) {
15829 // Look past the root to find a source value. Arbitrarily follow the
15830 // path through operand 0 of any 'or'. Also, peek through optional
15831 // shift-left-by-multiple-of-8-bits.
15832 Value *ZextLoad = Root;
15833 const APInt *ShAmtC;
15834 bool FoundOr = false;
15835 while (!isa<ConstantExpr>(ZextLoad) &&
15836 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15837 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15838 ShAmtC->urem(8) == 0))) {
15839 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15840 ZextLoad = BinOp->getOperand(0);
15841 if (BinOp->getOpcode() == Instruction::Or)
15842 FoundOr = true;
15843 }
15844 // Check if the input is an extended load of the required or/shift expression.
15845 Value *Load;
15846 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15847 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15848 return false;
15849
15850 // Require that the total load bit width is a legal integer type.
15851 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15852 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15853 Type *SrcTy = Load->getType();
15854 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15855 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15856 return false;
15857
15858 // Everything matched - assume that we can fold the whole sequence using
15859 // load combining.
15860 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15861 << *(cast<Instruction>(Root)) << "\n");
15862
15863 return true;
15864}
15865
15867 if (RdxKind != RecurKind::Or)
15868 return false;
15869
15870 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15871 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15872 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15873 /* MatchOr */ false);
15874}
15875
15877 // Peek through a final sequence of stores and check if all operations are
15878 // likely to be load-combined.
15879 unsigned NumElts = Stores.size();
15880 for (Value *Scalar : Stores) {
15881 Value *X;
15882 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15883 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15884 return false;
15885 }
15886 return true;
15887}
15888
15889bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15890 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15891 return true;
15892
15893 // Graph is empty - do nothing.
15894 if (VectorizableTree.empty()) {
15895 assert(ExternalUses.empty() && "We shouldn't have any external users");
15896
15897 return true;
15898 }
15899
15900 // No need to vectorize inserts of gathered values.
15901 if (VectorizableTree.size() == 2 &&
15902 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15903 VectorizableTree[1]->isGather() &&
15904 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15905 !(isSplat(VectorizableTree[1]->Scalars) ||
15906 allConstant(VectorizableTree[1]->Scalars))))
15907 return true;
15908
15909 // If the graph includes only PHI nodes and gathers, it is defnitely not
15910 // profitable for the vectorization, we can skip it, if the cost threshold is
15911 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15912 // gathers/buildvectors.
15913 constexpr int Limit = 4;
15914 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15915 !VectorizableTree.empty() &&
15916 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15917 return (TE->isGather() &&
15918 (!TE->hasState() ||
15919 TE->getOpcode() != Instruction::ExtractElement) &&
15920 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15921 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15922 }))
15923 return true;
15924
15925 // Do not vectorize small tree of phis only, if all vector phis are also
15926 // gathered.
15927 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15928 VectorizableTree.size() <= Limit &&
15929 all_of(VectorizableTree,
15930 [&](const std::unique_ptr<TreeEntry> &TE) {
15931 return (TE->isGather() &&
15932 (!TE->hasState() ||
15933 TE->getOpcode() != Instruction::ExtractElement) &&
15934 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15935 Limit) ||
15936 (TE->hasState() &&
15937 (TE->getOpcode() == Instruction::InsertElement ||
15938 (TE->getOpcode() == Instruction::PHI &&
15939 all_of(TE->Scalars, [&](Value *V) {
15940 return isa<PoisonValue>(V) || MustGather.contains(V);
15941 }))));
15942 }) &&
15943 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15944 return TE->State == TreeEntry::Vectorize &&
15945 TE->getOpcode() == Instruction::PHI;
15946 }))
15947 return true;
15948
15949 // If the tree contains only phis, buildvectors, split nodes and
15950 // small nodes with reuses, we can skip it.
15951 SmallVector<const TreeEntry *> StoreLoadNodes;
15952 unsigned NumGathers = 0;
15953 constexpr int LimitTreeSize = 36;
15954 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15955 all_of(VectorizableTree,
15956 [&](const std::unique_ptr<TreeEntry> &TE) {
15957 if (!TE->isGather() && TE->hasState() &&
15958 (TE->getOpcode() == Instruction::Load ||
15959 TE->getOpcode() == Instruction::Store)) {
15960 StoreLoadNodes.push_back(TE.get());
15961 return true;
15962 }
15963 if (TE->isGather())
15964 ++NumGathers;
15965 return TE->State == TreeEntry::SplitVectorize ||
15966 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15967 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15968 VectorizableTree.size() > LimitTreeSize) ||
15969 (TE->isGather() &&
15970 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15971 (TE->hasState() &&
15972 (TE->getOpcode() == Instruction::PHI ||
15973 (TE->hasCopyableElements() &&
15974 static_cast<unsigned>(count_if(
15975 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15976 TE->Scalars.size() / 2) ||
15977 ((!TE->ReuseShuffleIndices.empty() ||
15978 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15979 TE->Scalars.size() == 2)));
15980 }) &&
15981 (StoreLoadNodes.empty() ||
15982 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15983 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15984 return TE->getOpcode() == Instruction::Store ||
15985 all_of(TE->Scalars, [&](Value *V) {
15986 return !isa<LoadInst>(V) ||
15987 areAllUsersVectorized(cast<Instruction>(V));
15988 });
15989 })))))
15990 return true;
15991
15992 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15993 // tree node) and other buildvectors, we can skip it.
15994 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15995 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15996 VectorizableTree.size() >= Limit &&
15997 count_if(ArrayRef(VectorizableTree).drop_front(),
15998 [&](const std::unique_ptr<TreeEntry> &TE) {
15999 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
16000 TE->UserTreeIndex.UserTE->Idx == 0;
16001 }) == 2)
16002 return true;
16003
16004 // If the tree contains only vectorization of the phi node from the
16005 // buildvector - skip it.
16006 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16007 VectorizableTree.size() > 2 &&
16008 VectorizableTree.front()->State == TreeEntry::Vectorize &&
16009 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16010 VectorizableTree[1]->State == TreeEntry::Vectorize &&
16011 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
16012 all_of(
16013 ArrayRef(VectorizableTree).drop_front(2),
16014 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
16015 return true;
16016
16017 // We can vectorize the tree if its size is greater than or equal to the
16018 // minimum size specified by the MinTreeSize command line option.
16019 if (VectorizableTree.size() >= MinTreeSize)
16020 return false;
16021
16022 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
16023 // can vectorize it if we can prove it fully vectorizable.
16024 if (isFullyVectorizableTinyTree(ForReduction))
16025 return false;
16026
16027 // Check if any of the gather node forms an insertelement buildvector
16028 // somewhere.
16029 bool IsAllowedSingleBVNode =
16030 VectorizableTree.size() > 1 ||
16031 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
16032 !VectorizableTree.front()->isAltShuffle() &&
16033 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
16034 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
16035 allSameBlock(VectorizableTree.front()->Scalars));
16036 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
16037 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
16038 return isa<ExtractElementInst, Constant>(V) ||
16039 (IsAllowedSingleBVNode &&
16040 !V->hasNUsesOrMore(UsesLimit) &&
16041 any_of(V->users(), IsaPred<InsertElementInst>));
16042 });
16043 }))
16044 return false;
16045
16046 if (VectorizableTree.back()->isGather() &&
16047 VectorizableTree.back()->hasState() &&
16048 VectorizableTree.back()->isAltShuffle() &&
16049 VectorizableTree.back()->getVectorFactor() > 2 &&
16050 allSameBlock(VectorizableTree.back()->Scalars) &&
16051 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
16052 TTI->getScalarizationOverhead(
16053 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
16054 VectorizableTree.back()->getVectorFactor()),
16055 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
16056 /*Insert=*/true, /*Extract=*/false,
16058 return false;
16059
16060 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
16061 // vectorizable.
16062 return true;
16063}
16064
16067 constexpr unsigned SmallTree = 3;
16068 if (VectorizableTree.front()->isNonPowOf2Vec() &&
16069 getCanonicalGraphSize() <= SmallTree &&
16070 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
16071 [](const std::unique_ptr<TreeEntry> &TE) {
16072 return TE->isGather() && TE->hasState() &&
16073 TE->getOpcode() == Instruction::Load &&
16074 !allSameBlock(TE->Scalars);
16075 }) == 1)
16076 return true;
16077 return false;
16078 }
16079 bool Res = false;
16080 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
16081 TreeEntry &E = *VectorizableTree[Idx];
16082 if (E.State == TreeEntry::SplitVectorize)
16083 return false;
16084 if (!E.isGather())
16085 continue;
16086 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
16087 (!E.hasState() &&
16089 (isa<ExtractElementInst>(E.Scalars.front()) &&
16090 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
16091 return false;
16092 if (isSplat(E.Scalars) || allConstant(E.Scalars))
16093 continue;
16094 Res = true;
16095 }
16096 return Res;
16097}
16098
16100 // Walk from the bottom of the tree to the top, tracking which values are
16101 // live. When we see a call instruction that is not part of our tree,
16102 // query TTI to see if there is a cost to keeping values live over it
16103 // (for example, if spills and fills are required).
16104
16105 const TreeEntry *Root = VectorizableTree.front().get();
16106 if (Root->isGather())
16107 return 0;
16108
16109 InstructionCost Cost = 0;
16111 EntriesToOperands;
16112 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
16113 SmallPtrSet<const Instruction *, 8> LastInstructions;
16114 for (const auto &TEPtr : VectorizableTree) {
16115 if (!TEPtr->isGather()) {
16116 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
16117 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
16118 LastInstructions.insert(LastInst);
16119 }
16120 if (TEPtr->UserTreeIndex)
16121 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
16122 }
16123
16124 auto NoCallIntrinsic = [this](const Instruction *I) {
16125 const auto *II = dyn_cast<IntrinsicInst>(I);
16126 if (!II)
16127 return false;
16128 if (II->isAssumeLikeIntrinsic())
16129 return true;
16130 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
16131 InstructionCost IntrCost =
16132 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
16133 InstructionCost CallCost = TTI->getCallInstrCost(
16134 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
16135 return IntrCost < CallCost;
16136 };
16137
16138 // Maps last instruction in the entry to the last instruction for the one of
16139 // operand entries and the flag. If the flag is true, there are no calls in
16140 // between these instructions.
16142 CheckedInstructions;
16143 unsigned Budget = 0;
16144 const unsigned BudgetLimit =
16145 ScheduleRegionSizeBudget / VectorizableTree.size();
16146 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
16147 const Instruction *Last) {
16148 assert(First->getParent() == Last->getParent() &&
16149 "Expected instructions in same block.");
16150 if (auto It = CheckedInstructions.find(Last);
16151 It != CheckedInstructions.end()) {
16152 const Instruction *Checked = It->second.getPointer();
16153 if (Checked == First || Checked->comesBefore(First))
16154 return It->second.getInt() != 0;
16155 Last = Checked;
16156 } else if (Last == First || Last->comesBefore(First)) {
16157 return true;
16158 }
16160 ++First->getIterator().getReverse(),
16161 PrevInstIt =
16162 Last->getIterator().getReverse();
16163 SmallVector<const Instruction *> LastInstsInRange;
16164 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
16165 // Debug information does not impact spill cost.
16166 // Vectorized calls, represented as vector intrinsics, do not impact spill
16167 // cost.
16168 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
16169 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
16170 for (const Instruction *LastInst : LastInstsInRange)
16171 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
16172 return false;
16173 }
16174 if (LastInstructions.contains(&*PrevInstIt))
16175 LastInstsInRange.push_back(&*PrevInstIt);
16176
16177 ++PrevInstIt;
16178 ++Budget;
16179 }
16180 for (const Instruction *LastInst : LastInstsInRange)
16181 CheckedInstructions.try_emplace(
16182 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
16183 Budget <= BudgetLimit ? 1 : 0);
16184 return Budget <= BudgetLimit;
16185 };
16186 auto AddCosts = [&](const TreeEntry *Op) {
16187 Type *ScalarTy = Op->Scalars.front()->getType();
16188 auto It = MinBWs.find(Op);
16189 if (It != MinBWs.end())
16190 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
16191 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
16192 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
16193 if (ScalarTy->isVectorTy()) {
16194 // Handle revec dead vector instructions.
16195 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
16196 }
16197 };
16198 // Memoize the relationship between blocks, i.e. if there is (at least one)
16199 // non-vectorized call between the blocks. This allows to skip the analysis of
16200 // the same block paths multiple times.
16202 ParentOpParentToPreds;
16203 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
16204 BasicBlock *OpParent) {
16205 auto Key = std::make_pair(Root, OpParent);
16206 if (auto It = ParentOpParentToPreds.find(Key);
16207 It != ParentOpParentToPreds.end())
16208 return It->second;
16210 if (Pred)
16211 Worklist.push_back(Pred);
16212 else
16213 Worklist.append(pred_begin(Root), pred_end(Root));
16216 ParentsPairsToAdd;
16217 bool Res = false;
16219 for (const auto &KeyPair : ParentsPairsToAdd) {
16220 assert(!ParentOpParentToPreds.contains(KeyPair) &&
16221 "Should not have been added before.");
16222 ParentOpParentToPreds.try_emplace(KeyPair, Res);
16223 }
16224 });
16225 while (!Worklist.empty()) {
16226 BasicBlock *BB = Worklist.pop_back_val();
16227 if (BB == OpParent || !Visited.insert(BB).second)
16228 continue;
16229 auto Pair = std::make_pair(BB, OpParent);
16230 if (auto It = ParentOpParentToPreds.find(Pair);
16231 It != ParentOpParentToPreds.end()) {
16232 Res = It->second;
16233 return Res;
16234 }
16235 ParentsPairsToAdd.insert(Pair);
16236 unsigned BlockSize = BB->size();
16237 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
16238 return Res;
16239 Budget += BlockSize;
16240 if (Budget > BudgetLimit)
16241 return Res;
16242 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
16243 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
16244 BB->getTerminator()))
16245 return Res;
16246 Worklist.append(pred_begin(BB), pred_end(BB));
16247 }
16248 Res = true;
16249 return Res;
16250 };
16251 SmallVector<const TreeEntry *> LiveEntries(1, Root);
16252 while (!LiveEntries.empty()) {
16253 const TreeEntry *Entry = LiveEntries.pop_back_val();
16254 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
16255 if (Operands.empty())
16256 continue;
16257 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
16258 BasicBlock *Parent = LastInst->getParent();
16259 for (const TreeEntry *Op : Operands) {
16260 if (!Op->isGather())
16261 LiveEntries.push_back(Op);
16262 if (Entry->State == TreeEntry::SplitVectorize ||
16263 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
16264 (Op->isGather() && allConstant(Op->Scalars)))
16265 continue;
16266 Budget = 0;
16267 BasicBlock *Pred = nullptr;
16268 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
16269 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
16270 BasicBlock *OpParent;
16271 Instruction *OpLastInst;
16272 if (Op->isGather()) {
16273 assert(Entry->getOpcode() == Instruction::PHI &&
16274 "Expected phi node only.");
16275 OpParent = cast<PHINode>(Entry->getMainOp())
16276 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
16277 OpLastInst = OpParent->getTerminator();
16278 for (Value *V : Op->Scalars) {
16279 auto *Inst = dyn_cast<Instruction>(V);
16280 if (!Inst)
16281 continue;
16282 if (isVectorized(V)) {
16283 OpParent = Inst->getParent();
16284 OpLastInst = Inst;
16285 break;
16286 }
16287 }
16288 } else {
16289 OpLastInst = EntriesToLastInstruction.at(Op);
16290 OpParent = OpLastInst->getParent();
16291 }
16292 // Check the call instructions within the same basic blocks.
16293 if (OpParent == Parent) {
16294 if (Entry->getOpcode() == Instruction::PHI) {
16295 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16296 AddCosts(Op);
16297 continue;
16298 }
16299 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16300 AddCosts(Op);
16301 continue;
16302 }
16303 // Check for call instruction in between blocks.
16304 // 1. Check entry's block to the head.
16305 if (Entry->getOpcode() != Instruction::PHI &&
16306 !CheckForNonVecCallsInSameBlock(
16307 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
16308 LastInst)) {
16309 AddCosts(Op);
16310 continue;
16311 }
16312 // 2. Check op's block from the end.
16313 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16314 OpParent->getTerminator())) {
16315 AddCosts(Op);
16316 continue;
16317 }
16318 // 3. Check the predecessors of entry's block till op's block.
16319 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16320 AddCosts(Op);
16321 continue;
16322 }
16323 }
16324 }
16325
16326 return Cost;
16327}
16328
16329/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
16330/// buildvector sequence.
16332 const InsertElementInst *IE2) {
16333 if (IE1 == IE2)
16334 return false;
16335 const auto *I1 = IE1;
16336 const auto *I2 = IE2;
16337 const InsertElementInst *PrevI1;
16338 const InsertElementInst *PrevI2;
16339 unsigned Idx1 = *getElementIndex(IE1);
16340 unsigned Idx2 = *getElementIndex(IE2);
16341 do {
16342 if (I2 == IE1)
16343 return true;
16344 if (I1 == IE2)
16345 return false;
16346 PrevI1 = I1;
16347 PrevI2 = I2;
16348 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16349 getElementIndex(I1).value_or(Idx2) != Idx2)
16350 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
16351 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
16352 getElementIndex(I2).value_or(Idx1) != Idx1)
16353 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
16354 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16355 llvm_unreachable("Two different buildvectors not expected.");
16356}
16357
16358namespace {
16359/// Returns incoming Value *, if the requested type is Value * too, or a default
16360/// value, otherwise.
16361struct ValueSelect {
16362 template <typename U>
16363 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
16364 return V;
16365 }
16366 template <typename U>
16367 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
16368 return U();
16369 }
16370};
16371} // namespace
16372
16373/// Does the analysis of the provided shuffle masks and performs the requested
16374/// actions on the vectors with the given shuffle masks. It tries to do it in
16375/// several steps.
16376/// 1. If the Base vector is not undef vector, resizing the very first mask to
16377/// have common VF and perform action for 2 input vectors (including non-undef
16378/// Base). Other shuffle masks are combined with the resulting after the 1 stage
16379/// and processed as a shuffle of 2 elements.
16380/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
16381/// action only for 1 vector with the given mask, if it is not the identity
16382/// mask.
16383/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
16384/// vectors, combing the masks properly between the steps.
16385template <typename T>
16387 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
16388 function_ref<unsigned(T *)> GetVF,
16389 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
16391 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
16392 SmallVector<int> Mask(ShuffleMask.begin()->second);
16393 auto VMIt = std::next(ShuffleMask.begin());
16394 T *Prev = nullptr;
16395 SmallBitVector UseMask =
16396 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16397 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
16398 if (!IsBaseUndef.all()) {
16399 // Base is not undef, need to combine it with the next subvectors.
16400 std::pair<T *, bool> Res =
16401 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
16402 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
16403 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16404 if (Mask[Idx] == PoisonMaskElem)
16405 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
16406 else
16407 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16408 }
16409 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
16410 assert((!V || GetVF(V) == Mask.size()) &&
16411 "Expected base vector of VF number of elements.");
16412 Prev = Action(Mask, {nullptr, Res.first});
16413 } else if (ShuffleMask.size() == 1) {
16414 // Base is undef and only 1 vector is shuffled - perform the action only for
16415 // single vector, if the mask is not the identity mask.
16416 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16417 /*ForSingleMask=*/true);
16418 if (Res.second)
16419 // Identity mask is found.
16420 Prev = Res.first;
16421 else
16422 Prev = Action(Mask, {ShuffleMask.begin()->first});
16423 } else {
16424 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
16425 // shuffles step by step, combining shuffle between the steps.
16426 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16427 unsigned Vec2VF = GetVF(VMIt->first);
16428 if (Vec1VF == Vec2VF) {
16429 // No need to resize the input vectors since they are of the same size, we
16430 // can shuffle them directly.
16431 ArrayRef<int> SecMask = VMIt->second;
16432 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16433 if (SecMask[I] != PoisonMaskElem) {
16434 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16435 Mask[I] = SecMask[I] + Vec1VF;
16436 }
16437 }
16438 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16439 } else {
16440 // Vectors of different sizes - resize and reshuffle.
16441 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16442 /*ForSingleMask=*/false);
16443 std::pair<T *, bool> Res2 =
16444 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16445 ArrayRef<int> SecMask = VMIt->second;
16446 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16447 if (Mask[I] != PoisonMaskElem) {
16448 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16449 if (Res1.second)
16450 Mask[I] = I;
16451 } else if (SecMask[I] != PoisonMaskElem) {
16452 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16453 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
16454 }
16455 }
16456 Prev = Action(Mask, {Res1.first, Res2.first});
16457 }
16458 VMIt = std::next(VMIt);
16459 }
16460 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
16461 // Perform requested actions for the remaining masks/vectors.
16462 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16463 // Shuffle other input vectors, if any.
16464 std::pair<T *, bool> Res =
16465 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16466 ArrayRef<int> SecMask = VMIt->second;
16467 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16468 if (SecMask[I] != PoisonMaskElem) {
16469 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
16470 "Multiple uses of scalars.");
16471 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
16472 } else if (Mask[I] != PoisonMaskElem) {
16473 Mask[I] = I;
16474 }
16475 }
16476 Prev = Action(Mask, {Prev, Res.first});
16477 }
16478 return Prev;
16479}
16480
16482 ArrayRef<Value *> VectorizedVals) {
16484 SmallPtrSet<Value *, 4> CheckedExtracts;
16485 SmallPtrSet<const TreeEntry *, 4> GatheredLoadsNodes;
16486 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16487 << VectorizableTree.size() << ".\n");
16488 InstructionCost Cost = 0;
16489 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16490 TreeEntry &TE = *Ptr;
16491 // No need to count the cost for combined entries, they are combined and
16492 // just skip their cost.
16493 if (TE.State == TreeEntry::CombinedVectorize) {
16494 LLVM_DEBUG(
16495 dbgs() << "SLP: Skipping cost for combined node that starts with "
16496 << *TE.Scalars[0] << ".\n";
16497 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16498 NodesCosts.try_emplace(&TE);
16499 continue;
16500 }
16501 if (TE.hasState() &&
16502 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16503 if (const TreeEntry *E =
16504 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16505 E && E->getVectorFactor() == TE.getVectorFactor()) {
16506 // Some gather nodes might be absolutely the same as some vectorizable
16507 // nodes after reordering, need to handle it.
16508 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16509 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16510 << "SLP: Current total cost = " << Cost << "\n");
16511 NodesCosts.try_emplace(&TE);
16512 continue;
16513 }
16514 }
16515
16516 // Exclude cost of gather loads nodes which are not used. These nodes were
16517 // built as part of the final attempt to vectorize gathered loads.
16518 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16519 "Expected gather nodes with users only.");
16520
16521 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
16522 Cost += C;
16523 NodesCosts.try_emplace(&TE, C);
16524 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16525 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16526 << "SLP: Current total cost = " << Cost << "\n");
16527 // Add gathered loads nodes to the set for later processing.
16528 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
16529 TE.getOpcode() == Instruction::Load)
16530 GatheredLoadsNodes.insert(&TE);
16531 }
16532 // Bail out if the cost threshold is negative and cost already below it.
16533 if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 &&
16534 Cost < -SLPCostThreshold)
16535 return Cost;
16536 // Bail out, if gathered loads nodes are found.
16537 // TODO: add analysis for gathered load to include their cost correctly into
16538 // the related subtrees.
16539 if (!GatheredLoadsNodes.empty())
16540 return Cost;
16541 // The narrow non-profitable tree in loop? Skip, may cause regressions.
16542 constexpr unsigned PartLimit = 2;
16543 const unsigned Sz =
16544 getVectorElementSize(VectorizableTree.front()->Scalars.front());
16545 const unsigned MinVF = getMinVF(Sz);
16546 if (Cost >= -SLPCostThreshold &&
16547 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
16548 (!VectorizableTree.front()->hasState() ||
16549 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
16550 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
16551 return Cost;
16553 VectorizableTree.size());
16554 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16555 TreeEntry &TE = *Ptr;
16556 InstructionCost C = NodesCosts.at(&TE);
16557 SubtreeCosts[TE.Idx].first += C;
16558 const TreeEntry *UserTE = TE.UserTreeIndex.UserTE;
16559 while (UserTE) {
16560 SubtreeCosts[UserTE->Idx].first += C;
16561 SubtreeCosts[UserTE->Idx].second.push_back(TE.Idx);
16562 UserTE = UserTE->UserTreeIndex.UserTE;
16563 }
16564 }
16565 using CostIndicesTy =
16566 std::pair<TreeEntry *, std::pair<InstructionCost, SmallVector<unsigned>>>;
16567 struct FirstGreater {
16568 bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
16569 return LHS.second.first < RHS.second.first ||
16570 (LHS.second.first == RHS.second.first &&
16571 LHS.first->Idx < RHS.first->Idx);
16572 }
16573 };
16575 Worklist;
16576 for (const auto [Idx, P] : enumerate(SubtreeCosts))
16577 Worklist.emplace(VectorizableTree[Idx].get(), P);
16578
16579 // Narrow store trees with non-profitable immediate values - exit.
16580 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
16581 VectorizableTree.front()->hasState() &&
16582 VectorizableTree.front()->getOpcode() == Instruction::Store &&
16583 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
16584 return Cost;
16585
16586 bool Changed = false;
16587 while (!Worklist.empty() && Worklist.top().second.first > 0) {
16588 TreeEntry *TE = Worklist.top().first;
16589 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE) ||
16590 // Exit early if the parent node is split node and any of scalars is
16591 // used in other split nodes.
16592 (TE->UserTreeIndex &&
16593 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
16594 any_of(TE->Scalars, [&](Value *V) {
16595 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
16596 return Entries.size() > 1;
16597 }))) {
16598 Worklist.pop();
16599 continue;
16600 }
16601
16602 // Calculate the gather cost of the root node.
16603 InstructionCost SubtreeCost = Worklist.top().second.first;
16604 if (SubtreeCost < TE->Scalars.size()) {
16605 Worklist.pop();
16606 continue;
16607 }
16608 if (!TransformedToGatherNodes.empty()) {
16609 for (unsigned Idx : Worklist.top().second.second) {
16610 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].get());
16611 if (It != TransformedToGatherNodes.end()) {
16612 SubtreeCost -= SubtreeCosts[Idx].first;
16613 SubtreeCost += It->second;
16614 }
16615 }
16616 }
16617 if (SubtreeCost < 0 || SubtreeCost < TE->Scalars.size()) {
16618 Worklist.pop();
16619 continue;
16620 }
16621 const unsigned Sz = TE->Scalars.size();
16622 APInt DemandedElts = APInt::getAllOnes(Sz);
16623 for (auto [Idx, V] : enumerate(TE->Scalars)) {
16624 if (isConstant(V))
16625 DemandedElts.clearBit(Idx);
16626 }
16628
16629 Type *ScalarTy = getValueType(TE->Scalars.front());
16630 auto *VecTy = getWidenedType(ScalarTy, Sz);
16631 const unsigned EntryVF = TE->getVectorFactor();
16632 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
16634 *TTI, ScalarTy, VecTy, DemandedElts,
16635 /*Insert=*/true, /*Extract=*/false, CostKind);
16636 SmallVector<int> Mask;
16637 if (!TE->ReorderIndices.empty() &&
16638 TE->State != TreeEntry::CompressVectorize &&
16639 (TE->State != TreeEntry::StridedVectorize ||
16640 !isReverseOrder(TE->ReorderIndices))) {
16641 SmallVector<int> NewMask;
16642 if (TE->getOpcode() == Instruction::Store) {
16643 // For stores the order is actually a mask.
16644 NewMask.resize(TE->ReorderIndices.size());
16645 copy(TE->ReorderIndices, NewMask.begin());
16646 } else {
16647 inversePermutation(TE->ReorderIndices, NewMask);
16648 }
16649 ::addMask(Mask, NewMask);
16650 }
16651 if (!TE->ReuseShuffleIndices.empty())
16652 ::addMask(Mask, TE->ReuseShuffleIndices);
16653 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF))
16654 GatherCost +=
16655 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
16656 // If all scalars are reused in gather node(s) or other vector nodes, there
16657 // might be extra cost for inserting them.
16658 if (all_of(TE->Scalars, [&](Value *V) {
16659 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
16660 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
16661 }))
16662 GatherCost *= 2;
16663 // Erase subtree if it is non-profitable.
16664 if (SubtreeCost > GatherCost) {
16665 // If the remaining tree is just a buildvector - exit, it will cause
16666 // enless attempts to vectorize.
16667 if (VectorizableTree.front()->hasState() &&
16668 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16669 TE->Idx == 1)
16671
16672 LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
16673 << TE->Idx << " with cost "
16674 << Worklist.top().second.first << " and gather cost "
16675 << GatherCost << ".\n");
16676 if (TE->UserTreeIndex) {
16677 TransformedToGatherNodes.try_emplace(TE, GatherCost);
16678 NodesCosts.erase(TE);
16679 } else {
16680 DeletedNodes.insert(TE);
16681 TransformedToGatherNodes.erase(TE);
16682 NodesCosts.erase(TE);
16683 }
16684 for (unsigned Idx : Worklist.top().second.second) {
16685 TreeEntry &ChildTE = *VectorizableTree[Idx];
16686 DeletedNodes.insert(&ChildTE);
16687 TransformedToGatherNodes.erase(&ChildTE);
16688 NodesCosts.erase(&ChildTE);
16689 }
16690 Changed = true;
16691 }
16692 Worklist.pop();
16693 }
16694 if (!Changed)
16695 return SubtreeCosts.front().first;
16696
16697 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16698 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(TE.get())) {
16699 assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
16700 continue;
16701 }
16702 if (DeletedNodes.contains(TE.get()))
16703 continue;
16704 if (!NodesCosts.contains(TE.get())) {
16706 getEntryCost(TE.get(), VectorizedVals, CheckedExtracts);
16707 NodesCosts.try_emplace(TE.get(), C);
16708 }
16709 }
16710
16711 LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
16712 InstructionCost NewCost = 0;
16713 for (const auto &P : NodesCosts) {
16714 NewCost += P.second;
16715 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
16716 << shortBundleName(P.first->Scalars, P.first->Idx)
16717 << ".\n"
16718 << "SLP: Current total cost = " << Cost << "\n");
16719 }
16720 if (NewCost >= Cost) {
16721 DeletedNodes.clear();
16722 TransformedToGatherNodes.clear();
16723 NewCost = Cost;
16724 }
16725 return NewCost;
16726}
16727
16728namespace {
16729/// Data type for handling buildvector sequences with the reused scalars from
16730/// other tree entries.
16731template <typename T> struct ShuffledInsertData {
16732 /// List of insertelements to be replaced by shuffles.
16733 SmallVector<InsertElementInst *> InsertElements;
16734 /// The parent vectors and shuffle mask for the given list of inserts.
16736};
16737} // namespace
16738
16740 ArrayRef<Value *> VectorizedVals,
16741 InstructionCost ReductionCost) {
16742 InstructionCost Cost = TreeCost + ReductionCost;
16743
16744 if (Cost >= -SLPCostThreshold &&
16745 none_of(ExternalUses, [](const ExternalUser &EU) {
16746 return isa_and_nonnull<InsertElementInst>(EU.User);
16747 }))
16748 return Cost;
16749
16750 SmallPtrSet<Value *, 16> ExtractCostCalculated;
16751 InstructionCost ExtractCost = 0;
16753 SmallVector<APInt> DemandedElts;
16754 SmallDenseSet<Value *, 4> UsedInserts;
16756 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16758 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
16759 // Keep track {Scalar, Index, User} tuple.
16760 // On AArch64, this helps in fusing a mov instruction, associated with
16761 // extractelement, with fmul in the backend so that extractelement is free.
16763 for (ExternalUser &EU : ExternalUses) {
16764 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
16765 }
16766 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
16767 for (ExternalUser &EU : ExternalUses) {
16768 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
16769 << EU.E.Idx << " in lane " << EU.Lane << "\n");
16770 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
16771 else dbgs() << " User: nullptr\n");
16772 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
16773
16774 // Uses by ephemeral values are free (because the ephemeral value will be
16775 // removed prior to code generation, and so the extraction will be
16776 // removed as well).
16777 if (EphValues.count(EU.User))
16778 continue;
16779
16780 // Check if the scalar for the given user or all users is accounted already.
16781 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
16782 (EU.User &&
16783 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
16784 continue;
16785
16786 // Used in unreachable blocks or in EH pads (rarely executed) or is
16787 // terminated with unreachable instruction.
16788 if (BasicBlock *UserParent =
16789 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
16790 UserParent &&
16791 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16792 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
16793 continue;
16794
16795 // We only add extract cost once for the same scalar.
16796 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16797 !ExtractCostCalculated.insert(EU.Scalar).second)
16798 continue;
16799
16800 // No extract cost for vector "scalar" if REVEC is disabled
16801 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16802 continue;
16803
16804 // If found user is an insertelement, do not calculate extract cost but try
16805 // to detect it as a final shuffled/identity match.
16806 // TODO: what if a user is insertvalue when REVEC is enabled?
16807 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16808 VU && VU->getOperand(1) == EU.Scalar) {
16809 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16810 if (!UsedInserts.insert(VU).second)
16811 continue;
16812 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16813 if (InsertIdx) {
16814 const TreeEntry *ScalarTE = &EU.E;
16815 auto *It = find_if(
16816 ShuffledInserts,
16817 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16818 // Checks if 2 insertelements are from the same buildvector.
16819 InsertElementInst *VecInsert = Data.InsertElements.front();
16821 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16822 Value *Op0 = II->getOperand(0);
16823 if (isVectorized(II) && !isVectorized(Op0))
16824 return nullptr;
16825 return Op0;
16826 });
16827 });
16828 int VecId = -1;
16829 if (It == ShuffledInserts.end()) {
16830 auto &Data = ShuffledInserts.emplace_back();
16831 Data.InsertElements.emplace_back(VU);
16832 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16833 VecId = ShuffledInserts.size() - 1;
16834 auto It = MinBWs.find(ScalarTE);
16835 if (It != MinBWs.end() &&
16836 VectorCasts
16837 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16838 .second) {
16839 unsigned BWSz = It->second.first;
16840 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16841 unsigned VecOpcode;
16842 if (DstBWSz < BWSz)
16843 VecOpcode = Instruction::Trunc;
16844 else
16845 VecOpcode =
16846 It->second.second ? Instruction::SExt : Instruction::ZExt;
16848 InstructionCost C = TTI->getCastInstrCost(
16849 VecOpcode, FTy,
16850 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16851 FTy->getNumElements()),
16853 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16854 << " for extending externally used vector with "
16855 "non-equal minimum bitwidth.\n");
16856 Cost += C;
16857 }
16858 } else {
16859 if (isFirstInsertElement(VU, It->InsertElements.front()))
16860 It->InsertElements.front() = VU;
16861 VecId = std::distance(ShuffledInserts.begin(), It);
16862 }
16863 int InIdx = *InsertIdx;
16864 SmallVectorImpl<int> &Mask =
16865 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16866 if (Mask.empty())
16867 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16868 Mask[InIdx] = EU.Lane;
16869 DemandedElts[VecId].setBit(InIdx);
16870 continue;
16871 }
16872 }
16873 }
16874
16876 // If we plan to rewrite the tree in a smaller type, we will need to sign
16877 // extend the extracted value back to the original type. Here, we account
16878 // for the extract and the added cost of the sign extend if needed.
16879 InstructionCost ExtraCost = TTI::TCC_Free;
16880 auto *ScalarTy = EU.Scalar->getType();
16881 const unsigned BundleWidth = EU.E.getVectorFactor();
16882 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16883 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16884 const TreeEntry *Entry = &EU.E;
16885 auto It = MinBWs.find(Entry);
16886 if (It != MinBWs.end()) {
16887 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16888 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16889 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16890 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16891 ? Instruction::ZExt
16892 : Instruction::SExt;
16893 VecTy = getWidenedType(MinTy, BundleWidth);
16894 ExtraCost =
16895 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16896 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16897 << ExtraCost << "\n");
16898 } else {
16899 ExtraCost =
16900 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16901 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16902 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16903 << *VecTy << ": " << ExtraCost << "\n");
16904 }
16905 // Leave the scalar instructions as is if they are cheaper than extracts.
16906 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16907 Entry->getOpcode() == Instruction::Load) {
16908 // Checks if the user of the external scalar is phi in loop body.
16909 auto IsPhiInLoop = [&](const ExternalUser &U) {
16910 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16911 auto *I = cast<Instruction>(U.Scalar);
16912 const Loop *L = LI->getLoopFor(Phi->getParent());
16913 return L && (Phi->getParent() == I->getParent() ||
16914 L == LI->getLoopFor(I->getParent()));
16915 }
16916 return false;
16917 };
16918 if (!ValueToExtUses) {
16919 ValueToExtUses.emplace();
16920 for (const auto &P : enumerate(ExternalUses)) {
16921 // Ignore phis in loops.
16922 if (IsPhiInLoop(P.value()))
16923 continue;
16924
16925 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16926 }
16927 }
16928 // Can use original instruction, if no operands vectorized or they are
16929 // marked as externally used already.
16930 auto *Inst = cast<Instruction>(EU.Scalar);
16931 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16932 auto OperandIsScalar = [&](Value *V) {
16933 if (!isVectorized(V)) {
16934 // Some extractelements might be not vectorized, but
16935 // transformed into shuffle and removed from the function,
16936 // consider it here.
16937 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16938 return !EE->hasOneUse() || !MustGather.contains(EE);
16939 return true;
16940 }
16941 return ValueToExtUses->contains(V);
16942 };
16943 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16944 bool CanBeUsedAsScalarCast = false;
16945 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16946 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16947 Op && all_of(Op->operands(), OperandIsScalar)) {
16948 InstructionCost OpCost =
16949 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16950 ? TTI->getInstructionCost(Op, CostKind)
16951 : 0;
16952 if (ScalarCost + OpCost <= ExtraCost) {
16953 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16954 ScalarCost += OpCost;
16955 }
16956 }
16957 }
16958 if (CanBeUsedAsScalar) {
16959 bool KeepScalar = ScalarCost <= ExtraCost;
16960 // Try to keep original scalar if the user is the phi node from the same
16961 // block as the root phis, currently vectorized. It allows to keep
16962 // better ordering info of PHIs, being vectorized currently.
16963 bool IsProfitablePHIUser =
16964 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16965 VectorizableTree.front()->Scalars.size() > 2)) &&
16966 VectorizableTree.front()->hasState() &&
16967 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16968 !Inst->hasNUsesOrMore(UsesLimit) &&
16969 none_of(Inst->users(),
16970 [&](User *U) {
16971 auto *PHIUser = dyn_cast<PHINode>(U);
16972 return (!PHIUser ||
16973 PHIUser->getParent() !=
16974 cast<Instruction>(
16975 VectorizableTree.front()->getMainOp())
16976 ->getParent()) &&
16977 !isVectorized(U);
16978 }) &&
16979 count_if(Entry->Scalars, [&](Value *V) {
16980 return ValueToExtUses->contains(V);
16981 }) <= 2;
16982 if (IsProfitablePHIUser) {
16983 KeepScalar = true;
16984 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16985 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16986 (!GatheredLoadsEntriesFirst.has_value() ||
16987 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16988 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16989 return ValueToExtUses->contains(V);
16990 });
16991 auto It = ExtractsCount.find(Entry);
16992 if (It != ExtractsCount.end()) {
16993 assert(ScalarUsesCount >= It->getSecond().size() &&
16994 "Expected total number of external uses not less than "
16995 "number of scalar uses.");
16996 ScalarUsesCount -= It->getSecond().size();
16997 }
16998 // Keep original scalar if number of externally used instructions in
16999 // the same entry is not power of 2. It may help to do some extra
17000 // vectorization for now.
17001 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
17002 }
17003 if (KeepScalar) {
17004 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
17005 for (Value *V : Inst->operands()) {
17006 auto It = ValueToExtUses->find(V);
17007 if (It != ValueToExtUses->end()) {
17008 // Replace all uses to avoid compiler crash.
17009 ExternalUses[It->second].User = nullptr;
17010 }
17011 }
17012 ExtraCost = ScalarCost;
17013 if (!IsPhiInLoop(EU))
17014 ExtractsCount[Entry].insert(Inst);
17015 if (CanBeUsedAsScalarCast) {
17016 ScalarOpsFromCasts.insert(Inst->getOperand(0));
17017 // Update the users of the operands of the cast operand to avoid
17018 // compiler crash.
17019 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
17020 for (Value *V : IOp->operands()) {
17021 auto It = ValueToExtUses->find(V);
17022 if (It != ValueToExtUses->end()) {
17023 // Replace all uses to avoid compiler crash.
17024 ExternalUses[It->second].User = nullptr;
17025 }
17026 }
17027 }
17028 }
17029 }
17030 }
17031 }
17032
17033 ExtractCost += ExtraCost;
17034 }
17035 // Insert externals for extract of operands of casts to be emitted as scalars
17036 // instead of extractelement.
17037 for (Value *V : ScalarOpsFromCasts) {
17038 ExternalUsesAsOriginalScalar.insert(V);
17039 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
17040 const auto *It = find_if_not(TEs, [&](TreeEntry *TE) {
17041 return TransformedToGatherNodes.contains(TE) ||
17042 DeletedNodes.contains(TE);
17043 });
17044 if (It != TEs.end()) {
17045 const TreeEntry *UserTE = *It;
17046 ExternalUses.emplace_back(V, nullptr, *UserTE,
17047 UserTE->findLaneForValue(V));
17048 }
17049 }
17050 }
17051 // Add reduced value cost, if resized.
17052 if (!VectorizedVals.empty()) {
17053 const TreeEntry &Root = *VectorizableTree.front();
17054 auto BWIt = MinBWs.find(&Root);
17055 if (BWIt != MinBWs.end()) {
17056 Type *DstTy = Root.Scalars.front()->getType();
17057 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
17058 unsigned SrcSz =
17059 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
17060 if (OriginalSz != SrcSz) {
17061 unsigned Opcode = Instruction::Trunc;
17062 if (OriginalSz > SrcSz)
17063 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
17064 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
17065 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
17066 assert(SLPReVec && "Only supported by REVEC.");
17067 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
17068 }
17069 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
17072 }
17073 }
17074 }
17075
17076 // Buildvector with externally used scalars, which should remain as scalars,
17077 // should not be vectorized, the compiler may hang.
17078 if (SLPCostThreshold < 0 && VectorizableTree.size() > 1 &&
17079 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
17080 VectorizableTree[1]->hasState() &&
17081 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17082 all_of(VectorizableTree[1]->Scalars, [&](Value *V) {
17083 return ExternalUsesAsOriginalScalar.contains(V);
17084 }))
17086
17087 Cost += ExtractCost;
17088 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
17089 bool ForSingleMask) {
17090 InstructionCost C = 0;
17091 unsigned VF = Mask.size();
17092 unsigned VecVF = TE->getVectorFactor();
17093 bool HasLargeIndex =
17094 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
17095 if ((VF != VecVF && HasLargeIndex) ||
17097
17098 if (HasLargeIndex) {
17099 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
17100 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
17101 OrigMask.begin());
17103 getWidenedType(TE->getMainOp()->getType(), VecVF),
17104 OrigMask);
17105 LLVM_DEBUG(
17106 dbgs() << "SLP: Adding cost " << C
17107 << " for final shuffle of insertelement external users.\n";
17108 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17109 Cost += C;
17110 return std::make_pair(TE, true);
17111 }
17112
17113 if (!ForSingleMask) {
17114 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
17115 for (unsigned I = 0; I < VF; ++I) {
17116 if (Mask[I] != PoisonMaskElem)
17117 ResizeMask[Mask[I]] = Mask[I];
17118 }
17119 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
17122 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
17123 LLVM_DEBUG(
17124 dbgs() << "SLP: Adding cost " << C
17125 << " for final shuffle of insertelement external users.\n";
17126 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17127
17128 Cost += C;
17129 }
17130 }
17131 return std::make_pair(TE, false);
17132 };
17133 // Calculate the cost of the reshuffled vectors, if any.
17134 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
17135 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
17136 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
17137 unsigned VF = 0;
17138 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
17140 assert((TEs.size() == 1 || TEs.size() == 2) &&
17141 "Expected exactly 1 or 2 tree entries.");
17142 if (TEs.size() == 1) {
17143 if (VF == 0)
17144 VF = TEs.front()->getVectorFactor();
17145 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17146 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
17147 !all_of(enumerate(Mask), [=](const auto &Data) {
17148 return Data.value() == PoisonMaskElem ||
17149 (Data.index() < VF &&
17150 static_cast<int>(Data.index()) == Data.value());
17151 })) {
17154 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17155 << " for final shuffle of insertelement "
17156 "external users.\n";
17157 TEs.front()->dump();
17158 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17159 Cost += C;
17160 }
17161 } else {
17162 if (VF == 0) {
17163 if (TEs.front() &&
17164 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
17165 VF = TEs.front()->getVectorFactor();
17166 else
17167 VF = Mask.size();
17168 }
17169 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17171 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
17172 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17173 << " for final shuffle of vector node and external "
17174 "insertelement users.\n";
17175 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
17176 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17177 Cost += C;
17178 }
17179 VF = Mask.size();
17180 return TEs.back();
17181 };
17183 MutableArrayRef(Vector.data(), Vector.size()), Base,
17184 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
17185 EstimateShufflesCost);
17186 InstructionCost InsertCost = TTI->getScalarizationOverhead(
17188 ShuffledInserts[I].InsertElements.front()->getType()),
17189 DemandedElts[I],
17190 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
17191 Cost -= InsertCost;
17192 }
17193
17194 // Add the cost for reduced value resize (if required).
17195 if (ReductionBitWidth != 0) {
17196 assert(UserIgnoreList && "Expected reduction tree.");
17197 const TreeEntry &E = *VectorizableTree.front();
17198 auto It = MinBWs.find(&E);
17199 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
17200 unsigned SrcSize = It->second.first;
17201 unsigned DstSize = ReductionBitWidth;
17202 unsigned Opcode = Instruction::Trunc;
17203 if (SrcSize < DstSize) {
17204 bool IsArithmeticExtendedReduction =
17205 all_of(*UserIgnoreList, [](Value *V) {
17206 auto *I = cast<Instruction>(V);
17207 return is_contained({Instruction::Add, Instruction::FAdd,
17208 Instruction::Mul, Instruction::FMul,
17209 Instruction::And, Instruction::Or,
17210 Instruction::Xor},
17211 I->getOpcode());
17212 });
17213 if (IsArithmeticExtendedReduction)
17214 Opcode =
17215 Instruction::BitCast; // Handle it by getExtendedReductionCost
17216 else
17217 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
17218 }
17219 if (Opcode != Instruction::BitCast) {
17220 auto *SrcVecTy =
17221 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
17222 auto *DstVecTy =
17223 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
17224 TTI::CastContextHint CCH = getCastContextHint(E);
17225 InstructionCost CastCost;
17226 switch (E.getOpcode()) {
17227 case Instruction::SExt:
17228 case Instruction::ZExt:
17229 case Instruction::Trunc: {
17230 const TreeEntry *OpTE = getOperandEntry(&E, 0);
17231 CCH = getCastContextHint(*OpTE);
17232 break;
17233 }
17234 default:
17235 break;
17236 }
17237 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
17239 Cost += CastCost;
17240 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
17241 << " for final resize for reduction from " << SrcVecTy
17242 << " to " << DstVecTy << "\n";
17243 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17244 }
17245 }
17246 }
17247
17248 std::optional<InstructionCost> SpillCost;
17249 if (Cost < -SLPCostThreshold) {
17250 SpillCost = getSpillCost();
17251 Cost += *SpillCost;
17252 }
17253#ifndef NDEBUG
17254 SmallString<256> Str;
17255 {
17256 raw_svector_ostream OS(Str);
17257 OS << "SLP: Spill Cost = ";
17258 if (SpillCost)
17259 OS << *SpillCost;
17260 else
17261 OS << "<skipped>";
17262 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
17263 << "SLP: Total Cost = " << Cost << ".\n";
17264 }
17265 LLVM_DEBUG(dbgs() << Str);
17266 if (ViewSLPTree)
17267 ViewGraph(this, "SLP" + F->getName(), false, Str);
17268#endif
17269
17270 return Cost;
17271}
17272
17273/// Tries to find extractelement instructions with constant indices from fixed
17274/// vector type and gather such instructions into a bunch, which highly likely
17275/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17276/// successful, the matched scalars are replaced by poison values in \p VL for
17277/// future analysis.
17278std::optional<TTI::ShuffleKind>
17279BoUpSLP::tryToGatherSingleRegisterExtractElements(
17281 // Scan list of gathered scalars for extractelements that can be represented
17282 // as shuffles.
17284 SmallVector<int> UndefVectorExtracts;
17285 for (int I = 0, E = VL.size(); I < E; ++I) {
17286 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
17287 if (!EI) {
17288 if (isa<UndefValue>(VL[I]))
17289 UndefVectorExtracts.push_back(I);
17290 continue;
17291 }
17292 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
17293 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
17294 continue;
17295 std::optional<unsigned> Idx = getExtractIndex(EI);
17296 // Undefined index.
17297 if (!Idx) {
17298 UndefVectorExtracts.push_back(I);
17299 continue;
17300 }
17301 if (Idx >= VecTy->getNumElements()) {
17302 UndefVectorExtracts.push_back(I);
17303 continue;
17304 }
17305 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
17306 ExtractMask.reset(*Idx);
17307 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
17308 UndefVectorExtracts.push_back(I);
17309 continue;
17310 }
17311 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
17312 }
17313 // Sort the vector operands by the maximum number of uses in extractelements.
17315 VectorOpToIdx.takeVector();
17316 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
17317 return P1.second.size() > P2.second.size();
17318 });
17319 // Find the best pair of the vectors or a single vector.
17320 const int UndefSz = UndefVectorExtracts.size();
17321 unsigned SingleMax = 0;
17322 unsigned PairMax = 0;
17323 if (!Vectors.empty()) {
17324 SingleMax = Vectors.front().second.size() + UndefSz;
17325 if (Vectors.size() > 1) {
17326 auto *ItNext = std::next(Vectors.begin());
17327 PairMax = SingleMax + ItNext->second.size();
17328 }
17329 }
17330 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
17331 return std::nullopt;
17332 // Check if better to perform a shuffle of 2 vectors or just of a single
17333 // vector.
17334 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
17335 SmallVector<Value *> GatheredExtracts(
17336 VL.size(), PoisonValue::get(VL.front()->getType()));
17337 if (SingleMax >= PairMax && SingleMax) {
17338 for (int Idx : Vectors.front().second)
17339 std::swap(GatheredExtracts[Idx], VL[Idx]);
17340 } else if (!Vectors.empty()) {
17341 for (unsigned Idx : {0, 1})
17342 for (int Idx : Vectors[Idx].second)
17343 std::swap(GatheredExtracts[Idx], VL[Idx]);
17344 }
17345 // Add extracts from undefs too.
17346 for (int Idx : UndefVectorExtracts)
17347 std::swap(GatheredExtracts[Idx], VL[Idx]);
17348 // Check that gather of extractelements can be represented as just a
17349 // shuffle of a single/two vectors the scalars are extracted from.
17350 std::optional<TTI::ShuffleKind> Res =
17351 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
17352 if (!Res || all_of(Mask, equal_to(PoisonMaskElem))) {
17353 // TODO: try to check other subsets if possible.
17354 // Restore the original VL if attempt was not successful.
17355 copy(SavedVL, VL.begin());
17356 return std::nullopt;
17357 }
17358 // Restore unused scalars from mask, if some of the extractelements were not
17359 // selected for shuffle.
17360 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
17361 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
17362 isa<UndefValue>(GatheredExtracts[I])) {
17363 std::swap(VL[I], GatheredExtracts[I]);
17364 continue;
17365 }
17366 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
17367 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
17368 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
17369 is_contained(UndefVectorExtracts, I))
17370 continue;
17371 }
17372 return Res;
17373}
17374
17375/// Tries to find extractelement instructions with constant indices from fixed
17376/// vector type and gather such instructions into a bunch, which highly likely
17377/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17378/// successful, the matched scalars are replaced by poison values in \p VL for
17379/// future analysis.
17381BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
17382 SmallVectorImpl<int> &Mask,
17383 unsigned NumParts) const {
17384 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
17385 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
17386 Mask.assign(VL.size(), PoisonMaskElem);
17387 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17388 for (unsigned Part : seq<unsigned>(NumParts)) {
17389 // Scan list of gathered scalars for extractelements that can be represented
17390 // as shuffles.
17391 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
17392 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17393 SmallVector<int> SubMask;
17394 std::optional<TTI::ShuffleKind> Res =
17395 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
17396 ShufflesRes[Part] = Res;
17397 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
17398 }
17399 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
17400 return Res.has_value();
17401 }))
17402 ShufflesRes.clear();
17403 return ShufflesRes;
17404}
17405
17406std::optional<TargetTransformInfo::ShuffleKind>
17407BoUpSLP::isGatherShuffledSingleRegisterEntry(
17408 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
17409 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
17410 Entries.clear();
17411 if (TE->Idx == 0)
17412 return std::nullopt;
17413 // TODO: currently checking only for Scalars in the tree entry, need to count
17414 // reused elements too for better cost estimation.
17415 auto GetUserEntry = [&](const TreeEntry *TE) {
17416 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17417 TE = TE->UserTreeIndex.UserTE;
17418 if (TE == VectorizableTree.front().get())
17419 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
17420 return TE->UserTreeIndex;
17421 };
17422 auto HasGatherUser = [&](const TreeEntry *TE) {
17423 while (TE->Idx != 0 && TE->UserTreeIndex) {
17424 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17425 return true;
17426 TE = TE->UserTreeIndex.UserTE;
17427 }
17428 return false;
17429 };
17430 const EdgeInfo TEUseEI = GetUserEntry(TE);
17431 if (!TEUseEI)
17432 return std::nullopt;
17433 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
17434 const BasicBlock *TEInsertBlock = nullptr;
17435 // Main node of PHI entries keeps the correct order of operands/incoming
17436 // blocks.
17437 if (auto *PHI = dyn_cast_or_null<PHINode>(
17438 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
17439 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
17440 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
17441 TEInsertPt = TEInsertBlock->getTerminator();
17442 } else {
17443 TEInsertBlock = TEInsertPt->getParent();
17444 }
17445 if (!DT->isReachableFromEntry(TEInsertBlock))
17446 return std::nullopt;
17447 auto *NodeUI = DT->getNode(TEInsertBlock);
17448 assert(NodeUI && "Should only process reachable instructions");
17449 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
17450 auto CheckOrdering = [&](const Instruction *InsertPt) {
17451 // Argument InsertPt is an instruction where vector code for some other
17452 // tree entry (one that shares one or more scalars with TE) is going to be
17453 // generated. This lambda returns true if insertion point of vector code
17454 // for the TE dominates that point (otherwise dependency is the other way
17455 // around). The other node is not limited to be of a gather kind. Gather
17456 // nodes are not scheduled and their vector code is inserted before their
17457 // first user. If user is PHI, that is supposed to be at the end of a
17458 // predecessor block. Otherwise it is the last instruction among scalars of
17459 // the user node. So, instead of checking dependency between instructions
17460 // themselves, we check dependency between their insertion points for vector
17461 // code (since each scalar instruction ends up as a lane of a vector
17462 // instruction).
17463 const BasicBlock *InsertBlock = InsertPt->getParent();
17464 auto *NodeEUI = DT->getNode(InsertBlock);
17465 if (!NodeEUI)
17466 return false;
17467 assert((NodeUI == NodeEUI) ==
17468 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
17469 "Different nodes should have different DFS numbers");
17470 // Check the order of the gather nodes users.
17471 if (TEInsertPt->getParent() != InsertBlock &&
17472 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
17473 return false;
17474 if (TEInsertPt->getParent() == InsertBlock &&
17475 TEInsertPt->comesBefore(InsertPt))
17476 return false;
17477 return true;
17478 };
17479 // Find all tree entries used by the gathered values. If no common entries
17480 // found - not a shuffle.
17481 // Here we build a set of tree nodes for each gathered value and trying to
17482 // find the intersection between these sets. If we have at least one common
17483 // tree node for each gathered value - we have just a permutation of the
17484 // single vector. If we have 2 different sets, we're in situation where we
17485 // have a permutation of 2 input vectors.
17487 SmallDenseMap<Value *, int> UsedValuesEntry;
17488 SmallPtrSet<const Value *, 16> VisitedValue;
17489 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
17490 // The node is reused - exit.
17491 if ((TEPtr->getVectorFactor() != VL.size() &&
17492 TEPtr->Scalars.size() != VL.size()) ||
17493 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
17494 return false;
17495 UsedTEs.clear();
17496 UsedTEs.emplace_back().insert(TEPtr);
17497 for (Value *V : VL) {
17498 if (isConstant(V))
17499 continue;
17500 UsedValuesEntry.try_emplace(V, 0);
17501 }
17502 return true;
17503 };
17504 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
17505 unsigned EdgeIdx) {
17506 const TreeEntry *Ptr1 = User1;
17507 const TreeEntry *Ptr2 = User2;
17508 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
17509 while (Ptr2) {
17510 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
17511 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
17512 Ptr2 = Ptr2->UserTreeIndex.UserTE;
17513 }
17514 while (Ptr1) {
17515 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
17516 Ptr1 = Ptr1->UserTreeIndex.UserTE;
17517 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
17518 return Idx < It->second;
17519 }
17520 return false;
17521 };
17522 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
17523 Instruction *InsertPt) {
17524 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
17525 !TEUseEI.UserTE->isCopyableElement(
17526 const_cast<Instruction *>(TEInsertPt)) &&
17527 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17528 InsertPt->getNextNode() == TEInsertPt &&
17529 (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) ||
17530 !isUsedOutsideBlock(InsertPt));
17531 };
17532 for (Value *V : VL) {
17533 if (isConstant(V) || !VisitedValue.insert(V).second)
17534 continue;
17535 // Build a list of tree entries where V is used.
17536 SmallPtrSet<const TreeEntry *, 4> VToTEs;
17538 ValueToGatherNodes.lookup(V).takeVector());
17539 if (TransformedToGatherNodes.contains(TE)) {
17540 for (TreeEntry *E : getSplitTreeEntries(V)) {
17541 if (TE == E || !TransformedToGatherNodes.contains(E) ||
17542 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
17543 continue;
17544 GatherNodes.push_back(E);
17545 }
17546 for (TreeEntry *E : getTreeEntries(V)) {
17547 if (TE == E || !TransformedToGatherNodes.contains(E) ||
17548 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
17549 continue;
17550 GatherNodes.push_back(E);
17551 }
17552 }
17553 for (const TreeEntry *TEPtr : GatherNodes) {
17554 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
17555 continue;
17556 assert(any_of(TEPtr->Scalars,
17557 [&](Value *V) { return GatheredScalars.contains(V); }) &&
17558 "Must contain at least single gathered value.");
17559 assert(TEPtr->UserTreeIndex &&
17560 "Expected only single user of a gather node.");
17561 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17562
17563 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17564 UseEI.UserTE->hasState())
17565 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
17566 : nullptr;
17567 Instruction *InsertPt =
17568 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
17569 : &getLastInstructionInBundle(UseEI.UserTE);
17570 if (TEInsertPt == InsertPt) {
17571 // Check nodes, which might be emitted first.
17572 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17573 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17574 TEUseEI.UserTE->isAltShuffle()) &&
17575 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
17576 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17577 (UseEI.UserTE->hasState() &&
17578 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17579 !UseEI.UserTE->isAltShuffle()) ||
17580 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
17581 continue;
17582 }
17583
17584 // If the schedulable insertion point is used in multiple entries - just
17585 // exit, no known ordering at this point, available only after real
17586 // scheduling.
17587 if (!doesNotNeedToBeScheduled(InsertPt) &&
17588 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17589 continue;
17590 // If the users are the PHI nodes with the same incoming blocks - skip.
17591 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17592 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17593 UseEI.UserTE->State == TreeEntry::Vectorize &&
17594 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17595 TEUseEI.UserTE != UseEI.UserTE)
17596 continue;
17597 // If 2 gathers are operands of the same entry (regardless of whether
17598 // user is PHI or else), compare operands indices, use the earlier one
17599 // as the base.
17600 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17601 continue;
17602 // If the user instruction is used for some reason in different
17603 // vectorized nodes - make it depend on index.
17604 if (TEUseEI.UserTE != UseEI.UserTE &&
17605 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17606 HasGatherUser(TEUseEI.UserTE)))
17607 continue;
17608 // If the user node is the operand of the other user node - skip.
17609 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17610 continue;
17611 }
17612
17613 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17614 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17615 UseEI.UserTE->doesNotNeedToSchedule() &&
17616 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
17617 continue;
17618 // Check if the user node of the TE comes after user node of TEPtr,
17619 // otherwise TEPtr depends on TE.
17620 if ((TEInsertBlock != InsertPt->getParent() ||
17621 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17622 (!CheckOrdering(InsertPt) ||
17623 (UseEI.UserTE->hasCopyableElements() &&
17624 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17625 is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
17626 continue;
17627 // The node is reused - exit.
17628 if (CheckAndUseSameNode(TEPtr))
17629 break;
17630 // The parent node is copyable with last inst used outside? And the last
17631 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
17632 // preserve def-use chain.
17633 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17634 continue;
17635 VToTEs.insert(TEPtr);
17636 }
17637 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
17638 const auto *It = find_if(VTEs, [&](const TreeEntry *MTE) {
17639 return MTE != TE && MTE != TEUseEI.UserTE &&
17640 !DeletedNodes.contains(MTE) &&
17641 !TransformedToGatherNodes.contains(MTE);
17642 });
17643 if (It != VTEs.end()) {
17644 const TreeEntry *VTE = *It;
17645 if (none_of(TE->CombinedEntriesWithIndices,
17646 [&](const auto &P) { return P.first == VTE->Idx; })) {
17647 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17648 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17649 continue;
17650 }
17651 // The node is reused - exit.
17652 if (CheckAndUseSameNode(VTE))
17653 break;
17654 VToTEs.insert(VTE);
17655 }
17656 }
17657 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
17658 const auto *It = find_if(VTEs, [&, MainTE = TE](const TreeEntry *TE) {
17659 return TE != MainTE && !DeletedNodes.contains(TE) &&
17660 !TransformedToGatherNodes.contains(TE);
17661 });
17662 if (It != VTEs.end()) {
17663 const TreeEntry *VTE = *It;
17664 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17665 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17666 VTEs = VTEs.drop_front();
17667 // Iterate through all vectorized nodes.
17668 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
17669 return MTE->State == TreeEntry::Vectorize;
17670 });
17671 if (MIt == VTEs.end())
17672 continue;
17673 VTE = *MIt;
17674 }
17675 if (none_of(TE->CombinedEntriesWithIndices,
17676 [&](const auto &P) { return P.first == VTE->Idx; })) {
17677 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17678 if (&LastBundleInst == TEInsertPt ||
17679 !CheckOrdering(&LastBundleInst) ||
17680 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
17681 continue;
17682 }
17683 // The node is reused - exit.
17684 if (CheckAndUseSameNode(VTE))
17685 break;
17686 VToTEs.insert(VTE);
17687 }
17688 }
17689 if (VToTEs.empty())
17690 continue;
17691 if (UsedTEs.empty()) {
17692 // The first iteration, just insert the list of nodes to vector.
17693 UsedTEs.push_back(VToTEs);
17694 UsedValuesEntry.try_emplace(V, 0);
17695 } else {
17696 // Need to check if there are any previously used tree nodes which use V.
17697 // If there are no such nodes, consider that we have another one input
17698 // vector.
17699 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
17700 unsigned Idx = 0;
17701 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
17702 // Do we have a non-empty intersection of previously listed tree entries
17703 // and tree entries using current V?
17704 set_intersect(VToTEs, Set);
17705 if (!VToTEs.empty()) {
17706 // Yes, write the new subset and continue analysis for the next
17707 // scalar.
17708 Set.swap(VToTEs);
17709 break;
17710 }
17711 VToTEs = SavedVToTEs;
17712 ++Idx;
17713 }
17714 // No non-empty intersection found - need to add a second set of possible
17715 // source vectors.
17716 if (Idx == UsedTEs.size()) {
17717 // If the number of input vectors is greater than 2 - not a permutation,
17718 // fallback to the regular gather.
17719 // TODO: support multiple reshuffled nodes.
17720 if (UsedTEs.size() == 2)
17721 continue;
17722 UsedTEs.push_back(SavedVToTEs);
17723 Idx = UsedTEs.size() - 1;
17724 }
17725 UsedValuesEntry.try_emplace(V, Idx);
17726 }
17727 }
17728
17729 if (UsedTEs.empty()) {
17730 Entries.clear();
17731 return std::nullopt;
17732 }
17733
17734 unsigned VF = 0;
17735 if (UsedTEs.size() == 1) {
17736 // Keep the order to avoid non-determinism.
17737 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
17738 UsedTEs.front().end());
17739 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17740 return TE1->Idx < TE2->Idx;
17741 });
17742 // Try to find the perfect match in another gather node at first.
17743 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
17744 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
17745 });
17746 if (It != FirstEntries.end() &&
17747 ((*It)->getVectorFactor() == VL.size() ||
17748 ((*It)->getVectorFactor() == TE->Scalars.size() &&
17749 TE->ReuseShuffleIndices.size() == VL.size() &&
17750 (*It)->isSame(TE->Scalars)))) {
17751 Entries.push_back(*It);
17752 if ((*It)->getVectorFactor() == VL.size()) {
17753 std::iota(std::next(Mask.begin(), Part * VL.size()),
17754 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
17755 } else {
17756 SmallVector<int> CommonMask = TE->getCommonMask();
17757 copy(CommonMask, Mask.begin());
17758 }
17759 // Clear undef scalars.
17760 for (unsigned I : seq<unsigned>(VL.size()))
17761 if (isa<PoisonValue>(VL[I]))
17762 Mask[Part * VL.size() + I] = PoisonMaskElem;
17764 }
17765 // No perfect match, just shuffle, so choose the first tree node from the
17766 // tree.
17767 Entries.push_back(FirstEntries.front());
17768 // Update mapping between values and corresponding tree entries.
17769 for (auto &P : UsedValuesEntry)
17770 P.second = 0;
17771 VF = FirstEntries.front()->getVectorFactor();
17772 } else {
17773 // Try to find nodes with the same vector factor.
17774 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
17775 // Keep the order of tree nodes to avoid non-determinism.
17776 DenseMap<int, const TreeEntry *> VFToTE;
17777 for (const TreeEntry *TE : UsedTEs.front()) {
17778 unsigned VF = TE->getVectorFactor();
17779 auto It = VFToTE.find(VF);
17780 if (It != VFToTE.end()) {
17781 if (It->second->Idx > TE->Idx)
17782 It->getSecond() = TE;
17783 continue;
17784 }
17785 VFToTE.try_emplace(VF, TE);
17786 }
17787 // Same, keep the order to avoid non-determinism.
17788 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
17789 UsedTEs.back().end());
17790 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17791 return TE1->Idx < TE2->Idx;
17792 });
17793 for (const TreeEntry *TE : SecondEntries) {
17794 auto It = VFToTE.find(TE->getVectorFactor());
17795 if (It != VFToTE.end()) {
17796 VF = It->first;
17797 Entries.push_back(It->second);
17798 Entries.push_back(TE);
17799 break;
17800 }
17801 }
17802 // No 2 source vectors with the same vector factor - just choose 2 with max
17803 // index.
17804 if (Entries.empty()) {
17806 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
17807 return TE1->Idx < TE2->Idx;
17808 }));
17809 Entries.push_back(SecondEntries.front());
17810 VF = std::max(Entries.front()->getVectorFactor(),
17811 Entries.back()->getVectorFactor());
17812 } else {
17813 VF = Entries.front()->getVectorFactor();
17814 }
17815 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
17816 for (const TreeEntry *E : Entries)
17817 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
17818 E->Scalars.end());
17819 // Update mapping between values and corresponding tree entries.
17820 for (auto &P : UsedValuesEntry) {
17821 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
17822 if (ValuesToEntries[Idx].contains(P.first)) {
17823 P.second = Idx;
17824 break;
17825 }
17826 }
17827 }
17828
17829 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
17830 // Checks if the 2 PHIs are compatible in terms of high possibility to be
17831 // vectorized.
17832 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
17833 auto *PHI = cast<PHINode>(V);
17834 auto *PHI1 = cast<PHINode>(V1);
17835 // Check that all incoming values are compatible/from same parent (if they
17836 // are instructions).
17837 // The incoming values are compatible if they all are constants, or
17838 // instruction with the same/alternate opcodes from the same basic block.
17839 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
17840 Value *In = PHI->getIncomingValue(I);
17841 Value *In1 = PHI1->getIncomingValue(I);
17842 if (isConstant(In) && isConstant(In1))
17843 continue;
17844 if (!getSameOpcode({In, In1}, *TLI))
17845 return false;
17846 if (cast<Instruction>(In)->getParent() !=
17848 return false;
17849 }
17850 return true;
17851 };
17852 // Check if the value can be ignored during analysis for shuffled gathers.
17853 // We suppose it is better to ignore instruction, which do not form splats,
17854 // are not vectorized/not extractelements (these instructions will be handled
17855 // by extractelements processing) or may form vector node in future.
17856 auto MightBeIgnored = [=](Value *V) {
17857 auto *I = dyn_cast<Instruction>(V);
17858 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17860 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17861 };
17862 // Check that the neighbor instruction may form a full vector node with the
17863 // current instruction V. It is possible, if they have same/alternate opcode
17864 // and same parent basic block.
17865 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17866 Value *V1 = VL[Idx];
17867 bool UsedInSameVTE = false;
17868 auto It = UsedValuesEntry.find(V1);
17869 if (It != UsedValuesEntry.end())
17870 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17871 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17872 getSameOpcode({V, V1}, *TLI) &&
17873 cast<Instruction>(V)->getParent() ==
17874 cast<Instruction>(V1)->getParent() &&
17875 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17876 };
17877 // Build a shuffle mask for better cost estimation and vector emission.
17878 SmallBitVector UsedIdxs(Entries.size());
17880 for (int I = 0, E = VL.size(); I < E; ++I) {
17881 Value *V = VL[I];
17882 auto It = UsedValuesEntry.find(V);
17883 if (It == UsedValuesEntry.end())
17884 continue;
17885 // Do not try to shuffle scalars, if they are constants, or instructions
17886 // that can be vectorized as a result of the following vector build
17887 // vectorization.
17888 if (isConstant(V) || (MightBeIgnored(V) &&
17889 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17890 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17891 continue;
17892 unsigned Idx = It->second;
17893 EntryLanes.emplace_back(Idx, I);
17894 UsedIdxs.set(Idx);
17895 }
17896 // Iterate through all shuffled scalars and select entries, which can be used
17897 // for final shuffle.
17899 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17900 if (!UsedIdxs.test(I))
17901 continue;
17902 // Fix the entry number for the given scalar. If it is the first entry, set
17903 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17904 // These indices are used when calculating final shuffle mask as the vector
17905 // offset.
17906 for (std::pair<unsigned, int> &Pair : EntryLanes)
17907 if (Pair.first == I)
17908 Pair.first = TempEntries.size();
17909 TempEntries.push_back(Entries[I]);
17910 }
17911 Entries.swap(TempEntries);
17912 if (EntryLanes.size() == Entries.size() &&
17913 !VL.equals(ArrayRef(TE->Scalars)
17914 .slice(Part * VL.size(),
17915 std::min<int>(VL.size(), TE->Scalars.size())))) {
17916 // We may have here 1 or 2 entries only. If the number of scalars is equal
17917 // to the number of entries, no need to do the analysis, it is not very
17918 // profitable. Since VL is not the same as TE->Scalars, it means we already
17919 // have some shuffles before. Cut off not profitable case.
17920 Entries.clear();
17921 return std::nullopt;
17922 }
17923 // Build the final mask, check for the identity shuffle, if possible.
17924 bool IsIdentity = Entries.size() == 1;
17925 // Pair.first is the offset to the vector, while Pair.second is the index of
17926 // scalar in the list.
17927 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17928 unsigned Idx = Part * VL.size() + Pair.second;
17929 Mask[Idx] =
17930 Pair.first * VF +
17931 (ForOrder ? std::distance(
17932 Entries[Pair.first]->Scalars.begin(),
17933 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17934 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17935 IsIdentity &= Mask[Idx] == Pair.second;
17936 }
17937 if (ForOrder || IsIdentity || Entries.empty()) {
17938 switch (Entries.size()) {
17939 case 1:
17940 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17942 break;
17943 case 2:
17944 if (EntryLanes.size() > 2 || VL.size() <= 2)
17946 break;
17947 default:
17948 break;
17949 }
17950 } else if (!isa<VectorType>(VL.front()->getType()) &&
17951 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17952 // Do the cost estimation if shuffle beneficial than buildvector.
17953 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17954 std::next(Mask.begin(), (Part + 1) * VL.size()));
17955 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17956 for (int Idx : SubMask) {
17957 if (Idx == PoisonMaskElem)
17958 continue;
17959 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17960 MinElement = Idx;
17961 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17962 MaxElement = Idx;
17963 }
17964 assert(MaxElement >= 0 && MinElement >= 0 &&
17965 MaxElement % VF >= MinElement % VF &&
17966 "Expected at least single element.");
17967 unsigned NewVF = std::max<unsigned>(
17968 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17969 (MaxElement % VF) -
17970 (MinElement % VF) + 1));
17971 if (NewVF < VF) {
17972 for (int &Idx : SubMask) {
17973 if (Idx == PoisonMaskElem)
17974 continue;
17975 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17976 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17977 }
17978 } else {
17979 NewVF = VF;
17980 }
17981
17983 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17984 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17985 auto GetShuffleCost = [&,
17986 &TTI = *TTI](ArrayRef<int> Mask,
17988 VectorType *VecTy) -> InstructionCost {
17989 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17991 Mask, Entries.front()->getInterleaveFactor()))
17992 return TTI::TCC_Free;
17993 return ::getShuffleCost(TTI,
17994 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17996 VecTy, Mask, CostKind);
17997 };
17998 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17999 InstructionCost FirstShuffleCost = 0;
18000 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
18001 if (Entries.size() == 1 || !Entries[0]->isGather()) {
18002 FirstShuffleCost = ShuffleCost;
18003 } else {
18004 // Transform mask to include only first entry.
18005 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
18006 bool IsIdentity = true;
18007 for (auto [I, Idx] : enumerate(FirstMask)) {
18008 if (Idx >= static_cast<int>(NewVF)) {
18009 Idx = PoisonMaskElem;
18010 } else {
18011 DemandedElts.clearBit(I);
18012 if (Idx != PoisonMaskElem)
18013 IsIdentity &= static_cast<int>(I) == Idx;
18014 }
18015 }
18016 if (!IsIdentity)
18017 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
18018 FirstShuffleCost += getScalarizationOverhead(
18019 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
18020 /*Extract=*/false, CostKind);
18021 }
18022 InstructionCost SecondShuffleCost = 0;
18023 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
18024 if (Entries.size() == 1 || !Entries[1]->isGather()) {
18025 SecondShuffleCost = ShuffleCost;
18026 } else {
18027 // Transform mask to include only first entry.
18028 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
18029 bool IsIdentity = true;
18030 for (auto [I, Idx] : enumerate(SecondMask)) {
18031 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
18032 Idx = PoisonMaskElem;
18033 } else {
18034 DemandedElts.clearBit(I);
18035 if (Idx != PoisonMaskElem) {
18036 Idx -= NewVF;
18037 IsIdentity &= static_cast<int>(I) == Idx;
18038 }
18039 }
18040 }
18041 if (!IsIdentity)
18042 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
18043 SecondShuffleCost += getScalarizationOverhead(
18044 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
18045 /*Extract=*/false, CostKind);
18046 }
18047 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
18048 for (auto [I, Idx] : enumerate(SubMask))
18049 if (Idx == PoisonMaskElem)
18050 DemandedElts.clearBit(I);
18051 InstructionCost BuildVectorCost = getScalarizationOverhead(
18052 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
18053 /*Extract=*/false, CostKind);
18054 const TreeEntry *BestEntry = nullptr;
18055 if (FirstShuffleCost < ShuffleCost) {
18056 std::for_each(std::next(Mask.begin(), Part * VL.size()),
18057 std::next(Mask.begin(), (Part + 1) * VL.size()),
18058 [&](int &Idx) {
18059 if (Idx >= static_cast<int>(VF))
18060 Idx = PoisonMaskElem;
18061 });
18062 BestEntry = Entries.front();
18063 ShuffleCost = FirstShuffleCost;
18064 }
18065 if (SecondShuffleCost < ShuffleCost) {
18066 std::for_each(std::next(Mask.begin(), Part * VL.size()),
18067 std::next(Mask.begin(), (Part + 1) * VL.size()),
18068 [&](int &Idx) {
18069 if (Idx < static_cast<int>(VF))
18070 Idx = PoisonMaskElem;
18071 else
18072 Idx -= VF;
18073 });
18074 BestEntry = Entries[1];
18075 ShuffleCost = SecondShuffleCost;
18076 }
18077 if (BuildVectorCost >= ShuffleCost) {
18078 if (BestEntry) {
18079 Entries.clear();
18080 Entries.push_back(BestEntry);
18081 }
18082 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
18084 }
18085 }
18086 Entries.clear();
18087 // Clear the corresponding mask elements.
18088 std::fill(std::next(Mask.begin(), Part * VL.size()),
18089 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
18090 return std::nullopt;
18091}
18092
18094BoUpSLP::isGatherShuffledEntry(
18095 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
18096 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
18097 bool ForOrder) {
18098 assert(NumParts > 0 && NumParts < VL.size() &&
18099 "Expected positive number of registers.");
18100 Entries.clear();
18101 // No need to check for the topmost gather node.
18102 if (TE == VectorizableTree.front().get() &&
18103 (!GatheredLoadsEntriesFirst.has_value() ||
18104 none_of(ArrayRef(VectorizableTree).drop_front(),
18105 [](const std::unique_ptr<TreeEntry> &TE) {
18106 return !TE->isGather();
18107 })))
18108 return {};
18109 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
18110 // implemented yet.
18111 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
18112 return {};
18113 Mask.assign(VL.size(), PoisonMaskElem);
18114 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
18115 "Expected only single user of the gather node.");
18116 assert(VL.size() % NumParts == 0 &&
18117 "Number of scalars must be divisible by NumParts.");
18118 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
18119 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
18120 (TE->Idx == 0 ||
18121 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
18122 isSplat(TE->Scalars) ||
18123 (TE->hasState() &&
18124 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
18125 return {};
18126 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18128 for (unsigned Part : seq<unsigned>(NumParts)) {
18129 ArrayRef<Value *> SubVL =
18130 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
18131 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
18132 std::optional<TTI::ShuffleKind> SubRes =
18133 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
18134 ForOrder);
18135 if (!SubRes)
18136 SubEntries.clear();
18137 Res.push_back(SubRes);
18138 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
18139 SubEntries.front()->getVectorFactor() == VL.size() &&
18140 (SubEntries.front()->isSame(TE->Scalars) ||
18141 SubEntries.front()->isSame(VL))) {
18142 SmallVector<const TreeEntry *> LocalSubEntries;
18143 LocalSubEntries.swap(SubEntries);
18144 Entries.clear();
18145 Res.clear();
18146 std::iota(Mask.begin(), Mask.end(), 0);
18147 // Clear undef scalars.
18148 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
18149 if (isa<PoisonValue>(VL[I]))
18151 Entries.emplace_back(1, LocalSubEntries.front());
18153 return Res;
18154 }
18155 }
18156 if (all_of(Res,
18157 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
18158 Entries.clear();
18159 return {};
18160 }
18161 return Res;
18162}
18163
18164InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
18165 Type *ScalarTy) const {
18166 const unsigned VF = VL.size();
18167 auto *VecTy = getWidenedType(ScalarTy, VF);
18168 // Find the cost of inserting/extracting values from the vector.
18169 // Check if the same elements are inserted several times and count them as
18170 // shuffle candidates.
18171 APInt DemandedElements = APInt::getZero(VF);
18174 auto EstimateInsertCost = [&](unsigned I, Value *V) {
18175 DemandedElements.setBit(I);
18176 if (V->getType() != ScalarTy)
18177 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
18179 };
18180 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
18181 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
18182 for (auto [I, V] : enumerate(VL)) {
18183 // No need to shuffle duplicates for constants.
18184 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
18185 continue;
18186
18187 if (isConstant(V)) {
18188 ConstantShuffleMask[I] = I + VF;
18189 continue;
18190 }
18191 EstimateInsertCost(I, V);
18192 }
18193 // FIXME: add a cost for constant vector materialization.
18194 bool IsAnyNonUndefConst =
18195 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
18196 // 1. Shuffle input source vector and constant vector.
18197 if (!ForPoisonSrc && IsAnyNonUndefConst) {
18199 ConstantShuffleMask);
18200 }
18201
18202 // 2. Insert unique non-constants.
18203 if (!DemandedElements.isZero())
18204 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
18205 /*Insert=*/true,
18206 /*Extract=*/false, CostKind,
18207 ForPoisonSrc && !IsAnyNonUndefConst, VL);
18208 return Cost;
18209}
18210
18211Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
18212 auto It = EntryToLastInstruction.find(E);
18213 if (It != EntryToLastInstruction.end())
18214 return *cast<Instruction>(It->second);
18215 Instruction *Res = nullptr;
18216 // Get the basic block this bundle is in. All instructions in the bundle
18217 // should be in this block (except for extractelement-like instructions with
18218 // constant indices or gathered loads or copyables).
18219 Instruction *Front;
18220 unsigned Opcode;
18221 if (E->hasState()) {
18222 Front = E->getMainOp();
18223 Opcode = E->getOpcode();
18224 } else {
18225 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
18226 Opcode = Front->getOpcode();
18227 }
18228 auto *BB = Front->getParent();
18229 assert(
18230 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
18231 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
18232 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
18233 all_of(E->Scalars,
18234 [=](Value *V) -> bool {
18235 if (Opcode == Instruction::GetElementPtr &&
18236 !isa<GetElementPtrInst>(V))
18237 return true;
18238 auto *I = dyn_cast<Instruction>(V);
18239 return !I || !E->getMatchingMainOpOrAltOp(I) ||
18240 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
18241 })) &&
18242 "Expected gathered loads or GEPs or instructions from same basic "
18243 "block.");
18244
18245 auto FindLastInst = [&]() {
18246 Instruction *LastInst = Front;
18247 for (Value *V : E->Scalars) {
18248 auto *I = dyn_cast<Instruction>(V);
18249 if (!I)
18250 continue;
18251 if (E->isCopyableElement(I))
18252 continue;
18253 if (LastInst->getParent() == I->getParent()) {
18254 if (LastInst->comesBefore(I))
18255 LastInst = I;
18256 continue;
18257 }
18258 assert(((Opcode == Instruction::GetElementPtr &&
18260 E->State == TreeEntry::SplitVectorize ||
18261 (isVectorLikeInstWithConstOps(LastInst) &&
18263 (GatheredLoadsEntriesFirst.has_value() &&
18264 Opcode == Instruction::Load && E->isGather() &&
18265 E->Idx < *GatheredLoadsEntriesFirst)) &&
18266 "Expected vector-like or non-GEP in GEP node insts only.");
18267 if (!DT->isReachableFromEntry(LastInst->getParent())) {
18268 LastInst = I;
18269 continue;
18270 }
18271 if (!DT->isReachableFromEntry(I->getParent()))
18272 continue;
18273 auto *NodeA = DT->getNode(LastInst->getParent());
18274 auto *NodeB = DT->getNode(I->getParent());
18275 assert(NodeA && "Should only process reachable instructions");
18276 assert(NodeB && "Should only process reachable instructions");
18277 assert((NodeA == NodeB) ==
18278 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18279 "Different nodes should have different DFS numbers");
18280 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
18281 LastInst = I;
18282 }
18283 BB = LastInst->getParent();
18284 return LastInst;
18285 };
18286
18287 auto FindFirstInst = [&]() {
18288 Instruction *FirstInst = Front;
18289 for (Value *V : E->Scalars) {
18290 auto *I = dyn_cast<Instruction>(V);
18291 if (!I)
18292 continue;
18293 if (E->isCopyableElement(I))
18294 continue;
18295 if (FirstInst->getParent() == I->getParent()) {
18296 if (I->comesBefore(FirstInst))
18297 FirstInst = I;
18298 continue;
18299 }
18300 assert(((Opcode == Instruction::GetElementPtr &&
18302 (isVectorLikeInstWithConstOps(FirstInst) &&
18304 "Expected vector-like or non-GEP in GEP node insts only.");
18305 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
18306 FirstInst = I;
18307 continue;
18308 }
18309 if (!DT->isReachableFromEntry(I->getParent()))
18310 continue;
18311 auto *NodeA = DT->getNode(FirstInst->getParent());
18312 auto *NodeB = DT->getNode(I->getParent());
18313 assert(NodeA && "Should only process reachable instructions");
18314 assert(NodeB && "Should only process reachable instructions");
18315 assert((NodeA == NodeB) ==
18316 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18317 "Different nodes should have different DFS numbers");
18318 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
18319 FirstInst = I;
18320 }
18321 return FirstInst;
18322 };
18323
18324 if (E->State == TreeEntry::SplitVectorize) {
18325 Res = FindLastInst();
18326 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
18327 for (auto *E : Entries) {
18328 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
18329 if (!I)
18330 I = &getLastInstructionInBundle(E);
18331 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
18332 Res = I;
18333 }
18334 }
18335 EntryToLastInstruction.try_emplace(E, Res);
18336 return *Res;
18337 }
18338
18339 // Set insertpoint for gathered loads to the very first load.
18340 if (GatheredLoadsEntriesFirst.has_value() &&
18341 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
18342 Opcode == Instruction::Load) {
18343 Res = FindFirstInst();
18344 EntryToLastInstruction.try_emplace(E, Res);
18345 return *Res;
18346 }
18347
18348 // Set the insert point to the beginning of the basic block if the entry
18349 // should not be scheduled.
18350 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
18351 if (E->isGather())
18352 return nullptr;
18353 // Found previously that the instruction do not need to be scheduled.
18354 const auto *It = BlocksSchedules.find(BB);
18355 if (It == BlocksSchedules.end())
18356 return nullptr;
18357 for (Value *V : E->Scalars) {
18358 auto *I = dyn_cast<Instruction>(V);
18359 if (!I || isa<PHINode>(I) ||
18360 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
18361 continue;
18362 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
18363 if (Bundles.empty())
18364 continue;
18365 const auto *It = find_if(
18366 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
18367 if (It != Bundles.end())
18368 return *It;
18369 }
18370 return nullptr;
18371 };
18372 const ScheduleBundle *Bundle = FindScheduleBundle(E);
18373 if (!E->isGather() && !Bundle) {
18374 if ((Opcode == Instruction::GetElementPtr &&
18375 any_of(E->Scalars,
18376 [](Value *V) {
18377 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
18378 })) ||
18379 (all_of(E->Scalars,
18380 [&](Value *V) {
18381 return isa<PoisonValue>(V) ||
18382 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
18383 E->isCopyableElement(V) ||
18384 (!isVectorLikeInstWithConstOps(V) &&
18385 isUsedOutsideBlock(V));
18386 }) &&
18387 (!E->doesNotNeedToSchedule() ||
18388 any_of(E->Scalars,
18389 [&](Value *V) {
18390 if (!isa<Instruction>(V) ||
18391 (E->hasCopyableElements() && E->isCopyableElement(V)))
18392 return false;
18393 return !areAllOperandsNonInsts(V);
18394 }) ||
18395 none_of(E->Scalars, [&](Value *V) {
18396 if (!isa<Instruction>(V) ||
18397 (E->hasCopyableElements() && E->isCopyableElement(V)))
18398 return false;
18399 return MustGather.contains(V);
18400 }))))
18401 Res = FindLastInst();
18402 else
18403 Res = FindFirstInst();
18404 EntryToLastInstruction.try_emplace(E, Res);
18405 return *Res;
18406 }
18407
18408 // Find the last instruction. The common case should be that BB has been
18409 // scheduled, and the last instruction is VL.back(). So we start with
18410 // VL.back() and iterate over schedule data until we reach the end of the
18411 // bundle. The end of the bundle is marked by null ScheduleData.
18412 if (Bundle) {
18413 assert(!E->isGather() && "Gathered instructions should not be scheduled");
18414 Res = Bundle->getBundle().back()->getInst();
18415 EntryToLastInstruction.try_emplace(E, Res);
18416 return *Res;
18417 }
18418
18419 // LastInst can still be null at this point if there's either not an entry
18420 // for BB in BlocksSchedules or there's no ScheduleData available for
18421 // VL.back(). This can be the case if buildTreeRec aborts for various
18422 // reasons (e.g., the maximum recursion depth is reached, the maximum region
18423 // size is reached, etc.). ScheduleData is initialized in the scheduling
18424 // "dry-run".
18425 //
18426 // If this happens, we can still find the last instruction by brute force. We
18427 // iterate forwards from Front (inclusive) until we either see all
18428 // instructions in the bundle or reach the end of the block. If Front is the
18429 // last instruction in program order, LastInst will be set to Front, and we
18430 // will visit all the remaining instructions in the block.
18431 //
18432 // One of the reasons we exit early from buildTreeRec is to place an upper
18433 // bound on compile-time. Thus, taking an additional compile-time hit here is
18434 // not ideal. However, this should be exceedingly rare since it requires that
18435 // we both exit early from buildTreeRec and that the bundle be out-of-order
18436 // (causing us to iterate all the way to the end of the block).
18437 if (!Res)
18438 Res = FindLastInst();
18439 assert(Res && "Failed to find last instruction in bundle");
18440 EntryToLastInstruction.try_emplace(E, Res);
18441 return *Res;
18442}
18443
18444void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
18445 auto *Front = E->getMainOp();
18446 Instruction *LastInst = &getLastInstructionInBundle(E);
18447 assert(LastInst && "Failed to find last instruction in bundle");
18448 BasicBlock::iterator LastInstIt = LastInst->getIterator();
18449 // If the instruction is PHI, set the insert point after all the PHIs.
18450 bool IsPHI = isa<PHINode>(LastInst);
18451 if (IsPHI) {
18452 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
18453 if (LastInstIt != LastInst->getParent()->end() &&
18454 LastInstIt->getParent()->isLandingPad())
18455 LastInstIt = std::next(LastInstIt);
18456 }
18457 if (IsPHI ||
18458 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
18459 (E->doesNotNeedToSchedule() ||
18460 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
18461 isUsedOutsideBlock(LastInst)))) ||
18462 (GatheredLoadsEntriesFirst.has_value() &&
18463 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
18464 E->getOpcode() == Instruction::Load)) {
18465 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
18466 } else {
18467 // Set the insertion point after the last instruction in the bundle. Set the
18468 // debug location to Front.
18469 Builder.SetInsertPoint(
18470 LastInst->getParent(),
18471 LastInst->getNextNode()->getIterator());
18472 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
18473 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
18474 } else {
18475 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
18476 PoisonValue::get(Builder.getPtrTy()),
18477 MaybeAlign());
18478 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
18479 eraseInstruction(Res);
18480 LastInstructionToPos.try_emplace(LastInst, Res);
18481 }
18482 }
18483 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
18484}
18485
18486Value *BoUpSLP::gather(
18487 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
18488 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
18489 // List of instructions/lanes from current block and/or the blocks which are
18490 // part of the current loop. These instructions will be inserted at the end to
18491 // make it possible to optimize loops and hoist invariant instructions out of
18492 // the loops body with better chances for success.
18494 SmallSet<int, 4> PostponedIndices;
18495 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
18496 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
18497 SmallPtrSet<BasicBlock *, 4> Visited;
18498 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
18499 InsertBB = InsertBB->getSinglePredecessor();
18500 return InsertBB && InsertBB == InstBB;
18501 };
18502 for (int I = 0, E = VL.size(); I < E; ++I) {
18503 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
18504 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
18505 isVectorized(Inst) ||
18506 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
18507 PostponedIndices.insert(I).second)
18508 PostponedInsts.emplace_back(Inst, I);
18509 }
18510
18511 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
18512 Type *Ty) {
18513 Value *Scalar = V;
18514 if (Scalar->getType() != Ty) {
18515 assert(Scalar->getType()->isIntOrIntVectorTy() &&
18516 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
18517 Value *V = Scalar;
18518 if (auto *CI = dyn_cast<CastInst>(Scalar);
18520 Value *Op = CI->getOperand(0);
18521 if (auto *IOp = dyn_cast<Instruction>(Op);
18522 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
18523 V = Op;
18524 }
18525 Scalar = Builder.CreateIntCast(
18526 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
18527 }
18528
18529 Instruction *InsElt;
18530 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
18531 assert(SLPReVec && "FixedVectorType is not expected.");
18532 Vec =
18533 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
18534 auto *II = dyn_cast<Instruction>(Vec);
18535 if (!II)
18536 return Vec;
18537 InsElt = II;
18538 } else {
18539 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
18540 InsElt = dyn_cast<InsertElementInst>(Vec);
18541 if (!InsElt)
18542 return Vec;
18543 }
18544 GatherShuffleExtractSeq.insert(InsElt);
18545 CSEBlocks.insert(InsElt->getParent());
18546 // Add to our 'need-to-extract' list.
18547 if (isa<Instruction>(V)) {
18548 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
18549 const auto *It = find_if(Entries, [&](const TreeEntry *E) {
18550 return !TransformedToGatherNodes.contains(E) &&
18551 !DeletedNodes.contains(E);
18552 });
18553 if (It != Entries.end()) {
18554 // Find which lane we need to extract.
18555 User *UserOp = nullptr;
18556 if (Scalar != V) {
18557 if (auto *SI = dyn_cast<Instruction>(Scalar))
18558 UserOp = SI;
18559 } else {
18560 if (V->getType()->isVectorTy()) {
18561 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
18562 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
18563 // Find shufflevector, caused by resize.
18564 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
18565 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
18566 if (SV->getOperand(0) == V)
18567 return SV;
18568 if (SV->getOperand(1) == V)
18569 return SV;
18570 }
18571 return nullptr;
18572 };
18573 InsElt = nullptr;
18574 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18575 InsElt = User;
18576 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18577 InsElt = User;
18578 assert(InsElt &&
18579 "Failed to find shufflevector, caused by resize.");
18580 }
18581 }
18582 UserOp = InsElt;
18583 }
18584 if (UserOp) {
18585 unsigned FoundLane = (*It)->findLaneForValue(V);
18586 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
18587 }
18588 }
18589 }
18590 return Vec;
18591 };
18592 auto *VecTy = getWidenedType(ScalarTy, VL.size());
18593 Value *Vec = PoisonValue::get(VecTy);
18594 SmallVector<int> NonConsts;
18595 SmallVector<int> Mask(VL.size());
18596 std::iota(Mask.begin(), Mask.end(), 0);
18597 Value *OriginalRoot = Root;
18598 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
18599 SV && isa<PoisonValue>(SV->getOperand(1)) &&
18600 SV->getOperand(0)->getType() == VecTy) {
18601 Root = SV->getOperand(0);
18602 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18603 }
18604 // Insert constant values at first.
18605 for (int I = 0, E = VL.size(); I < E; ++I) {
18606 if (PostponedIndices.contains(I))
18607 continue;
18608 if (!isConstant(VL[I])) {
18609 NonConsts.push_back(I);
18610 continue;
18611 }
18612 if (isa<PoisonValue>(VL[I]))
18613 continue;
18614 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18615 Mask[I] = I + E;
18616 }
18617 if (Root) {
18618 if (isa<PoisonValue>(Vec)) {
18619 Vec = OriginalRoot;
18620 } else {
18621 Vec = CreateShuffle(Root, Vec, Mask);
18622 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
18623 OI && OI->use_empty() &&
18624 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18625 return TE->VectorizedValue == OI;
18626 }))
18627 eraseInstruction(OI);
18628 }
18629 }
18630 // Insert non-constant values.
18631 for (int I : NonConsts)
18632 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18633 // Append instructions, which are/may be part of the loop, in the end to make
18634 // it possible to hoist non-loop-based instructions.
18635 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18636 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18637
18638 return Vec;
18639}
18640
18641/// Merges shuffle masks and emits final shuffle instruction, if required. It
18642/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
18643/// when the actual shuffle instruction is generated only if this is actually
18644/// required. Otherwise, the shuffle instruction emission is delayed till the
18645/// end of the process, to reduce the number of emitted instructions and further
18646/// analysis/transformations.
18647/// The class also will look through the previously emitted shuffle instructions
18648/// and properly mark indices in mask as undef.
18649/// For example, given the code
18650/// \code
18651/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
18652/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
18653/// \endcode
18654/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
18655/// look through %s1 and %s2 and emit
18656/// \code
18657/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18658/// \endcode
18659/// instead.
18660/// If 2 operands are of different size, the smallest one will be resized and
18661/// the mask recalculated properly.
18662/// For example, given the code
18663/// \code
18664/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
18665/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
18666/// \endcode
18667/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
18668/// look through %s1 and %s2 and emit
18669/// \code
18670/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18671/// \endcode
18672/// instead.
18673class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
18674 bool IsFinalized = false;
18675 /// Combined mask for all applied operands and masks. It is built during
18676 /// analysis and actual emission of shuffle vector instructions.
18677 SmallVector<int> CommonMask;
18678 /// List of operands for the shuffle vector instruction. It hold at max 2
18679 /// operands, if the 3rd is going to be added, the first 2 are combined into
18680 /// shuffle with \p CommonMask mask, the first operand sets to be the
18681 /// resulting shuffle and the second operand sets to be the newly added
18682 /// operand. The \p CommonMask is transformed in the proper way after that.
18683 SmallVector<Value *, 2> InVectors;
18684 IRBuilderBase &Builder;
18685 BoUpSLP &R;
18686
18687 class ShuffleIRBuilder {
18688 IRBuilderBase &Builder;
18689 /// Holds all of the instructions that we gathered.
18690 SetVector<Instruction *> &GatherShuffleExtractSeq;
18691 /// A list of blocks that we are going to CSE.
18692 DenseSet<BasicBlock *> &CSEBlocks;
18693 /// Data layout.
18694 const DataLayout &DL;
18695
18696 public:
18697 ShuffleIRBuilder(IRBuilderBase &Builder,
18698 SetVector<Instruction *> &GatherShuffleExtractSeq,
18699 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
18700 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
18701 CSEBlocks(CSEBlocks), DL(DL) {}
18702 ~ShuffleIRBuilder() = default;
18703 /// Creates shufflevector for the 2 operands with the given mask.
18704 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
18705 if (V1->getType() != V2->getType()) {
18707 V1->getType()->isIntOrIntVectorTy() &&
18708 "Expected integer vector types only.");
18709 if (V1->getType() != V2->getType()) {
18710 if (cast<VectorType>(V2->getType())
18711 ->getElementType()
18712 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
18713 ->getElementType()
18714 ->getIntegerBitWidth())
18715 V2 = Builder.CreateIntCast(
18716 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
18717 else
18718 V1 = Builder.CreateIntCast(
18719 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
18720 }
18721 }
18722 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
18723 if (auto *I = dyn_cast<Instruction>(Vec)) {
18724 GatherShuffleExtractSeq.insert(I);
18725 CSEBlocks.insert(I->getParent());
18726 }
18727 return Vec;
18728 }
18729 /// Creates permutation of the single vector operand with the given mask, if
18730 /// it is not identity mask.
18731 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
18732 if (Mask.empty())
18733 return V1;
18734 unsigned VF = Mask.size();
18735 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
18736 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
18737 return V1;
18738 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18739 if (auto *I = dyn_cast<Instruction>(Vec)) {
18740 GatherShuffleExtractSeq.insert(I);
18741 CSEBlocks.insert(I->getParent());
18742 }
18743 return Vec;
18744 }
18745 Value *createIdentity(Value *V) { return V; }
18746 Value *createPoison(Type *Ty, unsigned VF) {
18747 return PoisonValue::get(getWidenedType(Ty, VF));
18748 }
18749 /// Resizes 2 input vector to match the sizes, if the they are not equal
18750 /// yet. The smallest vector is resized to the size of the larger vector.
18751 void resizeToMatch(Value *&V1, Value *&V2) {
18752 if (V1->getType() == V2->getType())
18753 return;
18754 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
18755 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
18756 int VF = std::max(V1VF, V2VF);
18757 int MinVF = std::min(V1VF, V2VF);
18758 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
18759 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
18760 0);
18761 Value *&Op = MinVF == V1VF ? V1 : V2;
18762 Op = Builder.CreateShuffleVector(Op, IdentityMask);
18763 if (auto *I = dyn_cast<Instruction>(Op)) {
18764 GatherShuffleExtractSeq.insert(I);
18765 CSEBlocks.insert(I->getParent());
18766 }
18767 if (MinVF == V1VF)
18768 V1 = Op;
18769 else
18770 V2 = Op;
18771 }
18772 };
18773
18774 /// Smart shuffle instruction emission, walks through shuffles trees and
18775 /// tries to find the best matching vector for the actual shuffle
18776 /// instruction.
18777 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
18778 assert(V1 && "Expected at least one vector value.");
18779 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18780 R.CSEBlocks, *R.DL);
18781 return BaseShuffleAnalysis::createShuffle<Value *>(
18782 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18783 }
18784
18785 /// Cast value \p V to the vector type with the same number of elements, but
18786 /// the base type \p ScalarTy.
18787 Value *castToScalarTyElem(Value *V,
18788 std::optional<bool> IsSigned = std::nullopt) {
18789 auto *VecTy = cast<VectorType>(V->getType());
18790 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
18791 if (VecTy->getElementType() == ScalarTy->getScalarType())
18792 return V;
18793 return Builder.CreateIntCast(
18794 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18795 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
18796 }
18797
18798 Value *getVectorizedValue(const TreeEntry &E) {
18799 Value *Vec = E.VectorizedValue;
18800 if (!Vec->getType()->isIntOrIntVectorTy())
18801 return Vec;
18802 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
18803 return !isa<PoisonValue>(V) &&
18804 !isKnownNonNegative(
18805 V, SimplifyQuery(*R.DL));
18806 }));
18807 }
18808
18809public:
18811 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18812
18813 /// Adjusts extractelements after reusing them.
18814 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
18815 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18816 unsigned NumParts, bool &UseVecBaseAsInput) {
18817 UseVecBaseAsInput = false;
18818 SmallPtrSet<Value *, 4> UniqueBases;
18819 Value *VecBase = nullptr;
18820 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
18821 if (!E->ReorderIndices.empty()) {
18822 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18823 E->ReorderIndices.end());
18824 reorderScalars(VL, ReorderMask);
18825 }
18826 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18827 int Idx = Mask[I];
18828 if (Idx == PoisonMaskElem)
18829 continue;
18830 auto *EI = cast<ExtractElementInst>(VL[I]);
18831 VecBase = EI->getVectorOperand();
18832 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
18833 VecBase = TEs.front()->VectorizedValue;
18834 assert(VecBase && "Expected vectorized value.");
18835 UniqueBases.insert(VecBase);
18836 // If the only one use is vectorized - can delete the extractelement
18837 // itself.
18838 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18839 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
18840 !R.isVectorized(EI) &&
18841 count_if(E->Scalars, [&](Value *V) { return V == EI; }) !=
18842 count_if(E->UserTreeIndex.UserTE->Scalars,
18843 [&](Value *V) { return V == EI; })) ||
18844 (NumParts != 1 && count(VL, EI) > 1) ||
18845 any_of(EI->users(), [&](User *U) {
18846 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18847 return UTEs.empty() || UTEs.size() > 1 ||
18848 any_of(UTEs,
18849 [&](const TreeEntry *TE) {
18850 return R.DeletedNodes.contains(TE) ||
18851 R.TransformedToGatherNodes.contains(TE);
18852 }) ||
18854 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18855 (!UTEs.empty() &&
18856 count_if(R.VectorizableTree,
18857 [&](const std::unique_ptr<TreeEntry> &TE) {
18858 return TE->UserTreeIndex.UserTE ==
18859 UTEs.front() &&
18860 is_contained(VL, EI);
18861 }) != 1);
18862 }))
18863 continue;
18864 R.eraseInstruction(EI);
18865 }
18866 if (NumParts == 1 || UniqueBases.size() == 1) {
18867 assert(VecBase && "Expected vectorized value.");
18868 return castToScalarTyElem(VecBase);
18869 }
18870 UseVecBaseAsInput = true;
18871 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
18872 for (auto [I, Idx] : enumerate(Mask))
18873 if (Idx != PoisonMaskElem)
18874 Idx = I;
18875 };
18876 // Perform multi-register vector shuffle, joining them into a single virtual
18877 // long vector.
18878 // Need to shuffle each part independently and then insert all this parts
18879 // into a long virtual vector register, forming the original vector.
18880 Value *Vec = nullptr;
18881 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18882 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18883 for (unsigned Part : seq<unsigned>(NumParts)) {
18884 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
18885 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
18886 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
18887 constexpr int MaxBases = 2;
18888 SmallVector<Value *, MaxBases> Bases(MaxBases);
18889 auto VLMask = zip(SubVL, SubMask);
18890 const unsigned VF = std::accumulate(
18891 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
18892 if (std::get<1>(D) == PoisonMaskElem)
18893 return S;
18894 Value *VecOp =
18895 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18896 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18897 !TEs.empty())
18898 VecOp = TEs.front()->VectorizedValue;
18899 assert(VecOp && "Expected vectorized value.");
18900 const unsigned Size =
18901 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18902 return std::max(S, Size);
18903 });
18904 for (const auto [V, I] : VLMask) {
18905 if (I == PoisonMaskElem)
18906 continue;
18907 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18908 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18909 VecOp = TEs.front()->VectorizedValue;
18910 assert(VecOp && "Expected vectorized value.");
18911 VecOp = castToScalarTyElem(VecOp);
18912 Bases[I / VF] = VecOp;
18913 }
18914 if (!Bases.front())
18915 continue;
18916 Value *SubVec;
18917 if (Bases.back()) {
18918 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18919 TransformToIdentity(SubMask);
18920 } else {
18921 SubVec = Bases.front();
18922 }
18923 if (!Vec) {
18924 Vec = SubVec;
18925 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18926 [&](unsigned P) {
18927 ArrayRef<int> SubMask =
18928 Mask.slice(P * SliceSize,
18929 getNumElems(Mask.size(),
18930 SliceSize, P));
18931 return all_of(SubMask, [](int Idx) {
18932 return Idx == PoisonMaskElem;
18933 });
18934 })) &&
18935 "Expected first part or all previous parts masked.");
18936 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18937 } else {
18938 unsigned NewVF =
18939 cast<FixedVectorType>(Vec->getType())->getNumElements();
18940 if (Vec->getType() != SubVec->getType()) {
18941 unsigned SubVecVF =
18942 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18943 NewVF = std::max(NewVF, SubVecVF);
18944 }
18945 // Adjust SubMask.
18946 for (int &Idx : SubMask)
18947 if (Idx != PoisonMaskElem)
18948 Idx += NewVF;
18949 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18950 Vec = createShuffle(Vec, SubVec, VecMask);
18951 TransformToIdentity(VecMask);
18952 }
18953 }
18954 copy(VecMask, Mask.begin());
18955 return Vec;
18956 }
18957 /// Checks if the specified entry \p E needs to be delayed because of its
18958 /// dependency nodes.
18959 std::optional<Value *>
18960 needToDelay(const TreeEntry *E,
18962 // No need to delay emission if all deps are ready.
18963 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18964 return all_of(
18965 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18966 }))
18967 return std::nullopt;
18968 // Postpone gather emission, will be emitted after the end of the
18969 // process to keep correct order.
18970 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18971 return Builder.CreateAlignedLoad(
18972 ResVecTy,
18973 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18974 MaybeAlign());
18975 }
18976 /// Reset the builder to handle perfect diamond match.
18978 IsFinalized = false;
18979 CommonMask.clear();
18980 InVectors.clear();
18981 }
18982 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18983 /// shuffling.
18984 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18985 Value *V1 = getVectorizedValue(E1);
18986 Value *V2 = getVectorizedValue(E2);
18987 add(V1, V2, Mask);
18988 }
18989 /// Adds single input vector (in form of tree entry) and the mask for its
18990 /// shuffling.
18991 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18992 Value *V1 = getVectorizedValue(E1);
18993 add(V1, Mask);
18994 }
18995 /// Adds 2 input vectors and the mask for their shuffling.
18996 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18997 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
19000 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
19001 V1 = castToScalarTyElem(V1);
19002 V2 = castToScalarTyElem(V2);
19003 if (InVectors.empty()) {
19004 InVectors.push_back(V1);
19005 InVectors.push_back(V2);
19006 CommonMask.assign(Mask.begin(), Mask.end());
19007 return;
19008 }
19009 Value *Vec = InVectors.front();
19010 if (InVectors.size() == 2) {
19011 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19012 transformMaskAfterShuffle(CommonMask, CommonMask);
19013 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
19014 Mask.size()) {
19015 Vec = createShuffle(Vec, nullptr, CommonMask);
19016 transformMaskAfterShuffle(CommonMask, CommonMask);
19017 }
19018 V1 = createShuffle(V1, V2, Mask);
19019 unsigned VF = std::max(getVF(V1), getVF(Vec));
19020 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19021 if (Mask[Idx] != PoisonMaskElem)
19022 CommonMask[Idx] = Idx + VF;
19023 InVectors.front() = Vec;
19024 if (InVectors.size() == 2)
19025 InVectors.back() = V1;
19026 else
19027 InVectors.push_back(V1);
19028 }
19029 /// Adds another one input vector and the mask for the shuffling.
19030 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
19032 "castToScalarTyElem expects V1 to be FixedVectorType");
19033 V1 = castToScalarTyElem(V1);
19034 if (InVectors.empty()) {
19035 InVectors.push_back(V1);
19036 CommonMask.assign(Mask.begin(), Mask.end());
19037 return;
19038 }
19039 const auto *It = find(InVectors, V1);
19040 if (It == InVectors.end()) {
19041 if (InVectors.size() == 2 ||
19042 InVectors.front()->getType() != V1->getType()) {
19043 Value *V = InVectors.front();
19044 if (InVectors.size() == 2) {
19045 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
19046 transformMaskAfterShuffle(CommonMask, CommonMask);
19047 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
19048 CommonMask.size()) {
19049 V = createShuffle(InVectors.front(), nullptr, CommonMask);
19050 transformMaskAfterShuffle(CommonMask, CommonMask);
19051 }
19052 unsigned VF = std::max(CommonMask.size(), Mask.size());
19053 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19054 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
19055 CommonMask[Idx] = V->getType() != V1->getType()
19056 ? Idx + VF
19057 : Mask[Idx] + getVF(V1);
19058 if (V->getType() != V1->getType())
19059 V1 = createShuffle(V1, nullptr, Mask);
19060 InVectors.front() = V;
19061 if (InVectors.size() == 2)
19062 InVectors.back() = V1;
19063 else
19064 InVectors.push_back(V1);
19065 return;
19066 }
19067 // Check if second vector is required if the used elements are already
19068 // used from the first one.
19069 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19070 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
19071 InVectors.push_back(V1);
19072 break;
19073 }
19074 }
19075 unsigned VF = 0;
19076 for (Value *V : InVectors)
19077 VF = std::max(VF, getVF(V));
19078 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19079 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
19080 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
19081 }
19082 /// Adds another one input vector and the mask for the shuffling.
19084 SmallVector<int> NewMask;
19085 inversePermutation(Order, NewMask);
19086 add(V1, NewMask);
19087 }
19088 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
19089 Value *Root = nullptr) {
19090 return R.gather(VL, Root, ScalarTy,
19091 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
19092 return createShuffle(V1, V2, Mask);
19093 });
19094 }
19095 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
19096 /// Finalize emission of the shuffles.
19097 /// \param Action the action (if any) to be performed before final applying of
19098 /// the \p ExtMask mask.
19100 ArrayRef<int> ExtMask,
19101 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
19102 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
19105 Action = {}) {
19106 IsFinalized = true;
19107 if (Action) {
19108 Value *Vec = InVectors.front();
19109 if (InVectors.size() == 2) {
19110 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19111 InVectors.pop_back();
19112 } else {
19113 Vec = createShuffle(Vec, nullptr, CommonMask);
19114 }
19115 transformMaskAfterShuffle(CommonMask, CommonMask);
19116 assert(VF > 0 &&
19117 "Expected vector length for the final value before action.");
19118 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
19119 if (VecVF < VF) {
19120 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
19121 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
19122 Vec = createShuffle(Vec, nullptr, ResizeMask);
19123 }
19124 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
19125 return createShuffle(V1, V2, Mask);
19126 });
19127 InVectors.front() = Vec;
19128 }
19129 if (!SubVectors.empty()) {
19130 Value *Vec = InVectors.front();
19131 if (InVectors.size() == 2) {
19132 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19133 InVectors.pop_back();
19134 } else {
19135 Vec = createShuffle(Vec, nullptr, CommonMask);
19136 }
19137 transformMaskAfterShuffle(CommonMask, CommonMask);
19138 auto CreateSubVectors = [&](Value *Vec,
19139 SmallVectorImpl<int> &CommonMask) {
19140 for (auto [E, Idx] : SubVectors) {
19141 Value *V = getVectorizedValue(*E);
19142 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
19143 // Use scalar version of the SCalarType to correctly handle shuffles
19144 // for revectorization. The revectorization mode operates by the
19145 // vectors, but here we need to operate on the scalars, because the
19146 // masks were already transformed for the vector elements and we don't
19147 // need doing this transformation again.
19148 Type *OrigScalarTy = ScalarTy;
19149 ScalarTy = ScalarTy->getScalarType();
19150 Vec = createInsertVector(
19151 Builder, Vec, V, InsertionIndex,
19152 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
19153 _3));
19154 ScalarTy = OrigScalarTy;
19155 if (!CommonMask.empty()) {
19156 std::iota(std::next(CommonMask.begin(), Idx),
19157 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
19158 Idx);
19159 }
19160 }
19161 return Vec;
19162 };
19163 if (SubVectorsMask.empty()) {
19164 Vec = CreateSubVectors(Vec, CommonMask);
19165 } else {
19166 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
19167 copy(SubVectorsMask, SVMask.begin());
19168 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
19169 if (I2 != PoisonMaskElem) {
19170 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
19171 I1 = I2 + CommonMask.size();
19172 }
19173 }
19174 Value *InsertVec =
19175 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
19176 Vec = createShuffle(InsertVec, Vec, SVMask);
19177 transformMaskAfterShuffle(CommonMask, SVMask);
19178 }
19179 InVectors.front() = Vec;
19180 }
19181
19182 if (!ExtMask.empty()) {
19183 if (CommonMask.empty()) {
19184 CommonMask.assign(ExtMask.begin(), ExtMask.end());
19185 } else {
19186 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
19187 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
19188 if (ExtMask[I] == PoisonMaskElem)
19189 continue;
19190 NewMask[I] = CommonMask[ExtMask[I]];
19191 }
19192 CommonMask.swap(NewMask);
19193 }
19194 }
19195 if (CommonMask.empty()) {
19196 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
19197 return InVectors.front();
19198 }
19199 if (InVectors.size() == 2)
19200 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
19201 return createShuffle(InVectors.front(), nullptr, CommonMask);
19202 }
19203
19205 assert((IsFinalized || CommonMask.empty()) &&
19206 "Shuffle construction must be finalized.");
19207 }
19208};
19209
19210Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
19211 return vectorizeTree(getOperandEntry(E, NodeIdx));
19212}
19213
19214template <typename BVTy, typename ResTy, typename... Args>
19215ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
19216 Args &...Params) {
19217 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
19218 "Expected gather node.");
19219 unsigned VF = E->getVectorFactor();
19220
19221 bool NeedFreeze = false;
19222 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
19223 // Clear values, to be replaced by insertvector instructions.
19224 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
19225 for_each(MutableArrayRef(GatheredScalars)
19226 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
19227 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
19229 E->CombinedEntriesWithIndices.size());
19230 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
19231 [&](const auto &P) {
19232 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19233 });
19234 // Build a mask out of the reorder indices and reorder scalars per this
19235 // mask.
19236 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
19237 E->ReorderIndices.end());
19238 if (!ReorderMask.empty())
19239 reorderScalars(GatheredScalars, ReorderMask);
19240 SmallVector<int> SubVectorsMask;
19241 inversePermutation(E->ReorderIndices, SubVectorsMask);
19242 // Transform non-clustered elements in the mask to poison (-1).
19243 // "Clustered" operations will be reordered using this mask later.
19244 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
19245 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
19246 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
19247 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
19248 } else {
19249 SubVectorsMask.clear();
19250 }
19251 SmallVector<Value *> StoredGS(GatheredScalars);
19252 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
19253 unsigned I, unsigned SliceSize,
19254 bool IsNotPoisonous) {
19255 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
19256 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19257 }))
19258 return false;
19259 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
19260 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
19261 if (UserTE->getNumOperands() != 2)
19262 return false;
19263 if (!IsNotPoisonous) {
19264 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
19265 [=](const std::unique_ptr<TreeEntry> &TE) {
19266 return TE->UserTreeIndex.UserTE == UserTE &&
19267 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
19268 });
19269 if (It == VectorizableTree.end())
19270 return false;
19271 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
19272 if (!(*It)->ReorderIndices.empty()) {
19273 inversePermutation((*It)->ReorderIndices, ReorderMask);
19274 reorderScalars(GS, ReorderMask);
19275 }
19276 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
19277 Value *V0 = std::get<0>(P);
19278 Value *V1 = std::get<1>(P);
19279 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
19280 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
19281 is_contained(E->Scalars, V1));
19282 }))
19283 return false;
19284 }
19285 int Idx;
19286 if ((Mask.size() < InputVF &&
19287 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
19288 Idx == 0) ||
19289 (Mask.size() == InputVF &&
19290 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
19291 std::iota(
19292 std::next(Mask.begin(), I * SliceSize),
19293 std::next(Mask.begin(),
19294 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
19295 0);
19296 } else {
19297 unsigned IVal =
19298 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
19299 std::fill(
19300 std::next(Mask.begin(), I * SliceSize),
19301 std::next(Mask.begin(),
19302 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
19303 IVal);
19304 }
19305 return true;
19306 };
19307 BVTy ShuffleBuilder(ScalarTy, Params...);
19308 ResTy Res = ResTy();
19309 SmallVector<int> Mask;
19310 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
19312 Value *ExtractVecBase = nullptr;
19313 bool UseVecBaseAsInput = false;
19316 Type *OrigScalarTy = GatheredScalars.front()->getType();
19317 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
19318 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
19319 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
19320 // Check for gathered extracts.
19321 bool Resized = false;
19322 ExtractShuffles =
19323 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
19324 if (!ExtractShuffles.empty()) {
19325 SmallVector<const TreeEntry *> ExtractEntries;
19326 for (auto [Idx, I] : enumerate(ExtractMask)) {
19327 if (I == PoisonMaskElem)
19328 continue;
19329 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
19330 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
19331 !TEs.empty())
19332 ExtractEntries.append(TEs.begin(), TEs.end());
19333 }
19334 if (std::optional<ResTy> Delayed =
19335 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
19336 // Delay emission of gathers which are not ready yet.
19337 PostponedGathers.insert(E);
19338 // Postpone gather emission, will be emitted after the end of the
19339 // process to keep correct order.
19340 return *Delayed;
19341 }
19342 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
19343 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
19344 ExtractVecBase = VecBase;
19345 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
19346 if (VF == VecBaseTy->getNumElements() &&
19347 GatheredScalars.size() != VF) {
19348 Resized = true;
19349 GatheredScalars.append(VF - GatheredScalars.size(),
19350 PoisonValue::get(OrigScalarTy));
19351 NumParts =
19352 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
19353 }
19354 }
19355 }
19356 // Gather extracts after we check for full matched gathers only.
19357 if (!ExtractShuffles.empty() || !E->hasState() ||
19358 E->getOpcode() != Instruction::Load ||
19359 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
19360 any_of(E->Scalars, IsaPred<LoadInst>)) &&
19361 any_of(E->Scalars,
19362 [this](Value *V) {
19363 return isa<LoadInst>(V) && isVectorized(V);
19364 })) ||
19365 (E->hasState() && E->isAltShuffle()) ||
19366 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
19367 isSplat(E->Scalars) ||
19368 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
19369 GatherShuffles =
19370 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
19371 }
19372 if (!GatherShuffles.empty()) {
19373 if (std::optional<ResTy> Delayed =
19374 ShuffleBuilder.needToDelay(E, Entries)) {
19375 // Delay emission of gathers which are not ready yet.
19376 PostponedGathers.insert(E);
19377 // Postpone gather emission, will be emitted after the end of the
19378 // process to keep correct order.
19379 return *Delayed;
19380 }
19381 if (GatherShuffles.size() == 1 &&
19382 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
19383 Entries.front().front()->isSame(E->Scalars)) {
19384 // Perfect match in the graph, will reuse the previously vectorized
19385 // node. Cost is 0.
19386 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
19387 << shortBundleName(E->Scalars, E->Idx) << ".\n");
19388 // Restore the mask for previous partially matched values.
19389 Mask.resize(E->Scalars.size());
19390 const TreeEntry *FrontTE = Entries.front().front();
19391 if (FrontTE->ReorderIndices.empty() &&
19392 ((FrontTE->ReuseShuffleIndices.empty() &&
19393 E->Scalars.size() == FrontTE->Scalars.size()) ||
19394 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
19395 std::iota(Mask.begin(), Mask.end(), 0);
19396 } else {
19397 for (auto [I, V] : enumerate(E->Scalars)) {
19398 if (isa<PoisonValue>(V)) {
19399 Mask[I] = PoisonMaskElem;
19400 continue;
19401 }
19402 Mask[I] = FrontTE->findLaneForValue(V);
19403 }
19404 }
19405 // Reset the builder(s) to correctly handle perfect diamond matched
19406 // nodes.
19407 ShuffleBuilder.resetForSameNode();
19408 ShuffleBuilder.add(*FrontTE, Mask);
19409 // Full matched entry found, no need to insert subvectors.
19410 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
19411 return Res;
19412 }
19413 if (!Resized) {
19414 if (GatheredScalars.size() != VF &&
19415 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
19416 return any_of(TEs, [&](const TreeEntry *TE) {
19417 return TE->getVectorFactor() == VF;
19418 });
19419 }))
19420 GatheredScalars.append(VF - GatheredScalars.size(),
19421 PoisonValue::get(OrigScalarTy));
19422 }
19423 // Remove shuffled elements from list of gathers.
19424 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
19425 if (Mask[I] != PoisonMaskElem)
19426 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
19427 }
19428 }
19429 }
19430 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
19431 SmallVectorImpl<int> &ReuseMask,
19432 bool IsRootPoison) {
19433 // For splats with can emit broadcasts instead of gathers, so try to find
19434 // such sequences.
19435 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
19436 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
19437 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
19438 SmallVector<int> UndefPos;
19439 DenseMap<Value *, unsigned> UniquePositions;
19440 // Gather unique non-const values and all constant values.
19441 // For repeated values, just shuffle them.
19442 int NumNonConsts = 0;
19443 int SinglePos = 0;
19444 for (auto [I, V] : enumerate(Scalars)) {
19445 if (isa<UndefValue>(V)) {
19446 if (!isa<PoisonValue>(V)) {
19447 ReuseMask[I] = I;
19448 UndefPos.push_back(I);
19449 }
19450 continue;
19451 }
19452 if (isConstant(V)) {
19453 ReuseMask[I] = I;
19454 continue;
19455 }
19456 ++NumNonConsts;
19457 SinglePos = I;
19458 Value *OrigV = V;
19459 Scalars[I] = PoisonValue::get(OrigScalarTy);
19460 if (IsSplat) {
19461 Scalars.front() = OrigV;
19462 ReuseMask[I] = 0;
19463 } else {
19464 const auto Res = UniquePositions.try_emplace(OrigV, I);
19465 Scalars[Res.first->second] = OrigV;
19466 ReuseMask[I] = Res.first->second;
19467 }
19468 }
19469 if (NumNonConsts == 1) {
19470 // Restore single insert element.
19471 if (IsSplat) {
19472 ReuseMask.assign(VF, PoisonMaskElem);
19473 std::swap(Scalars.front(), Scalars[SinglePos]);
19474 if (!UndefPos.empty() && UndefPos.front() == 0)
19475 Scalars.front() = UndefValue::get(OrigScalarTy);
19476 }
19477 ReuseMask[SinglePos] = SinglePos;
19478 } else if (!UndefPos.empty() && IsSplat) {
19479 // For undef values, try to replace them with the simple broadcast.
19480 // We can do it if the broadcasted value is guaranteed to be
19481 // non-poisonous, or by freezing the incoming scalar value first.
19482 auto *It = find_if(Scalars, [this, E](Value *V) {
19483 return !isa<UndefValue>(V) &&
19485 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
19486 // Check if the value already used in the same operation in
19487 // one of the nodes already.
19488 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
19489 is_contained(E->UserTreeIndex.UserTE->Scalars,
19490 U.getUser());
19491 })));
19492 });
19493 if (It != Scalars.end()) {
19494 // Replace undefs by the non-poisoned scalars and emit broadcast.
19495 int Pos = std::distance(Scalars.begin(), It);
19496 for (int I : UndefPos) {
19497 // Set the undef position to the non-poisoned scalar.
19498 ReuseMask[I] = Pos;
19499 // Replace the undef by the poison, in the mask it is replaced by
19500 // non-poisoned scalar already.
19501 if (I != Pos)
19502 Scalars[I] = PoisonValue::get(OrigScalarTy);
19503 }
19504 } else {
19505 // Replace undefs by the poisons, emit broadcast and then emit
19506 // freeze.
19507 for (int I : UndefPos) {
19508 ReuseMask[I] = PoisonMaskElem;
19509 if (isa<UndefValue>(Scalars[I]))
19510 Scalars[I] = PoisonValue::get(OrigScalarTy);
19511 }
19512 NeedFreeze = true;
19513 }
19514 }
19515 };
19516 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
19517 bool IsNonPoisoned = true;
19518 bool IsUsedInExpr = true;
19519 Value *Vec1 = nullptr;
19520 if (!ExtractShuffles.empty()) {
19521 // Gather of extractelements can be represented as just a shuffle of
19522 // a single/two vectors the scalars are extracted from.
19523 // Find input vectors.
19524 Value *Vec2 = nullptr;
19525 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
19526 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
19527 ExtractMask[I] = PoisonMaskElem;
19528 }
19529 if (UseVecBaseAsInput) {
19530 Vec1 = ExtractVecBase;
19531 } else {
19532 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
19533 if (ExtractMask[I] == PoisonMaskElem)
19534 continue;
19535 if (isa<UndefValue>(StoredGS[I]))
19536 continue;
19537 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
19538 Value *VecOp = EI->getVectorOperand();
19539 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
19540 !TEs.empty() && TEs.front()->VectorizedValue)
19541 VecOp = TEs.front()->VectorizedValue;
19542 if (!Vec1) {
19543 Vec1 = VecOp;
19544 } else if (Vec1 != VecOp) {
19545 assert((!Vec2 || Vec2 == VecOp) &&
19546 "Expected only 1 or 2 vectors shuffle.");
19547 Vec2 = VecOp;
19548 }
19549 }
19550 }
19551 if (Vec2) {
19552 IsUsedInExpr = false;
19553 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
19554 isGuaranteedNotToBePoison(Vec2, AC);
19555 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
19556 } else if (Vec1) {
19557 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
19558 IsUsedInExpr &= FindReusedSplat(
19559 ExtractMask,
19560 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
19561 ExtractMask.size(), IsNotPoisonedVec);
19562 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
19563 IsNonPoisoned &= IsNotPoisonedVec;
19564 } else {
19565 IsUsedInExpr = false;
19566 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
19567 /*ForExtracts=*/true);
19568 }
19569 }
19570 if (!GatherShuffles.empty()) {
19571 unsigned SliceSize =
19572 getPartNumElems(E->Scalars.size(),
19573 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
19574 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
19575 for (const auto [I, TEs] : enumerate(Entries)) {
19576 if (TEs.empty()) {
19577 assert(!GatherShuffles[I] &&
19578 "No shuffles with empty entries list expected.");
19579 continue;
19580 }
19581 assert((TEs.size() == 1 || TEs.size() == 2) &&
19582 "Expected shuffle of 1 or 2 entries.");
19583 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
19584 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
19585 VecMask.assign(VecMask.size(), PoisonMaskElem);
19586 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
19587 if (TEs.size() == 1) {
19588 bool IsNotPoisonedVec =
19589 TEs.front()->VectorizedValue
19590 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
19591 : true;
19592 IsUsedInExpr &=
19593 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
19594 SliceSize, IsNotPoisonedVec);
19595 ShuffleBuilder.add(*TEs.front(), VecMask);
19596 IsNonPoisoned &= IsNotPoisonedVec;
19597 } else {
19598 IsUsedInExpr = false;
19599 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19600 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19601 IsNonPoisoned &=
19602 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
19603 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
19604 }
19605 }
19606 }
19607 // Try to figure out best way to combine values: build a shuffle and insert
19608 // elements or just build several shuffles.
19609 // Insert non-constant scalars.
19610 SmallVector<Value *> NonConstants(GatheredScalars);
19611 int EMSz = ExtractMask.size();
19612 int MSz = Mask.size();
19613 // Try to build constant vector and shuffle with it only if currently we
19614 // have a single permutation and more than 1 scalar constants.
19615 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
19616 bool IsIdentityShuffle =
19617 ((UseVecBaseAsInput ||
19618 all_of(ExtractShuffles,
19619 [](const std::optional<TTI::ShuffleKind> &SK) {
19620 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19622 })) &&
19623 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
19624 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
19625 (!GatherShuffles.empty() &&
19626 all_of(GatherShuffles,
19627 [](const std::optional<TTI::ShuffleKind> &SK) {
19628 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19630 }) &&
19631 none_of(Mask, [&](int I) { return I >= MSz; }) &&
19633 bool EnoughConstsForShuffle =
19634 IsSingleShuffle &&
19635 (none_of(GatheredScalars,
19636 [](Value *V) {
19637 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19638 }) ||
19639 any_of(GatheredScalars,
19640 [](Value *V) {
19641 return isa<Constant>(V) && !isa<UndefValue>(V);
19642 })) &&
19643 (!IsIdentityShuffle ||
19644 (GatheredScalars.size() == 2 &&
19645 any_of(GatheredScalars,
19646 [](Value *V) { return !isa<UndefValue>(V); })) ||
19647 count_if(GatheredScalars, [](Value *V) {
19648 return isa<Constant>(V) && !isa<PoisonValue>(V);
19649 }) > 1);
19650 // NonConstants array contains just non-constant values, GatheredScalars
19651 // contains only constant to build final vector and then shuffle.
19652 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
19653 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
19654 NonConstants[I] = PoisonValue::get(OrigScalarTy);
19655 else
19656 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
19657 }
19658 // Generate constants for final shuffle and build a mask for them.
19659 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
19660 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
19661 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
19662 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
19663 ShuffleBuilder.add(BV, BVMask);
19664 }
19665 if (all_of(NonConstants, [=](Value *V) {
19666 return isa<PoisonValue>(V) ||
19667 (IsSingleShuffle && ((IsIdentityShuffle &&
19668 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
19669 }))
19670 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19671 SubVectorsMask);
19672 else
19673 Res = ShuffleBuilder.finalize(
19674 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
19675 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
19676 bool IsSplat = isSplat(NonConstants);
19677 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
19678 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
19679 auto CheckIfSplatIsProfitable = [&]() {
19680 // Estimate the cost of splatting + shuffle and compare with
19681 // insert + shuffle.
19682 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19683 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19684 if (isa<ExtractElementInst>(V) || isVectorized(V))
19685 return false;
19686 InstructionCost SplatCost = TTI->getVectorInstrCost(
19687 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
19688 PoisonValue::get(VecTy), V);
19689 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19690 for (auto [Idx, I] : enumerate(BVMask))
19691 if (I != PoisonMaskElem)
19692 NewMask[Idx] = Mask.size();
19693 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
19694 NewMask, CostKind);
19695 InstructionCost BVCost = TTI->getVectorInstrCost(
19696 Instruction::InsertElement, VecTy, CostKind,
19697 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
19698 // Shuffle required?
19699 if (count(BVMask, PoisonMaskElem) <
19700 static_cast<int>(BVMask.size() - 1)) {
19701 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19702 for (auto [Idx, I] : enumerate(BVMask))
19703 if (I != PoisonMaskElem)
19704 NewMask[Idx] = I;
19705 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
19706 VecTy, NewMask, CostKind);
19707 }
19708 return SplatCost <= BVCost;
19709 };
19710 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
19711 for (auto [Idx, I] : enumerate(BVMask))
19712 if (I != PoisonMaskElem)
19713 Mask[Idx] = I;
19714 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
19715 } else {
19716 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19717 SmallVector<Value *> Values(NonConstants.size(),
19718 PoisonValue::get(ScalarTy));
19719 Values[0] = V;
19720 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
19721 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
19722 transform(BVMask, SplatMask.begin(), [](int I) {
19723 return I == PoisonMaskElem ? PoisonMaskElem : 0;
19724 });
19725 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
19726 BV = CreateShuffle(BV, nullptr, SplatMask);
19727 for (auto [Idx, I] : enumerate(BVMask))
19728 if (I != PoisonMaskElem)
19729 Mask[Idx] = BVMask.size() + Idx;
19730 Vec = CreateShuffle(Vec, BV, Mask);
19731 for (auto [Idx, I] : enumerate(Mask))
19732 if (I != PoisonMaskElem)
19733 Mask[Idx] = Idx;
19734 }
19735 });
19736 } else if (!allConstant(GatheredScalars)) {
19737 // Gather unique scalars and all constants.
19738 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
19739 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
19740 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
19741 ShuffleBuilder.add(BV, ReuseMask);
19742 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19743 SubVectorsMask);
19744 } else {
19745 // Gather all constants.
19746 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
19747 for (auto [I, V] : enumerate(GatheredScalars)) {
19748 if (!isa<PoisonValue>(V))
19749 Mask[I] = I;
19750 }
19751 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19752 ShuffleBuilder.add(BV, Mask);
19753 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19754 SubVectorsMask);
19755 }
19756
19757 if (NeedFreeze)
19758 Res = ShuffleBuilder.createFreeze(Res);
19759 return Res;
19760}
19761
19762Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
19763 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
19764 (void)vectorizeTree(VectorizableTree[EIdx].get());
19765 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
19766 Builder, *this);
19767}
19768
19769/// \returns \p I after propagating metadata from \p VL only for instructions in
19770/// \p VL.
19773 for (Value *V : VL)
19774 if (isa<Instruction>(V))
19775 Insts.push_back(V);
19776 return llvm::propagateMetadata(Inst, Insts);
19777}
19778
19780 if (DebugLoc DL = PN.getDebugLoc())
19781 return DL;
19782 return DebugLoc::getUnknown();
19783}
19784
19785Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
19786 IRBuilderBase::InsertPointGuard Guard(Builder);
19787
19788 Value *V = E->Scalars.front();
19789 Type *ScalarTy = V->getType();
19790 if (!isa<CmpInst>(V))
19791 ScalarTy = getValueType(V);
19792 auto It = MinBWs.find(E);
19793 if (It != MinBWs.end()) {
19794 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
19795 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
19796 if (VecTy)
19797 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
19798 }
19799 if (E->VectorizedValue)
19800 return E->VectorizedValue;
19801 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
19802 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
19803 // Set insert point for non-reduction initial nodes.
19804 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
19805 setInsertPointAfterBundle(E);
19806 Value *Vec = createBuildVector(E, ScalarTy);
19807 E->VectorizedValue = Vec;
19808 return Vec;
19809 }
19810 if (E->State == TreeEntry::SplitVectorize) {
19811 assert(E->CombinedEntriesWithIndices.size() == 2 &&
19812 "Expected exactly 2 combined entries.");
19813 setInsertPointAfterBundle(E);
19814 TreeEntry &OpTE1 =
19815 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
19816 assert(OpTE1.isSame(
19817 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19818 "Expected same first part of scalars.");
19819 Value *Op1 = vectorizeTree(&OpTE1);
19820 TreeEntry &OpTE2 =
19821 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
19822 assert(
19823 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19824 "Expected same second part of scalars.");
19825 Value *Op2 = vectorizeTree(&OpTE2);
19826 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
19827 bool IsSigned = false;
19828 auto It = MinBWs.find(OpE);
19829 if (It != MinBWs.end())
19830 IsSigned = It->second.second;
19831 else
19832 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19833 if (isa<PoisonValue>(V))
19834 return false;
19835 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19836 });
19837 return IsSigned;
19838 };
19839 if (cast<VectorType>(Op1->getType())->getElementType() !=
19840 ScalarTy->getScalarType()) {
19841 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19842 Op1 = Builder.CreateIntCast(
19843 Op1,
19845 ScalarTy,
19846 cast<FixedVectorType>(Op1->getType())->getNumElements()),
19847 GetOperandSignedness(&OpTE1));
19848 }
19849 if (cast<VectorType>(Op2->getType())->getElementType() !=
19850 ScalarTy->getScalarType()) {
19851 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19852 Op2 = Builder.CreateIntCast(
19853 Op2,
19855 ScalarTy,
19856 cast<FixedVectorType>(Op2->getType())->getNumElements()),
19857 GetOperandSignedness(&OpTE2));
19858 }
19859 if (E->ReorderIndices.empty()) {
19860 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
19861 std::iota(
19862 Mask.begin(),
19863 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
19864 0);
19865 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
19866 if (ScalarTyNumElements != 1) {
19867 assert(SLPReVec && "Only supported by REVEC.");
19868 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
19869 }
19870 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19871 Vec = createInsertVector(Builder, Vec, Op2,
19872 E->CombinedEntriesWithIndices.back().second *
19873 ScalarTyNumElements);
19874 E->VectorizedValue = Vec;
19875 return Vec;
19876 }
19877 unsigned CommonVF =
19878 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19879 if (getNumElements(Op1->getType()) != CommonVF) {
19880 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19881 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
19882 0);
19883 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19884 }
19885 if (getNumElements(Op2->getType()) != CommonVF) {
19886 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19887 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
19888 0);
19889 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19890 }
19891 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
19892 E->VectorizedValue = Vec;
19893 return Vec;
19894 }
19895
19896 bool IsReverseOrder =
19897 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
19898 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
19899 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
19900 if (E->getOpcode() == Instruction::Store &&
19901 E->State == TreeEntry::Vectorize) {
19902 ArrayRef<int> Mask =
19903 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19904 E->ReorderIndices.size());
19905 ShuffleBuilder.add(V, Mask);
19906 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19907 E->State == TreeEntry::CompressVectorize) {
19908 ShuffleBuilder.addOrdered(V, {});
19909 } else {
19910 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19911 }
19913 E->CombinedEntriesWithIndices.size());
19914 transform(
19915 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19916 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19917 });
19918 assert(
19919 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19920 "Expected either combined subnodes or reordering");
19921 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19922 };
19923
19924 assert(!E->isGather() && "Unhandled state");
19925 unsigned ShuffleOrOp =
19926 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19927 Instruction *VL0 = E->getMainOp();
19928 auto GetOperandSignedness = [&](unsigned Idx) {
19929 const TreeEntry *OpE = getOperandEntry(E, Idx);
19930 bool IsSigned = false;
19931 auto It = MinBWs.find(OpE);
19932 if (It != MinBWs.end())
19933 IsSigned = It->second.second;
19934 else
19935 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19936 if (isa<PoisonValue>(V))
19937 return false;
19938 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19939 });
19940 return IsSigned;
19941 };
19942 switch (ShuffleOrOp) {
19943 case Instruction::PHI: {
19944 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19945 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19946 "PHI reordering is free.");
19947 auto *PH = cast<PHINode>(VL0);
19948 Builder.SetInsertPoint(PH->getParent(),
19949 PH->getParent()->getFirstNonPHIIt());
19950 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19951 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19952 Value *V = NewPhi;
19953
19954 // Adjust insertion point once all PHI's have been generated.
19955 Builder.SetInsertPoint(PH->getParent(),
19956 PH->getParent()->getFirstInsertionPt());
19957 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19958
19959 V = FinalShuffle(V, E);
19960
19961 E->VectorizedValue = V;
19962 // If phi node is fully emitted - exit.
19963 if (NewPhi->getNumIncomingValues() != 0)
19964 return NewPhi;
19965
19966 // PHINodes may have multiple entries from the same block. We want to
19967 // visit every block once.
19968 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19969
19970 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19971 BasicBlock *IBB = PH->getIncomingBlock(I);
19972
19973 // Stop emission if all incoming values are generated.
19974 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19975 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19976 return NewPhi;
19977 }
19978
19979 if (!VisitedBBs.insert(IBB).second) {
19980 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19981 NewPhi->addIncoming(VecOp, IBB);
19982 TreeEntry *OpTE = getOperandEntry(E, I);
19983 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19984 OpTE->VectorizedValue = VecOp;
19985 continue;
19986 }
19987
19988 Builder.SetInsertPoint(IBB->getTerminator());
19989 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19990 Value *Vec = vectorizeOperand(E, I);
19991 if (VecTy != Vec->getType()) {
19992 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19993 MinBWs.contains(getOperandEntry(E, I))) &&
19994 "Expected item in MinBWs.");
19995 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19996 }
19997 NewPhi->addIncoming(Vec, IBB);
19998 }
19999
20000 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
20001 "Invalid number of incoming values");
20002 assert(E->VectorizedValue && "Expected vectorized value.");
20003 return E->VectorizedValue;
20004 }
20005
20006 case Instruction::ExtractElement: {
20007 Value *V = E->getSingleOperand(0);
20008 setInsertPointAfterBundle(E);
20009 V = FinalShuffle(V, E);
20010 E->VectorizedValue = V;
20011 return V;
20012 }
20013 case Instruction::ExtractValue: {
20014 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
20015 Builder.SetInsertPoint(LI);
20016 Value *Ptr = LI->getPointerOperand();
20017 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
20018 Value *NewV = ::propagateMetadata(V, E->Scalars);
20019 NewV = FinalShuffle(NewV, E);
20020 E->VectorizedValue = NewV;
20021 return NewV;
20022 }
20023 case Instruction::InsertElement: {
20024 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
20025 if (const TreeEntry *OpE = getOperandEntry(E, 1);
20026 OpE && !OpE->isGather() && OpE->hasState() &&
20027 !OpE->hasCopyableElements())
20028 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
20029 else
20030 setInsertPointAfterBundle(E);
20031 Value *V = vectorizeOperand(E, 1);
20032 ArrayRef<Value *> Op = E->getOperand(1);
20033 Type *ScalarTy = Op.front()->getType();
20034 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
20035 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20036 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
20037 assert(Res.first > 0 && "Expected item in MinBWs.");
20038 V = Builder.CreateIntCast(
20039 V,
20041 ScalarTy,
20042 cast<FixedVectorType>(V->getType())->getNumElements()),
20043 Res.second);
20044 }
20045
20046 // Create InsertVector shuffle if necessary
20047 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
20048 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
20049 }));
20050 const unsigned NumElts =
20051 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
20052 const unsigned NumScalars = E->Scalars.size();
20053
20054 unsigned Offset = *getElementIndex(VL0);
20055 assert(Offset < NumElts && "Failed to find vector index offset");
20056
20057 // Create shuffle to resize vector
20058 SmallVector<int> Mask;
20059 if (!E->ReorderIndices.empty()) {
20060 inversePermutation(E->ReorderIndices, Mask);
20061 Mask.append(NumElts - NumScalars, PoisonMaskElem);
20062 } else {
20063 Mask.assign(NumElts, PoisonMaskElem);
20064 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
20065 }
20066 // Create InsertVector shuffle if necessary
20067 bool IsIdentity = true;
20068 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
20069 Mask.swap(PrevMask);
20070 for (unsigned I = 0; I < NumScalars; ++I) {
20071 Value *Scalar = E->Scalars[PrevMask[I]];
20072 unsigned InsertIdx = *getElementIndex(Scalar);
20073 IsIdentity &= InsertIdx - Offset == I;
20074 Mask[InsertIdx - Offset] = I;
20075 }
20076 if (!IsIdentity || NumElts != NumScalars) {
20077 Value *V2 = nullptr;
20078 bool IsVNonPoisonous =
20080 SmallVector<int> InsertMask(Mask);
20081 if (NumElts != NumScalars && Offset == 0) {
20082 // Follow all insert element instructions from the current buildvector
20083 // sequence.
20084 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
20085 do {
20086 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
20087 if (!InsertIdx)
20088 break;
20089 if (InsertMask[*InsertIdx] == PoisonMaskElem)
20090 InsertMask[*InsertIdx] = *InsertIdx;
20091 if (!Ins->hasOneUse())
20092 break;
20095 } while (Ins);
20096 SmallBitVector UseMask =
20097 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20098 SmallBitVector IsFirstPoison =
20099 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
20100 SmallBitVector IsFirstUndef =
20101 isUndefVector(FirstInsert->getOperand(0), UseMask);
20102 if (!IsFirstPoison.all()) {
20103 unsigned Idx = 0;
20104 for (unsigned I = 0; I < NumElts; I++) {
20105 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
20106 IsFirstUndef.test(I)) {
20107 if (IsVNonPoisonous) {
20108 InsertMask[I] = I < NumScalars ? I : 0;
20109 continue;
20110 }
20111 if (!V2)
20112 V2 = UndefValue::get(V->getType());
20113 if (Idx >= NumScalars)
20114 Idx = NumScalars - 1;
20115 InsertMask[I] = NumScalars + Idx;
20116 ++Idx;
20117 } else if (InsertMask[I] != PoisonMaskElem &&
20118 Mask[I] == PoisonMaskElem) {
20119 InsertMask[I] = PoisonMaskElem;
20120 }
20121 }
20122 } else {
20123 InsertMask = Mask;
20124 }
20125 }
20126 if (!V2)
20127 V2 = PoisonValue::get(V->getType());
20128 V = Builder.CreateShuffleVector(V, V2, InsertMask);
20129 if (auto *I = dyn_cast<Instruction>(V)) {
20130 GatherShuffleExtractSeq.insert(I);
20131 CSEBlocks.insert(I->getParent());
20132 }
20133 }
20134
20135 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
20136 for (unsigned I = 0; I < NumElts; I++) {
20137 if (Mask[I] != PoisonMaskElem)
20138 InsertMask[Offset + I] = I;
20139 }
20140 SmallBitVector UseMask =
20141 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20142 SmallBitVector IsFirstUndef =
20143 isUndefVector(FirstInsert->getOperand(0), UseMask);
20144 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
20145 NumElts != NumScalars) {
20146 if (IsFirstUndef.all()) {
20147 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
20148 SmallBitVector IsFirstPoison =
20149 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
20150 if (!IsFirstPoison.all()) {
20151 for (unsigned I = 0; I < NumElts; I++) {
20152 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
20153 InsertMask[I] = I + NumElts;
20154 }
20155 }
20156 V = Builder.CreateShuffleVector(
20157 V,
20158 IsFirstPoison.all() ? PoisonValue::get(V->getType())
20159 : FirstInsert->getOperand(0),
20160 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
20161 if (auto *I = dyn_cast<Instruction>(V)) {
20162 GatherShuffleExtractSeq.insert(I);
20163 CSEBlocks.insert(I->getParent());
20164 }
20165 }
20166 } else {
20167 SmallBitVector IsFirstPoison =
20168 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
20169 for (unsigned I = 0; I < NumElts; I++) {
20170 if (InsertMask[I] == PoisonMaskElem)
20171 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
20172 else
20173 InsertMask[I] += NumElts;
20174 }
20175 V = Builder.CreateShuffleVector(
20176 FirstInsert->getOperand(0), V, InsertMask,
20177 cast<Instruction>(E->Scalars.back())->getName());
20178 if (auto *I = dyn_cast<Instruction>(V)) {
20179 GatherShuffleExtractSeq.insert(I);
20180 CSEBlocks.insert(I->getParent());
20181 }
20182 }
20183 }
20184
20185 ++NumVectorInstructions;
20186 E->VectorizedValue = V;
20187 return V;
20188 }
20189 case Instruction::ZExt:
20190 case Instruction::SExt:
20191 case Instruction::FPToUI:
20192 case Instruction::FPToSI:
20193 case Instruction::FPExt:
20194 case Instruction::PtrToInt:
20195 case Instruction::IntToPtr:
20196 case Instruction::SIToFP:
20197 case Instruction::UIToFP:
20198 case Instruction::Trunc:
20199 case Instruction::FPTrunc:
20200 case Instruction::BitCast: {
20201 setInsertPointAfterBundle(E);
20202
20203 Value *InVec = vectorizeOperand(E, 0);
20204
20205 auto *CI = cast<CastInst>(VL0);
20206 Instruction::CastOps VecOpcode = CI->getOpcode();
20207 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
20208 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
20209 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
20210 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
20211 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
20212 // Check if the values are candidates to demote.
20213 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
20214 if (SrcIt != MinBWs.end())
20215 SrcBWSz = SrcIt->second.first;
20216 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
20217 if (BWSz == SrcBWSz) {
20218 VecOpcode = Instruction::BitCast;
20219 } else if (BWSz < SrcBWSz) {
20220 VecOpcode = Instruction::Trunc;
20221 } else if (It != MinBWs.end()) {
20222 assert(BWSz > SrcBWSz && "Invalid cast!");
20223 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
20224 } else if (SrcIt != MinBWs.end()) {
20225 assert(BWSz > SrcBWSz && "Invalid cast!");
20226 VecOpcode =
20227 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
20228 }
20229 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
20230 !SrcIt->second.second) {
20231 VecOpcode = Instruction::UIToFP;
20232 }
20233 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
20234 ? InVec
20235 : Builder.CreateCast(VecOpcode, InVec, VecTy);
20236 V = FinalShuffle(V, E);
20237
20238 E->VectorizedValue = V;
20239 ++NumVectorInstructions;
20240 return V;
20241 }
20242 case Instruction::FCmp:
20243 case Instruction::ICmp: {
20244 setInsertPointAfterBundle(E);
20245
20246 Value *L = vectorizeOperand(E, 0);
20247 Value *R = vectorizeOperand(E, 1);
20248 if (L->getType() != R->getType()) {
20249 assert((getOperandEntry(E, 0)->isGather() ||
20250 getOperandEntry(E, 1)->isGather() ||
20251 MinBWs.contains(getOperandEntry(E, 0)) ||
20252 MinBWs.contains(getOperandEntry(E, 1))) &&
20253 "Expected item in MinBWs.");
20254 if (cast<VectorType>(L->getType())
20255 ->getElementType()
20256 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
20257 ->getElementType()
20258 ->getIntegerBitWidth()) {
20259 Type *CastTy = R->getType();
20260 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
20261 } else {
20262 Type *CastTy = L->getType();
20263 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
20264 }
20265 }
20266
20267 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
20268 Value *V = Builder.CreateCmp(P0, L, R);
20269 propagateIRFlags(V, E->Scalars, VL0);
20270 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
20271 ICmp->setSameSign(/*B=*/false);
20272 // Do not cast for cmps.
20273 VecTy = cast<FixedVectorType>(V->getType());
20274 V = FinalShuffle(V, E);
20275
20276 E->VectorizedValue = V;
20277 ++NumVectorInstructions;
20278 return V;
20279 }
20280 case Instruction::Select: {
20281 setInsertPointAfterBundle(E);
20282
20283 Value *Cond = vectorizeOperand(E, 0);
20284 Value *True = vectorizeOperand(E, 1);
20285 Value *False = vectorizeOperand(E, 2);
20286 if (True->getType() != VecTy || False->getType() != VecTy) {
20287 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
20288 getOperandEntry(E, 2)->isGather() ||
20289 MinBWs.contains(getOperandEntry(E, 1)) ||
20290 MinBWs.contains(getOperandEntry(E, 2))) &&
20291 "Expected item in MinBWs.");
20292 if (True->getType() != VecTy)
20293 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
20294 if (False->getType() != VecTy)
20295 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
20296 }
20297
20298 unsigned CondNumElements = getNumElements(Cond->getType());
20299 unsigned TrueNumElements = getNumElements(True->getType());
20300 assert(TrueNumElements >= CondNumElements &&
20301 TrueNumElements % CondNumElements == 0 &&
20302 "Cannot vectorize Instruction::Select");
20303 assert(TrueNumElements == getNumElements(False->getType()) &&
20304 "Cannot vectorize Instruction::Select");
20305 if (CondNumElements != TrueNumElements) {
20306 // When the return type is i1 but the source is fixed vector type, we
20307 // need to duplicate the condition value.
20308 Cond = Builder.CreateShuffleVector(
20309 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
20310 CondNumElements));
20311 }
20312 assert(getNumElements(Cond->getType()) == TrueNumElements &&
20313 "Cannot vectorize Instruction::Select");
20314 Value *V =
20315 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
20316 V = FinalShuffle(V, E);
20317
20318 E->VectorizedValue = V;
20319 ++NumVectorInstructions;
20320 return V;
20321 }
20322 case Instruction::FNeg: {
20323 setInsertPointAfterBundle(E);
20324
20325 Value *Op = vectorizeOperand(E, 0);
20326
20327 Value *V = Builder.CreateUnOp(
20328 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
20329 propagateIRFlags(V, E->Scalars, VL0);
20330 if (auto *I = dyn_cast<Instruction>(V))
20331 V = ::propagateMetadata(I, E->Scalars);
20332
20333 V = FinalShuffle(V, E);
20334
20335 E->VectorizedValue = V;
20336 ++NumVectorInstructions;
20337
20338 return V;
20339 }
20340 case Instruction::Freeze: {
20341 setInsertPointAfterBundle(E);
20342
20343 Value *Op = vectorizeOperand(E, 0);
20344
20345 if (Op->getType() != VecTy) {
20346 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
20347 MinBWs.contains(getOperandEntry(E, 0))) &&
20348 "Expected item in MinBWs.");
20349 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
20350 }
20351 Value *V = Builder.CreateFreeze(Op);
20352 V = FinalShuffle(V, E);
20353
20354 E->VectorizedValue = V;
20355 ++NumVectorInstructions;
20356
20357 return V;
20358 }
20359 case Instruction::Add:
20360 case Instruction::FAdd:
20361 case Instruction::Sub:
20362 case Instruction::FSub:
20363 case Instruction::Mul:
20364 case Instruction::FMul:
20365 case Instruction::UDiv:
20366 case Instruction::SDiv:
20367 case Instruction::FDiv:
20368 case Instruction::URem:
20369 case Instruction::SRem:
20370 case Instruction::FRem:
20371 case Instruction::Shl:
20372 case Instruction::LShr:
20373 case Instruction::AShr:
20374 case Instruction::And:
20375 case Instruction::Or:
20376 case Instruction::Xor: {
20377 setInsertPointAfterBundle(E);
20378
20379 Value *LHS = vectorizeOperand(E, 0);
20380 Value *RHS = vectorizeOperand(E, 1);
20381 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
20382 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
20383 ArrayRef<Value *> Ops = E->getOperand(I);
20384 if (all_of(Ops, [&](Value *Op) {
20385 auto *CI = dyn_cast<ConstantInt>(Op);
20386 return CI && CI->getValue().countr_one() >= It->second.first;
20387 })) {
20388 V = FinalShuffle(I == 0 ? RHS : LHS, E);
20389 E->VectorizedValue = V;
20390 ++NumVectorInstructions;
20391 return V;
20392 }
20393 }
20394 }
20395 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
20396 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
20397 getOperandEntry(E, 1)->isGather() ||
20398 MinBWs.contains(getOperandEntry(E, 0)) ||
20399 MinBWs.contains(getOperandEntry(E, 1))) &&
20400 "Expected item in MinBWs.");
20401 if (LHS->getType() != VecTy)
20402 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
20403 if (RHS->getType() != VecTy)
20404 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
20405 }
20406
20407 Value *V = Builder.CreateBinOp(
20408 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
20409 RHS);
20410 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
20411 if (auto *I = dyn_cast<Instruction>(V)) {
20412 V = ::propagateMetadata(I, E->Scalars);
20413 // Drop nuw flags for abs(sub(commutative), true).
20414 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
20415 any_of(E->Scalars, [E](Value *V) {
20416 return isa<PoisonValue>(V) ||
20417 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
20418 isCommutative(cast<Instruction>(V));
20419 }))
20420 I->setHasNoUnsignedWrap(/*b=*/false);
20421 }
20422
20423 V = FinalShuffle(V, E);
20424
20425 E->VectorizedValue = V;
20426 ++NumVectorInstructions;
20427
20428 return V;
20429 }
20430 case Instruction::Load: {
20431 // Loads are inserted at the head of the tree because we don't want to
20432 // sink them all the way down past store instructions.
20433 setInsertPointAfterBundle(E);
20434
20435 LoadInst *LI = cast<LoadInst>(VL0);
20436 Instruction *NewLI;
20437 FixedVectorType *StridedLoadTy = nullptr;
20438 Value *PO = LI->getPointerOperand();
20439 if (E->State == TreeEntry::Vectorize) {
20440 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
20441 } else if (E->State == TreeEntry::CompressVectorize) {
20442 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
20443 CompressEntryToData.at(E);
20444 Align CommonAlignment = LI->getAlign();
20445 if (IsMasked) {
20446 unsigned VF = getNumElements(LoadVecTy);
20447 SmallVector<Constant *> MaskValues(
20448 VF / getNumElements(LI->getType()),
20449 ConstantInt::getFalse(VecTy->getContext()));
20450 for (int I : CompressMask)
20451 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
20452 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
20453 assert(SLPReVec && "Only supported by REVEC.");
20454 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
20455 }
20456 Constant *MaskValue = ConstantVector::get(MaskValues);
20457 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
20458 MaskValue);
20459 } else {
20460 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
20461 }
20462 NewLI = ::propagateMetadata(NewLI, E->Scalars);
20463 // TODO: include this cost into CommonCost.
20464 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
20465 assert(SLPReVec && "FixedVectorType is not expected.");
20466 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
20467 CompressMask);
20468 }
20469 NewLI =
20470 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
20471 } else if (E->State == TreeEntry::StridedVectorize) {
20472 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
20473 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
20474 PO = IsReverseOrder ? PtrN : Ptr0;
20475 Type *StrideTy = DL->getIndexType(PO->getType());
20476 Value *StrideVal;
20477 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
20478 StridedLoadTy = SPtrInfo.Ty;
20479 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
20480 unsigned StridedLoadEC =
20481 StridedLoadTy->getElementCount().getKnownMinValue();
20482
20483 Value *Stride = SPtrInfo.StrideVal;
20484 if (!Stride) {
20485 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
20486 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
20487 SCEVExpander Expander(*SE, "strided-load-vec");
20488 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
20489 &*Builder.GetInsertPoint());
20490 }
20491 Value *NewStride =
20492 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
20493 StrideVal = Builder.CreateMul(
20494 NewStride, ConstantInt::getSigned(
20495 StrideTy, (IsReverseOrder ? -1 : 1) *
20496 static_cast<int>(
20497 DL->getTypeAllocSize(ScalarTy))));
20498 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
20499 auto *Inst = Builder.CreateIntrinsic(
20500 Intrinsic::experimental_vp_strided_load,
20501 {StridedLoadTy, PO->getType(), StrideTy},
20502 {PO, StrideVal,
20503 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
20504 Builder.getInt32(StridedLoadEC)});
20505 Inst->addParamAttr(
20506 /*ArgNo=*/0,
20507 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20508 NewLI = Inst;
20509 } else {
20510 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
20511 Value *VecPtr = vectorizeOperand(E, 0);
20512 if (isa<FixedVectorType>(ScalarTy)) {
20513 assert(SLPReVec && "FixedVectorType is not expected.");
20514 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
20515 // to expand VecPtr if ScalarTy is a vector type.
20516 unsigned ScalarTyNumElements =
20517 cast<FixedVectorType>(ScalarTy)->getNumElements();
20518 unsigned VecTyNumElements =
20519 cast<FixedVectorType>(VecTy)->getNumElements();
20520 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
20521 "Cannot expand getelementptr.");
20522 unsigned VF = VecTyNumElements / ScalarTyNumElements;
20523 SmallVector<Constant *> Indices(VecTyNumElements);
20524 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
20525 return Builder.getInt64(I % ScalarTyNumElements);
20526 });
20527 VecPtr = Builder.CreateGEP(
20528 VecTy->getElementType(),
20529 Builder.CreateShuffleVector(
20530 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
20531 ConstantVector::get(Indices));
20532 }
20533 // Use the minimum alignment of the gathered loads.
20534 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
20535 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
20536 }
20537 Value *V = E->State == TreeEntry::CompressVectorize
20538 ? NewLI
20539 : ::propagateMetadata(NewLI, E->Scalars);
20540
20541 if (StridedLoadTy != VecTy)
20542 V = Builder.CreateBitOrPointerCast(V, VecTy);
20543 V = FinalShuffle(V, E);
20544 E->VectorizedValue = V;
20545 ++NumVectorInstructions;
20546 return V;
20547 }
20548 case Instruction::Store: {
20549 auto *SI = cast<StoreInst>(VL0);
20550
20551 setInsertPointAfterBundle(E);
20552
20553 Value *VecValue = vectorizeOperand(E, 0);
20554 if (VecValue->getType() != VecTy)
20555 VecValue =
20556 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
20557 VecValue = FinalShuffle(VecValue, E);
20558
20559 Value *Ptr = SI->getPointerOperand();
20560 Instruction *ST;
20561 if (E->State == TreeEntry::Vectorize) {
20562 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
20563 } else {
20564 assert(E->State == TreeEntry::StridedVectorize &&
20565 "Expected either strided or consecutive stores.");
20566 if (!E->ReorderIndices.empty()) {
20567 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
20568 Ptr = SI->getPointerOperand();
20569 }
20570 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
20571 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
20572 auto *Inst = Builder.CreateIntrinsic(
20573 Intrinsic::experimental_vp_strided_store,
20574 {VecTy, Ptr->getType(), StrideTy},
20575 {VecValue, Ptr,
20577 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
20578 Builder.getAllOnesMask(VecTy->getElementCount()),
20579 Builder.getInt32(E->Scalars.size())});
20580 Inst->addParamAttr(
20581 /*ArgNo=*/1,
20582 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20583 ST = Inst;
20584 }
20585
20586 Value *V = ::propagateMetadata(ST, E->Scalars);
20587
20588 E->VectorizedValue = V;
20589 ++NumVectorInstructions;
20590 return V;
20591 }
20592 case Instruction::GetElementPtr: {
20593 auto *GEP0 = cast<GetElementPtrInst>(VL0);
20594 setInsertPointAfterBundle(E);
20595
20596 Value *Op0 = vectorizeOperand(E, 0);
20597
20598 SmallVector<Value *> OpVecs;
20599 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
20600 Value *OpVec = vectorizeOperand(E, J);
20601 OpVecs.push_back(OpVec);
20602 }
20603
20604 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20605 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
20607 for (Value *V : E->Scalars) {
20609 GEPs.push_back(V);
20610 }
20611 V = ::propagateMetadata(I, GEPs);
20612 }
20613
20614 V = FinalShuffle(V, E);
20615
20616 E->VectorizedValue = V;
20617 ++NumVectorInstructions;
20618
20619 return V;
20620 }
20621 case Instruction::Call: {
20622 CallInst *CI = cast<CallInst>(VL0);
20623 setInsertPointAfterBundle(E);
20624
20626
20628 CI, ID, VecTy->getNumElements(),
20629 It != MinBWs.end() ? It->second.first : 0, TTI);
20630 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
20631 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
20632 VecCallCosts.first <= VecCallCosts.second;
20633
20634 Value *ScalarArg = nullptr;
20635 SmallVector<Value *> OpVecs;
20636 SmallVector<Type *, 2> TysForDecl;
20637 // Add return type if intrinsic is overloaded on it.
20638 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
20639 TysForDecl.push_back(VecTy);
20640 auto *CEI = cast<CallInst>(VL0);
20641 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
20642 // Some intrinsics have scalar arguments. This argument should not be
20643 // vectorized.
20644 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
20645 ScalarArg = CEI->getArgOperand(I);
20646 // if decided to reduce bitwidth of abs intrinsic, it second argument
20647 // must be set false (do not return poison, if value issigned min).
20648 if (ID == Intrinsic::abs && It != MinBWs.end() &&
20649 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
20650 ScalarArg = Builder.getFalse();
20651 OpVecs.push_back(ScalarArg);
20653 TysForDecl.push_back(ScalarArg->getType());
20654 continue;
20655 }
20656
20657 Value *OpVec = vectorizeOperand(E, I);
20658 ScalarArg = CEI->getArgOperand(I);
20659 if (cast<VectorType>(OpVec->getType())->getElementType() !=
20660 ScalarArg->getType()->getScalarType() &&
20661 It == MinBWs.end()) {
20662 auto *CastTy =
20663 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
20664 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
20665 } else if (It != MinBWs.end()) {
20666 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
20667 }
20668 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
20669 OpVecs.push_back(OpVec);
20670 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
20671 TysForDecl.push_back(OpVec->getType());
20672 }
20673
20674 Function *CF;
20675 if (!UseIntrinsic) {
20676 VFShape Shape =
20678 ElementCount::getFixed(VecTy->getNumElements()),
20679 false /*HasGlobalPred*/);
20680 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
20681 } else {
20682 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
20683 }
20684
20686 CI->getOperandBundlesAsDefs(OpBundles);
20687 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
20688
20689 propagateIRFlags(V, E->Scalars, VL0);
20690 V = FinalShuffle(V, E);
20691
20692 E->VectorizedValue = V;
20693 ++NumVectorInstructions;
20694 return V;
20695 }
20696 case Instruction::ShuffleVector: {
20697 Value *V;
20698 if (SLPReVec && !E->isAltShuffle()) {
20699 setInsertPointAfterBundle(E);
20700 Value *Src = vectorizeOperand(E, 0);
20701 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
20702 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
20703 SmallVector<int> NewMask(ThisMask.size());
20704 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
20705 return SVSrc->getShuffleMask()[Mask];
20706 });
20707 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
20708 SVSrc->getOperand(1), NewMask);
20709 } else {
20710 V = Builder.CreateShuffleVector(Src, ThisMask);
20711 }
20712 propagateIRFlags(V, E->Scalars, VL0);
20713 if (auto *I = dyn_cast<Instruction>(V))
20714 V = ::propagateMetadata(I, E->Scalars);
20715 V = FinalShuffle(V, E);
20716 } else {
20717 assert(E->isAltShuffle() &&
20718 ((Instruction::isBinaryOp(E->getOpcode()) &&
20719 Instruction::isBinaryOp(E->getAltOpcode())) ||
20720 (Instruction::isCast(E->getOpcode()) &&
20721 Instruction::isCast(E->getAltOpcode())) ||
20722 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
20723 "Invalid Shuffle Vector Operand");
20724
20725 Value *LHS = nullptr, *RHS = nullptr;
20726 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
20727 setInsertPointAfterBundle(E);
20728 LHS = vectorizeOperand(E, 0);
20729 RHS = vectorizeOperand(E, 1);
20730 } else {
20731 setInsertPointAfterBundle(E);
20732 LHS = vectorizeOperand(E, 0);
20733 }
20734 if (LHS && RHS &&
20735 ((Instruction::isBinaryOp(E->getOpcode()) &&
20736 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
20737 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
20738 assert((It != MinBWs.end() ||
20739 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
20740 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
20741 MinBWs.contains(getOperandEntry(E, 0)) ||
20742 MinBWs.contains(getOperandEntry(E, 1))) &&
20743 "Expected item in MinBWs.");
20744 Type *CastTy = VecTy;
20745 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
20747 ->getElementType()
20748 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
20749 ->getElementType()
20750 ->getIntegerBitWidth())
20751 CastTy = RHS->getType();
20752 else
20753 CastTy = LHS->getType();
20754 }
20755 if (LHS->getType() != CastTy)
20756 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
20757 if (RHS->getType() != CastTy)
20758 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
20759 }
20760
20761 Value *V0, *V1;
20762 if (Instruction::isBinaryOp(E->getOpcode())) {
20763 V0 = Builder.CreateBinOp(
20764 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
20765 V1 = Builder.CreateBinOp(
20766 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
20767 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
20768 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
20769 auto *AltCI = cast<CmpInst>(E->getAltOp());
20770 CmpInst::Predicate AltPred = AltCI->getPredicate();
20771 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
20772 } else {
20773 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
20774 unsigned SrcBWSz = DL->getTypeSizeInBits(
20775 cast<VectorType>(LHS->getType())->getElementType());
20776 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20777 if (BWSz <= SrcBWSz) {
20778 if (BWSz < SrcBWSz)
20779 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
20780 assert(LHS->getType() == VecTy &&
20781 "Expected same type as operand.");
20782 if (auto *I = dyn_cast<Instruction>(LHS))
20783 LHS = ::propagateMetadata(I, E->Scalars);
20784 LHS = FinalShuffle(LHS, E);
20785 E->VectorizedValue = LHS;
20786 ++NumVectorInstructions;
20787 return LHS;
20788 }
20789 }
20790 V0 = Builder.CreateCast(
20791 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
20792 V1 = Builder.CreateCast(
20793 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
20794 }
20795 // Add V0 and V1 to later analysis to try to find and remove matching
20796 // instruction, if any.
20797 for (Value *V : {V0, V1}) {
20798 if (auto *I = dyn_cast<Instruction>(V)) {
20799 GatherShuffleExtractSeq.insert(I);
20800 CSEBlocks.insert(I->getParent());
20801 }
20802 }
20803
20804 // Create shuffle to take alternate operations from the vector.
20805 // Also, gather up main and alt scalar ops to propagate IR flags to
20806 // each vector operation.
20807 ValueList OpScalars, AltScalars;
20808 SmallVector<int> Mask;
20809 E->buildAltOpShuffleMask(
20810 [E, this](Instruction *I) {
20811 assert(E->getMatchingMainOpOrAltOp(I) &&
20812 "Unexpected main/alternate opcode");
20813 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
20814 *TLI);
20815 },
20816 Mask, &OpScalars, &AltScalars);
20817
20818 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
20819 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
20820 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
20821 // Drop nuw flags for abs(sub(commutative), true).
20822 if (auto *I = dyn_cast<Instruction>(Vec);
20823 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
20824 any_of(E->Scalars, [E](Value *V) {
20825 if (isa<PoisonValue>(V))
20826 return false;
20827 if (E->hasCopyableElements() && E->isCopyableElement(V))
20828 return false;
20829 auto *IV = cast<Instruction>(V);
20830 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20831 }))
20832 I->setHasNoUnsignedWrap(/*b=*/false);
20833 };
20834 DropNuwFlag(V0, E->getOpcode());
20835 DropNuwFlag(V1, E->getAltOpcode());
20836
20837 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20838 assert(SLPReVec && "FixedVectorType is not expected.");
20839 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
20840 }
20841 V = Builder.CreateShuffleVector(V0, V1, Mask);
20842 if (auto *I = dyn_cast<Instruction>(V)) {
20843 V = ::propagateMetadata(I, E->Scalars);
20844 GatherShuffleExtractSeq.insert(I);
20845 CSEBlocks.insert(I->getParent());
20846 }
20847 }
20848
20849 E->VectorizedValue = V;
20850 ++NumVectorInstructions;
20851
20852 return V;
20853 }
20854 default:
20855 llvm_unreachable("unknown inst");
20856 }
20857 return nullptr;
20858}
20859
20861 ExtraValueToDebugLocsMap ExternallyUsedValues;
20862 return vectorizeTree(ExternallyUsedValues);
20863}
20864
20866 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
20867 Instruction *ReductionRoot,
20868 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20869 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
20870 // need to rebuild it.
20871 EntryToLastInstruction.clear();
20872 // All blocks must be scheduled before any instructions are inserted.
20873 for (auto &BSIter : BlocksSchedules)
20874 scheduleBlock(*this, BSIter.second.get());
20875 // Cache last instructions for the nodes to avoid side effects, which may
20876 // appear during vectorization, like extra uses, etc.
20877 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20878 if (TE->isGather() || DeletedNodes.contains(TE.get()))
20879 continue;
20880 (void)getLastInstructionInBundle(TE.get());
20881 }
20882
20883 if (ReductionRoot)
20884 Builder.SetInsertPoint(ReductionRoot->getParent(),
20885 ReductionRoot->getIterator());
20886 else
20887 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20888
20889 // Vectorize gather operands of the nodes with the external uses only.
20891 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20892 if (DeletedNodes.contains(TE.get()))
20893 continue;
20894 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20895 TE->UserTreeIndex.UserTE->hasState() &&
20896 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20897 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20898 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20899 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20900 all_of(TE->UserTreeIndex.UserTE->Scalars,
20901 [](Value *V) { return isUsedOutsideBlock(V); })) {
20902 Instruction &LastInst =
20903 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20904 GatherEntries.emplace_back(TE.get(), &LastInst);
20905 }
20906 }
20907 for (auto &Entry : GatherEntries) {
20908 IRBuilderBase::InsertPointGuard Guard(Builder);
20909 Builder.SetInsertPoint(Entry.second);
20910 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20911 (void)vectorizeTree(Entry.first);
20912 }
20913 // Emit gathered loads first to emit better code for the users of those
20914 // gathered loads.
20915 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20916 if (DeletedNodes.contains(TE.get()))
20917 continue;
20918 if (GatheredLoadsEntriesFirst.has_value() &&
20919 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20920 (!TE->isGather() || TE->UserTreeIndex)) {
20921 assert((TE->UserTreeIndex ||
20922 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20923 "Expected gathered load node.");
20924 (void)vectorizeTree(TE.get());
20925 }
20926 }
20927 (void)vectorizeTree(VectorizableTree[0].get());
20928 // Run through the list of postponed gathers and emit them, replacing the temp
20929 // emitted allocas with actual vector instructions.
20930 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20932 for (const TreeEntry *E : PostponedNodes) {
20933 auto *TE = const_cast<TreeEntry *>(E);
20934 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20935 TE->VectorizedValue = nullptr;
20936 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20937 // If user is a PHI node, its vector code have to be inserted right before
20938 // block terminator. Since the node was delayed, there were some unresolved
20939 // dependencies at the moment when stab instruction was emitted. In a case
20940 // when any of these dependencies turn out an operand of another PHI, coming
20941 // from this same block, position of a stab instruction will become invalid.
20942 // The is because source vector that supposed to feed this gather node was
20943 // inserted at the end of the block [after stab instruction]. So we need
20944 // to adjust insertion point again to the end of block.
20945 if (isa<PHINode>(UserI) ||
20946 (TE->UserTreeIndex.UserTE->hasState() &&
20947 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20948 // Insert before all users.
20949 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20950 for (User *U : PrevVec->users()) {
20951 if (U == UserI)
20952 continue;
20953 auto *UI = dyn_cast<Instruction>(U);
20954 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20955 continue;
20956 if (UI->comesBefore(InsertPt))
20957 InsertPt = UI;
20958 }
20959 Builder.SetInsertPoint(InsertPt);
20960 } else {
20961 Builder.SetInsertPoint(PrevVec);
20962 }
20963 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20964 Value *Vec = vectorizeTree(TE);
20965 if (auto *VecI = dyn_cast<Instruction>(Vec);
20966 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20967 Builder.GetInsertPoint()->comesBefore(VecI))
20968 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20969 Builder.GetInsertPoint());
20970 if (Vec->getType() != PrevVec->getType()) {
20971 assert(Vec->getType()->isIntOrIntVectorTy() &&
20972 PrevVec->getType()->isIntOrIntVectorTy() &&
20973 "Expected integer vector types only.");
20974 std::optional<bool> IsSigned;
20975 for (Value *V : TE->Scalars) {
20976 if (isVectorized(V)) {
20977 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20978 auto It = MinBWs.find(MNTE);
20979 if (It != MinBWs.end()) {
20980 IsSigned = IsSigned.value_or(false) || It->second.second;
20981 if (*IsSigned)
20982 break;
20983 }
20984 }
20985 if (IsSigned.value_or(false))
20986 break;
20987 // Scan through gather nodes.
20988 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20989 auto It = MinBWs.find(BVE);
20990 if (It != MinBWs.end()) {
20991 IsSigned = IsSigned.value_or(false) || It->second.second;
20992 if (*IsSigned)
20993 break;
20994 }
20995 }
20996 if (IsSigned.value_or(false))
20997 break;
20998 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20999 IsSigned =
21000 IsSigned.value_or(false) ||
21001 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
21002 continue;
21003 }
21004 if (IsSigned.value_or(false))
21005 break;
21006 }
21007 }
21008 if (IsSigned.value_or(false)) {
21009 // Final attempt - check user node.
21010 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
21011 if (It != MinBWs.end())
21012 IsSigned = It->second.second;
21013 }
21014 assert(IsSigned &&
21015 "Expected user node or perfect diamond match in MinBWs.");
21016 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
21017 }
21018 PrevVec->replaceAllUsesWith(Vec);
21019 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
21020 // Replace the stub vector node, if it was used before for one of the
21021 // buildvector nodes already.
21022 auto It = PostponedValues.find(PrevVec);
21023 if (It != PostponedValues.end()) {
21024 for (TreeEntry *VTE : It->getSecond())
21025 VTE->VectorizedValue = Vec;
21026 }
21027 eraseInstruction(PrevVec);
21028 }
21029
21030 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
21031 << " values .\n");
21032
21034 // Maps vector instruction to original insertelement instruction
21035 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
21036 // Maps extract Scalar to the corresponding extractelement instruction in the
21037 // basic block. Only one extractelement per block should be emitted.
21039 ScalarToEEs;
21040 SmallDenseSet<Value *, 4> UsedInserts;
21042 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
21044 // Extract all of the elements with the external uses.
21045 for (const auto &ExternalUse : ExternalUses) {
21046 Value *Scalar = ExternalUse.Scalar;
21047 llvm::User *User = ExternalUse.User;
21048
21049 // Skip users that we already RAUW. This happens when one instruction
21050 // has multiple uses of the same value.
21051 if (User && !is_contained(Scalar->users(), User))
21052 continue;
21053 const TreeEntry *E = &ExternalUse.E;
21054 assert(E && "Invalid scalar");
21055 assert(!E->isGather() && "Extracting from a gather list");
21056 // Non-instruction pointers are not deleted, just skip them.
21057 if (E->getOpcode() == Instruction::GetElementPtr &&
21058 !isa<GetElementPtrInst>(Scalar))
21059 continue;
21060
21061 Value *Vec = E->VectorizedValue;
21062 assert(Vec && "Can't find vectorizable value");
21063
21064 Value *Lane = Builder.getInt32(ExternalUse.Lane);
21065 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
21066 if (Scalar->getType() != Vec->getType()) {
21067 Value *Ex = nullptr;
21068 Value *ExV = nullptr;
21069 auto *Inst = dyn_cast<Instruction>(Scalar);
21070 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
21071 auto It = ScalarToEEs.find(Scalar);
21072 if (It != ScalarToEEs.end()) {
21073 // No need to emit many extracts, just move the only one in the
21074 // current block.
21075 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
21076 : Builder.GetInsertBlock());
21077 if (EEIt != It->second.end()) {
21078 Value *PrevV = EEIt->second.first;
21079 if (auto *I = dyn_cast<Instruction>(PrevV);
21080 I && !ReplaceInst &&
21081 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
21082 Builder.GetInsertPoint()->comesBefore(I)) {
21083 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
21084 Builder.GetInsertPoint());
21085 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
21086 CI->moveAfter(I);
21087 }
21088 Ex = PrevV;
21089 ExV = EEIt->second.second ? EEIt->second.second : Ex;
21090 }
21091 }
21092 if (!Ex) {
21093 // "Reuse" the existing extract to improve final codegen.
21094 if (ReplaceInst) {
21095 // Leave the instruction as is, if it cheaper extracts and all
21096 // operands are scalar.
21097 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
21098 IgnoredExtracts.insert(EE);
21099 Ex = EE;
21100 } else {
21101 auto *CloneInst = Inst->clone();
21102 CloneInst->insertBefore(Inst->getIterator());
21103 if (Inst->hasName())
21104 CloneInst->takeName(Inst);
21105 Ex = CloneInst;
21106 }
21107 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
21108 ES && isa<Instruction>(Vec)) {
21109 Value *V = ES->getVectorOperand();
21110 auto *IVec = cast<Instruction>(Vec);
21111 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
21112 V = ETEs.front()->VectorizedValue;
21113 if (auto *IV = dyn_cast<Instruction>(V);
21114 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
21115 IV->comesBefore(IVec))
21116 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
21117 else
21118 Ex = Builder.CreateExtractElement(Vec, Lane);
21119 } else if (auto *VecTy =
21120 dyn_cast<FixedVectorType>(Scalar->getType())) {
21121 assert(SLPReVec && "FixedVectorType is not expected.");
21122 unsigned VecTyNumElements = VecTy->getNumElements();
21123 // When REVEC is enabled, we need to extract a vector.
21124 // Note: The element size of Scalar may be different from the
21125 // element size of Vec.
21126 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
21127 ExternalUse.Lane * VecTyNumElements);
21128 } else {
21129 Ex = Builder.CreateExtractElement(Vec, Lane);
21130 }
21131 // If necessary, sign-extend or zero-extend ScalarRoot
21132 // to the larger type.
21133 ExV = Ex;
21134 if (Scalar->getType() != Ex->getType())
21135 ExV = Builder.CreateIntCast(
21136 Ex, Scalar->getType(),
21137 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
21138 auto *I = dyn_cast<Instruction>(Ex);
21139 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
21140 : &F->getEntryBlock(),
21141 std::make_pair(Ex, ExV));
21142 }
21143 // The then branch of the previous if may produce constants, since 0
21144 // operand might be a constant.
21145 if (auto *ExI = dyn_cast<Instruction>(Ex);
21146 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
21147 GatherShuffleExtractSeq.insert(ExI);
21148 CSEBlocks.insert(ExI->getParent());
21149 }
21150 return ExV;
21151 }
21152 assert(isa<FixedVectorType>(Scalar->getType()) &&
21153 isa<InsertElementInst>(Scalar) &&
21154 "In-tree scalar of vector type is not insertelement?");
21155 auto *IE = cast<InsertElementInst>(Scalar);
21156 VectorToInsertElement.try_emplace(Vec, IE);
21157 return Vec;
21158 };
21159 // If User == nullptr, the Scalar remains as scalar in vectorized
21160 // instructions or is used as extra arg. Generate ExtractElement instruction
21161 // and update the record for this scalar in ExternallyUsedValues.
21162 if (!User) {
21163 if (!ScalarsWithNullptrUser.insert(Scalar).second)
21164 continue;
21165 assert(
21166 (ExternallyUsedValues.count(Scalar) ||
21167 ExternalUsesWithNonUsers.count(Scalar) ||
21168 ExternalUsesAsOriginalScalar.contains(Scalar) ||
21169 any_of(
21170 Scalar->users(),
21171 [&, TTI = TTI](llvm::User *U) {
21172 if (ExternalUsesAsOriginalScalar.contains(U))
21173 return true;
21174 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
21175 return !UseEntries.empty() &&
21176 (E->State == TreeEntry::Vectorize ||
21177 E->State == TreeEntry::StridedVectorize ||
21178 E->State == TreeEntry::CompressVectorize) &&
21179 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
21180 return (UseEntry->State == TreeEntry::Vectorize ||
21181 UseEntry->State ==
21182 TreeEntry::StridedVectorize ||
21183 UseEntry->State ==
21184 TreeEntry::CompressVectorize) &&
21185 doesInTreeUserNeedToExtract(
21186 Scalar, getRootEntryInstruction(*UseEntry),
21187 TLI, TTI);
21188 });
21189 })) &&
21190 "Scalar with nullptr User must be registered in "
21191 "ExternallyUsedValues map or remain as scalar in vectorized "
21192 "instructions");
21193 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
21194 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
21195 if (PHI->getParent()->isLandingPad())
21196 Builder.SetInsertPoint(
21197 PHI->getParent(),
21198 std::next(
21199 PHI->getParent()->getLandingPadInst()->getIterator()));
21200 else
21201 Builder.SetInsertPoint(PHI->getParent(),
21202 PHI->getParent()->getFirstNonPHIIt());
21203 } else {
21204 Builder.SetInsertPoint(VecI->getParent(),
21205 std::next(VecI->getIterator()));
21206 }
21207 } else {
21208 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21209 }
21210 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21211 // Required to update internally referenced instructions.
21212 if (Scalar != NewInst) {
21213 assert((!isa<ExtractElementInst>(Scalar) ||
21214 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
21215 "Extractelements should not be replaced.");
21216 Scalar->replaceAllUsesWith(NewInst);
21217 }
21218 continue;
21219 }
21220
21221 if (auto *VU = dyn_cast<InsertElementInst>(User);
21222 VU && VU->getOperand(1) == Scalar) {
21223 // Skip if the scalar is another vector op or Vec is not an instruction.
21224 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
21225 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
21226 if (!UsedInserts.insert(VU).second)
21227 continue;
21228 // Need to use original vector, if the root is truncated.
21229 auto BWIt = MinBWs.find(E);
21230 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
21231 auto *ScalarTy = FTy->getElementType();
21232 auto Key = std::make_pair(Vec, ScalarTy);
21233 auto VecIt = VectorCasts.find(Key);
21234 if (VecIt == VectorCasts.end()) {
21235 IRBuilderBase::InsertPointGuard Guard(Builder);
21236 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
21237 if (IVec->getParent()->isLandingPad())
21238 Builder.SetInsertPoint(IVec->getParent(),
21239 std::next(IVec->getParent()
21240 ->getLandingPadInst()
21241 ->getIterator()));
21242 else
21243 Builder.SetInsertPoint(
21244 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
21245 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
21246 Builder.SetInsertPoint(IVec->getNextNode());
21247 }
21248 Vec = Builder.CreateIntCast(
21249 Vec,
21251 ScalarTy,
21252 cast<FixedVectorType>(Vec->getType())->getNumElements()),
21253 BWIt->second.second);
21254 VectorCasts.try_emplace(Key, Vec);
21255 } else {
21256 Vec = VecIt->second;
21257 }
21258 }
21259
21260 std::optional<unsigned> InsertIdx = getElementIndex(VU);
21261 if (InsertIdx) {
21262 auto *It = find_if(
21263 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
21264 // Checks if 2 insertelements are from the same buildvector.
21265 InsertElementInst *VecInsert = Data.InsertElements.front();
21267 VU, VecInsert,
21268 [](InsertElementInst *II) { return II->getOperand(0); });
21269 });
21270 unsigned Idx = *InsertIdx;
21271 if (It == ShuffledInserts.end()) {
21272 (void)ShuffledInserts.emplace_back();
21273 It = std::next(ShuffledInserts.begin(),
21274 ShuffledInserts.size() - 1);
21275 }
21276 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
21277 if (Mask.empty())
21278 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
21279 Mask[Idx] = ExternalUse.Lane;
21280 It->InsertElements.push_back(cast<InsertElementInst>(User));
21281 continue;
21282 }
21283 }
21284 }
21285 }
21286
21287 // Generate extracts for out-of-tree users.
21288 // Find the insertion point for the extractelement lane.
21289 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
21290 if (PHINode *PH = dyn_cast<PHINode>(User)) {
21291 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
21292 if (PH->getIncomingValue(I) == Scalar) {
21293 Instruction *IncomingTerminator =
21294 PH->getIncomingBlock(I)->getTerminator();
21295 if (isa<CatchSwitchInst>(IncomingTerminator)) {
21296 Builder.SetInsertPoint(VecI->getParent(),
21297 std::next(VecI->getIterator()));
21298 } else {
21299 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
21300 }
21301 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21302 PH->setOperand(I, NewInst);
21303 }
21304 }
21305 } else {
21306 Builder.SetInsertPoint(cast<Instruction>(User));
21307 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21308 User->replaceUsesOfWith(Scalar, NewInst);
21309 }
21310 } else {
21311 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21312 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21313 User->replaceUsesOfWith(Scalar, NewInst);
21314 }
21315
21316 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
21317 }
21318
21319 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
21320 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
21321 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
21322 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
21323 for (int I = 0, E = Mask.size(); I < E; ++I) {
21324 if (Mask[I] < VF)
21325 CombinedMask1[I] = Mask[I];
21326 else
21327 CombinedMask2[I] = Mask[I] - VF;
21328 }
21329 ShuffleInstructionBuilder ShuffleBuilder(
21330 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
21331 ShuffleBuilder.add(V1, CombinedMask1);
21332 if (V2)
21333 ShuffleBuilder.add(V2, CombinedMask2);
21334 return ShuffleBuilder.finalize({}, {}, {});
21335 };
21336
21337 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
21338 bool ForSingleMask) {
21339 unsigned VF = Mask.size();
21340 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
21341 if (VF != VecVF) {
21342 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
21343 Vec = CreateShuffle(Vec, nullptr, Mask);
21344 return std::make_pair(Vec, true);
21345 }
21346 if (!ForSingleMask) {
21347 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
21348 for (unsigned I = 0; I < VF; ++I) {
21349 if (Mask[I] != PoisonMaskElem)
21350 ResizeMask[Mask[I]] = Mask[I];
21351 }
21352 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
21353 }
21354 }
21355
21356 return std::make_pair(Vec, false);
21357 };
21358 // Perform shuffling of the vectorize tree entries for better handling of
21359 // external extracts.
21360 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
21361 // Find the first and the last instruction in the list of insertelements.
21362 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
21363 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
21364 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
21365 Builder.SetInsertPoint(LastInsert);
21366 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
21368 MutableArrayRef(Vector.data(), Vector.size()),
21369 FirstInsert->getOperand(0),
21370 [](Value *Vec) {
21371 return cast<VectorType>(Vec->getType())
21372 ->getElementCount()
21373 .getKnownMinValue();
21374 },
21375 ResizeToVF,
21376 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
21377 ArrayRef<Value *> Vals) {
21378 assert((Vals.size() == 1 || Vals.size() == 2) &&
21379 "Expected exactly 1 or 2 input values.");
21380 if (Vals.size() == 1) {
21381 // Do not create shuffle if the mask is a simple identity
21382 // non-resizing mask.
21383 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
21384 ->getNumElements() ||
21385 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
21386 return CreateShuffle(Vals.front(), nullptr, Mask);
21387 return Vals.front();
21388 }
21389 return CreateShuffle(Vals.front() ? Vals.front()
21390 : FirstInsert->getOperand(0),
21391 Vals.back(), Mask);
21392 });
21393 auto It = ShuffledInserts[I].InsertElements.rbegin();
21394 // Rebuild buildvector chain.
21395 InsertElementInst *II = nullptr;
21396 if (It != ShuffledInserts[I].InsertElements.rend())
21397 II = *It;
21399 while (It != ShuffledInserts[I].InsertElements.rend()) {
21400 assert(II && "Must be an insertelement instruction.");
21401 if (*It == II)
21402 ++It;
21403 else
21404 Inserts.push_back(cast<Instruction>(II));
21405 II = dyn_cast<InsertElementInst>(II->getOperand(0));
21406 }
21407 for (Instruction *II : reverse(Inserts)) {
21408 II->replaceUsesOfWith(II->getOperand(0), NewInst);
21409 if (auto *NewI = dyn_cast<Instruction>(NewInst))
21410 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
21411 II->moveAfter(NewI);
21412 NewInst = II;
21413 }
21414 LastInsert->replaceAllUsesWith(NewInst);
21415 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
21416 IE->replaceUsesOfWith(IE->getOperand(0),
21417 PoisonValue::get(IE->getOperand(0)->getType()));
21418 IE->replaceUsesOfWith(IE->getOperand(1),
21419 PoisonValue::get(IE->getOperand(1)->getType()));
21420 eraseInstruction(IE);
21421 }
21422 CSEBlocks.insert(LastInsert->getParent());
21423 }
21424
21425 SmallVector<Instruction *> RemovedInsts;
21426 // For each vectorized value:
21427 for (auto &TEPtr : VectorizableTree) {
21428 TreeEntry *Entry = TEPtr.get();
21429
21430 // No need to handle users of gathered values.
21431 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
21432 DeletedNodes.contains(Entry) ||
21433 TransformedToGatherNodes.contains(Entry))
21434 continue;
21435
21436 assert(Entry->VectorizedValue && "Can't find vectorizable value");
21437
21438 // For each lane:
21439 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
21440 Value *Scalar = Entry->Scalars[Lane];
21441
21442 if (Entry->getOpcode() == Instruction::GetElementPtr &&
21443 !isa<GetElementPtrInst>(Scalar))
21444 continue;
21445 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
21446 EE && IgnoredExtracts.contains(EE))
21447 continue;
21448 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
21449 continue;
21450#ifndef NDEBUG
21451 Type *Ty = Scalar->getType();
21452 if (!Ty->isVoidTy()) {
21453 for (User *U : Scalar->users()) {
21454 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
21455
21456 // It is legal to delete users in the ignorelist.
21457 assert((isVectorized(U) ||
21458 (UserIgnoreList && UserIgnoreList->contains(U)) ||
21461 "Deleting out-of-tree value");
21462 }
21463 }
21464#endif
21465 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
21466 auto *I = cast<Instruction>(Scalar);
21467 RemovedInsts.push_back(I);
21468 }
21469 }
21470
21471 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
21472 // new vector instruction.
21473 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
21474 V->mergeDIAssignID(RemovedInsts);
21475
21476 // Clear up reduction references, if any.
21477 if (UserIgnoreList) {
21478 for (Instruction *I : RemovedInsts) {
21479 const TreeEntry *IE = getTreeEntries(I).front();
21480 if (ArrayRef<TreeEntry *> SplitEntries = getSplitTreeEntries(I);
21481 !SplitEntries.empty() && SplitEntries.front()->Idx < IE->Idx)
21482 IE = SplitEntries.front();
21483 if (IE->Idx != 0 &&
21484 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
21485 (ValueToGatherNodes.lookup(I).contains(
21486 VectorizableTree.front().get()) ||
21487 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
21488 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
21489 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
21490 IE->UserTreeIndex &&
21491 is_contained(VectorizableTree.front()->Scalars, I)) &&
21492 !(GatheredLoadsEntriesFirst.has_value() &&
21493 IE->Idx >= *GatheredLoadsEntriesFirst &&
21494 VectorizableTree.front()->isGather() &&
21495 is_contained(VectorizableTree.front()->Scalars, I)) &&
21496 !(!VectorizableTree.front()->isGather() &&
21497 VectorizableTree.front()->isCopyableElement(I)))
21498 continue;
21499 SmallVector<SelectInst *> LogicalOpSelects;
21500 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
21501 // Do not replace condition of the logical op in form select <cond>.
21502 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
21503 (match(U.getUser(), m_LogicalAnd()) ||
21504 match(U.getUser(), m_LogicalOr())) &&
21505 U.getOperandNo() == 0;
21506 if (IsPoisoningLogicalOp) {
21507 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
21508 return false;
21509 }
21510 return UserIgnoreList->contains(U.getUser());
21511 });
21512 // Replace conditions of the poisoning logical ops with the non-poison
21513 // constant value.
21514 for (SelectInst *SI : LogicalOpSelects)
21515 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
21516 }
21517 }
21518 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
21519 // cache correctness.
21520 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
21521 // - instructions are not deleted until later.
21522 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
21523
21524 Builder.ClearInsertionPoint();
21525 InstrElementSize.clear();
21526
21527 const TreeEntry &RootTE = *VectorizableTree.front();
21528 Value *Vec = RootTE.VectorizedValue;
21529 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
21530 It != MinBWs.end() &&
21531 ReductionBitWidth != It->second.first) {
21532 IRBuilder<>::InsertPointGuard Guard(Builder);
21533 Builder.SetInsertPoint(ReductionRoot->getParent(),
21534 ReductionRoot->getIterator());
21535 Vec = Builder.CreateIntCast(
21536 Vec,
21537 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
21538 cast<VectorType>(Vec->getType())->getElementCount()),
21539 It->second.second);
21540 }
21541 return Vec;
21542}
21543
21545 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
21546 << " gather sequences instructions.\n");
21547 // LICM InsertElementInst sequences.
21548 for (Instruction *I : GatherShuffleExtractSeq) {
21549 if (isDeleted(I))
21550 continue;
21551
21552 // Check if this block is inside a loop.
21553 Loop *L = LI->getLoopFor(I->getParent());
21554 if (!L)
21555 continue;
21556
21557 // Check if it has a preheader.
21558 BasicBlock *PreHeader = L->getLoopPreheader();
21559 if (!PreHeader)
21560 continue;
21561
21562 // If the vector or the element that we insert into it are
21563 // instructions that are defined in this basic block then we can't
21564 // hoist this instruction.
21565 if (any_of(I->operands(), [L](Value *V) {
21566 auto *OpI = dyn_cast<Instruction>(V);
21567 return OpI && L->contains(OpI);
21568 }))
21569 continue;
21570
21571 // We can hoist this instruction. Move it to the pre-header.
21572 I->moveBefore(PreHeader->getTerminator()->getIterator());
21573 CSEBlocks.insert(PreHeader);
21574 }
21575
21576 // Make a list of all reachable blocks in our CSE queue.
21578 CSEWorkList.reserve(CSEBlocks.size());
21579 for (BasicBlock *BB : CSEBlocks)
21580 if (DomTreeNode *N = DT->getNode(BB)) {
21581 assert(DT->isReachableFromEntry(N));
21582 CSEWorkList.push_back(N);
21583 }
21584
21585 // Sort blocks by domination. This ensures we visit a block after all blocks
21586 // dominating it are visited.
21587 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
21588 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
21589 "Different nodes should have different DFS numbers");
21590 return A->getDFSNumIn() < B->getDFSNumIn();
21591 });
21592
21593 // Less defined shuffles can be replaced by the more defined copies.
21594 // Between two shuffles one is less defined if it has the same vector operands
21595 // and its mask indeces are the same as in the first one or undefs. E.g.
21596 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
21597 // poison, <0, 0, 0, 0>.
21598 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
21599 Instruction *I2,
21600 SmallVectorImpl<int> &NewMask) {
21601 if (I1->getType() != I2->getType())
21602 return false;
21603 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
21604 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
21605 if (!SI1 || !SI2)
21606 return I1->isIdenticalTo(I2);
21607 if (SI1->isIdenticalTo(SI2))
21608 return true;
21609 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
21610 if (SI1->getOperand(I) != SI2->getOperand(I))
21611 return false;
21612 // Check if the second instruction is more defined than the first one.
21613 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
21614 ArrayRef<int> SM1 = SI1->getShuffleMask();
21615 // Count trailing undefs in the mask to check the final number of used
21616 // registers.
21617 unsigned LastUndefsCnt = 0;
21618 for (int I = 0, E = NewMask.size(); I < E; ++I) {
21619 if (SM1[I] == PoisonMaskElem)
21620 ++LastUndefsCnt;
21621 else
21622 LastUndefsCnt = 0;
21623 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
21624 NewMask[I] != SM1[I])
21625 return false;
21626 if (NewMask[I] == PoisonMaskElem)
21627 NewMask[I] = SM1[I];
21628 }
21629 // Check if the last undefs actually change the final number of used vector
21630 // registers.
21631 return SM1.size() - LastUndefsCnt > 1 &&
21632 ::getNumberOfParts(*TTI, SI1->getType()) ==
21634 *TTI, getWidenedType(SI1->getType()->getElementType(),
21635 SM1.size() - LastUndefsCnt));
21636 };
21637 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
21638 // instructions. TODO: We can further optimize this scan if we split the
21639 // instructions into different buckets based on the insert lane.
21641 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
21642 assert(*I &&
21643 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
21644 "Worklist not sorted properly!");
21645 BasicBlock *BB = (*I)->getBlock();
21646 // For all instructions in blocks containing gather sequences:
21647 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
21648 if (isDeleted(&In))
21649 continue;
21651 !GatherShuffleExtractSeq.contains(&In))
21652 continue;
21653
21654 // Check if we can replace this instruction with any of the
21655 // visited instructions.
21656 bool Replaced = false;
21657 for (Instruction *&V : Visited) {
21658 SmallVector<int> NewMask;
21659 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
21660 DT->dominates(V->getParent(), In.getParent())) {
21661 In.replaceAllUsesWith(V);
21662 eraseInstruction(&In);
21663 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
21664 if (!NewMask.empty())
21665 SI->setShuffleMask(NewMask);
21666 Replaced = true;
21667 break;
21668 }
21670 GatherShuffleExtractSeq.contains(V) &&
21671 IsIdenticalOrLessDefined(V, &In, NewMask) &&
21672 DT->dominates(In.getParent(), V->getParent())) {
21673 In.moveAfter(V);
21674 V->replaceAllUsesWith(&In);
21676 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
21677 if (!NewMask.empty())
21678 SI->setShuffleMask(NewMask);
21679 V = &In;
21680 Replaced = true;
21681 break;
21682 }
21683 }
21684 if (!Replaced) {
21685 assert(!is_contained(Visited, &In));
21686 Visited.push_back(&In);
21687 }
21688 }
21689 }
21690 CSEBlocks.clear();
21691 GatherShuffleExtractSeq.clear();
21692}
21693
21694BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
21695 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
21696 auto &BundlePtr =
21697 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
21698 for (Value *V : VL) {
21699 if (S.isNonSchedulable(V))
21700 continue;
21701 auto *I = cast<Instruction>(V);
21702 if (S.isCopyableElement(V)) {
21703 // Add a copyable element model.
21704 ScheduleCopyableData &SD =
21705 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
21706 // Group the instructions to a bundle.
21707 BundlePtr->add(&SD);
21708 continue;
21709 }
21710 ScheduleData *BundleMember = getScheduleData(V);
21711 assert(BundleMember && "no ScheduleData for bundle member "
21712 "(maybe not in same basic block)");
21713 // Group the instructions to a bundle.
21714 BundlePtr->add(BundleMember);
21715 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
21716 BundlePtr.get());
21717 }
21718 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
21719 return *BundlePtr;
21720}
21721
21722// Groups the instructions to a bundle (which is then a single scheduling entity)
21723// and schedules instructions until the bundle gets ready.
21724std::optional<BoUpSLP::ScheduleBundle *>
21725BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
21726 const InstructionsState &S,
21727 const EdgeInfo &EI) {
21728 // No need to schedule PHIs, insertelement, extractelement and extractvalue
21729 // instructions.
21730 if (isa<PHINode>(S.getMainOp()) ||
21731 isVectorLikeInstWithConstOps(S.getMainOp()))
21732 return nullptr;
21733 // If the parent node is non-schedulable and the current node is copyable, and
21734 // any of parent instructions are used outside several basic blocks or in
21735 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
21736 // analysis, leading to a crash.
21737 // Non-scheduled nodes may not have related ScheduleData model, which may lead
21738 // to a skipped dep analysis.
21739 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21740 EI.UserTE->doesNotNeedToSchedule() &&
21741 EI.UserTE->getOpcode() != Instruction::PHI &&
21742 any_of(EI.UserTE->Scalars, [](Value *V) {
21743 auto *I = dyn_cast<Instruction>(V);
21744 if (!I || I->hasOneUser())
21745 return false;
21746 for (User *U : I->users()) {
21747 auto *UI = cast<Instruction>(U);
21748 if (isa<BinaryOperator>(UI))
21749 return true;
21750 }
21751 return false;
21752 }))
21753 return std::nullopt;
21754 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21755 EI.UserTE->hasCopyableElements() &&
21756 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21757 all_of(VL, [&](Value *V) {
21758 if (S.isCopyableElement(V))
21759 return true;
21760 return isUsedOutsideBlock(V);
21761 }))
21762 return std::nullopt;
21763 // If any instruction is used outside block only and its operand is placed
21764 // immediately before it, do not schedule, it may cause wrong def-use chain.
21765 if (S.areInstructionsWithCopyableElements() && any_of(VL, [&](Value *V) {
21766 if (isa<PoisonValue>(V) || S.isCopyableElement(V))
21767 return false;
21768 if (isUsedOutsideBlock(V)) {
21769 for (Value *Op : cast<Instruction>(V)->operands()) {
21770 auto *I = dyn_cast<Instruction>(Op);
21771 if (!I)
21772 continue;
21773 return SLP->isVectorized(I) && I->getNextNode() == V;
21774 }
21775 }
21776 return false;
21777 }))
21778 return std::nullopt;
21779 if (S.areInstructionsWithCopyableElements() && EI) {
21780 bool IsNonSchedulableWithParentPhiNode =
21781 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
21782 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
21783 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21784 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
21785 if (IsNonSchedulableWithParentPhiNode) {
21786 SmallSet<std::pair<Value *, Value *>, 4> Values;
21787 for (const auto [Idx, V] :
21788 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
21789 Value *Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
21790 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
21791 auto *I = dyn_cast<Instruction>(Op);
21792 if (!I || !isCommutative(I))
21793 continue;
21794 if (!Values.insert(std::make_pair(V, Op)).second)
21795 return std::nullopt;
21796 }
21797 } else {
21798 // If any of the parent requires scheduling - exit, complex dep between
21799 // schedulable/non-schedulable parents.
21800 if (any_of(EI.UserTE->Scalars, [&](Value *V) {
21801 if (EI.UserTE->hasCopyableElements() &&
21802 EI.UserTE->isCopyableElement(V))
21803 return false;
21804 ArrayRef<TreeEntry *> Entries = SLP->getTreeEntries(V);
21805 return any_of(Entries, [](const TreeEntry *TE) {
21806 return TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
21807 TE->UserTreeIndex.UserTE->hasState() &&
21808 TE->UserTreeIndex.UserTE->State !=
21809 TreeEntry::SplitVectorize &&
21810 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
21811 });
21812 }))
21813 return std::nullopt;
21814 }
21815 }
21816 bool HasCopyables = S.areInstructionsWithCopyableElements();
21817 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
21818 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
21819 // If all operands were replaced by copyables, the operands of this node
21820 // might be not, so need to recalculate dependencies for schedule data,
21821 // replaced by copyable schedule data.
21822 SmallVector<ScheduleData *> ControlDependentMembers;
21823 for (Value *V : VL) {
21824 auto *I = dyn_cast<Instruction>(V);
21825 if (!I || (HasCopyables && S.isCopyableElement(V)))
21826 continue;
21827 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21828 for (const Use &U : I->operands()) {
21829 unsigned &NumOps =
21830 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
21831 .first->getSecond();
21832 ++NumOps;
21833 if (auto *Op = dyn_cast<Instruction>(U.get());
21834 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
21835 if (ScheduleData *OpSD = getScheduleData(Op);
21836 OpSD && OpSD->hasValidDependencies())
21837 // TODO: investigate how to improve it instead of early exiting.
21838 return std::nullopt;
21839 }
21840 }
21841 }
21842 return nullptr;
21843 }
21844
21845 // Initialize the instruction bundle.
21846 Instruction *OldScheduleEnd = ScheduleEnd;
21847 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
21848
21849 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
21850 // Clear deps or recalculate the region, if the memory instruction is a
21851 // copyable. It may have memory deps, which must be recalculated.
21852 SmallVector<ScheduleData *> ControlDependentMembers;
21853 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21854 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21855 for (ScheduleEntity *SE : Bundle.getBundle()) {
21856 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
21857 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21858 BundleMember && BundleMember->hasValidDependencies()) {
21859 BundleMember->clearDirectDependencies();
21860 if (RegionHasStackSave ||
21862 BundleMember->getInst()))
21863 ControlDependentMembers.push_back(BundleMember);
21864 }
21865 continue;
21866 }
21867 auto *SD = cast<ScheduleData>(SE);
21868 if (SD->hasValidDependencies() &&
21869 (!S.areInstructionsWithCopyableElements() ||
21870 !S.isCopyableElement(SD->getInst())) &&
21871 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21872 EI.UserTE->hasState() &&
21873 (!EI.UserTE->hasCopyableElements() ||
21874 !EI.UserTE->isCopyableElement(SD->getInst())))
21875 SD->clearDirectDependencies();
21876 for (const Use &U : SD->getInst()->operands()) {
21877 unsigned &NumOps =
21878 UserOpToNumOps
21879 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
21880 .first->getSecond();
21881 ++NumOps;
21882 if (auto *Op = dyn_cast<Instruction>(U.get());
21883 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
21884 *SLP, NumOps)) {
21885 if (ScheduleData *OpSD = getScheduleData(Op);
21886 OpSD && OpSD->hasValidDependencies()) {
21887 OpSD->clearDirectDependencies();
21888 if (RegionHasStackSave ||
21890 ControlDependentMembers.push_back(OpSD);
21891 }
21892 }
21893 }
21894 }
21895 };
21896 // The scheduling region got new instructions at the lower end (or it is a
21897 // new region for the first bundle). This makes it necessary to
21898 // recalculate all dependencies.
21899 // It is seldom that this needs to be done a second time after adding the
21900 // initial bundle to the region.
21901 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21902 for_each(ScheduleDataMap, [&](auto &P) {
21903 if (BB != P.first->getParent())
21904 return;
21905 ScheduleData *SD = P.second;
21906 if (isInSchedulingRegion(*SD))
21907 SD->clearDependencies();
21908 });
21909 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21910 for_each(P.second, [&](ScheduleCopyableData *SD) {
21911 if (isInSchedulingRegion(*SD))
21912 SD->clearDependencies();
21913 });
21914 });
21915 ReSchedule = true;
21916 }
21917 // Check if the bundle data has deps for copyable elements already. In
21918 // this case need to reset deps and recalculate it.
21919 if (Bundle && !Bundle.getBundle().empty()) {
21920 if (S.areInstructionsWithCopyableElements() ||
21921 !ScheduleCopyableDataMap.empty())
21922 CheckIfNeedToClearDeps(Bundle);
21923 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
21924 << BB->getName() << "\n");
21925 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
21926 ControlDependentMembers);
21927 } else if (!ControlDependentMembers.empty()) {
21928 ScheduleBundle Invalid = ScheduleBundle::invalid();
21929 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
21930 ControlDependentMembers);
21931 }
21932
21933 if (ReSchedule) {
21934 resetSchedule();
21935 initialFillReadyList(ReadyInsts);
21936 }
21937
21938 // Now try to schedule the new bundle or (if no bundle) just calculate
21939 // dependencies. As soon as the bundle is "ready" it means that there are no
21940 // cyclic dependencies and we can schedule it. Note that's important that we
21941 // don't "schedule" the bundle yet.
21942 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21943 !ReadyInsts.empty()) {
21944 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21945 assert(Picked->isReady() && "must be ready to schedule");
21946 schedule(*SLP, S, EI, Picked, ReadyInsts);
21947 if (Picked == &Bundle)
21948 break;
21949 }
21950 };
21951
21952 // Make sure that the scheduling region contains all
21953 // instructions of the bundle.
21954 for (Value *V : VL) {
21955 if (S.isNonSchedulable(V))
21956 continue;
21957 if (!extendSchedulingRegion(V, S)) {
21958 // If the scheduling region got new instructions at the lower end (or it
21959 // is a new region for the first bundle). This makes it necessary to
21960 // recalculate all dependencies.
21961 // Otherwise the compiler may crash trying to incorrectly calculate
21962 // dependencies and emit instruction in the wrong order at the actual
21963 // scheduling.
21964 ScheduleBundle Invalid = ScheduleBundle::invalid();
21965 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
21966 return std::nullopt;
21967 }
21968 }
21969
21970 bool ReSchedule = false;
21971 for (Value *V : VL) {
21972 if (S.isNonSchedulable(V))
21973 continue;
21975 getScheduleCopyableData(cast<Instruction>(V));
21976 if (!CopyableData.empty()) {
21977 for (ScheduleCopyableData *SD : CopyableData)
21978 ReadyInsts.remove(SD);
21979 }
21980 ScheduleData *BundleMember = getScheduleData(V);
21981 assert((BundleMember || S.isCopyableElement(V)) &&
21982 "no ScheduleData for bundle member (maybe not in same basic block)");
21983 if (!BundleMember)
21984 continue;
21985
21986 // Make sure we don't leave the pieces of the bundle in the ready list when
21987 // whole bundle might not be ready.
21988 ReadyInsts.remove(BundleMember);
21989 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
21990 !Bundles.empty()) {
21991 for (ScheduleBundle *B : Bundles)
21992 ReadyInsts.remove(B);
21993 }
21994
21995 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21996 continue;
21997 // A bundle member was scheduled as single instruction before and now
21998 // needs to be scheduled as part of the bundle. We just get rid of the
21999 // existing schedule.
22000 // A bundle member has deps calculated before it was copyable element - need
22001 // to reschedule.
22002 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
22003 << " was already scheduled\n");
22004 ReSchedule = true;
22005 }
22006
22007 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
22008 TryScheduleBundleImpl(ReSchedule, Bundle);
22009 if (!Bundle.isReady()) {
22010 for (ScheduleEntity *BD : Bundle.getBundle()) {
22011 // Copyable data scheduling is just removed.
22013 continue;
22014 if (BD->isReady()) {
22015 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
22016 if (Bundles.empty()) {
22017 ReadyInsts.insert(BD);
22018 continue;
22019 }
22020 for (ScheduleBundle *B : Bundles)
22021 if (B->isReady())
22022 ReadyInsts.insert(B);
22023 }
22024 }
22025 ScheduledBundlesList.pop_back();
22026 SmallVector<ScheduleData *> ControlDependentMembers;
22027 for (Value *V : VL) {
22028 if (S.isNonSchedulable(V))
22029 continue;
22030 auto *I = cast<Instruction>(V);
22031 if (S.isCopyableElement(I)) {
22032 // Remove the copyable data from the scheduling region and restore
22033 // previous mappings.
22034 auto KV = std::make_pair(EI, I);
22035 assert(ScheduleCopyableDataMap.contains(KV) &&
22036 "no ScheduleCopyableData for copyable element");
22037 ScheduleCopyableData *SD =
22038 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
22039 ScheduleCopyableDataMapByUsers[I].remove(SD);
22040 if (EI.UserTE) {
22041 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
22042 const auto *It = find(Op, I);
22043 assert(It != Op.end() && "Lane not set");
22044 SmallPtrSet<Instruction *, 4> Visited;
22045 do {
22046 int Lane = std::distance(Op.begin(), It);
22047 assert(Lane >= 0 && "Lane not set");
22048 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
22049 !EI.UserTE->ReorderIndices.empty())
22050 Lane = EI.UserTE->ReorderIndices[Lane];
22051 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
22052 "Couldn't find extract lane");
22053 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
22054 if (!Visited.insert(In).second) {
22055 It = find(make_range(std::next(It), Op.end()), I);
22056 break;
22057 }
22058 ScheduleCopyableDataMapByInstUser
22059 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
22060 .pop_back();
22061 It = find(make_range(std::next(It), Op.end()), I);
22062 } while (It != Op.end());
22063 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
22064 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
22065 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
22066 }
22067 if (ScheduleCopyableDataMapByUsers[I].empty())
22068 ScheduleCopyableDataMapByUsers.erase(I);
22069 ScheduleCopyableDataMap.erase(KV);
22070 // Need to recalculate dependencies for the actual schedule data.
22071 if (ScheduleData *OpSD = getScheduleData(I);
22072 OpSD && OpSD->hasValidDependencies()) {
22073 OpSD->clearDirectDependencies();
22074 if (RegionHasStackSave ||
22076 ControlDependentMembers.push_back(OpSD);
22077 }
22078 continue;
22079 }
22080 ScheduledBundles.find(I)->getSecond().pop_back();
22081 }
22082 if (!ControlDependentMembers.empty()) {
22083 ScheduleBundle Invalid = ScheduleBundle::invalid();
22084 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
22085 ControlDependentMembers);
22086 }
22087 return std::nullopt;
22088 }
22089 return &Bundle;
22090}
22091
22092BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
22093 // Allocate a new ScheduleData for the instruction.
22094 if (ChunkPos >= ChunkSize) {
22095 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
22096 ChunkPos = 0;
22097 }
22098 return &(ScheduleDataChunks.back()[ChunkPos++]);
22099}
22100
22101bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
22102 Value *V, const InstructionsState &S) {
22104 assert(I && "bundle member must be an instruction");
22105 if (getScheduleData(I))
22106 return true;
22107 if (!ScheduleStart) {
22108 // It's the first instruction in the new region.
22109 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
22110 ScheduleStart = I;
22111 ScheduleEnd = I->getNextNode();
22112 assert(ScheduleEnd && "tried to vectorize a terminator?");
22113 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
22114 return true;
22115 }
22116 // Search up and down at the same time, because we don't know if the new
22117 // instruction is above or below the existing scheduling region.
22118 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
22119 // against the budget. Otherwise debug info could affect codegen.
22121 ++ScheduleStart->getIterator().getReverse();
22122 BasicBlock::reverse_iterator UpperEnd = BB->rend();
22123 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
22124 BasicBlock::iterator LowerEnd = BB->end();
22125 auto IsAssumeLikeIntr = [](const Instruction &I) {
22126 if (auto *II = dyn_cast<IntrinsicInst>(&I))
22127 return II->isAssumeLikeIntrinsic();
22128 return false;
22129 };
22130 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22131 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22132 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
22133 &*DownIter != I) {
22134 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
22135 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
22136 return false;
22137 }
22138
22139 ++UpIter;
22140 ++DownIter;
22141
22142 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22143 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22144 }
22145 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
22146 assert(I->getParent() == ScheduleStart->getParent() &&
22147 "Instruction is in wrong basic block.");
22148 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
22149 ScheduleStart = I;
22150 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
22151 << "\n");
22152 return true;
22153 }
22154 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
22155 "Expected to reach top of the basic block or instruction down the "
22156 "lower end.");
22157 assert(I->getParent() == ScheduleEnd->getParent() &&
22158 "Instruction is in wrong basic block.");
22159 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
22160 nullptr);
22161 ScheduleEnd = I->getNextNode();
22162 assert(ScheduleEnd && "tried to vectorize a terminator?");
22163 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
22164 return true;
22165}
22166
22167void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
22168 Instruction *ToI,
22169 ScheduleData *PrevLoadStore,
22170 ScheduleData *NextLoadStore) {
22171 ScheduleData *CurrentLoadStore = PrevLoadStore;
22172 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
22173 // No need to allocate data for non-schedulable instructions.
22174 if (isa<PHINode>(I))
22175 continue;
22176 ScheduleData *SD = ScheduleDataMap.lookup(I);
22177 if (!SD) {
22178 SD = allocateScheduleDataChunks();
22179 ScheduleDataMap[I] = SD;
22180 }
22181 assert(!isInSchedulingRegion(*SD) &&
22182 "new ScheduleData already in scheduling region");
22183 SD->init(SchedulingRegionID, I);
22184
22185 auto CanIgnoreLoad = [](const Instruction *I) {
22186 const auto *LI = dyn_cast<LoadInst>(I);
22187 // If there is a simple load marked as invariant, we can ignore it.
22188 // But, in the (unlikely) case of non-simple invariant load,
22189 // we should not ignore it.
22190 return LI && LI->isSimple() &&
22191 LI->getMetadata(LLVMContext::MD_invariant_load);
22192 };
22193
22194 if (I->mayReadOrWriteMemory() &&
22195 // Simple InvariantLoad does not depend on other memory accesses.
22196 !CanIgnoreLoad(I) &&
22197 (!isa<IntrinsicInst>(I) ||
22198 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
22200 Intrinsic::pseudoprobe))) {
22201 // Update the linked list of memory accessing instructions.
22202 if (CurrentLoadStore) {
22203 CurrentLoadStore->setNextLoadStore(SD);
22204 } else {
22205 FirstLoadStoreInRegion = SD;
22206 }
22207 CurrentLoadStore = SD;
22208 }
22209
22212 RegionHasStackSave = true;
22213 }
22214 if (NextLoadStore) {
22215 if (CurrentLoadStore)
22216 CurrentLoadStore->setNextLoadStore(NextLoadStore);
22217 } else {
22218 LastLoadStoreInRegion = CurrentLoadStore;
22219 }
22220}
22221
22222void BoUpSLP::BlockScheduling::calculateDependencies(
22223 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
22224 ArrayRef<ScheduleData *> ControlDeps) {
22225 SmallVector<ScheduleEntity *> WorkList;
22226 auto ProcessNode = [&](ScheduleEntity *SE) {
22227 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
22228 if (CD->hasValidDependencies())
22229 return;
22230 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
22231 CD->initDependencies();
22232 CD->resetUnscheduledDeps();
22233 const EdgeInfo &EI = CD->getEdgeInfo();
22234 if (EI.UserTE) {
22235 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
22236 const auto *It = find(Op, CD->getInst());
22237 assert(It != Op.end() && "Lane not set");
22238 SmallPtrSet<Instruction *, 4> Visited;
22239 do {
22240 int Lane = std::distance(Op.begin(), It);
22241 assert(Lane >= 0 && "Lane not set");
22242 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
22243 !EI.UserTE->ReorderIndices.empty())
22244 Lane = EI.UserTE->ReorderIndices[Lane];
22245 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
22246 "Couldn't find extract lane");
22247 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
22248 if (EI.UserTE->isCopyableElement(In)) {
22249 // We may have not have related copyable scheduling data, if the
22250 // instruction is non-schedulable.
22251 if (ScheduleCopyableData *UseSD =
22252 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
22253 CD->incDependencies();
22254 if (!UseSD->isScheduled())
22255 CD->incrementUnscheduledDeps(1);
22256 if (!UseSD->hasValidDependencies() ||
22257 (InsertInReadyList && UseSD->isReady()))
22258 WorkList.push_back(UseSD);
22259 }
22260 } else if (Visited.insert(In).second) {
22261 if (ScheduleData *UseSD = getScheduleData(In)) {
22262 CD->incDependencies();
22263 if (!UseSD->isScheduled())
22264 CD->incrementUnscheduledDeps(1);
22265 if (!UseSD->hasValidDependencies() ||
22266 (InsertInReadyList && UseSD->isReady()))
22267 WorkList.push_back(UseSD);
22268 }
22269 }
22270 It = find(make_range(std::next(It), Op.end()), CD->getInst());
22271 } while (It != Op.end());
22272 if (CD->isReady() && CD->getDependencies() == 0 &&
22273 (EI.UserTE->hasState() &&
22274 (EI.UserTE->getMainOp()->getParent() !=
22275 CD->getInst()->getParent() ||
22276 (isa<PHINode>(EI.UserTE->getMainOp()) &&
22277 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
22278 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
22279 auto *IU = dyn_cast<Instruction>(U);
22280 if (!IU)
22281 return true;
22282 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
22283 })))))) {
22284 // If no uses in the block - mark as having pseudo-use, which cannot
22285 // be scheduled.
22286 // Prevents incorrect def-use tracking between external user and
22287 // actual instruction.
22288 CD->incDependencies();
22289 CD->incrementUnscheduledDeps(1);
22290 }
22291 }
22292 return;
22293 }
22294 auto *BundleMember = cast<ScheduleData>(SE);
22295 if (BundleMember->hasValidDependencies())
22296 return;
22297 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
22298 BundleMember->initDependencies();
22299 BundleMember->resetUnscheduledDeps();
22300 // Handle def-use chain dependencies.
22301 SmallDenseMap<Value *, unsigned> UserToNumOps;
22302 for (User *U : BundleMember->getInst()->users()) {
22303 if (isa<PHINode>(U))
22304 continue;
22305 if (ScheduleData *UseSD = getScheduleData(U)) {
22306 // The operand is a copyable element - skip.
22307 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
22308 ++NumOps;
22309 if (areAllOperandsReplacedByCopyableData(
22310 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
22311 continue;
22312 BundleMember->incDependencies();
22313 if (!UseSD->isScheduled())
22314 BundleMember->incrementUnscheduledDeps(1);
22315 if (!UseSD->hasValidDependencies() ||
22316 (InsertInReadyList && UseSD->isReady()))
22317 WorkList.push_back(UseSD);
22318 }
22319 }
22320 for (ScheduleCopyableData *UseSD :
22321 getScheduleCopyableDataUsers(BundleMember->getInst())) {
22322 BundleMember->incDependencies();
22323 if (!UseSD->isScheduled())
22324 BundleMember->incrementUnscheduledDeps(1);
22325 if (!UseSD->hasValidDependencies() ||
22326 (InsertInReadyList && UseSD->isReady()))
22327 WorkList.push_back(UseSD);
22328 }
22329
22330 SmallPtrSet<const Instruction *, 4> Visited;
22331 auto MakeControlDependent = [&](Instruction *I) {
22332 // Do not mark control dependent twice.
22333 if (!Visited.insert(I).second)
22334 return;
22335 auto *DepDest = getScheduleData(I);
22336 assert(DepDest && "must be in schedule window");
22337 DepDest->addControlDependency(BundleMember);
22338 BundleMember->incDependencies();
22339 if (!DepDest->isScheduled())
22340 BundleMember->incrementUnscheduledDeps(1);
22341 if (!DepDest->hasValidDependencies() ||
22342 (InsertInReadyList && DepDest->isReady()))
22343 WorkList.push_back(DepDest);
22344 };
22345
22346 // Any instruction which isn't safe to speculate at the beginning of the
22347 // block is control depend on any early exit or non-willreturn call
22348 // which proceeds it.
22349 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
22350 for (Instruction *I = BundleMember->getInst()->getNextNode();
22351 I != ScheduleEnd; I = I->getNextNode()) {
22352 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
22353 continue;
22354
22355 // Add the dependency
22356 MakeControlDependent(I);
22357
22359 // Everything past here must be control dependent on I.
22360 break;
22361 }
22362 }
22363
22364 if (RegionHasStackSave) {
22365 // If we have an inalloc alloca instruction, it needs to be scheduled
22366 // after any preceeding stacksave. We also need to prevent any alloca
22367 // from reordering above a preceeding stackrestore.
22368 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
22369 match(BundleMember->getInst(),
22371 for (Instruction *I = BundleMember->getInst()->getNextNode();
22372 I != ScheduleEnd; I = I->getNextNode()) {
22375 // Any allocas past here must be control dependent on I, and I
22376 // must be memory dependend on BundleMember->Inst.
22377 break;
22378
22379 if (!isa<AllocaInst>(I))
22380 continue;
22381
22382 // Add the dependency
22383 MakeControlDependent(I);
22384 }
22385 }
22386
22387 // In addition to the cases handle just above, we need to prevent
22388 // allocas and loads/stores from moving below a stacksave or a
22389 // stackrestore. Avoiding moving allocas below stackrestore is currently
22390 // thought to be conservatism. Moving loads/stores below a stackrestore
22391 // can lead to incorrect code.
22392 if (isa<AllocaInst>(BundleMember->getInst()) ||
22393 BundleMember->getInst()->mayReadOrWriteMemory()) {
22394 for (Instruction *I = BundleMember->getInst()->getNextNode();
22395 I != ScheduleEnd; I = I->getNextNode()) {
22398 continue;
22399
22400 // Add the dependency
22401 MakeControlDependent(I);
22402 break;
22403 }
22404 }
22405 }
22406
22407 // Handle the memory dependencies (if any).
22408 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
22409 if (!NextLoadStore)
22410 return;
22411 Instruction *SrcInst = BundleMember->getInst();
22412 assert(SrcInst->mayReadOrWriteMemory() &&
22413 "NextLoadStore list for non memory effecting bundle?");
22414 MemoryLocation SrcLoc = getLocation(SrcInst);
22415 bool SrcMayWrite = SrcInst->mayWriteToMemory();
22416 unsigned NumAliased = 0;
22417 unsigned DistToSrc = 1;
22418 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
22419
22420 for (ScheduleData *DepDest = NextLoadStore; DepDest;
22421 DepDest = DepDest->getNextLoadStore()) {
22422 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
22423
22424 // We have two limits to reduce the complexity:
22425 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
22426 // SLP->isAliased (which is the expensive part in this loop).
22427 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
22428 // the whole loop (even if the loop is fast, it's quadratic).
22429 // It's important for the loop break condition (see below) to
22430 // check this limit even between two read-only instructions.
22431 if (DistToSrc >= MaxMemDepDistance ||
22432 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
22433 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
22434 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
22435
22436 // We increment the counter only if the locations are aliased
22437 // (instead of counting all alias checks). This gives a better
22438 // balance between reduced runtime and accurate dependencies.
22439 NumAliased++;
22440
22441 DepDest->addMemoryDependency(BundleMember);
22442 BundleMember->incDependencies();
22443 if (!DepDest->isScheduled())
22444 BundleMember->incrementUnscheduledDeps(1);
22445 if (!DepDest->hasValidDependencies() ||
22446 (InsertInReadyList && DepDest->isReady()))
22447 WorkList.push_back(DepDest);
22448 }
22449
22450 // Example, explaining the loop break condition: Let's assume our
22451 // starting instruction is i0 and MaxMemDepDistance = 3.
22452 //
22453 // +--------v--v--v
22454 // i0,i1,i2,i3,i4,i5,i6,i7,i8
22455 // +--------^--^--^
22456 //
22457 // MaxMemDepDistance let us stop alias-checking at i3 and we add
22458 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
22459 // Previously we already added dependencies from i3 to i6,i7,i8
22460 // (because of MaxMemDepDistance). As we added a dependency from
22461 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
22462 // and we can abort this loop at i6.
22463 if (DistToSrc >= 2 * MaxMemDepDistance)
22464 break;
22465 DistToSrc++;
22466 }
22467 };
22468
22469 assert((Bundle || !ControlDeps.empty()) &&
22470 "expected at least one instruction to schedule");
22471 if (Bundle)
22472 WorkList.push_back(Bundle.getBundle().front());
22473 WorkList.append(ControlDeps.begin(), ControlDeps.end());
22474 SmallPtrSet<ScheduleBundle *, 16> Visited;
22475 while (!WorkList.empty()) {
22476 ScheduleEntity *SD = WorkList.pop_back_val();
22477 SmallVector<ScheduleBundle *, 1> CopyableBundle;
22479 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
22480 CopyableBundle.push_back(&CD->getBundle());
22481 Bundles = CopyableBundle;
22482 } else {
22483 Bundles = getScheduleBundles(SD->getInst());
22484 }
22485 if (Bundles.empty()) {
22486 if (!SD->hasValidDependencies())
22487 ProcessNode(SD);
22488 if (InsertInReadyList && SD->isReady()) {
22489 ReadyInsts.insert(SD);
22490 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
22491 }
22492 continue;
22493 }
22494 for (ScheduleBundle *Bundle : Bundles) {
22495 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
22496 continue;
22497 assert(isInSchedulingRegion(*Bundle) &&
22498 "ScheduleData not in scheduling region");
22499 for_each(Bundle->getBundle(), ProcessNode);
22500 }
22501 if (InsertInReadyList && SD->isReady()) {
22502 for (ScheduleBundle *Bundle : Bundles) {
22503 assert(isInSchedulingRegion(*Bundle) &&
22504 "ScheduleData not in scheduling region");
22505 if (!Bundle->isReady())
22506 continue;
22507 ReadyInsts.insert(Bundle);
22508 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
22509 << "\n");
22510 }
22511 }
22512 }
22513}
22514
22515void BoUpSLP::BlockScheduling::resetSchedule() {
22516 assert(ScheduleStart &&
22517 "tried to reset schedule on block which has not been scheduled");
22518 for_each(ScheduleDataMap, [&](auto &P) {
22519 if (BB != P.first->getParent())
22520 return;
22521 ScheduleData *SD = P.second;
22522 if (isInSchedulingRegion(*SD)) {
22523 SD->setScheduled(/*Scheduled=*/false);
22524 SD->resetUnscheduledDeps();
22525 }
22526 });
22527 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
22528 for_each(P.second, [&](ScheduleCopyableData *SD) {
22529 if (isInSchedulingRegion(*SD)) {
22530 SD->setScheduled(/*Scheduled=*/false);
22531 SD->resetUnscheduledDeps();
22532 }
22533 });
22534 });
22535 for_each(ScheduledBundles, [&](auto &P) {
22536 for_each(P.second, [&](ScheduleBundle *Bundle) {
22537 if (isInSchedulingRegion(*Bundle))
22538 Bundle->setScheduled(/*Scheduled=*/false);
22539 });
22540 });
22541 // Reset schedule data for copyable elements.
22542 for (auto &P : ScheduleCopyableDataMap) {
22543 if (isInSchedulingRegion(*P.second)) {
22544 P.second->setScheduled(/*Scheduled=*/false);
22545 P.second->resetUnscheduledDeps();
22546 }
22547 }
22548 ReadyInsts.clear();
22549}
22550
22551void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
22552 if (!BS->ScheduleStart)
22553 return;
22554
22555 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
22556
22557 // A key point - if we got here, pre-scheduling was able to find a valid
22558 // scheduling of the sub-graph of the scheduling window which consists
22559 // of all vector bundles and their transitive users. As such, we do not
22560 // need to reschedule anything *outside of* that subgraph.
22561
22562 BS->resetSchedule();
22563
22564 // For the real scheduling we use a more sophisticated ready-list: it is
22565 // sorted by the original instruction location. This lets the final schedule
22566 // be as close as possible to the original instruction order.
22567 // WARNING: If changing this order causes a correctness issue, that means
22568 // there is some missing dependence edge in the schedule data graph.
22569 struct ScheduleDataCompare {
22570 bool operator()(const ScheduleEntity *SD1,
22571 const ScheduleEntity *SD2) const {
22572 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
22573 }
22574 };
22575 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
22576
22577 // Ensure that all dependency data is updated (for nodes in the sub-graph)
22578 // and fill the ready-list with initial instructions.
22579 int Idx = 0;
22580 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22581 I = I->getNextNode()) {
22582 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
22583 if (!Bundles.empty()) {
22584 for (ScheduleBundle *Bundle : Bundles) {
22585 Bundle->setSchedulingPriority(Idx++);
22586 if (!Bundle->hasValidDependencies())
22587 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
22588 }
22589 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
22590 for (ScheduleCopyableData *SD : reverse(SDs)) {
22591 ScheduleBundle &Bundle = SD->getBundle();
22592 Bundle.setSchedulingPriority(Idx++);
22593 if (!Bundle.hasValidDependencies())
22594 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22595 }
22596 continue;
22597 }
22599 BS->getScheduleCopyableDataUsers(I);
22600 if (ScheduleData *SD = BS->getScheduleData(I)) {
22601 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
22602 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
22603 SDTEs.front()->doesNotNeedToSchedule() ||
22605 "scheduler and vectorizer bundle mismatch");
22606 SD->setSchedulingPriority(Idx++);
22607 if (!SD->hasValidDependencies() &&
22608 (!CopyableData.empty() ||
22609 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
22610 assert(TE->isGather() && "expected gather node");
22611 return TE->hasState() && TE->hasCopyableElements() &&
22612 TE->isCopyableElement(I);
22613 }))) {
22614 // Need to calculate deps for these nodes to correctly handle copyable
22615 // dependencies, even if they were cancelled.
22616 // If copyables bundle was cancelled, the deps are cleared and need to
22617 // recalculate them.
22618 ScheduleBundle Bundle;
22619 Bundle.add(SD);
22620 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22621 }
22622 }
22623 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
22624 ScheduleBundle &Bundle = SD->getBundle();
22625 Bundle.setSchedulingPriority(Idx++);
22626 if (!Bundle.hasValidDependencies())
22627 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22628 }
22629 }
22630 BS->initialFillReadyList(ReadyInsts);
22631
22632 Instruction *LastScheduledInst = BS->ScheduleEnd;
22633
22634 // Do the "real" scheduling.
22635 SmallPtrSet<Instruction *, 16> Scheduled;
22636 while (!ReadyInsts.empty()) {
22637 auto *Picked = *ReadyInsts.begin();
22638 ReadyInsts.erase(ReadyInsts.begin());
22639
22640 // Move the scheduled instruction(s) to their dedicated places, if not
22641 // there yet.
22642 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
22643 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
22644 Instruction *PickedInst = BundleMember->getInst();
22645 // If copyable must be schedule as part of something else, skip it.
22646 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
22647 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
22648 (!IsCopyable && !Scheduled.insert(PickedInst).second))
22649 continue;
22650 if (PickedInst->getNextNode() != LastScheduledInst)
22651 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
22652 LastScheduledInst = PickedInst;
22653 }
22654 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
22655 LastScheduledInst);
22656 } else {
22657 auto *SD = cast<ScheduleData>(Picked);
22658 Instruction *PickedInst = SD->getInst();
22659 if (PickedInst->getNextNode() != LastScheduledInst)
22660 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
22661 LastScheduledInst = PickedInst;
22662 }
22663 auto Invalid = InstructionsState::invalid();
22664 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
22665 }
22666
22667 // Check that we didn't break any of our invariants.
22668#ifdef EXPENSIVE_CHECKS
22669 BS->verify();
22670#endif
22671
22672#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
22673 // Check that all schedulable entities got scheduled
22674 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22675 I = I->getNextNode()) {
22676 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
22677 assert(all_of(Bundles,
22678 [](const ScheduleBundle *Bundle) {
22679 return Bundle->isScheduled();
22680 }) &&
22681 "must be scheduled at this point");
22682 }
22683#endif
22684
22685 // Avoid duplicate scheduling of the block.
22686 BS->ScheduleStart = nullptr;
22687}
22688
22690 // If V is a store, just return the width of the stored value (or value
22691 // truncated just before storing) without traversing the expression tree.
22692 // This is the common case.
22693 if (auto *Store = dyn_cast<StoreInst>(V))
22694 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
22695
22696 if (auto *IEI = dyn_cast<InsertElementInst>(V))
22697 return getVectorElementSize(IEI->getOperand(1));
22698
22699 auto E = InstrElementSize.find(V);
22700 if (E != InstrElementSize.end())
22701 return E->second;
22702
22703 // If V is not a store, we can traverse the expression tree to find loads
22704 // that feed it. The type of the loaded value may indicate a more suitable
22705 // width than V's type. We want to base the vector element size on the width
22706 // of memory operations where possible.
22709 if (auto *I = dyn_cast<Instruction>(V)) {
22710 Worklist.emplace_back(I, I->getParent(), 0);
22711 Visited.insert(I);
22712 }
22713
22714 // Traverse the expression tree in bottom-up order looking for loads. If we
22715 // encounter an instruction we don't yet handle, we give up.
22716 auto Width = 0u;
22717 Value *FirstNonBool = nullptr;
22718 while (!Worklist.empty()) {
22719 auto [I, Parent, Level] = Worklist.pop_back_val();
22720
22721 // We should only be looking at scalar instructions here. If the current
22722 // instruction has a vector type, skip.
22723 auto *Ty = I->getType();
22724 if (isa<VectorType>(Ty))
22725 continue;
22726 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
22727 FirstNonBool = I;
22728 if (Level > RecursionMaxDepth)
22729 continue;
22730
22731 // If the current instruction is a load, update MaxWidth to reflect the
22732 // width of the loaded value.
22734 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
22735
22736 // Otherwise, we need to visit the operands of the instruction. We only
22737 // handle the interesting cases from buildTree here. If an operand is an
22738 // instruction we haven't yet visited and from the same basic block as the
22739 // user or the use is a PHI node, we add it to the worklist.
22742 for (Use &U : I->operands()) {
22743 if (auto *J = dyn_cast<Instruction>(U.get()))
22744 if (Visited.insert(J).second &&
22745 (isa<PHINode>(I) || J->getParent() == Parent)) {
22746 Worklist.emplace_back(J, J->getParent(), Level + 1);
22747 continue;
22748 }
22749 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
22750 FirstNonBool = U.get();
22751 }
22752 } else {
22753 break;
22754 }
22755 }
22756
22757 // If we didn't encounter a memory access in the expression tree, or if we
22758 // gave up for some reason, just return the width of V. Otherwise, return the
22759 // maximum width we found.
22760 if (!Width) {
22761 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
22762 V = FirstNonBool;
22763 Width = DL->getTypeSizeInBits(V->getType());
22764 }
22765
22766 for (Instruction *I : Visited)
22767 InstrElementSize[I] = Width;
22768
22769 return Width;
22770}
22771
22772bool BoUpSLP::collectValuesToDemote(
22773 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
22775 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
22776 bool &IsProfitableToDemote, bool IsTruncRoot) const {
22777 // We can always demote constants.
22778 if (all_of(E.Scalars, IsaPred<Constant>))
22779 return true;
22780
22781 unsigned OrigBitWidth =
22782 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
22783 if (OrigBitWidth == BitWidth) {
22784 MaxDepthLevel = 1;
22785 return true;
22786 }
22787
22788 // Check if the node was analyzed already and must keep its original bitwidth.
22789 if (NodesToKeepBWs.contains(E.Idx))
22790 return false;
22791
22792 // If the value is not a vectorized instruction in the expression and not used
22793 // by the insertelement instruction and not used in multiple vector nodes, it
22794 // cannot be demoted.
22795 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
22796 if (isa<PoisonValue>(R))
22797 return false;
22798 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22799 });
22800 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
22801 if (isa<PoisonValue>(V))
22802 return true;
22803 if (getTreeEntries(V).size() > 1)
22804 return false;
22805 // For lat shuffle of sext/zext with many uses need to check the extra bit
22806 // for unsigned values, otherwise may have incorrect casting for reused
22807 // scalars.
22808 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
22809 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
22810 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22811 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22812 return true;
22813 }
22814 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22815 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22816 if (IsSignedNode)
22817 ++BitWidth1;
22818 if (auto *I = dyn_cast<Instruction>(V)) {
22819 APInt Mask = DB->getDemandedBits(I);
22820 unsigned BitWidth2 =
22821 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
22822 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22823 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
22824 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22825 break;
22826 BitWidth2 *= 2;
22827 }
22828 BitWidth1 = std::min(BitWidth1, BitWidth2);
22829 }
22830 BitWidth = std::max(BitWidth, BitWidth1);
22831 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
22832 };
22833 auto FinalAnalysis = [&, TTI = TTI]() {
22834 if (!IsProfitableToDemote)
22835 return false;
22836 bool Res = all_of(
22837 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
22838 // Demote gathers.
22839 if (Res && E.isGather()) {
22840 if (E.hasState()) {
22841 if (const TreeEntry *SameTE =
22842 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
22843 SameTE)
22844 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
22845 ToDemote, Visited, NodesToKeepBWs,
22846 MaxDepthLevel, IsProfitableToDemote,
22847 IsTruncRoot)) {
22848 ToDemote.push_back(E.Idx);
22849 return true;
22850 }
22851 }
22852 // Check possible extractelement instructions bases and final vector
22853 // length.
22854 SmallPtrSet<Value *, 4> UniqueBases;
22855 for (Value *V : E.Scalars) {
22856 auto *EE = dyn_cast<ExtractElementInst>(V);
22857 if (!EE)
22858 continue;
22859 UniqueBases.insert(EE->getVectorOperand());
22860 }
22861 const unsigned VF = E.Scalars.size();
22862 Type *OrigScalarTy = E.Scalars.front()->getType();
22863 if (UniqueBases.size() <= 2 ||
22864 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
22866 *TTI,
22868 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
22869 VF))) {
22870 ToDemote.push_back(E.Idx);
22871 return true;
22872 }
22873 }
22874 return Res;
22875 };
22876 if (E.isGather() || !Visited.insert(&E).second ||
22877 any_of(E.Scalars, [&](Value *V) {
22878 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22879 return isa<InsertElementInst>(U) && !isVectorized(U);
22880 });
22881 }))
22882 return FinalAnalysis();
22883
22884 if (any_of(E.Scalars, [&](Value *V) {
22885 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22886 return isVectorized(U) ||
22887 (E.Idx == 0 && UserIgnoreList &&
22888 UserIgnoreList->contains(U)) ||
22889 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22890 !U->getType()->isScalableTy() &&
22891 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22892 }) && !IsPotentiallyTruncated(V, BitWidth);
22893 }))
22894 return false;
22895
22896 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
22897 bool &NeedToExit) {
22898 NeedToExit = false;
22899 unsigned InitLevel = MaxDepthLevel;
22900 for (const TreeEntry *Op : Operands) {
22901 unsigned Level = InitLevel;
22902 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
22903 ToDemote, Visited, NodesToKeepBWs, Level,
22904 IsProfitableToDemote, IsTruncRoot)) {
22905 if (!IsProfitableToDemote)
22906 return false;
22907 NeedToExit = true;
22908 if (!FinalAnalysis())
22909 return false;
22910 continue;
22911 }
22912 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22913 }
22914 return true;
22915 };
22916 auto AttemptCheckBitwidth =
22917 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
22918 // Try all bitwidth < OrigBitWidth.
22919 NeedToExit = false;
22920 unsigned BestFailBitwidth = 0;
22921 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
22922 if (Checker(BitWidth, OrigBitWidth))
22923 return true;
22924 if (BestFailBitwidth == 0 && FinalAnalysis())
22925 BestFailBitwidth = BitWidth;
22926 }
22927 if (BitWidth >= OrigBitWidth) {
22928 if (BestFailBitwidth == 0) {
22929 BitWidth = OrigBitWidth;
22930 return false;
22931 }
22932 MaxDepthLevel = 1;
22933 BitWidth = BestFailBitwidth;
22934 NeedToExit = true;
22935 return true;
22936 }
22937 return false;
22938 };
22939 auto TryProcessInstruction =
22940 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
22941 function_ref<bool(unsigned, unsigned)> Checker = {}) {
22942 if (Operands.empty()) {
22943 if (!IsTruncRoot)
22944 MaxDepthLevel = 1;
22945 for (Value *V : E.Scalars)
22946 (void)IsPotentiallyTruncated(V, BitWidth);
22947 } else {
22948 // Several vectorized uses? Check if we can truncate it, otherwise -
22949 // exit.
22950 if (any_of(E.Scalars, [&](Value *V) {
22951 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22952 }))
22953 return false;
22954 bool NeedToExit = false;
22955 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22956 return false;
22957 if (NeedToExit)
22958 return true;
22959 if (!ProcessOperands(Operands, NeedToExit))
22960 return false;
22961 if (NeedToExit)
22962 return true;
22963 }
22964
22965 ++MaxDepthLevel;
22966 // Record the entry that we can demote.
22967 ToDemote.push_back(E.Idx);
22968 return IsProfitableToDemote;
22969 };
22970
22971 if (E.State == TreeEntry::SplitVectorize)
22972 return TryProcessInstruction(
22973 BitWidth,
22974 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
22975 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
22976
22977 if (E.isAltShuffle()) {
22978 // Combining these opcodes may lead to incorrect analysis, skip for now.
22979 auto IsDangerousOpcode = [](unsigned Opcode) {
22980 switch (Opcode) {
22981 case Instruction::Shl:
22982 case Instruction::AShr:
22983 case Instruction::LShr:
22984 case Instruction::UDiv:
22985 case Instruction::SDiv:
22986 case Instruction::URem:
22987 case Instruction::SRem:
22988 return true;
22989 default:
22990 break;
22991 }
22992 return false;
22993 };
22994 if (IsDangerousOpcode(E.getAltOpcode()))
22995 return FinalAnalysis();
22996 }
22997
22998 switch (E.getOpcode()) {
22999
23000 // We can always demote truncations and extensions. Since truncations can
23001 // seed additional demotion, we save the truncated value.
23002 case Instruction::Trunc:
23003 if (IsProfitableToDemoteRoot)
23004 IsProfitableToDemote = true;
23005 return TryProcessInstruction(BitWidth);
23006 case Instruction::ZExt:
23007 case Instruction::SExt:
23008 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
23009 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
23010 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
23011 return false;
23012 IsProfitableToDemote = true;
23013 return TryProcessInstruction(BitWidth);
23014
23015 // We can demote certain binary operations if we can demote both of their
23016 // operands.
23017 case Instruction::Add:
23018 case Instruction::Sub:
23019 case Instruction::Mul:
23020 case Instruction::And:
23021 case Instruction::Or:
23022 case Instruction::Xor: {
23023 return TryProcessInstruction(
23024 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
23025 }
23026 case Instruction::Freeze:
23027 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
23028 case Instruction::Shl: {
23029 // If we are truncating the result of this SHL, and if it's a shift of an
23030 // inrange amount, we can always perform a SHL in a smaller type.
23031 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
23032 return all_of(E.Scalars, [&](Value *V) {
23033 if (isa<PoisonValue>(V))
23034 return true;
23035 if (E.isCopyableElement(V))
23036 return true;
23037 auto *I = cast<Instruction>(V);
23038 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23039 return AmtKnownBits.getMaxValue().ult(BitWidth);
23040 });
23041 };
23042 return TryProcessInstruction(
23043 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
23044 }
23045 case Instruction::LShr: {
23046 // If this is a truncate of a logical shr, we can truncate it to a smaller
23047 // lshr iff we know that the bits we would otherwise be shifting in are
23048 // already zeros.
23049 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23050 return all_of(E.Scalars, [&](Value *V) {
23051 if (isa<PoisonValue>(V))
23052 return true;
23053 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23054 if (E.isCopyableElement(V))
23055 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
23056 auto *I = cast<Instruction>(V);
23057 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23058 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23059 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
23060 SimplifyQuery(*DL));
23061 });
23062 };
23063 return TryProcessInstruction(
23064 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
23065 LShrChecker);
23066 }
23067 case Instruction::AShr: {
23068 // If this is a truncate of an arithmetic shr, we can truncate it to a
23069 // smaller ashr iff we know that all the bits from the sign bit of the
23070 // original type and the sign bit of the truncate type are similar.
23071 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23072 return all_of(E.Scalars, [&](Value *V) {
23073 if (isa<PoisonValue>(V))
23074 return true;
23075 auto *I = cast<Instruction>(V);
23076 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23077 unsigned ShiftedBits = OrigBitWidth - BitWidth;
23078 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23079 ShiftedBits <
23080 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23081 });
23082 };
23083 return TryProcessInstruction(
23084 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
23085 AShrChecker);
23086 }
23087 case Instruction::UDiv:
23088 case Instruction::URem: {
23089 // UDiv and URem can be truncated if all the truncated bits are zero.
23090 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23091 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23092 return all_of(E.Scalars, [&](Value *V) {
23093 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23094 if (E.hasCopyableElements() && E.isCopyableElement(V))
23095 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
23096 auto *I = cast<Instruction>(V);
23097 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
23098 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23099 });
23100 };
23101 return TryProcessInstruction(
23102 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
23103 }
23104
23105 // We can demote selects if we can demote their true and false values.
23106 case Instruction::Select: {
23107 return TryProcessInstruction(
23108 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
23109 }
23110
23111 // We can demote phis if we can demote all their incoming operands.
23112 case Instruction::PHI: {
23113 const unsigned NumOps = E.getNumOperands();
23115 transform(seq<unsigned>(0, NumOps), Ops.begin(),
23116 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
23117
23118 return TryProcessInstruction(BitWidth, Ops);
23119 }
23120
23121 case Instruction::Call: {
23122 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
23123 if (!IC)
23124 break;
23126 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
23127 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
23128 break;
23129 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
23130 function_ref<bool(unsigned, unsigned)> CallChecker;
23131 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23132 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23133 return all_of(E.Scalars, [&](Value *V) {
23134 auto *I = cast<Instruction>(V);
23135 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
23136 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23137 return MaskedValueIsZero(I->getOperand(0), Mask,
23138 SimplifyQuery(*DL)) &&
23139 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23140 }
23141 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
23142 "Expected min/max intrinsics only.");
23143 unsigned SignBits = OrigBitWidth - BitWidth;
23144 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
23145 unsigned Op0SignBits =
23146 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23147 unsigned Op1SignBits =
23148 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
23149 return SignBits <= Op0SignBits &&
23150 ((SignBits != Op0SignBits &&
23151 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
23152 MaskedValueIsZero(I->getOperand(0), Mask,
23153 SimplifyQuery(*DL))) &&
23154 SignBits <= Op1SignBits &&
23155 ((SignBits != Op1SignBits &&
23156 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
23157 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
23158 });
23159 };
23160 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23161 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23162 return all_of(E.Scalars, [&](Value *V) {
23163 auto *I = cast<Instruction>(V);
23164 unsigned SignBits = OrigBitWidth - BitWidth;
23165 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
23166 unsigned Op0SignBits =
23167 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23168 return SignBits <= Op0SignBits &&
23169 ((SignBits != Op0SignBits &&
23170 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
23171 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
23172 });
23173 };
23174 if (ID != Intrinsic::abs) {
23175 Operands.push_back(getOperandEntry(&E, 1));
23176 CallChecker = CompChecker;
23177 } else {
23178 CallChecker = AbsChecker;
23179 }
23180 InstructionCost BestCost =
23181 std::numeric_limits<InstructionCost::CostType>::max();
23182 unsigned BestBitWidth = BitWidth;
23183 unsigned VF = E.Scalars.size();
23184 // Choose the best bitwidth based on cost estimations.
23185 auto Checker = [&](unsigned BitWidth, unsigned) {
23186 unsigned MinBW = PowerOf2Ceil(BitWidth);
23187 SmallVector<Type *> ArgTys =
23188 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
23189 auto VecCallCosts = getVectorCallCosts(
23190 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
23191 TTI, TLI, ArgTys);
23192 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
23193 if (Cost < BestCost) {
23194 BestCost = Cost;
23195 BestBitWidth = BitWidth;
23196 }
23197 return false;
23198 };
23199 [[maybe_unused]] bool NeedToExit;
23200 (void)AttemptCheckBitwidth(Checker, NeedToExit);
23201 BitWidth = BestBitWidth;
23202 return TryProcessInstruction(BitWidth, Operands, CallChecker);
23203 }
23204
23205 // Otherwise, conservatively give up.
23206 default:
23207 break;
23208 }
23209 MaxDepthLevel = 1;
23210 return FinalAnalysis();
23211}
23212
23213static RecurKind getRdxKind(Value *V);
23214
23216 // We only attempt to truncate integer expressions.
23217 bool IsStoreOrInsertElt =
23218 VectorizableTree.front()->hasState() &&
23219 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
23220 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
23221 if ((IsStoreOrInsertElt || UserIgnoreList) &&
23222 ExtraBitWidthNodes.size() <= 1 &&
23223 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
23224 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
23225 return;
23226
23227 unsigned NodeIdx = 0;
23228 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
23229 NodeIdx = 1;
23230
23231 // Ensure the roots of the vectorizable tree don't form a cycle.
23232 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
23233 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
23234 "Unexpected tree is graph.");
23235
23236 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
23237 // resize to the final type.
23238 bool IsTruncRoot = false;
23239 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
23240 SmallVector<unsigned> RootDemotes;
23241 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
23242 if (NodeIdx != 0 &&
23243 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23244 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23245 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
23246 IsTruncRoot = true;
23247 RootDemotes.push_back(NodeIdx);
23248 IsProfitableToDemoteRoot = true;
23249 ++NodeIdx;
23250 }
23251
23252 // Analyzed the reduction already and not profitable - exit.
23253 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
23254 return;
23255
23256 SmallVector<unsigned> ToDemote;
23257 auto ComputeMaxBitWidth =
23258 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
23259 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
23260 ToDemote.clear();
23261 // Check if the root is trunc and the next node is gather/buildvector, then
23262 // keep trunc in scalars, which is free in most cases.
23263 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
23264 !NodesToKeepBWs.contains(E.Idx) &&
23265 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
23266 all_of(E.Scalars, [&](Value *V) {
23267 return V->hasOneUse() || isa<Constant>(V) ||
23268 (!V->hasNUsesOrMore(UsesLimit) &&
23269 none_of(V->users(), [&](User *U) {
23270 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
23271 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23272 if (TEs.empty() || is_contained(TEs, UserTE))
23273 return false;
23274 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23275 SelectInst>(U) ||
23276 isa<SIToFPInst, UIToFPInst>(U) ||
23277 (UserTE->hasState() &&
23278 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23279 SelectInst>(UserTE->getMainOp()) ||
23280 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
23281 return true;
23282 unsigned UserTESz = DL->getTypeSizeInBits(
23283 UserTE->Scalars.front()->getType());
23284 if (all_of(TEs, [&](const TreeEntry *TE) {
23285 auto It = MinBWs.find(TE);
23286 return It != MinBWs.end() &&
23287 It->second.first > UserTESz;
23288 }))
23289 return true;
23290 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
23291 }));
23292 })) {
23293 ToDemote.push_back(E.Idx);
23294 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23295 auto It = MinBWs.find(UserTE);
23296 if (It != MinBWs.end())
23297 return It->second.first;
23298 unsigned MaxBitWidth =
23299 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
23300 MaxBitWidth = bit_ceil(MaxBitWidth);
23301 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23302 MaxBitWidth = 8;
23303 return MaxBitWidth;
23304 }
23305
23306 if (!E.hasState())
23307 return 0u;
23308
23309 unsigned VF = E.getVectorFactor();
23310 Type *ScalarTy = E.Scalars.front()->getType();
23311 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
23312 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
23313 if (!TreeRootIT)
23314 return 0u;
23315
23316 if (any_of(E.Scalars,
23317 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
23318 return 0u;
23319
23320 unsigned NumParts = ::getNumberOfParts(
23321 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
23322
23323 // The maximum bit width required to represent all the values that can be
23324 // demoted without loss of precision. It would be safe to truncate the roots
23325 // of the expression to this width.
23326 unsigned MaxBitWidth = 1u;
23327
23328 // True if the roots can be zero-extended back to their original type,
23329 // rather than sign-extended. We know that if the leading bits are not
23330 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
23331 // True.
23332 // Determine if the sign bit of all the roots is known to be zero. If not,
23333 // IsKnownPositive is set to False.
23334 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
23335 if (isa<PoisonValue>(R))
23336 return true;
23337 KnownBits Known = computeKnownBits(R, *DL);
23338 return Known.isNonNegative();
23339 });
23340
23341 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
23342 E.UserTreeIndex.UserTE->hasState() &&
23343 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
23344 MaxBitWidth =
23345 std::min(DL->getTypeSizeInBits(
23346 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
23347 DL->getTypeSizeInBits(ScalarTy));
23348
23349 // We first check if all the bits of the roots are demanded. If they're not,
23350 // we can truncate the roots to this narrower type.
23351 for (Value *Root : E.Scalars) {
23352 if (isa<PoisonValue>(Root))
23353 continue;
23354 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
23355 TypeSize NumTypeBits =
23356 DL->getTypeSizeInBits(Root->getType()->getScalarType());
23357 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23358 // If we can't prove that the sign bit is zero, we must add one to the
23359 // maximum bit width to account for the unknown sign bit. This preserves
23360 // the existing sign bit so we can safely sign-extend the root back to the
23361 // original type. Otherwise, if we know the sign bit is zero, we will
23362 // zero-extend the root instead.
23363 //
23364 // FIXME: This is somewhat suboptimal, as there will be cases where adding
23365 // one to the maximum bit width will yield a larger-than-necessary
23366 // type. In general, we need to add an extra bit only if we can't
23367 // prove that the upper bit of the original type is equal to the
23368 // upper bit of the proposed smaller type. If these two bits are
23369 // the same (either zero or one) we know that sign-extending from
23370 // the smaller type will result in the same value. Here, since we
23371 // can't yet prove this, we are just making the proposed smaller
23372 // type larger to ensure correctness.
23373 if (!IsKnownPositive)
23374 ++BitWidth1;
23375
23376 auto *I = dyn_cast<Instruction>(Root);
23377 if (!I) {
23378 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
23379 continue;
23380 }
23381 APInt Mask = DB->getDemandedBits(I);
23382 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23383 MaxBitWidth =
23384 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
23385 }
23386
23387 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23388 MaxBitWidth = 8;
23389
23390 // If the original type is large, but reduced type does not improve the reg
23391 // use - ignore it.
23392 if (NumParts > 1 &&
23393 NumParts ==
23395 *TTI, getWidenedType(IntegerType::get(F->getContext(),
23396 bit_ceil(MaxBitWidth)),
23397 VF)))
23398 return 0u;
23399
23400 unsigned Opcode = E.getOpcode();
23401 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
23402 Opcode == Instruction::SExt ||
23403 Opcode == Instruction::ZExt || NumParts > 1;
23404 // Conservatively determine if we can actually truncate the roots of the
23405 // expression. Collect the values that can be demoted in ToDemote and
23406 // additional roots that require investigating in Roots.
23408 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
23409 bool NeedToDemote = IsProfitableToDemote;
23410
23411 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
23412 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
23413 NeedToDemote, IsTruncRoot) ||
23414 (MaxDepthLevel <= Limit &&
23415 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
23416 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
23417 DL->getTypeSizeInBits(TreeRootIT) /
23418 DL->getTypeSizeInBits(
23419 E.getMainOp()->getOperand(0)->getType()) >
23420 2)))))
23421 return 0u;
23422 // Round MaxBitWidth up to the next power-of-two.
23423 MaxBitWidth = bit_ceil(MaxBitWidth);
23424
23425 return MaxBitWidth;
23426 };
23427
23428 // If we can truncate the root, we must collect additional values that might
23429 // be demoted as a result. That is, those seeded by truncations we will
23430 // modify.
23431 // Add reduction ops sizes, if any.
23432 if (UserIgnoreList &&
23433 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
23434 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
23435 // x i1> to in)).
23436 if (all_of(*UserIgnoreList,
23437 [](Value *V) {
23438 return isa<PoisonValue>(V) ||
23439 cast<Instruction>(V)->getOpcode() == Instruction::Add;
23440 }) &&
23441 VectorizableTree.front()->State == TreeEntry::Vectorize &&
23442 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
23443 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
23444 Builder.getInt1Ty()) {
23445 ReductionBitWidth = 1;
23446 } else {
23447 for (Value *V : *UserIgnoreList) {
23448 if (isa<PoisonValue>(V))
23449 continue;
23450 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
23451 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
23452 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23453 if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
23454 ++BitWidth1;
23455 unsigned BitWidth2 = BitWidth1;
23457 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
23458 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23459 }
23460 ReductionBitWidth =
23461 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
23462 }
23463 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
23464 ReductionBitWidth = 8;
23465
23466 ReductionBitWidth = bit_ceil(ReductionBitWidth);
23467 }
23468 }
23469 bool IsTopRoot = NodeIdx == 0;
23470 while (NodeIdx < VectorizableTree.size() &&
23471 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23472 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23473 RootDemotes.push_back(NodeIdx);
23474 ++NodeIdx;
23475 IsTruncRoot = true;
23476 }
23477 bool IsSignedCmp = false;
23478 if (UserIgnoreList &&
23479 all_of(*UserIgnoreList,
23481 m_SMax(m_Value(), m_Value())))))
23482 IsSignedCmp = true;
23483 while (NodeIdx < VectorizableTree.size()) {
23484 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
23485 unsigned Limit = 2;
23486 if (IsTopRoot &&
23487 ReductionBitWidth ==
23488 DL->getTypeSizeInBits(
23489 VectorizableTree.front()->Scalars.front()->getType()))
23490 Limit = 3;
23491 unsigned MaxBitWidth = ComputeMaxBitWidth(
23492 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
23493 IsTruncRoot, IsSignedCmp);
23494 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
23495 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
23496 ReductionBitWidth = bit_ceil(MaxBitWidth);
23497 else if (MaxBitWidth == 0)
23498 ReductionBitWidth = 0;
23499 }
23500
23501 for (unsigned Idx : RootDemotes) {
23502 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
23503 uint32_t OrigBitWidth =
23504 DL->getTypeSizeInBits(V->getType()->getScalarType());
23505 if (OrigBitWidth > MaxBitWidth) {
23506 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
23507 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
23508 }
23509 return false;
23510 }))
23511 ToDemote.push_back(Idx);
23512 }
23513 RootDemotes.clear();
23514 IsTopRoot = false;
23515 IsProfitableToDemoteRoot = true;
23516
23517 if (ExtraBitWidthNodes.empty()) {
23518 NodeIdx = VectorizableTree.size();
23519 } else {
23520 unsigned NewIdx = 0;
23521 do {
23522 NewIdx = *ExtraBitWidthNodes.begin();
23523 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
23524 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
23525 NodeIdx = NewIdx;
23526 IsTruncRoot =
23527 NodeIdx < VectorizableTree.size() &&
23528 VectorizableTree[NodeIdx]->UserTreeIndex &&
23529 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
23530 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23531 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23532 Instruction::Trunc &&
23533 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
23534 IsSignedCmp =
23535 NodeIdx < VectorizableTree.size() &&
23536 VectorizableTree[NodeIdx]->UserTreeIndex &&
23537 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23538 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23539 Instruction::ICmp &&
23540 any_of(
23541 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
23542 [&](Value *V) {
23543 auto *IC = dyn_cast<ICmpInst>(V);
23544 return IC && (IC->isSigned() ||
23545 !isKnownNonNegative(IC->getOperand(0),
23546 SimplifyQuery(*DL)) ||
23547 !isKnownNonNegative(IC->getOperand(1),
23548 SimplifyQuery(*DL)));
23549 });
23550 }
23551
23552 // If the maximum bit width we compute is less than the width of the roots'
23553 // type, we can proceed with the narrowing. Otherwise, do nothing.
23554 if (MaxBitWidth == 0 ||
23555 MaxBitWidth >=
23556 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
23557 ->getBitWidth()) {
23558 if (UserIgnoreList)
23559 AnalyzedMinBWVals.insert_range(TreeRoot);
23560 NodesToKeepBWs.insert_range(ToDemote);
23561 continue;
23562 }
23563
23564 // Finally, map the values we can demote to the maximum bit with we
23565 // computed.
23566 for (unsigned Idx : ToDemote) {
23567 TreeEntry *TE = VectorizableTree[Idx].get();
23568 if (MinBWs.contains(TE))
23569 continue;
23570 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
23571 if (isa<PoisonValue>(R))
23572 return false;
23573 return !isKnownNonNegative(R, SimplifyQuery(*DL));
23574 });
23575 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
23576 }
23577 }
23578}
23579
23581 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
23582 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
23584 auto *AA = &AM.getResult<AAManager>(F);
23585 auto *LI = &AM.getResult<LoopAnalysis>(F);
23586 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
23587 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
23588 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
23590
23591 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
23592 if (!Changed)
23593 return PreservedAnalyses::all();
23594
23597 return PA;
23598}
23599
23601 TargetTransformInfo *TTI_,
23602 TargetLibraryInfo *TLI_, AAResults *AA_,
23603 LoopInfo *LI_, DominatorTree *DT_,
23604 AssumptionCache *AC_, DemandedBits *DB_,
23607 return false;
23608 SE = SE_;
23609 TTI = TTI_;
23610 TLI = TLI_;
23611 AA = AA_;
23612 LI = LI_;
23613 DT = DT_;
23614 AC = AC_;
23615 DB = DB_;
23616 DL = &F.getDataLayout();
23617
23618 Stores.clear();
23619 GEPs.clear();
23620 bool Changed = false;
23621
23622 // If the target claims to have no vector registers don't attempt
23623 // vectorization.
23624 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
23625 LLVM_DEBUG(
23626 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
23627 return false;
23628 }
23629
23630 // Don't vectorize when the attribute NoImplicitFloat is used.
23631 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
23632 return false;
23633
23634 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
23635
23636 // Use the bottom up slp vectorizer to construct chains that start with
23637 // store instructions.
23638 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
23639
23640 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
23641 // delete instructions.
23642
23643 // Update DFS numbers now so that we can use them for ordering.
23644 DT->updateDFSNumbers();
23645
23646 // Scan the blocks in the function in post order.
23647 for (auto *BB : post_order(&F.getEntryBlock())) {
23649 continue;
23650
23651 // Start new block - clear the list of reduction roots.
23652 R.clearReductionData();
23653 collectSeedInstructions(BB);
23654
23655 // Vectorize trees that end at stores.
23656 if (!Stores.empty()) {
23657 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
23658 << " underlying objects.\n");
23659 Changed |= vectorizeStoreChains(R);
23660 }
23661
23662 // Vectorize trees that end at reductions.
23663 Changed |= vectorizeChainsInBlock(BB, R);
23664
23665 // Vectorize the index computations of getelementptr instructions. This
23666 // is primarily intended to catch gather-like idioms ending at
23667 // non-consecutive loads.
23668 if (!GEPs.empty()) {
23669 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
23670 << " underlying objects.\n");
23671 Changed |= vectorizeGEPIndices(BB, R);
23672 }
23673 }
23674
23675 if (Changed) {
23676 R.optimizeGatherSequence();
23677 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
23678 }
23679 return Changed;
23680}
23681
23682std::optional<bool>
23683SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
23684 unsigned Idx, unsigned MinVF,
23685 unsigned &Size) {
23686 Size = 0;
23687 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
23688 << "\n");
23689 const unsigned Sz = R.getVectorElementSize(Chain[0]);
23690 unsigned VF = Chain.size();
23691
23692 if (!has_single_bit(Sz) ||
23694 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
23695 VF) ||
23696 VF < 2 || VF < MinVF) {
23697 // Check if vectorizing with a non-power-of-2 VF should be considered. At
23698 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
23699 // all vector lanes are used.
23700 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
23701 return false;
23702 }
23703
23704 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
23705 << "\n");
23706
23707 SetVector<Value *> ValOps;
23708 for (Value *V : Chain)
23709 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
23710 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
23711 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
23712 InstructionsState S = Analysis.buildInstructionsState(
23713 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
23714 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
23715 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
23716 bool IsAllowedSize =
23717 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
23718 ValOps.size()) ||
23719 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
23720 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
23721 (!S.getMainOp()->isSafeToRemove() ||
23722 any_of(ValOps.getArrayRef(),
23723 [&](Value *V) {
23724 return !isa<ExtractElementInst>(V) &&
23725 (V->getNumUses() > Chain.size() ||
23726 any_of(V->users(), [&](User *U) {
23727 return !Stores.contains(U);
23728 }));
23729 }))) ||
23730 (ValOps.size() > Chain.size() / 2 && !S)) {
23731 Size = (!IsAllowedSize && S) ? 1 : 2;
23732 return false;
23733 }
23734 }
23735 if (R.isLoadCombineCandidate(Chain))
23736 return true;
23737 R.buildTree(Chain);
23738 // Check if tree tiny and store itself or its value is not vectorized.
23739 if (R.isTreeTinyAndNotFullyVectorizable()) {
23740 if (R.isGathered(Chain.front()) ||
23741 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
23742 return std::nullopt;
23743 Size = R.getCanonicalGraphSize();
23744 return false;
23745 }
23746 if (R.isProfitableToReorder()) {
23747 R.reorderTopToBottom();
23748 R.reorderBottomToTop();
23749 }
23750 R.transformNodes();
23751 R.computeMinimumValueSizes();
23752
23753 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
23754 R.buildExternalUses();
23755
23756 Size = R.getCanonicalGraphSize();
23757 if (S && S.getOpcode() == Instruction::Load)
23758 Size = 2; // cut off masked gather small trees
23759 InstructionCost Cost = R.getTreeCost(TreeCost);
23760
23761 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
23762 if (Cost < -SLPCostThreshold) {
23763 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
23764
23765 using namespace ore;
23766
23767 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
23768 cast<StoreInst>(Chain[0]))
23769 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
23770 << " and with tree size "
23771 << NV("TreeSize", R.getTreeSize()));
23772
23773 R.vectorizeTree();
23774 return true;
23775 }
23776
23777 return false;
23778}
23779
23780/// Checks if the quadratic mean deviation is less than 90% of the mean size.
23781static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
23782 bool First) {
23783 unsigned Num = 0;
23784 uint64_t Sum = std::accumulate(
23785 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
23786 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
23787 unsigned Size = First ? Val.first : Val.second;
23788 if (Size == 1)
23789 return V;
23790 ++Num;
23791 return V + Size;
23792 });
23793 if (Num == 0)
23794 return true;
23795 uint64_t Mean = Sum / Num;
23796 if (Mean == 0)
23797 return true;
23798 uint64_t Dev = std::accumulate(
23799 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
23800 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
23801 unsigned P = First ? Val.first : Val.second;
23802 if (P == 1)
23803 return V;
23804 return V + (P - Mean) * (P - Mean);
23805 }) /
23806 Num;
23807 return Dev * 96 / (Mean * Mean) == 0;
23808}
23809
23810namespace {
23811
23812/// A group of stores that we'll try to bundle together using vector ops.
23813/// They are ordered using the signed distance of their address operand to the
23814/// address of this group's BaseInstr.
23815class RelatedStoreInsts {
23816public:
23817 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
23818 : AllStores(AllStores) {
23819 reset(BaseInstrIdx);
23820 }
23821
23822 void reset(unsigned NewBaseInstr) {
23823 assert(NewBaseInstr < AllStores.size() &&
23824 "Instruction index out of bounds");
23825 BaseInstrIdx = NewBaseInstr;
23826 Instrs.clear();
23827 insertOrLookup(NewBaseInstr, 0);
23828 }
23829
23830 /// Tries to insert \p InstrIdx as the store with a pointer distance of
23831 /// \p PtrDist.
23832 /// Does nothing if there is already a store with that \p PtrDist.
23833 /// \returns The previously associated Instruction index, or std::nullopt
23834 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
23835 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23836 return Inserted ? std::nullopt : std::make_optional(It->second);
23837 }
23838
23839 using DistToInstMap = std::map<int64_t, unsigned>;
23840 const DistToInstMap &getStores() const { return Instrs; }
23841
23842 /// If \p SI is related to this group of stores, return the distance of its
23843 /// pointer operand to the one the group's BaseInstr.
23844 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
23845 ScalarEvolution &SE) const {
23846 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23847 return getPointersDiff(
23848 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
23849 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
23850 /*StrictCheck=*/true);
23851 }
23852
23853 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
23854 /// Stores whose index is less than \p MinSafeIdx will be dropped.
23855 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
23856 int64_t DistFromCurBase) {
23857 DistToInstMap PrevSet = std::move(Instrs);
23858 reset(NewBaseInstIdx);
23859
23860 // Re-insert stores that come after MinSafeIdx to try and vectorize them
23861 // again. Their distance will be "rebased" to use NewBaseInstIdx as
23862 // reference.
23863 for (auto [Dist, InstIdx] : PrevSet) {
23864 if (InstIdx >= MinSafeIdx)
23865 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23866 }
23867 }
23868
23869 /// Remove all stores that have been vectorized from this group.
23870 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
23871 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
23872 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
23873 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
23874 });
23875
23876 // Get a forward iterator pointing after the last vectorized store and erase
23877 // all stores before it so we don't try to vectorize them again.
23878 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23879 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23880 }
23881
23882private:
23883 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
23884 unsigned BaseInstrIdx;
23885
23886 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
23887 DistToInstMap Instrs;
23888
23889 /// Reference to all the stores in the BB being analyzed.
23890 ArrayRef<StoreInst *> AllStores;
23891};
23892
23893} // end anonymous namespace
23894
23895bool SLPVectorizerPass::vectorizeStores(
23896 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
23897 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23898 &Visited) {
23899 // We may run into multiple chains that merge into a single chain. We mark the
23900 // stores that we vectorized so that we don't visit the same store twice.
23901 BoUpSLP::ValueSet VectorizedStores;
23902 bool Changed = false;
23903
23904 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23905 int64_t PrevDist = -1;
23906 BoUpSLP::ValueList Operands;
23907 // Collect the chain into a list.
23908 for (auto [Idx, Data] : enumerate(StoreSeq)) {
23909 auto &[Dist, InstIdx] = Data;
23910 if (Operands.empty() || Dist - PrevDist == 1) {
23911 Operands.push_back(Stores[InstIdx]);
23912 PrevDist = Dist;
23913 if (Idx != StoreSeq.size() - 1)
23914 continue;
23915 }
23916 llvm::scope_exit E([&, &Dist = Dist, &InstIdx = InstIdx]() {
23917 Operands.clear();
23918 Operands.push_back(Stores[InstIdx]);
23919 PrevDist = Dist;
23920 });
23921
23922 if (Operands.size() <= 1 ||
23923 !Visited
23924 .insert({Operands.front(),
23925 cast<StoreInst>(Operands.front())->getValueOperand(),
23926 Operands.back(),
23927 cast<StoreInst>(Operands.back())->getValueOperand(),
23928 Operands.size()})
23929 .second)
23930 continue;
23931
23932 unsigned MaxVecRegSize = R.getMaxVecRegSize();
23933 unsigned EltSize = R.getVectorElementSize(Operands[0]);
23934 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
23935
23936 unsigned MaxVF =
23937 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23938 auto *Store = cast<StoreInst>(Operands[0]);
23939 Type *StoreTy = Store->getValueOperand()->getType();
23940 Type *ValueTy = StoreTy;
23941 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
23942 ValueTy = Trunc->getSrcTy();
23943 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
23944 // getStoreMinimumVF only support scalar type as arguments. As a result,
23945 // we need to use the element type of StoreTy and ValueTy to retrieve the
23946 // VF and then transform it back.
23947 // Remember: VF is defined as the number we want to vectorize, not the
23948 // number of elements in the final vector.
23949 Type *StoreScalarTy = StoreTy->getScalarType();
23950 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
23951 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23952 ValueTy->getScalarType()));
23953 MinVF /= getNumElements(StoreTy);
23954 MinVF = std::max<unsigned>(2, MinVF);
23955
23956 if (MaxVF < MinVF) {
23957 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23958 << ") < "
23959 << "MinVF (" << MinVF << ")\n");
23960 continue;
23961 }
23962
23963 unsigned NonPowerOf2VF = 0;
23965 // First try vectorizing with a non-power-of-2 VF. At the moment, only
23966 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
23967 // lanes are used.
23968 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
23969 if (has_single_bit(CandVF + 1)) {
23970 NonPowerOf2VF = CandVF;
23971 assert(NonPowerOf2VF != MaxVF &&
23972 "Non-power-of-2 VF should not be equal to MaxVF");
23973 }
23974 }
23975
23976 // MaxRegVF represents the number of instructions (scalar, or vector in
23977 // case of revec) that can be vectorized to naturally fit in a vector
23978 // register.
23979 unsigned MaxRegVF = MaxVF;
23980
23981 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
23982 if (MaxVF < MinVF) {
23983 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23984 << ") < "
23985 << "MinVF (" << MinVF << ")\n");
23986 continue;
23987 }
23988
23989 SmallVector<unsigned> CandidateVFs;
23990 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23991 VF = divideCeil(VF, 2))
23992 CandidateVFs.push_back(VF);
23993
23994 unsigned End = Operands.size();
23995 unsigned Repeat = 0;
23996 constexpr unsigned MaxAttempts = 4;
23998 Operands.size(), {1, 1});
23999 // The `slice` and `drop_front` interfaces are convenient
24000 const auto RangeSizes = MutableArrayRef(RangeSizesStorage);
24001 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
24002 auto IsNotVectorized = [](bool First,
24003 const std::pair<unsigned, unsigned> &P) {
24004 return First ? P.first > 0 : P.second > 0;
24005 };
24006 auto IsVectorized = [](bool First,
24007 const std::pair<unsigned, unsigned> &P) {
24008 return First ? P.first == 0 : P.second == 0;
24009 };
24010 auto VFIsProfitable = [](bool First, unsigned Size,
24011 const std::pair<unsigned, unsigned> &P) {
24012 return First ? Size >= P.first : Size >= P.second;
24013 };
24014 auto FirstSizeSame = [](unsigned Size,
24015 const std::pair<unsigned, unsigned> &P) {
24016 return Size == P.first;
24017 };
24018 while (true) {
24019 ++Repeat;
24020 bool RepeatChanged = false;
24021 bool AnyProfitableGraph = false;
24022 for (unsigned VF : CandidateVFs) {
24023 AnyProfitableGraph = false;
24024 unsigned FirstUnvecStore =
24025 std::distance(RangeSizes.begin(),
24026 find_if(RangeSizes, std::bind(IsNotVectorized,
24027 VF >= MaxRegVF, _1)));
24028
24029 // Form slices of size VF starting from FirstUnvecStore and try to
24030 // vectorize them.
24031 while (FirstUnvecStore < End) {
24032 unsigned FirstVecStore = std::distance(
24033 RangeSizes.begin(),
24034 find_if(RangeSizes.drop_front(FirstUnvecStore),
24035 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
24036 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
24037 for (unsigned SliceStartIdx = FirstUnvecStore;
24038 SliceStartIdx + VF <= MaxSliceEnd;) {
24039 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
24040 VF >= MaxRegVF)) {
24041 ++SliceStartIdx;
24042 continue;
24043 }
24044 ArrayRef<Value *> Slice =
24045 ArrayRef(Operands).slice(SliceStartIdx, VF);
24046 assert(all_of(Slice,
24047 [&](Value *V) {
24048 return cast<StoreInst>(V)
24049 ->getValueOperand()
24050 ->getType() ==
24051 cast<StoreInst>(Slice.front())
24052 ->getValueOperand()
24053 ->getType();
24054 }) &&
24055 "Expected all operands of same type.");
24056 if (!NonSchedulable.empty()) {
24057 auto [NonSchedSizeMax, NonSchedSizeMin] =
24058 NonSchedulable.lookup(Slice.front());
24059 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
24060 // VF is too ambitious. Try to vectorize another slice before
24061 // trying a smaller VF.
24062 SliceStartIdx += NonSchedSizeMax;
24063 continue;
24064 }
24065 }
24066 unsigned TreeSize;
24067 std::optional<bool> Res =
24068 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
24069 if (!Res) {
24070 // Update the range of non schedulable VFs for slices starting
24071 // at SliceStartIdx.
24072 NonSchedulable
24073 .try_emplace(Slice.front(), std::make_pair(VF, VF))
24074 .first->getSecond()
24075 .second = VF;
24076 } else if (*Res) {
24077 // Mark the vectorized stores so that we don't vectorize them
24078 // again.
24079 VectorizedStores.insert_range(Slice);
24080 // Mark the vectorized stores so that we don't vectorize them
24081 // again.
24082 AnyProfitableGraph = RepeatChanged = Changed = true;
24083 // If we vectorized initial block, no need to try to vectorize
24084 // it again.
24085 for (std::pair<unsigned, unsigned> &P :
24086 RangeSizes.slice(SliceStartIdx, VF))
24087 P.first = P.second = 0;
24088 if (SliceStartIdx < FirstUnvecStore + MinVF) {
24089 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
24090 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
24091 P.first = P.second = 0;
24092 FirstUnvecStore = SliceStartIdx + VF;
24093 }
24094 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
24095 for (std::pair<unsigned, unsigned> &P :
24096 RangeSizes.slice(SliceStartIdx + VF,
24097 MaxSliceEnd - (SliceStartIdx + VF)))
24098 P.first = P.second = 0;
24099 if (MaxSliceEnd == End)
24100 End = SliceStartIdx;
24101 MaxSliceEnd = SliceStartIdx;
24102 }
24103 SliceStartIdx += VF;
24104 continue;
24105 }
24106 if (VF > 2 && Res &&
24107 !all_of(RangeSizes.slice(SliceStartIdx, VF),
24108 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
24109 _1))) {
24110 SliceStartIdx += VF;
24111 continue;
24112 }
24113 // Check for the very big VFs that we're not rebuilding same
24114 // trees, just with larger number of elements.
24115 if (VF > MaxRegVF && TreeSize > 1 &&
24116 all_of(RangeSizes.slice(SliceStartIdx, VF),
24117 std::bind(FirstSizeSame, TreeSize, _1))) {
24118 SliceStartIdx += VF;
24119 while (SliceStartIdx != MaxSliceEnd &&
24120 RangeSizes[SliceStartIdx].first == TreeSize)
24121 ++SliceStartIdx;
24122 continue;
24123 }
24124 if (TreeSize > 1) {
24125 for (std::pair<unsigned, unsigned> &P :
24126 RangeSizes.slice(SliceStartIdx, VF)) {
24127 if (VF >= MaxRegVF)
24128 P.second = std::max(P.second, TreeSize);
24129 else
24130 P.first = std::max(P.first, TreeSize);
24131 }
24132 }
24133 ++SliceStartIdx;
24134 AnyProfitableGraph = true;
24135 }
24136 if (FirstUnvecStore >= End)
24137 break;
24138 if (MaxSliceEnd - FirstUnvecStore < VF &&
24139 MaxSliceEnd - FirstUnvecStore >= MinVF)
24140 AnyProfitableGraph = true;
24141 FirstUnvecStore = std::distance(
24142 RangeSizes.begin(),
24143 find_if(RangeSizes.drop_front(MaxSliceEnd),
24144 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
24145 }
24146 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
24147 break;
24148 }
24149 // All values vectorized - exit.
24150 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
24151 return P.first == 0 && P.second == 0;
24152 }))
24153 break;
24154 // Check if tried all attempts or no need for the last attempts at all.
24155 if (Repeat >= MaxAttempts ||
24156 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
24157 break;
24158 constexpr unsigned StoresLimit = 64;
24159 const unsigned MaxTotalNum = std::min<unsigned>(
24160 Operands.size(),
24161 static_cast<unsigned>(
24162 End -
24163 std::distance(
24164 RangeSizes.begin(),
24165 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
24166 1));
24167 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
24168 unsigned Limit =
24169 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
24170 CandidateVFs.clear();
24171 if (bit_floor(Limit) == VF)
24172 CandidateVFs.push_back(Limit);
24173 if (VF > MaxTotalNum || VF >= StoresLimit)
24174 break;
24175 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
24176 if (P.first != 0)
24177 P.first = std::max(P.second, P.first);
24178 }
24179 // Last attempt to vectorize max number of elements, if all previous
24180 // attempts were unsuccessful because of the cost issues.
24181 CandidateVFs.push_back(VF);
24182 }
24183 }
24184 };
24185
24186 /// Groups of stores to vectorize
24187 SmallVector<RelatedStoreInsts> SortedStores;
24188
24189 // Inserts the specified store SI with the given index Idx to the set of the
24190 // stores. If the store with the same distance is found already - stop
24191 // insertion, try to vectorize already found stores. If some stores from this
24192 // sequence were not vectorized - try to vectorize them with the new store
24193 // later. But this logic is applied only to the stores, that come before the
24194 // previous store with the same distance.
24195 // Example:
24196 // 1. store x, %p
24197 // 2. store y, %p+1
24198 // 3. store z, %p+2
24199 // 4. store a, %p
24200 // 5. store b, %p+3
24201 // - Scan this from the last to first store. The very first bunch of stores is
24202 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
24203 // vector).
24204 // - The next store in the list - #1 - has the same distance from store #5 as
24205 // the store #4.
24206 // - Try to vectorize sequence of stores 4,2,3,5.
24207 // - If all these stores are vectorized - just drop them.
24208 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
24209 // - Start new stores sequence.
24210 // The new bunch of stores is {1, {1, 0}}.
24211 // - Add the stores from previous sequence, that were not vectorized.
24212 // Here we consider the stores in the reversed order, rather they are used in
24213 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
24214 // Store #3 can be added -> comes after store #4 with the same distance as
24215 // store #1.
24216 // Store #5 cannot be added - comes before store #4.
24217 // This logic allows to improve the compile time, we assume that the stores
24218 // after previous store with the same distance most likely have memory
24219 // dependencies and no need to waste compile time to try to vectorize them.
24220 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
24221 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
24222 std::optional<int64_t> PtrDist;
24223 auto *RelatedStores = find_if(
24224 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
24225 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
24226 return PtrDist.has_value();
24227 });
24228
24229 // We did not find a comparable store, start a new group.
24230 if (RelatedStores == SortedStores.end()) {
24231 SortedStores.emplace_back(Idx, Stores);
24232 return;
24233 }
24234
24235 // If there is already a store in the group with the same PtrDiff, try to
24236 // vectorize the existing instructions before adding the current store.
24237 // Otherwise, insert this store and keep collecting.
24238 if (std::optional<unsigned> PrevInst =
24239 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
24240 TryToVectorize(RelatedStores->getStores());
24241 RelatedStores->clearVectorizedStores(VectorizedStores);
24242 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
24243 /*NewBaseInstIdx=*/Idx,
24244 /*DistFromCurBase=*/*PtrDist);
24245 }
24246 };
24247 Type *PrevValTy = nullptr;
24248 for (auto [I, SI] : enumerate(Stores)) {
24249 if (R.isDeleted(SI))
24250 continue;
24251 if (!PrevValTy)
24252 PrevValTy = SI->getValueOperand()->getType();
24253 // Check that we do not try to vectorize stores of different types.
24254 if (PrevValTy != SI->getValueOperand()->getType()) {
24255 for (RelatedStoreInsts &StoreSeq : SortedStores)
24256 TryToVectorize(StoreSeq.getStores());
24257 SortedStores.clear();
24258 PrevValTy = SI->getValueOperand()->getType();
24259 }
24260 FillStoresSet(I, SI);
24261 }
24262
24263 // Final vectorization attempt.
24264 for (RelatedStoreInsts &StoreSeq : SortedStores)
24265 TryToVectorize(StoreSeq.getStores());
24266
24267 return Changed;
24268}
24269
24270void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
24271 // Initialize the collections. We will make a single pass over the block.
24272 Stores.clear();
24273 GEPs.clear();
24274
24275 // Visit the store and getelementptr instructions in BB and organize them in
24276 // Stores and GEPs according to the underlying objects of their pointer
24277 // operands.
24278 for (Instruction &I : *BB) {
24279 // Ignore store instructions that are volatile or have a pointer operand
24280 // that doesn't point to a scalar type.
24281 if (auto *SI = dyn_cast<StoreInst>(&I)) {
24282 if (!SI->isSimple())
24283 continue;
24284 if (!isValidElementType(SI->getValueOperand()->getType()))
24285 continue;
24286 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
24287 }
24288
24289 // Ignore getelementptr instructions that have more than one index, a
24290 // constant index, or a pointer operand that doesn't point to a scalar
24291 // type.
24292 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
24293 if (GEP->getNumIndices() != 1)
24294 continue;
24295 Value *Idx = GEP->idx_begin()->get();
24296 if (isa<Constant>(Idx))
24297 continue;
24298 if (!isValidElementType(Idx->getType()))
24299 continue;
24300 if (GEP->getType()->isVectorTy())
24301 continue;
24302 GEPs[GEP->getPointerOperand()].push_back(GEP);
24303 }
24304 }
24305}
24306
24307bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
24308 bool MaxVFOnly) {
24309 if (VL.size() < 2)
24310 return false;
24311
24312 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
24313 << VL.size() << ".\n");
24314
24315 // Check that all of the parts are instructions of the same type,
24316 // we permit an alternate opcode via InstructionsState.
24317 InstructionsState S = getSameOpcode(VL, *TLI);
24318 if (!S)
24319 return false;
24320
24321 Instruction *I0 = S.getMainOp();
24322 // Make sure invalid types (including vector type) are rejected before
24323 // determining vectorization factor for scalar instructions.
24324 for (Value *V : VL) {
24325 Type *Ty = V->getType();
24327 // NOTE: the following will give user internal llvm type name, which may
24328 // not be useful.
24329 R.getORE()->emit([&]() {
24330 std::string TypeStr;
24331 llvm::raw_string_ostream OS(TypeStr);
24332 Ty->print(OS);
24333 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
24334 << "Cannot SLP vectorize list: type "
24335 << TypeStr + " is unsupported by vectorizer";
24336 });
24337 return false;
24338 }
24339 }
24340
24341 Type *ScalarTy = getValueType(VL[0]);
24342 unsigned Sz = R.getVectorElementSize(I0);
24343 unsigned MinVF = R.getMinVF(Sz);
24344 unsigned MaxVF = std::max<unsigned>(
24345 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
24346 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
24347 if (MaxVF < 2) {
24348 R.getORE()->emit([&]() {
24349 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
24350 << "Cannot SLP vectorize list: vectorization factor "
24351 << "less than 2 is not supported";
24352 });
24353 return false;
24354 }
24355
24356 bool Changed = false;
24357 bool CandidateFound = false;
24358 InstructionCost MinCost = SLPCostThreshold.getValue();
24359
24360 unsigned NextInst = 0, MaxInst = VL.size();
24361 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
24362 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
24363 // No actual vectorization should happen, if number of parts is the same as
24364 // provided vectorization factor (i.e. the scalar type is used for vector
24365 // code during codegen).
24366 auto *VecTy = getWidenedType(ScalarTy, VF);
24367 if (TTI->getNumberOfParts(VecTy) == VF)
24368 continue;
24369 for (unsigned I = NextInst; I < MaxInst; ++I) {
24370 unsigned ActualVF = std::min(MaxInst - I, VF);
24371
24372 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
24373 continue;
24374
24375 if (MaxVFOnly && ActualVF < MaxVF)
24376 break;
24377 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
24378 break;
24379
24380 SmallVector<Value *> Ops(ActualVF, nullptr);
24381 unsigned Idx = 0;
24382 for (Value *V : VL.drop_front(I)) {
24383 // Check that a previous iteration of this loop did not delete the
24384 // Value.
24385 if (auto *Inst = dyn_cast<Instruction>(V);
24386 !Inst || !R.isDeleted(Inst)) {
24387 Ops[Idx] = V;
24388 ++Idx;
24389 if (Idx == ActualVF)
24390 break;
24391 }
24392 }
24393 // Not enough vectorizable instructions - exit.
24394 if (Idx != ActualVF)
24395 break;
24396
24397 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
24398 << "\n");
24399
24400 R.buildTree(Ops);
24401 if (R.isTreeTinyAndNotFullyVectorizable())
24402 continue;
24403 if (R.isProfitableToReorder()) {
24404 R.reorderTopToBottom();
24405 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
24406 }
24407 R.transformNodes();
24408 R.computeMinimumValueSizes();
24409 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
24410 R.buildExternalUses();
24411
24412 InstructionCost Cost = R.getTreeCost(TreeCost);
24413 CandidateFound = true;
24414 MinCost = std::min(MinCost, Cost);
24415
24416 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24417 << " for VF=" << ActualVF << "\n");
24418 if (Cost < -SLPCostThreshold) {
24419 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
24420 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
24422 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
24423 << " and with tree size "
24424 << ore::NV("TreeSize", R.getTreeSize()));
24425
24426 R.vectorizeTree();
24427 // Move to the next bundle.
24428 I += VF - 1;
24429 NextInst = I + 1;
24430 Changed = true;
24431 }
24432 }
24433 }
24434
24435 if (!Changed && CandidateFound) {
24436 R.getORE()->emit([&]() {
24437 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
24438 << "List vectorization was possible but not beneficial with cost "
24439 << ore::NV("Cost", MinCost) << " >= "
24440 << ore::NV("Treshold", -SLPCostThreshold);
24441 });
24442 } else if (!Changed) {
24443 R.getORE()->emit([&]() {
24444 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
24445 << "Cannot SLP vectorize list: vectorization was impossible"
24446 << " with available vectorization factors";
24447 });
24448 }
24449 return Changed;
24450}
24451
24452namespace {
24453
24454/// Model horizontal reductions.
24455///
24456/// A horizontal reduction is a tree of reduction instructions that has values
24457/// that can be put into a vector as its leaves. For example:
24458///
24459/// mul mul mul mul
24460/// \ / \ /
24461/// + +
24462/// \ /
24463/// +
24464/// This tree has "mul" as its leaf values and "+" as its reduction
24465/// instructions. A reduction can feed into a store or a binary operation
24466/// feeding a phi.
24467/// ...
24468/// \ /
24469/// +
24470/// |
24471/// phi +=
24472///
24473/// Or:
24474/// ...
24475/// \ /
24476/// +
24477/// |
24478/// *p =
24479///
24480class HorizontalReduction {
24481 using ReductionOpsType = SmallVector<Value *, 16>;
24482 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
24483 ReductionOpsListType ReductionOps;
24484 /// List of possibly reduced values.
24486 /// Maps reduced value to the corresponding reduction operation.
24487 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
24488 WeakTrackingVH ReductionRoot;
24489 /// The type of reduction operation.
24490 RecurKind RdxKind;
24491 /// Checks if the optimization of original scalar identity operations on
24492 /// matched horizontal reductions is enabled and allowed.
24493 bool IsSupportedHorRdxIdentityOp = false;
24494 /// The minimum number of the reduced values.
24495 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
24496 /// Contains vector values for reduction including their scale factor and
24497 /// signedness.
24499
24500 static bool isCmpSelMinMax(Instruction *I) {
24501 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
24503 }
24504
24505 // And/or are potentially poison-safe logical patterns like:
24506 // select x, y, false
24507 // select x, true, y
24508 static bool isBoolLogicOp(Instruction *I) {
24509 return isa<SelectInst>(I) &&
24510 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
24511 }
24512
24513 /// Checks if instruction is associative and can be vectorized.
24514 static bool isVectorizable(RecurKind Kind, Instruction *I,
24515 bool TwoElementReduction = false) {
24516 if (Kind == RecurKind::None)
24517 return false;
24518
24519 // Integer ops that map to select instructions or intrinsics are fine.
24521 isBoolLogicOp(I))
24522 return true;
24523
24524 // No need to check for associativity, if 2 reduced values.
24525 if (TwoElementReduction)
24526 return true;
24527
24528 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
24529 // FP min/max are associative except for NaN and -0.0. We do not
24530 // have to rule out -0.0 here because the intrinsic semantics do not
24531 // specify a fixed result for it.
24532 return I->getFastMathFlags().noNaNs();
24533 }
24534
24535 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
24536 return true;
24537
24538 return I->isAssociative();
24539 }
24540
24541 static Value *getRdxOperand(Instruction *I, unsigned Index) {
24542 // Poison-safe 'or' takes the form: select X, true, Y
24543 // To make that work with the normal operand processing, we skip the
24544 // true value operand.
24545 // TODO: Change the code and data structures to handle this without a hack.
24546 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
24547 return I->getOperand(2);
24548 return I->getOperand(Index);
24549 }
24550
24551 /// Creates reduction operation with the current opcode.
24552 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
24553 Value *RHS, const Twine &Name, bool UseSelect) {
24554 Type *OpTy = LHS->getType();
24555 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
24556 switch (Kind) {
24557 case RecurKind::Or: {
24558 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
24559 return Builder.CreateSelectWithUnknownProfile(
24560 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
24561 RHS, DEBUG_TYPE, Name);
24562 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24563 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24564 Name);
24565 }
24566 case RecurKind::And: {
24567 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
24568 return Builder.CreateSelectWithUnknownProfile(
24569 LHS, RHS,
24570 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
24571 DEBUG_TYPE, Name);
24572 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24573 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24574 Name);
24575 }
24576 case RecurKind::Add:
24577 case RecurKind::Mul:
24578 case RecurKind::Xor:
24579 case RecurKind::FAdd:
24580 case RecurKind::FMul: {
24581 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24582 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24583 Name);
24584 }
24585 case RecurKind::SMax:
24586 case RecurKind::SMin:
24587 case RecurKind::UMax:
24588 case RecurKind::UMin:
24589 if (UseSelect) {
24591 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
24592 return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
24593 Name);
24594 }
24595 [[fallthrough]];
24596 case RecurKind::FMax:
24597 case RecurKind::FMin:
24598 case RecurKind::FMaximum:
24599 case RecurKind::FMinimum:
24600 case RecurKind::FMaximumNum:
24601 case RecurKind::FMinimumNum: {
24603 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
24604 }
24605 default:
24606 llvm_unreachable("Unknown reduction operation.");
24607 }
24608 }
24609
24610 /// Creates reduction operation with the current opcode with the IR flags
24611 /// from \p ReductionOps, dropping nuw/nsw flags.
24612 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
24613 Value *RHS, const Twine &Name,
24614 const ReductionOpsListType &ReductionOps) {
24615 bool UseSelect = ReductionOps.size() == 2 ||
24616 // Logical or/and.
24617 (ReductionOps.size() == 1 &&
24618 any_of(ReductionOps.front(), IsaPred<SelectInst>));
24619 assert((!UseSelect || ReductionOps.size() != 2 ||
24620 isa<SelectInst>(ReductionOps[1][0])) &&
24621 "Expected cmp + select pairs for reduction");
24622 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
24624 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
24625 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
24626 /*IncludeWrapFlags=*/false);
24627 propagateIRFlags(Op, ReductionOps[1], nullptr,
24628 /*IncludeWrapFlags=*/false);
24629 return Op;
24630 }
24631 }
24632 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
24633 return Op;
24634 }
24635
24636public:
24637 static RecurKind getRdxKind(Value *V) {
24638 auto *I = dyn_cast<Instruction>(V);
24639 if (!I)
24640 return RecurKind::None;
24641 if (match(I, m_Add(m_Value(), m_Value())))
24642 return RecurKind::Add;
24643 if (match(I, m_Mul(m_Value(), m_Value())))
24644 return RecurKind::Mul;
24645 if (match(I, m_And(m_Value(), m_Value())) ||
24647 return RecurKind::And;
24648 if (match(I, m_Or(m_Value(), m_Value())) ||
24650 return RecurKind::Or;
24651 if (match(I, m_Xor(m_Value(), m_Value())))
24652 return RecurKind::Xor;
24653 if (match(I, m_FAdd(m_Value(), m_Value())))
24654 return RecurKind::FAdd;
24655 if (match(I, m_FMul(m_Value(), m_Value())))
24656 return RecurKind::FMul;
24657
24659 return RecurKind::FMax;
24661 return RecurKind::FMin;
24662
24663 if (match(I, m_FMaximum(m_Value(), m_Value())))
24664 return RecurKind::FMaximum;
24665 if (match(I, m_FMinimum(m_Value(), m_Value())))
24666 return RecurKind::FMinimum;
24667 // This matches either cmp+select or intrinsics. SLP is expected to handle
24668 // either form.
24669 // TODO: If we are canonicalizing to intrinsics, we can remove several
24670 // special-case paths that deal with selects.
24671 if (match(I, m_SMax(m_Value(), m_Value())))
24672 return RecurKind::SMax;
24673 if (match(I, m_SMin(m_Value(), m_Value())))
24674 return RecurKind::SMin;
24675 if (match(I, m_UMax(m_Value(), m_Value())))
24676 return RecurKind::UMax;
24677 if (match(I, m_UMin(m_Value(), m_Value())))
24678 return RecurKind::UMin;
24679
24680 if (auto *Select = dyn_cast<SelectInst>(I)) {
24681 // Try harder: look for min/max pattern based on instructions producing
24682 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
24683 // During the intermediate stages of SLP, it's very common to have
24684 // pattern like this (since optimizeGatherSequence is run only once
24685 // at the end):
24686 // %1 = extractelement <2 x i32> %a, i32 0
24687 // %2 = extractelement <2 x i32> %a, i32 1
24688 // %cond = icmp sgt i32 %1, %2
24689 // %3 = extractelement <2 x i32> %a, i32 0
24690 // %4 = extractelement <2 x i32> %a, i32 1
24691 // %select = select i1 %cond, i32 %3, i32 %4
24692 CmpPredicate Pred;
24693 Instruction *L1;
24694 Instruction *L2;
24695
24696 Value *LHS = Select->getTrueValue();
24697 Value *RHS = Select->getFalseValue();
24698 Value *Cond = Select->getCondition();
24699
24700 // TODO: Support inverse predicates.
24701 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
24704 return RecurKind::None;
24705 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
24708 return RecurKind::None;
24709 } else {
24711 return RecurKind::None;
24712 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
24715 return RecurKind::None;
24716 }
24717
24718 switch (Pred) {
24719 default:
24720 return RecurKind::None;
24721 case CmpInst::ICMP_SGT:
24722 case CmpInst::ICMP_SGE:
24723 return RecurKind::SMax;
24724 case CmpInst::ICMP_SLT:
24725 case CmpInst::ICMP_SLE:
24726 return RecurKind::SMin;
24727 case CmpInst::ICMP_UGT:
24728 case CmpInst::ICMP_UGE:
24729 return RecurKind::UMax;
24730 case CmpInst::ICMP_ULT:
24731 case CmpInst::ICMP_ULE:
24732 return RecurKind::UMin;
24733 }
24734 }
24735 return RecurKind::None;
24736 }
24737
24738 /// Get the index of the first operand.
24739 static unsigned getFirstOperandIndex(Instruction *I) {
24740 return isCmpSelMinMax(I) ? 1 : 0;
24741 }
24742
24743private:
24744 /// Total number of operands in the reduction operation.
24745 static unsigned getNumberOfOperands(Instruction *I) {
24746 return isCmpSelMinMax(I) ? 3 : 2;
24747 }
24748
24749 /// Checks if the instruction is in basic block \p BB.
24750 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
24751 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
24752 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
24753 auto *Sel = cast<SelectInst>(I);
24754 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
24755 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
24756 }
24757 return I->getParent() == BB;
24758 }
24759
24760 /// Expected number of uses for reduction operations/reduced values.
24761 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
24762 if (IsCmpSelMinMax) {
24763 // SelectInst must be used twice while the condition op must have single
24764 // use only.
24765 if (auto *Sel = dyn_cast<SelectInst>(I))
24766 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
24767 return I->hasNUses(2);
24768 }
24769
24770 // Arithmetic reduction operation must be used once only.
24771 return I->hasOneUse();
24772 }
24773
24774 /// Initializes the list of reduction operations.
24775 void initReductionOps(Instruction *I) {
24776 if (isCmpSelMinMax(I))
24777 ReductionOps.assign(2, ReductionOpsType());
24778 else
24779 ReductionOps.assign(1, ReductionOpsType());
24780 }
24781
24782 /// Add all reduction operations for the reduction instruction \p I.
24783 void addReductionOps(Instruction *I) {
24784 if (isCmpSelMinMax(I)) {
24785 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
24786 ReductionOps[1].emplace_back(I);
24787 } else {
24788 ReductionOps[0].emplace_back(I);
24789 }
24790 }
24791
24792 static bool isGoodForReduction(ArrayRef<Value *> Data) {
24793 int Sz = Data.size();
24794 auto *I = dyn_cast<Instruction>(Data.front());
24795 return Sz > 1 || isConstant(Data.front()) ||
24796 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
24797 }
24798
24799public:
24800 HorizontalReduction() = default;
24802 : ReductionRoot(I), ReductionLimit(2) {
24803 RdxKind = HorizontalReduction::getRdxKind(I);
24804 ReductionOps.emplace_back().push_back(I);
24805 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
24806 for (Value *V : Ops)
24807 ReducedValsToOps[V].push_back(I);
24808 }
24809
24810 bool matchReductionForOperands() const {
24811 // Analyze "regular" integer/FP types for reductions - no target-specific
24812 // types or pointers.
24813 assert(ReductionRoot && "Reduction root is not set!");
24814 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
24815 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
24816 return Ops.size() == 2;
24817 })))
24818 return false;
24819
24820 return true;
24821 }
24822
24823 /// Try to find a reduction tree.
24824 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24825 ScalarEvolution &SE, const DataLayout &DL,
24826 const TargetLibraryInfo &TLI) {
24827 RdxKind = HorizontalReduction::getRdxKind(Root);
24828 if (!isVectorizable(RdxKind, Root))
24829 return false;
24830
24831 // Analyze "regular" integer/FP types for reductions - no target-specific
24832 // types or pointers.
24833 Type *Ty = Root->getType();
24834 if (!isValidElementType(Ty) || Ty->isPointerTy())
24835 return false;
24836
24837 // Though the ultimate reduction may have multiple uses, its condition must
24838 // have only single use.
24839 if (auto *Sel = dyn_cast<SelectInst>(Root))
24840 if (!Sel->getCondition()->hasOneUse())
24841 return false;
24842
24843 ReductionRoot = Root;
24844
24845 // Iterate through all the operands of the possible reduction tree and
24846 // gather all the reduced values, sorting them by their value id.
24847 BasicBlock *BB = Root->getParent();
24848 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24850 1, std::make_pair(Root, 0));
24851 // Checks if the operands of the \p TreeN instruction are also reduction
24852 // operations or should be treated as reduced values or an extra argument,
24853 // which is not part of the reduction.
24854 auto CheckOperands = [&](Instruction *TreeN,
24855 SmallVectorImpl<Value *> &PossibleReducedVals,
24856 SmallVectorImpl<Instruction *> &ReductionOps,
24857 unsigned Level) {
24858 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
24859 getNumberOfOperands(TreeN)))) {
24860 Value *EdgeVal = getRdxOperand(TreeN, I);
24861 ReducedValsToOps[EdgeVal].push_back(TreeN);
24862 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
24863 // If the edge is not an instruction, or it is different from the main
24864 // reduction opcode or has too many uses - possible reduced value.
24865 // Also, do not try to reduce const values, if the operation is not
24866 // foldable.
24867 if (!EdgeInst || Level > RecursionMaxDepth ||
24868 getRdxKind(EdgeInst) != RdxKind ||
24869 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24870 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24871 !isVectorizable(RdxKind, EdgeInst) ||
24872 (R.isAnalyzedReductionRoot(EdgeInst) &&
24873 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
24874 PossibleReducedVals.push_back(EdgeVal);
24875 continue;
24876 }
24877 ReductionOps.push_back(EdgeInst);
24878 }
24879 };
24880 // Try to regroup reduced values so that it gets more profitable to try to
24881 // reduce them. Values are grouped by their value ids, instructions - by
24882 // instruction op id and/or alternate op id, plus do extra analysis for
24883 // loads (grouping them by the distance between pointers) and cmp
24884 // instructions (grouping them by the predicate).
24885 SmallMapVector<
24886 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24887 8>
24888 PossibleReducedVals;
24889 initReductionOps(Root);
24890 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
24891 SmallSet<size_t, 2> LoadKeyUsed;
24892
24893 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
24895 Value *Ptr =
24897 if (!LoadKeyUsed.insert(Key).second) {
24898 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
24899 if (LIt != LoadsMap.end()) {
24900 for (LoadInst *RLI : LIt->second) {
24901 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
24902 LI->getType(), LI->getPointerOperand(), DL, SE,
24903 /*StrictCheck=*/true))
24904 return hash_value(RLI->getPointerOperand());
24905 }
24906 for (LoadInst *RLI : LIt->second) {
24908 LI->getPointerOperand(), TLI)) {
24909 hash_code SubKey = hash_value(RLI->getPointerOperand());
24910 return SubKey;
24911 }
24912 }
24913 if (LIt->second.size() > 2) {
24914 hash_code SubKey =
24915 hash_value(LIt->second.back()->getPointerOperand());
24916 return SubKey;
24917 }
24918 }
24919 }
24920 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
24921 .first->second.push_back(LI);
24922 return hash_value(LI->getPointerOperand());
24923 };
24924
24925 while (!Worklist.empty()) {
24926 auto [TreeN, Level] = Worklist.pop_back_val();
24927 SmallVector<Value *> PossibleRedVals;
24928 SmallVector<Instruction *> PossibleReductionOps;
24929 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24930 addReductionOps(TreeN);
24931 // Add reduction values. The values are sorted for better vectorization
24932 // results.
24933 for (Value *V : PossibleRedVals) {
24934 size_t Key, Idx;
24935 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
24936 /*AllowAlternate=*/false);
24937 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
24938 }
24939 for (Instruction *I : reverse(PossibleReductionOps))
24940 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
24941 }
24942 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
24943 // Sort values by the total number of values kinds to start the reduction
24944 // from the longest possible reduced values sequences.
24945 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
24946 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
24947 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
24948 for (auto &Slice : PossibleRedVals) {
24949 PossibleRedValsVect.emplace_back();
24950 auto RedValsVect = Slice.second.takeVector();
24951 stable_sort(RedValsVect, llvm::less_second());
24952 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
24953 PossibleRedValsVect.back().append(Data.second, Data.first);
24954 }
24955 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
24956 return P1.size() > P2.size();
24957 });
24958 bool First = true;
24959 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
24960 if (First) {
24961 First = false;
24962 ReducedVals.emplace_back();
24963 } else if (!isGoodForReduction(Data)) {
24964 auto *LI = dyn_cast<LoadInst>(Data.front());
24965 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
24966 if (!LI || !LastLI ||
24968 getUnderlyingObject(LastLI->getPointerOperand()))
24969 ReducedVals.emplace_back();
24970 }
24971 ReducedVals.back().append(Data.rbegin(), Data.rend());
24972 }
24973 }
24974 // Sort the reduced values by number of same/alternate opcode and/or pointer
24975 // operand.
24976 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
24977 return P1.size() > P2.size();
24978 });
24979 return true;
24980 }
24981
24982 /// Attempt to vectorize the tree found by matchAssociativeReduction.
24983 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
24984 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24985 DominatorTree &DT) {
24986 constexpr unsigned RegMaxNumber = 4;
24987 constexpr unsigned RedValsMaxNumber = 128;
24988 // If there are a sufficient number of reduction values, reduce
24989 // to a nearby power-of-2. We can safely generate oversized
24990 // vectors and rely on the backend to split them to legal sizes.
24991 if (unsigned NumReducedVals = std::accumulate(
24992 ReducedVals.begin(), ReducedVals.end(), 0,
24993 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
24994 if (!isGoodForReduction(Vals))
24995 return Num;
24996 return Num + Vals.size();
24997 });
24998 NumReducedVals < ReductionLimit &&
24999 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
25000 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
25001 })) {
25002 for (ReductionOpsType &RdxOps : ReductionOps)
25003 for (Value *RdxOp : RdxOps)
25004 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
25005 return nullptr;
25006 }
25007
25008 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
25009 TargetFolder(DL));
25010 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
25011
25012 // Track the reduced values in case if they are replaced by extractelement
25013 // because of the vectorization.
25014 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
25015 ReducedVals.front().size());
25016
25017 // The compare instruction of a min/max is the insertion point for new
25018 // instructions and may be replaced with a new compare instruction.
25019 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
25020 assert(isa<SelectInst>(RdxRootInst) &&
25021 "Expected min/max reduction to have select root instruction");
25022 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
25023 assert(isa<Instruction>(ScalarCond) &&
25024 "Expected min/max reduction to have compare condition");
25025 return cast<Instruction>(ScalarCond);
25026 };
25027
25028 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
25029 return isBoolLogicOp(cast<Instruction>(V));
25030 });
25031 // Return new VectorizedTree, based on previous value.
25032 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
25033 if (VectorizedTree) {
25034 // Update the final value in the reduction.
25036 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
25037 if (AnyBoolLogicOp) {
25038 auto It = ReducedValsToOps.find(VectorizedTree);
25039 auto It1 = ReducedValsToOps.find(Res);
25040 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
25041 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
25042 (It != ReducedValsToOps.end() &&
25043 any_of(It->getSecond(), [&](Instruction *I) {
25044 return isBoolLogicOp(I) &&
25045 getRdxOperand(I, 0) == VectorizedTree;
25046 }))) {
25047 ;
25048 } else if (isGuaranteedNotToBePoison(Res, AC) ||
25049 (It1 != ReducedValsToOps.end() &&
25050 any_of(It1->getSecond(), [&](Instruction *I) {
25051 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
25052 }))) {
25053 std::swap(VectorizedTree, Res);
25054 } else {
25055 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
25056 }
25057 }
25058
25059 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
25060 ReductionOps);
25061 }
25062 // Initialize the final value in the reduction.
25063 return Res;
25064 };
25065 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
25066 ReductionOps.front().size());
25067 for (ReductionOpsType &RdxOps : ReductionOps)
25068 for (Value *RdxOp : RdxOps) {
25069 if (!RdxOp)
25070 continue;
25071 IgnoreList.insert(RdxOp);
25072 }
25073 // Intersect the fast-math-flags from all reduction operations.
25074 FastMathFlags RdxFMF;
25075 RdxFMF.set();
25076 for (Value *U : IgnoreList)
25077 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
25078 RdxFMF &= FPMO->getFastMathFlags();
25079 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
25080
25081 // Need to track reduced vals, they may be changed during vectorization of
25082 // subvectors.
25083 for (ArrayRef<Value *> Candidates : ReducedVals)
25084 for (Value *V : Candidates)
25085 TrackedVals.try_emplace(V, V);
25086
25087 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
25088 Value *V) -> unsigned & {
25089 auto *It = MV.find(V);
25090 assert(It != MV.end() && "Unable to find given key.");
25091 return It->second;
25092 };
25093
25094 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
25095 // List of the values that were reduced in other trees as part of gather
25096 // nodes and thus requiring extract if fully vectorized in other trees.
25097 SmallPtrSet<Value *, 4> RequiredExtract;
25098 WeakTrackingVH VectorizedTree = nullptr;
25099 bool CheckForReusedReductionOps = false;
25100 // Try to vectorize elements based on their type.
25102 SmallVector<SmallVector<Value *>> LocalReducedVals;
25103 // Try merge consecutive reduced values into a single vectorizable group and
25104 // check, if they can be vectorized as copyables.
25105 for (ArrayRef<Value *> RV : ReducedVals) {
25106 // Loads are not very compatible with undefs.
25107 if (isa<UndefValue>(RV.front()) &&
25108 (States.empty() || !States.back() ||
25109 States.back().getOpcode() == Instruction::Load)) {
25110 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
25111 States.push_back(InstructionsState::invalid());
25112 continue;
25113 }
25114 if (!LocalReducedVals.empty() &&
25115 isa<UndefValue>(LocalReducedVals.back().front()) &&
25116 isa<LoadInst>(RV.front())) {
25117 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
25118 States.push_back(getSameOpcode(RV, TLI));
25119 continue;
25120 }
25122 if (!LocalReducedVals.empty())
25123 Ops = LocalReducedVals.back();
25124 Ops.append(RV.begin(), RV.end());
25125 InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
25126 InstructionsState OpS =
25127 Analysis.buildInstructionsState(Ops, V, VectorizeCopyableElements);
25128 if (LocalReducedVals.empty()) {
25129 LocalReducedVals.push_back(Ops);
25130 States.push_back(OpS);
25131 continue;
25132 }
25133 if (OpS) {
25134 LocalReducedVals.back().swap(Ops);
25135 States.back() = OpS;
25136 continue;
25137 }
25138 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
25139 States.push_back(getSameOpcode(RV, TLI));
25140 }
25141 ReducedVals.swap(LocalReducedVals);
25142 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
25143 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
25144 InstructionsState S = States[I];
25145 SmallVector<Value *> Candidates;
25146 Candidates.reserve(2 * OrigReducedVals.size());
25147 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
25148 for (Value *ReducedVal : OrigReducedVals) {
25149 Value *RdxVal = TrackedVals.at(ReducedVal);
25150 // Check if the reduction value was not overriden by the extractelement
25151 // instruction because of the vectorization and exclude it, if it is not
25152 // compatible with other values.
25153 // Also check if the instruction was folded to constant/other value.
25154 auto *Inst = dyn_cast<Instruction>(RdxVal);
25155 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
25156 (!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
25157 !S.isCopyableElement(Inst)))) ||
25158 (S && !Inst && !isa<PoisonValue>(RdxVal) &&
25159 !S.isCopyableElement(RdxVal)))
25160 continue;
25161 Candidates.push_back(RdxVal);
25162 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
25163 }
25164 bool ShuffledExtracts = false;
25165 // Try to handle shuffled extractelements.
25166 if (S && S.getOpcode() == Instruction::ExtractElement &&
25167 !S.isAltShuffle() && I + 1 < E) {
25168 SmallVector<Value *> CommonCandidates(Candidates);
25169 for (Value *RV : ReducedVals[I + 1]) {
25170 Value *RdxVal = TrackedVals.at(RV);
25171 // Check if the reduction value was not overriden by the
25172 // extractelement instruction because of the vectorization and
25173 // exclude it, if it is not compatible with other values.
25174 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
25175 if (!Inst)
25176 continue;
25177 CommonCandidates.push_back(RdxVal);
25178 TrackedToOrig.try_emplace(RdxVal, RV);
25179 }
25180 SmallVector<int> Mask;
25181 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
25182 ++I;
25183 Candidates.swap(CommonCandidates);
25184 ShuffledExtracts = true;
25185 }
25186 }
25187
25188 // Emit code for constant values.
25189 if (Candidates.size() > 1 && allConstant(Candidates)) {
25190 Value *Res = Candidates.front();
25191 Value *OrigV = TrackedToOrig.at(Candidates.front());
25192 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25193 for (Value *VC : ArrayRef(Candidates).drop_front()) {
25194 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
25195 Value *OrigV = TrackedToOrig.at(VC);
25196 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25197 if (auto *ResI = dyn_cast<Instruction>(Res))
25198 V.analyzedReductionRoot(ResI);
25199 }
25200 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
25201 continue;
25202 }
25203
25204 unsigned NumReducedVals = Candidates.size();
25205 if (NumReducedVals < ReductionLimit &&
25206 (NumReducedVals < 2 || !isSplat(Candidates)))
25207 continue;
25208
25209 // Check if we support repeated scalar values processing (optimization of
25210 // original scalar identity operations on matched horizontal reductions).
25211 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
25212 RdxKind != RecurKind::FMul &&
25213 RdxKind != RecurKind::FMulAdd;
25214 // Gather same values.
25215 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
25216 if (IsSupportedHorRdxIdentityOp)
25217 for (Value *V : Candidates) {
25218 Value *OrigV = TrackedToOrig.at(V);
25219 ++SameValuesCounter.try_emplace(OrigV).first->second;
25220 }
25221 // Used to check if the reduced values used same number of times. In this
25222 // case the compiler may produce better code. E.g. if reduced values are
25223 // aabbccdd (8 x values), then the first node of the tree will have a node
25224 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
25225 // Plus, the final reduction will be performed on <8 x aabbccdd>.
25226 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
25227 // x abcd) * 2.
25228 // Currently it only handles add/fadd/xor. and/or/min/max do not require
25229 // this analysis, other operations may require an extra estimation of
25230 // the profitability.
25231 bool SameScaleFactor = false;
25232 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
25233 SameValuesCounter.size() != Candidates.size();
25234 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
25235 if (OptReusedScalars) {
25236 SameScaleFactor =
25237 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
25238 RdxKind == RecurKind::Xor) &&
25239 all_of(drop_begin(SameValuesCounter),
25240 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
25241 return P.second == SameValuesCounter.front().second;
25242 });
25243 Candidates.resize(SameValuesCounter.size());
25244 transform(SameValuesCounter, Candidates.begin(),
25245 [&](const auto &P) { return TrackedVals.at(P.first); });
25246 NumReducedVals = Candidates.size();
25247 // Have a reduction of the same element.
25248 if (NumReducedVals == 1) {
25249 Value *OrigV = TrackedToOrig.at(Candidates.front());
25250 unsigned Cnt = At(SameValuesCounter, OrigV);
25251 Value *RedVal =
25252 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
25253 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25254 VectorizedVals.try_emplace(OrigV, Cnt);
25255 ExternallyUsedValues.insert(OrigV);
25256 continue;
25257 }
25258 }
25259
25260 unsigned MaxVecRegSize = V.getMaxVecRegSize();
25261 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
25262 const unsigned MaxElts = std::clamp<unsigned>(
25263 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
25264 RegMaxNumber * RedValsMaxNumber);
25265
25266 unsigned ReduxWidth = NumReducedVals;
25267 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
25268 unsigned NumParts, NumRegs;
25269 Type *ScalarTy = Candidates.front()->getType();
25270 ReduxWidth =
25271 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
25272 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
25273 NumParts = ::getNumberOfParts(TTI, Tp);
25274 NumRegs =
25276 while (NumParts > NumRegs) {
25277 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
25278 ReduxWidth = bit_floor(ReduxWidth - 1);
25279 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
25280 NumParts = ::getNumberOfParts(TTI, Tp);
25281 NumRegs =
25283 }
25284 if (NumParts > NumRegs / 2)
25285 ReduxWidth = bit_floor(ReduxWidth);
25286 return ReduxWidth;
25287 };
25288 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
25289 ReduxWidth = GetVectorFactor(ReduxWidth);
25290 ReduxWidth = std::min(ReduxWidth, MaxElts);
25291
25292 unsigned Start = 0;
25293 unsigned Pos = Start;
25294 // Restarts vectorization attempt with lower vector factor.
25295 unsigned PrevReduxWidth = ReduxWidth;
25296 bool CheckForReusedReductionOpsLocal = false;
25297 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
25298 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
25299 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
25300 // Check if any of the reduction ops are gathered. If so, worth
25301 // trying again with less number of reduction ops.
25302 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
25303 }
25304 ++Pos;
25305 if (Pos < NumReducedVals - ReduxWidth + 1)
25306 return IsAnyRedOpGathered;
25307 Pos = Start;
25308 --ReduxWidth;
25309 if (ReduxWidth > 1)
25310 ReduxWidth = GetVectorFactor(ReduxWidth);
25311 return IsAnyRedOpGathered;
25312 };
25313 bool AnyVectorized = false;
25314 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
25315 while (Pos < NumReducedVals - ReduxWidth + 1 &&
25316 ReduxWidth >= ReductionLimit) {
25317 // Dependency in tree of the reduction ops - drop this attempt, try
25318 // later.
25319 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
25320 Start == 0) {
25321 CheckForReusedReductionOps = true;
25322 break;
25323 }
25324 PrevReduxWidth = ReduxWidth;
25325 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
25326 // Been analyzed already - skip.
25327 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
25328 (!has_single_bit(ReduxWidth) &&
25329 (IgnoredCandidates.contains(
25330 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
25331 IgnoredCandidates.contains(
25332 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
25333 bit_floor(ReduxWidth))))) ||
25334 V.areAnalyzedReductionVals(VL)) {
25335 (void)AdjustReducedVals(/*IgnoreVL=*/true);
25336 continue;
25337 }
25338 // Early exit if any of the reduction values were deleted during
25339 // previous vectorization attempts.
25340 if (any_of(VL, [&V](Value *RedVal) {
25341 auto *RedValI = dyn_cast<Instruction>(RedVal);
25342 return RedValI && V.isDeleted(RedValI);
25343 }))
25344 break;
25345 V.buildTree(VL, IgnoreList);
25346 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
25347 if (!AdjustReducedVals())
25348 V.analyzedReductionVals(VL);
25349 continue;
25350 }
25351 if (V.isLoadCombineReductionCandidate(RdxKind)) {
25352 if (!AdjustReducedVals())
25353 V.analyzedReductionVals(VL);
25354 continue;
25355 }
25356 V.reorderTopToBottom();
25357 // No need to reorder the root node at all for reassociative reduction.
25358 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
25359 VL.front()->getType()->isIntOrIntVectorTy() ||
25360 ReductionLimit > 2);
25361 // Keep extracted other reduction values, if they are used in the
25362 // vectorization trees.
25363 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
25364 ExternallyUsedValues);
25365 // The reduction root is used as the insertion point for new
25366 // instructions, so set it as externally used to prevent it from being
25367 // deleted.
25368 LocalExternallyUsedValues.insert(ReductionRoot);
25369 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
25370 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
25371 continue;
25372 for (Value *V : ReducedVals[Cnt])
25373 if (isa<Instruction>(V))
25374 LocalExternallyUsedValues.insert(TrackedVals[V]);
25375 }
25376 if (!IsSupportedHorRdxIdentityOp) {
25377 // Number of uses of the candidates in the vector of values.
25378 assert(SameValuesCounter.empty() &&
25379 "Reused values counter map is not empty");
25380 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25381 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25382 continue;
25383 Value *V = Candidates[Cnt];
25384 Value *OrigV = TrackedToOrig.at(V);
25385 ++SameValuesCounter.try_emplace(OrigV).first->second;
25386 }
25387 }
25388 V.transformNodes();
25389 V.computeMinimumValueSizes();
25390 InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VL);
25391
25392 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
25393 // Gather externally used values.
25394 SmallPtrSet<Value *, 4> Visited;
25395 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25396 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25397 continue;
25398 Value *RdxVal = Candidates[Cnt];
25399 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
25400 RdxVal = It->second;
25401 if (!Visited.insert(RdxVal).second)
25402 continue;
25403 // Check if the scalar was vectorized as part of the vectorization
25404 // tree but not the top node.
25405 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
25406 LocalExternallyUsedValues.insert(RdxVal);
25407 continue;
25408 }
25409 Value *OrigV = TrackedToOrig.at(RdxVal);
25410 unsigned NumOps =
25411 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
25412 if (NumOps != ReducedValsToOps.at(OrigV).size())
25413 LocalExternallyUsedValues.insert(RdxVal);
25414 }
25415 // Do not need the list of reused scalars in regular mode anymore.
25416 if (!IsSupportedHorRdxIdentityOp)
25417 SameValuesCounter.clear();
25418 for (Value *RdxVal : VL)
25419 if (RequiredExtract.contains(RdxVal))
25420 LocalExternallyUsedValues.insert(RdxVal);
25421 V.buildExternalUses(LocalExternallyUsedValues);
25422
25423 // Estimate cost.
25424 InstructionCost ReductionCost =
25425 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
25426 InstructionCost Cost = V.getTreeCost(TreeCost, VL, ReductionCost);
25427 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
25428 << " for reduction\n");
25429 if (!Cost.isValid())
25430 break;
25431 if (Cost >= -SLPCostThreshold) {
25432 V.getORE()->emit([&]() {
25433 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
25434 ReducedValsToOps.at(VL[0]).front())
25435 << "Vectorizing horizontal reduction is possible "
25436 << "but not beneficial with cost " << ore::NV("Cost", Cost)
25437 << " and threshold "
25438 << ore::NV("Threshold", -SLPCostThreshold);
25439 });
25440 if (!AdjustReducedVals()) {
25441 V.analyzedReductionVals(VL);
25442 unsigned Offset = Pos == Start ? Pos : Pos - 1;
25443 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
25444 // Add subvectors of VL to the list of the analyzed values.
25445 for (unsigned VF = getFloorFullVectorNumberOfElements(
25446 *TTI, VL.front()->getType(), ReduxWidth - 1);
25447 VF >= ReductionLimit;
25449 *TTI, VL.front()->getType(), VF - 1)) {
25450 if (has_single_bit(VF) &&
25451 V.getCanonicalGraphSize() != V.getTreeSize())
25452 continue;
25453 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
25454 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
25455 }
25456 }
25457 }
25458 continue;
25459 }
25460
25461 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
25462 << Cost << ". (HorRdx)\n");
25463 V.getORE()->emit([&]() {
25464 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
25465 ReducedValsToOps.at(VL[0]).front())
25466 << "Vectorized horizontal reduction with cost "
25467 << ore::NV("Cost", Cost) << " and with tree size "
25468 << ore::NV("TreeSize", V.getTreeSize());
25469 });
25470
25471 Builder.setFastMathFlags(RdxFMF);
25472
25473 // Emit a reduction. If the root is a select (min/max idiom), the insert
25474 // point is the compare condition of that select.
25475 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
25476 Instruction *InsertPt = RdxRootInst;
25477 if (IsCmpSelMinMax)
25478 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
25479
25480 // Vectorize a tree.
25481 Value *VectorizedRoot = V.vectorizeTree(
25482 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
25483 // Update TrackedToOrig mapping, since the tracked values might be
25484 // updated.
25485 for (Value *RdxVal : Candidates) {
25486 Value *OrigVal = TrackedToOrig.at(RdxVal);
25487 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
25488 if (TransformedRdxVal != RdxVal)
25489 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
25490 }
25491
25492 Builder.SetInsertPoint(InsertPt);
25493
25494 // To prevent poison from leaking across what used to be sequential,
25495 // safe, scalar boolean logic operations, the reduction operand must be
25496 // frozen.
25497 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
25498 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
25499
25500 // Emit code to correctly handle reused reduced values, if required.
25501 if (OptReusedScalars && !SameScaleFactor) {
25502 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
25503 SameValuesCounter, TrackedToOrig);
25504 }
25505
25506 Type *ScalarTy = VL.front()->getType();
25507 Type *VecTy = VectorizedRoot->getType();
25508 Type *RedScalarTy = VecTy->getScalarType();
25509 VectorValuesAndScales.emplace_back(
25510 VectorizedRoot,
25511 OptReusedScalars && SameScaleFactor
25512 ? SameValuesCounter.front().second
25513 : 1,
25514 RedScalarTy != ScalarTy->getScalarType()
25515 ? V.isSignedMinBitwidthRootNode()
25516 : true);
25517
25518 // Count vectorized reduced values to exclude them from final reduction.
25519 for (Value *RdxVal : VL) {
25520 Value *OrigV = TrackedToOrig.at(RdxVal);
25521 if (IsSupportedHorRdxIdentityOp) {
25522 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
25523 continue;
25524 }
25525 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25526 if (!V.isVectorized(RdxVal))
25527 RequiredExtract.insert(RdxVal);
25528 }
25529 Pos += ReduxWidth;
25530 Start = Pos;
25531 ReduxWidth = NumReducedVals - Pos;
25532 if (ReduxWidth > 1)
25533 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
25534 AnyVectorized = true;
25535 }
25536 if (OptReusedScalars && !AnyVectorized) {
25537 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
25538 Value *RdxVal = TrackedVals.at(P.first);
25539 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
25540 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25541 VectorizedVals.try_emplace(P.first, P.second);
25542 }
25543 continue;
25544 }
25545 }
25546 if (!VectorValuesAndScales.empty())
25547 VectorizedTree = GetNewVectorizedTree(
25548 VectorizedTree,
25549 emitReduction(Builder, *TTI, ReductionRoot->getType()));
25550
25551 if (!VectorizedTree) {
25552 if (!CheckForReusedReductionOps) {
25553 for (ReductionOpsType &RdxOps : ReductionOps)
25554 for (Value *RdxOp : RdxOps)
25555 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
25556 }
25557 return nullptr;
25558 }
25559
25560 // Reorder operands of bool logical op in the natural order to avoid
25561 // possible problem with poison propagation. If not possible to reorder
25562 // (both operands are originally RHS), emit an extra freeze instruction
25563 // for the LHS operand.
25564 // I.e., if we have original code like this:
25565 // RedOp1 = select i1 ?, i1 LHS, i1 false
25566 // RedOp2 = select i1 RHS, i1 ?, i1 false
25567
25568 // Then, we swap LHS/RHS to create a new op that matches the poison
25569 // semantics of the original code.
25570
25571 // If we have original code like this and both values could be poison:
25572 // RedOp1 = select i1 ?, i1 LHS, i1 false
25573 // RedOp2 = select i1 ?, i1 RHS, i1 false
25574
25575 // Then, we must freeze LHS in the new op.
25576 auto FixBoolLogicalOps =
25577 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
25578 Instruction *RedOp2, bool InitStep) {
25579 if (!AnyBoolLogicOp)
25580 return;
25581 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
25582 getRdxOperand(RedOp1, 0) == LHS ||
25584 return;
25585 bool NeedFreeze = LHS != VectorizedTree;
25586 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
25587 getRdxOperand(RedOp2, 0) == RHS ||
25589 // If RedOp2 was used as a second operand - do not swap.
25590 if ((InitStep || RHS != VectorizedTree) &&
25591 getRdxOperand(RedOp2, 0) == RHS &&
25592 ((isBoolLogicOp(RedOp1) &&
25593 getRdxOperand(RedOp1, 1) == RedOp2) ||
25594 any_of(ReductionOps, [&](ArrayRef<Value *> Ops) {
25595 return any_of(Ops, [&](Value *Op) {
25596 auto *OpI = dyn_cast<Instruction>(Op);
25597 return OpI && isBoolLogicOp(OpI) &&
25598 getRdxOperand(OpI, 1) == RedOp2;
25599 });
25600 }))) {
25601 NeedFreeze = false;
25602 } else {
25603 std::swap(LHS, RHS);
25604 return;
25605 }
25606 }
25607 if (NeedFreeze)
25608 LHS = Builder.CreateFreeze(LHS);
25609 };
25610 // Finish the reduction.
25611 // Need to add extra arguments and not vectorized possible reduction values.
25612 // Try to avoid dependencies between the scalar remainders after reductions.
25613 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
25614 bool InitStep) {
25615 unsigned Sz = InstVals.size();
25616 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
25617 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
25618 Instruction *RedOp = InstVals[I + 1].first;
25619 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
25620 Value *RdxVal1 = InstVals[I].second;
25621 Value *StableRdxVal1 = RdxVal1;
25622 auto It1 = TrackedVals.find(RdxVal1);
25623 if (It1 != TrackedVals.end())
25624 StableRdxVal1 = It1->second;
25625 Value *RdxVal2 = InstVals[I + 1].second;
25626 Value *StableRdxVal2 = RdxVal2;
25627 auto It2 = TrackedVals.find(RdxVal2);
25628 if (It2 != TrackedVals.end())
25629 StableRdxVal2 = It2->second;
25630 // To prevent poison from leaking across what used to be sequential,
25631 // safe, scalar boolean logic operations, the reduction operand must be
25632 // frozen.
25633 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
25634 RedOp, InitStep);
25635 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
25636 StableRdxVal2, "op.rdx", ReductionOps);
25637 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
25638 }
25639 if (Sz % 2 == 1)
25640 ExtraReds[Sz / 2] = InstVals.back();
25641 return ExtraReds;
25642 };
25644 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
25645 VectorizedTree);
25646 SmallPtrSet<Value *, 8> Visited;
25647 for (ArrayRef<Value *> Candidates : ReducedVals) {
25648 for (Value *RdxVal : Candidates) {
25649 if (!Visited.insert(RdxVal).second)
25650 continue;
25651 unsigned NumOps = VectorizedVals.lookup(RdxVal);
25652 for (Instruction *RedOp :
25653 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
25654 ExtraReductions.emplace_back(RedOp, RdxVal);
25655 }
25656 }
25657 // Iterate through all not-vectorized reduction values/extra arguments.
25658 bool InitStep = true;
25659 while (ExtraReductions.size() > 1) {
25661 FinalGen(ExtraReductions, InitStep);
25662 ExtraReductions.swap(NewReds);
25663 InitStep = false;
25664 }
25665 VectorizedTree = ExtraReductions.front().second;
25666
25667 ReductionRoot->replaceAllUsesWith(VectorizedTree);
25668
25669 // The original scalar reduction is expected to have no remaining
25670 // uses outside the reduction tree itself. Assert that we got this
25671 // correct, replace internal uses with undef, and mark for eventual
25672 // deletion.
25673#ifndef NDEBUG
25674 SmallPtrSet<Value *, 4> IgnoreSet;
25675 for (ArrayRef<Value *> RdxOps : ReductionOps)
25676 IgnoreSet.insert_range(RdxOps);
25677#endif
25678 for (ArrayRef<Value *> RdxOps : ReductionOps) {
25679 for (Value *Ignore : RdxOps) {
25680 if (!Ignore)
25681 continue;
25682#ifndef NDEBUG
25683 for (auto *U : Ignore->users()) {
25684 assert(IgnoreSet.count(U) &&
25685 "All users must be either in the reduction ops list.");
25686 }
25687#endif
25688 if (!Ignore->use_empty()) {
25689 Value *P = PoisonValue::get(Ignore->getType());
25690 Ignore->replaceAllUsesWith(P);
25691 }
25692 }
25693 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
25694 }
25695 return VectorizedTree;
25696 }
25697
25698private:
25699 /// Creates the reduction from the given \p Vec vector value with the given
25700 /// scale \p Scale and signedness \p IsSigned.
25701 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
25702 Value *Vec, unsigned Scale, bool IsSigned,
25703 Type *DestTy) {
25704 Value *Rdx;
25705 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
25706 unsigned DestTyNumElements = getNumElements(VecTy);
25707 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
25708 Rdx = PoisonValue::get(
25709 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
25710 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
25711 // Do reduction for each lane.
25712 // e.g., do reduce add for
25713 // VL[0] = <4 x Ty> <a, b, c, d>
25714 // VL[1] = <4 x Ty> <e, f, g, h>
25715 // Lane[0] = <2 x Ty> <a, e>
25716 // Lane[1] = <2 x Ty> <b, f>
25717 // Lane[2] = <2 x Ty> <c, g>
25718 // Lane[3] = <2 x Ty> <d, h>
25719 // result[0] = reduce add Lane[0]
25720 // result[1] = reduce add Lane[1]
25721 // result[2] = reduce add Lane[2]
25722 // result[3] = reduce add Lane[3]
25723 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
25724 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
25725 Rdx = Builder.CreateInsertElement(
25726 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
25727 }
25728 } else {
25729 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
25730 }
25731 if (Rdx->getType() != DestTy)
25732 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
25733 // Improved analysis for add/fadd/xor reductions with same scale
25734 // factor for all operands of reductions. We can emit scalar ops for
25735 // them instead.
25736 if (Scale > 1)
25737 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
25738 return Rdx;
25739 }
25740
25741 /// Calculate the cost of a reduction.
25742 InstructionCost getReductionCost(TargetTransformInfo *TTI,
25743 ArrayRef<Value *> ReducedVals,
25744 bool IsCmpSelMinMax, FastMathFlags FMF,
25745 const BoUpSLP &R, DominatorTree &DT,
25746 const DataLayout &DL,
25747 const TargetLibraryInfo &TLI) {
25749 Type *ScalarTy = ReducedVals.front()->getType();
25750 unsigned ReduxWidth = ReducedVals.size();
25751 FixedVectorType *VectorTy = R.getReductionType();
25752 InstructionCost VectorCost = 0, ScalarCost;
25753 // If all of the reduced values are constant, the vector cost is 0, since
25754 // the reduction value can be calculated at the compile time.
25755 bool AllConsts = allConstant(ReducedVals);
25756 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
25758 // Scalar cost is repeated for N-1 elements.
25759 int Cnt = ReducedVals.size();
25760 for (Value *RdxVal : ReducedVals) {
25761 if (!isa<Instruction>(RdxVal))
25762 continue;
25763 if (Cnt == 1)
25764 break;
25765 --Cnt;
25766 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
25767 Cost += GenCostFn();
25768 continue;
25769 }
25770 InstructionCost ScalarCost = 0;
25771 for (User *U : RdxVal->users()) {
25772 auto *RdxOp = cast<Instruction>(U);
25773 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
25774 if (RdxKind == RecurKind::FAdd) {
25776 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
25777 if (FMACost.isValid()) {
25778 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
25779 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
25780 // Also, exclude scalar fmul cost.
25781 InstructionCost FMulCost =
25783 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
25784 FMACost -= FMulCost;
25785 }
25786 ScalarCost += FMACost;
25787 continue;
25788 }
25789 }
25790 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
25791 continue;
25792 }
25793 ScalarCost = InstructionCost::getInvalid();
25794 break;
25795 }
25796 if (ScalarCost.isValid())
25797 Cost += ScalarCost;
25798 else
25799 Cost += GenCostFn();
25800 }
25801 return Cost;
25802 };
25803 // Require reduction cost if:
25804 // 1. This type is not a full register type and no other vectors with the
25805 // same type in the storage (first vector with small type).
25806 // 2. The storage does not have any vector with full vector use (first
25807 // vector with full register use).
25808 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
25809 switch (RdxKind) {
25810 case RecurKind::Add:
25811 case RecurKind::Mul:
25812 case RecurKind::Or:
25813 case RecurKind::And:
25814 case RecurKind::Xor:
25815 case RecurKind::FAdd:
25816 case RecurKind::FMul: {
25817 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
25818 if (!AllConsts) {
25819 if (DoesRequireReductionOp) {
25820 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
25821 assert(SLPReVec && "FixedVectorType is not expected.");
25822 unsigned ScalarTyNumElements = VecTy->getNumElements();
25823 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
25824 VectorCost += TTI->getShuffleCost(
25827 ReducedVals.size()),
25828 VectorTy,
25829 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
25830 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
25831 FMF, CostKind);
25832 }
25833 VectorCost += TTI->getScalarizationOverhead(
25834 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
25835 /*Extract*/ false, TTI::TCK_RecipThroughput);
25836 } else {
25837 Type *RedTy = VectorTy->getElementType();
25838 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25839 std::make_pair(RedTy, true));
25840 if (RType == RedTy) {
25841 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
25842 FMF, CostKind);
25843 } else {
25844 VectorCost = TTI->getExtendedReductionCost(
25845 RdxOpcode, !IsSigned, RedTy,
25846 getWidenedType(RType, ReduxWidth), FMF, CostKind);
25847 }
25848 }
25849 } else {
25850 Type *RedTy = VectorTy->getElementType();
25851 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25852 std::make_pair(RedTy, true));
25853 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
25854 InstructionCost FMACost = InstructionCost::getInvalid();
25855 if (RdxKind == RecurKind::FAdd) {
25856 // Check if the reduction operands can be converted to FMA.
25858 FastMathFlags FMF;
25859 FMF.set();
25860 for (Value *RdxVal : ReducedVals) {
25861 if (!RdxVal->hasOneUse()) {
25862 Ops.clear();
25863 break;
25864 }
25865 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
25866 FMF &= FPCI->getFastMathFlags();
25867 Ops.push_back(RdxVal->user_back());
25868 }
25869 if (!Ops.empty()) {
25870 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
25871 *TTI, TLI);
25872 if (FMACost.isValid()) {
25873 // Calculate actual FMAD cost.
25874 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25875 {RVecTy, RVecTy, RVecTy}, FMF);
25876 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
25877
25878 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
25879 // Also, exclude vector fmul cost.
25881 Instruction::FMul, RVecTy, CostKind);
25883 << "Minus vector FMul cost: " << FMulCost << "\n");
25884 FMACost -= FMulCost;
25885 }
25886 }
25887 }
25888 if (FMACost.isValid())
25889 VectorCost += FMACost;
25890 else
25891 VectorCost +=
25892 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
25893 if (RType != RedTy) {
25894 unsigned Opcode = Instruction::Trunc;
25895 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25896 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25897 VectorCost += TTI->getCastInstrCost(
25898 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25899 }
25900 }
25901 }
25902 ScalarCost = EvaluateScalarCost([&]() {
25903 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
25904 });
25905 break;
25906 }
25907 case RecurKind::FMax:
25908 case RecurKind::FMin:
25909 case RecurKind::FMaximum:
25910 case RecurKind::FMinimum:
25911 case RecurKind::SMax:
25912 case RecurKind::SMin:
25913 case RecurKind::UMax:
25914 case RecurKind::UMin: {
25916 if (!AllConsts) {
25917 if (DoesRequireReductionOp) {
25918 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
25919 } else {
25920 // Check if the previous reduction already exists and account it as
25921 // series of operations + single reduction.
25922 Type *RedTy = VectorTy->getElementType();
25923 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25924 std::make_pair(RedTy, true));
25925 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
25926 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25927 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
25928 if (RType != RedTy) {
25929 unsigned Opcode = Instruction::Trunc;
25930 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25931 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25932 VectorCost += TTI->getCastInstrCost(
25933 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25934 }
25935 }
25936 }
25937 ScalarCost = EvaluateScalarCost([&]() {
25938 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25939 return TTI->getIntrinsicInstrCost(ICA, CostKind);
25940 });
25941 break;
25942 }
25943 default:
25944 llvm_unreachable("Expected arithmetic or min/max reduction operation");
25945 }
25946
25947 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
25948 << " for reduction of " << shortBundleName(ReducedVals)
25949 << " (It is a splitting reduction)\n");
25950 return VectorCost - ScalarCost;
25951 }
25952
25953 /// Splits the values, stored in VectorValuesAndScales, into registers/free
25954 /// sub-registers, combines them with the given reduction operation as a
25955 /// vector operation and then performs single (small enough) reduction.
25956 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
25957 Type *DestTy) {
25958 Value *ReducedSubTree = nullptr;
25959 // Creates reduction and combines with the previous reduction.
25960 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
25961 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
25962 if (ReducedSubTree)
25963 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25964 "op.rdx", ReductionOps);
25965 else
25966 ReducedSubTree = Rdx;
25967 };
25968 if (VectorValuesAndScales.size() == 1) {
25969 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
25970 CreateSingleOp(Vec, Scale, IsSigned);
25971 return ReducedSubTree;
25972 }
25973 // Scales Vec using given Cnt scale factor and then performs vector combine
25974 // with previous value of VecOp.
25975 Value *VecRes = nullptr;
25976 bool VecResSignedness = false;
25977 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
25978 Type *ScalarTy = Vec->getType()->getScalarType();
25979 // Scale Vec using given Cnt scale factor.
25980 if (Cnt > 1) {
25981 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
25982 switch (RdxKind) {
25983 case RecurKind::Add: {
25984 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
25985 unsigned VF = getNumElements(Vec->getType());
25986 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
25987 << ". (HorRdx)\n");
25988 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
25989 for (unsigned I : seq<unsigned>(Cnt))
25990 std::iota(std::next(Mask.begin(), VF * I),
25991 std::next(Mask.begin(), VF * (I + 1)), 0);
25992 ++NumVectorInstructions;
25993 Vec = Builder.CreateShuffleVector(Vec, Mask);
25994 break;
25995 }
25996 // res = mul vv, n
25997 if (ScalarTy != DestTy->getScalarType())
25998 Vec = Builder.CreateIntCast(
25999 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
26000 IsSigned);
26002 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
26003 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
26004 << ". (HorRdx)\n");
26005 ++NumVectorInstructions;
26006 Vec = Builder.CreateMul(Vec, Scale);
26007 break;
26008 }
26009 case RecurKind::Xor: {
26010 // res = n % 2 ? 0 : vv
26012 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
26013 if (Cnt % 2 == 0)
26014 Vec = Constant::getNullValue(Vec->getType());
26015 break;
26016 }
26017 case RecurKind::FAdd: {
26018 // res = fmul v, n
26019 Value *Scale =
26020 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
26021 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
26022 << ". (HorRdx)\n");
26023 ++NumVectorInstructions;
26024 Vec = Builder.CreateFMul(Vec, Scale);
26025 break;
26026 }
26027 case RecurKind::And:
26028 case RecurKind::Or:
26029 case RecurKind::SMax:
26030 case RecurKind::SMin:
26031 case RecurKind::UMax:
26032 case RecurKind::UMin:
26033 case RecurKind::FMax:
26034 case RecurKind::FMin:
26035 case RecurKind::FMaximum:
26036 case RecurKind::FMinimum:
26037 // res = vv
26038 break;
26039 case RecurKind::Sub:
26040 case RecurKind::AddChainWithSubs:
26041 case RecurKind::Mul:
26042 case RecurKind::FMul:
26043 case RecurKind::FMulAdd:
26044 case RecurKind::AnyOf:
26045 case RecurKind::FindFirstIVSMin:
26046 case RecurKind::FindFirstIVUMin:
26047 case RecurKind::FindLastIVSMax:
26048 case RecurKind::FindLastIVUMax:
26049 case RecurKind::FindLast:
26050 case RecurKind::FMaxNum:
26051 case RecurKind::FMinNum:
26052 case RecurKind::FMaximumNum:
26053 case RecurKind::FMinimumNum:
26054 case RecurKind::None:
26055 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
26056 }
26057 }
26058 // Combine Vec with the previous VecOp.
26059 if (!VecRes) {
26060 VecRes = Vec;
26061 VecResSignedness = IsSigned;
26062 } else {
26063 ++NumVectorInstructions;
26064 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
26065 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
26066 // Handle ctpop.
26067 unsigned VecResVF = getNumElements(VecRes->getType());
26068 unsigned VecVF = getNumElements(Vec->getType());
26069 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
26070 std::iota(Mask.begin(), Mask.end(), 0);
26071 // Ensure that VecRes is always larger than Vec
26072 if (VecResVF < VecVF) {
26073 std::swap(VecRes, Vec);
26074 std::swap(VecResVF, VecVF);
26075 }
26076 if (VecResVF != VecVF) {
26077 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
26078 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
26079 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
26080 }
26081 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
26082 return;
26083 }
26084 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
26085 VecRes = Builder.CreateIntCast(
26086 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
26087 VecResSignedness);
26088 if (ScalarTy != DestTy->getScalarType())
26089 Vec = Builder.CreateIntCast(
26090 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
26091 IsSigned);
26092 unsigned VecResVF = getNumElements(VecRes->getType());
26093 unsigned VecVF = getNumElements(Vec->getType());
26094 // Ensure that VecRes is always larger than Vec
26095 if (VecResVF < VecVF) {
26096 std::swap(VecRes, Vec);
26097 std::swap(VecResVF, VecVF);
26098 }
26099 // extract + op + insert
26100 Value *Op = VecRes;
26101 if (VecResVF != VecVF)
26102 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
26103 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
26104 if (VecResVF != VecVF)
26105 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
26106 VecRes = Op;
26107 }
26108 };
26109 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
26110 CreateVecOp(Vec, Scale, IsSigned);
26111 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
26112
26113 return ReducedSubTree;
26114 }
26115
26116 /// Emit a horizontal reduction of the vectorized value.
26117 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
26118 const TargetTransformInfo *TTI, Type *DestTy) {
26119 assert(VectorizedValue && "Need to have a vectorized tree node");
26120 assert(RdxKind != RecurKind::FMulAdd &&
26121 "A call to the llvm.fmuladd intrinsic is not handled yet");
26122
26123 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
26124 if (FTy->getScalarType() == Builder.getInt1Ty() &&
26125 RdxKind == RecurKind::Add &&
26126 DestTy->getScalarType() != FTy->getScalarType()) {
26127 // Convert vector_reduce_add(ZExt(<n x i1>)) to
26128 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
26129 Value *V = Builder.CreateBitCast(
26130 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
26131 ++NumVectorInstructions;
26132 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
26133 }
26134 ++NumVectorInstructions;
26135 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
26136 }
26137
26138 /// Emits optimized code for unique scalar value reused \p Cnt times.
26139 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
26140 unsigned Cnt) {
26141 assert(IsSupportedHorRdxIdentityOp &&
26142 "The optimization of matched scalar identity horizontal reductions "
26143 "must be supported.");
26144 if (Cnt == 1)
26145 return VectorizedValue;
26146 switch (RdxKind) {
26147 case RecurKind::Add: {
26148 // res = mul vv, n
26149 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
26150 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
26151 << VectorizedValue << ". (HorRdx)\n");
26152 return Builder.CreateMul(VectorizedValue, Scale);
26153 }
26154 case RecurKind::Xor: {
26155 // res = n % 2 ? 0 : vv
26156 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
26157 << ". (HorRdx)\n");
26158 if (Cnt % 2 == 0)
26159 return Constant::getNullValue(VectorizedValue->getType());
26160 return VectorizedValue;
26161 }
26162 case RecurKind::FAdd: {
26163 // res = fmul v, n
26164 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
26165 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
26166 << VectorizedValue << ". (HorRdx)\n");
26167 return Builder.CreateFMul(VectorizedValue, Scale);
26168 }
26169 case RecurKind::And:
26170 case RecurKind::Or:
26171 case RecurKind::SMax:
26172 case RecurKind::SMin:
26173 case RecurKind::UMax:
26174 case RecurKind::UMin:
26175 case RecurKind::FMax:
26176 case RecurKind::FMin:
26177 case RecurKind::FMaximum:
26178 case RecurKind::FMinimum:
26179 // res = vv
26180 return VectorizedValue;
26181 case RecurKind::Sub:
26182 case RecurKind::AddChainWithSubs:
26183 case RecurKind::Mul:
26184 case RecurKind::FMul:
26185 case RecurKind::FMulAdd:
26186 case RecurKind::AnyOf:
26187 case RecurKind::FindFirstIVSMin:
26188 case RecurKind::FindFirstIVUMin:
26189 case RecurKind::FindLastIVSMax:
26190 case RecurKind::FindLastIVUMax:
26191 case RecurKind::FindLast:
26192 case RecurKind::FMaxNum:
26193 case RecurKind::FMinNum:
26194 case RecurKind::FMaximumNum:
26195 case RecurKind::FMinimumNum:
26196 case RecurKind::None:
26197 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
26198 }
26199 return nullptr;
26200 }
26201
26202 /// Emits actual operation for the scalar identity values, found during
26203 /// horizontal reduction analysis.
26204 Value *
26205 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
26206 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
26207 const DenseMap<Value *, Value *> &TrackedToOrig) {
26208 assert(IsSupportedHorRdxIdentityOp &&
26209 "The optimization of matched scalar identity horizontal reductions "
26210 "must be supported.");
26211 ArrayRef<Value *> VL = R.getRootNodeScalars();
26212 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
26213 if (VTy->getElementType() != VL.front()->getType()) {
26214 VectorizedValue = Builder.CreateIntCast(
26215 VectorizedValue,
26216 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
26217 R.isSignedMinBitwidthRootNode());
26218 }
26219 switch (RdxKind) {
26220 case RecurKind::Add: {
26221 // root = mul prev_root, <1, 1, n, 1>
26223 for (Value *V : VL) {
26224 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
26225 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
26226 }
26227 auto *Scale = ConstantVector::get(Vals);
26228 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
26229 << VectorizedValue << ". (HorRdx)\n");
26230 return Builder.CreateMul(VectorizedValue, Scale);
26231 }
26232 case RecurKind::And:
26233 case RecurKind::Or:
26234 // No need for multiple or/and(s).
26235 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
26236 << ". (HorRdx)\n");
26237 return VectorizedValue;
26238 case RecurKind::SMax:
26239 case RecurKind::SMin:
26240 case RecurKind::UMax:
26241 case RecurKind::UMin:
26242 case RecurKind::FMax:
26243 case RecurKind::FMin:
26244 case RecurKind::FMaximum:
26245 case RecurKind::FMinimum:
26246 // No need for multiple min/max(s) of the same value.
26247 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
26248 << ". (HorRdx)\n");
26249 return VectorizedValue;
26250 case RecurKind::Xor: {
26251 // Replace values with even number of repeats with 0, since
26252 // x xor x = 0.
26253 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
26254 // 7>, if elements 4th and 6th elements have even number of repeats.
26255 SmallVector<int> Mask(
26256 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
26258 std::iota(Mask.begin(), Mask.end(), 0);
26259 bool NeedShuffle = false;
26260 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
26261 Value *V = VL[I];
26262 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
26263 if (Cnt % 2 == 0) {
26264 Mask[I] = VF;
26265 NeedShuffle = true;
26266 }
26267 }
26268 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
26269 : Mask) dbgs()
26270 << I << " ";
26271 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
26272 if (NeedShuffle)
26273 VectorizedValue = Builder.CreateShuffleVector(
26274 VectorizedValue,
26275 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
26276 return VectorizedValue;
26277 }
26278 case RecurKind::FAdd: {
26279 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
26281 for (Value *V : VL) {
26282 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
26283 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
26284 }
26285 auto *Scale = ConstantVector::get(Vals);
26286 return Builder.CreateFMul(VectorizedValue, Scale);
26287 }
26288 case RecurKind::Sub:
26289 case RecurKind::AddChainWithSubs:
26290 case RecurKind::Mul:
26291 case RecurKind::FMul:
26292 case RecurKind::FMulAdd:
26293 case RecurKind::AnyOf:
26294 case RecurKind::FindFirstIVSMin:
26295 case RecurKind::FindFirstIVUMin:
26296 case RecurKind::FindLastIVSMax:
26297 case RecurKind::FindLastIVUMax:
26298 case RecurKind::FindLast:
26299 case RecurKind::FMaxNum:
26300 case RecurKind::FMinNum:
26301 case RecurKind::FMaximumNum:
26302 case RecurKind::FMinimumNum:
26303 case RecurKind::None:
26304 llvm_unreachable("Unexpected reduction kind for reused scalars.");
26305 }
26306 return nullptr;
26307 }
26308};
26309} // end anonymous namespace
26310
26311/// Gets recurrence kind from the specified value.
26313 return HorizontalReduction::getRdxKind(V);
26314}
26315static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
26316 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
26317 return cast<FixedVectorType>(IE->getType())->getNumElements();
26318
26319 unsigned AggregateSize = 1;
26320 auto *IV = cast<InsertValueInst>(InsertInst);
26321 Type *CurrentType = IV->getType();
26322 do {
26323 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
26324 for (auto *Elt : ST->elements())
26325 if (Elt != ST->getElementType(0)) // check homogeneity
26326 return std::nullopt;
26327 AggregateSize *= ST->getNumElements();
26328 CurrentType = ST->getElementType(0);
26329 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
26330 AggregateSize *= AT->getNumElements();
26331 CurrentType = AT->getElementType();
26332 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
26333 AggregateSize *= VT->getNumElements();
26334 return AggregateSize;
26335 } else if (CurrentType->isSingleValueType()) {
26336 return AggregateSize;
26337 } else {
26338 return std::nullopt;
26339 }
26340 } while (true);
26341}
26342
26343static void findBuildAggregateRec(Instruction *LastInsertInst,
26345 SmallVectorImpl<Value *> &BuildVectorOpds,
26346 SmallVectorImpl<Value *> &InsertElts,
26347 unsigned OperandOffset, const BoUpSLP &R) {
26348 do {
26349 Value *InsertedOperand = LastInsertInst->getOperand(1);
26350 std::optional<unsigned> OperandIndex =
26351 getElementIndex(LastInsertInst, OperandOffset);
26352 if (!OperandIndex || R.isDeleted(LastInsertInst))
26353 return;
26354 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
26356 BuildVectorOpds, InsertElts, *OperandIndex, R);
26357
26358 } else {
26359 BuildVectorOpds[*OperandIndex] = InsertedOperand;
26360 InsertElts[*OperandIndex] = LastInsertInst;
26361 }
26362 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
26363 } while (LastInsertInst != nullptr &&
26365 LastInsertInst->hasOneUse());
26366}
26367
26368/// Recognize construction of vectors like
26369/// %ra = insertelement <4 x float> poison, float %s0, i32 0
26370/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
26371/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
26372/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
26373/// starting from the last insertelement or insertvalue instruction.
26374///
26375/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
26376/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
26377/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
26378///
26379/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
26380///
26381/// \return true if it matches.
26382static bool findBuildAggregate(Instruction *LastInsertInst,
26384 SmallVectorImpl<Value *> &BuildVectorOpds,
26385 SmallVectorImpl<Value *> &InsertElts,
26386 const BoUpSLP &R) {
26387
26388 assert((isa<InsertElementInst>(LastInsertInst) ||
26389 isa<InsertValueInst>(LastInsertInst)) &&
26390 "Expected insertelement or insertvalue instruction!");
26391
26392 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
26393 "Expected empty result vectors!");
26394
26395 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
26396 if (!AggregateSize)
26397 return false;
26398 BuildVectorOpds.resize(*AggregateSize);
26399 InsertElts.resize(*AggregateSize);
26400
26401 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
26402 llvm::erase(BuildVectorOpds, nullptr);
26403 llvm::erase(InsertElts, nullptr);
26404 if (BuildVectorOpds.size() >= 2)
26405 return true;
26406
26407 return false;
26408}
26409
26410/// Try and get a reduction instruction from a phi node.
26411///
26412/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
26413/// if they come from either \p ParentBB or a containing loop latch.
26414///
26415/// \returns A candidate reduction value if possible, or \code nullptr \endcode
26416/// if not possible.
26418 BasicBlock *ParentBB, LoopInfo *LI) {
26419 // There are situations where the reduction value is not dominated by the
26420 // reduction phi. Vectorizing such cases has been reported to cause
26421 // miscompiles. See PR25787.
26422 auto DominatedReduxValue = [&](Value *R) {
26423 return isa<Instruction>(R) &&
26424 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
26425 };
26426
26427 Instruction *Rdx = nullptr;
26428
26429 // Return the incoming value if it comes from the same BB as the phi node.
26430 if (P->getIncomingBlock(0) == ParentBB) {
26431 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
26432 } else if (P->getIncomingBlock(1) == ParentBB) {
26433 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
26434 }
26435
26436 if (Rdx && DominatedReduxValue(Rdx))
26437 return Rdx;
26438
26439 // Otherwise, check whether we have a loop latch to look at.
26440 Loop *BBL = LI->getLoopFor(ParentBB);
26441 if (!BBL)
26442 return nullptr;
26443 BasicBlock *BBLatch = BBL->getLoopLatch();
26444 if (!BBLatch)
26445 return nullptr;
26446
26447 // There is a loop latch, return the incoming value if it comes from
26448 // that. This reduction pattern occasionally turns up.
26449 if (P->getIncomingBlock(0) == BBLatch) {
26450 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
26451 } else if (P->getIncomingBlock(1) == BBLatch) {
26452 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
26453 }
26454
26455 if (Rdx && DominatedReduxValue(Rdx))
26456 return Rdx;
26457
26458 return nullptr;
26459}
26460
26461static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
26462 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
26463 return true;
26464 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
26465 return true;
26466 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
26467 return true;
26468 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
26469 return true;
26470 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
26471 return true;
26473 return true;
26475 return true;
26477 return true;
26479 return true;
26480 return false;
26481}
26482
26483/// We could have an initial reduction that is not an add.
26484/// r *= v1 + v2 + v3 + v4
26485/// In such a case start looking for a tree rooted in the first '+'.
26486/// \Returns the new root if found, which may be nullptr if not an instruction.
26488 Instruction *Root) {
26489 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
26490 isa<IntrinsicInst>(Root)) &&
26491 "Expected binop, select, or intrinsic for reduction matching");
26492 Value *LHS =
26493 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
26494 Value *RHS =
26495 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
26496 if (LHS == Phi)
26497 return dyn_cast<Instruction>(RHS);
26498 if (RHS == Phi)
26499 return dyn_cast<Instruction>(LHS);
26500 return nullptr;
26501}
26502
26503/// \p Returns the first operand of \p I that does not match \p Phi. If
26504/// operand is not an instruction it returns nullptr.
26506 Value *Op0 = nullptr;
26507 Value *Op1 = nullptr;
26508 if (!matchRdxBop(I, Op0, Op1))
26509 return nullptr;
26510 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
26511}
26512
26513/// \Returns true if \p I is a candidate instruction for reduction vectorization.
26515 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
26516 Value *B0 = nullptr, *B1 = nullptr;
26517 bool IsBinop = matchRdxBop(I, B0, B1);
26518 return IsBinop || IsSelect;
26519}
26520
26521bool SLPVectorizerPass::vectorizeHorReduction(
26522 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
26523 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
26524 if (!ShouldVectorizeHor)
26525 return false;
26526 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
26527
26528 if (Root->getParent() != BB || isa<PHINode>(Root))
26529 return false;
26530
26531 // If we can find a secondary reduction root, use that instead.
26532 auto SelectRoot = [&]() {
26533 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
26534 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
26535 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
26536 return NewRoot;
26537 return Root;
26538 };
26539
26540 // Start analysis starting from Root instruction. If horizontal reduction is
26541 // found, try to vectorize it. If it is not a horizontal reduction or
26542 // vectorization is not possible or not effective, and currently analyzed
26543 // instruction is a binary operation, try to vectorize the operands, using
26544 // pre-order DFS traversal order. If the operands were not vectorized, repeat
26545 // the same procedure considering each operand as a possible root of the
26546 // horizontal reduction.
26547 // Interrupt the process if the Root instruction itself was vectorized or all
26548 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
26549 // If a horizintal reduction was not matched or vectorized we collect
26550 // instructions for possible later attempts for vectorization.
26551 std::queue<std::pair<Instruction *, unsigned>> Stack;
26552 Stack.emplace(SelectRoot(), 0);
26553 SmallPtrSet<Value *, 8> VisitedInstrs;
26554 bool Res = false;
26555 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
26556 if (R.isAnalyzedReductionRoot(Inst))
26557 return nullptr;
26558 if (!isReductionCandidate(Inst))
26559 return nullptr;
26560 HorizontalReduction HorRdx;
26561 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
26562 return nullptr;
26563 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
26564 };
26565 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
26566 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
26567 FutureSeed = getNonPhiOperand(Root, P);
26568 if (!FutureSeed)
26569 return false;
26570 }
26571 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
26572 // analysis is done separately.
26574 PostponedInsts.push_back(FutureSeed);
26575 return true;
26576 };
26577
26578 while (!Stack.empty()) {
26579 Instruction *Inst;
26580 unsigned Level;
26581 std::tie(Inst, Level) = Stack.front();
26582 Stack.pop();
26583 // Do not try to analyze instruction that has already been vectorized.
26584 // This may happen when we vectorize instruction operands on a previous
26585 // iteration while stack was populated before that happened.
26586 if (R.isDeleted(Inst))
26587 continue;
26588 if (Value *VectorizedV = TryToReduce(Inst)) {
26589 Res = true;
26590 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
26591 // Try to find another reduction.
26592 Stack.emplace(I, Level);
26593 continue;
26594 }
26595 if (R.isDeleted(Inst))
26596 continue;
26597 } else {
26598 // We could not vectorize `Inst` so try to use it as a future seed.
26599 if (!TryAppendToPostponedInsts(Inst)) {
26600 assert(Stack.empty() && "Expected empty stack");
26601 break;
26602 }
26603 }
26604
26605 // Try to vectorize operands.
26606 // Continue analysis for the instruction from the same basic block only to
26607 // save compile time.
26608 if (++Level < RecursionMaxDepth)
26609 for (auto *Op : Inst->operand_values())
26610 if (VisitedInstrs.insert(Op).second)
26611 if (auto *I = dyn_cast<Instruction>(Op))
26612 // Do not try to vectorize CmpInst operands, this is done
26613 // separately.
26615 !R.isDeleted(I) && I->getParent() == BB)
26616 Stack.emplace(I, Level);
26617 }
26618 return Res;
26619}
26620
26621bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
26622 if (!I)
26623 return false;
26624
26625 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
26626 return false;
26627 // Skip potential FMA candidates.
26628 if ((I->getOpcode() == Instruction::FAdd ||
26629 I->getOpcode() == Instruction::FSub) &&
26630 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
26631 .isValid())
26632 return false;
26633
26634 Value *P = I->getParent();
26635
26636 // Vectorize in current basic block only.
26637 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
26638 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
26639 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
26640 R.isDeleted(Op0) || R.isDeleted(Op1))
26641 return false;
26642
26643 // First collect all possible candidates
26645 Candidates.emplace_back(Op0, Op1);
26646
26647 auto *A = dyn_cast<BinaryOperator>(Op0);
26648 auto *B = dyn_cast<BinaryOperator>(Op1);
26649 // Try to skip B.
26650 if (A && B && B->hasOneUse()) {
26651 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
26652 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
26653 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
26654 Candidates.emplace_back(A, B0);
26655 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
26656 Candidates.emplace_back(A, B1);
26657 }
26658 // Try to skip A.
26659 if (B && A && A->hasOneUse()) {
26660 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
26661 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
26662 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
26663 Candidates.emplace_back(A0, B);
26664 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
26665 Candidates.emplace_back(A1, B);
26666 }
26667
26668 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
26670 if (!isReductionCandidate(Inst))
26671 return false;
26672 Type *Ty = Inst->getType();
26673 if (!isValidElementType(Ty) || Ty->isPointerTy())
26674 return false;
26675 HorizontalReduction HorRdx(Inst, Ops);
26676 if (!HorRdx.matchReductionForOperands())
26677 return false;
26678 // Check the cost of operations.
26679 VectorType *VecTy = getWidenedType(Ty, Ops.size());
26681 InstructionCost ScalarCost =
26682 TTI.getScalarizationOverhead(
26683 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
26684 /*Extract=*/true, CostKind) +
26685 TTI.getInstructionCost(Inst, CostKind);
26686 InstructionCost RedCost;
26687 switch (::getRdxKind(Inst)) {
26688 case RecurKind::Add:
26689 case RecurKind::Mul:
26690 case RecurKind::Or:
26691 case RecurKind::And:
26692 case RecurKind::Xor:
26693 case RecurKind::FAdd:
26694 case RecurKind::FMul: {
26695 FastMathFlags FMF;
26696 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
26697 FMF = FPCI->getFastMathFlags();
26698 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
26699 CostKind);
26700 break;
26701 }
26702 default:
26703 return false;
26704 }
26705 if (RedCost >= ScalarCost)
26706 return false;
26707
26708 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
26709 };
26710 if (Candidates.size() == 1)
26711 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
26712
26713 // We have multiple options. Try to pick the single best.
26714 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
26715 if (!BestCandidate)
26716 return false;
26717 return (*BestCandidate == 0 &&
26718 TryToReduce(I, {Candidates[*BestCandidate].first,
26719 Candidates[*BestCandidate].second})) ||
26720 tryToVectorizeList({Candidates[*BestCandidate].first,
26721 Candidates[*BestCandidate].second},
26722 R);
26723}
26724
26725bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
26726 BasicBlock *BB, BoUpSLP &R) {
26727 SmallVector<WeakTrackingVH> PostponedInsts;
26728 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
26729 Res |= tryToVectorize(PostponedInsts, R);
26730 return Res;
26731}
26732
26733bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
26734 BoUpSLP &R) {
26735 bool Res = false;
26736 for (Value *V : Insts)
26737 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
26738 Res |= tryToVectorize(Inst, R);
26739 return Res;
26740}
26741
26742bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
26743 BasicBlock *BB, BoUpSLP &R,
26744 bool MaxVFOnly) {
26745 if (!R.canMapToVector(IVI->getType()))
26746 return false;
26747
26748 SmallVector<Value *, 16> BuildVectorOpds;
26749 SmallVector<Value *, 16> BuildVectorInsts;
26750 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
26751 return false;
26752
26753 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
26754 R.getORE()->emit([&]() {
26755 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
26756 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
26757 "trying reduction first.";
26758 });
26759 return false;
26760 }
26761 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
26762 // Aggregate value is unlikely to be processed in vector register.
26763 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
26764}
26765
26766bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
26767 BasicBlock *BB, BoUpSLP &R,
26768 bool MaxVFOnly) {
26769 SmallVector<Value *, 16> BuildVectorInsts;
26770 SmallVector<Value *, 16> BuildVectorOpds;
26771 SmallVector<int> Mask;
26772 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
26774 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
26775 return false;
26776
26777 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
26778 R.getORE()->emit([&]() {
26779 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
26780 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
26781 "trying reduction first.";
26782 });
26783 return false;
26784 }
26785 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
26786 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
26787}
26788
26789template <typename T>
26791 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
26792 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
26793 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
26794 bool MaxVFOnly, BoUpSLP &R) {
26795 bool Changed = false;
26796 // Sort by type, parent, operands.
26797 stable_sort(Incoming, Comparator);
26798
26799 // Try to vectorize elements base on their type.
26800 SmallVector<T *> Candidates;
26802 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
26803 VL.clear()) {
26804 // Look for the next elements with the same type, parent and operand
26805 // kinds.
26806 auto *I = dyn_cast<Instruction>(*IncIt);
26807 if (!I || R.isDeleted(I)) {
26808 ++IncIt;
26809 continue;
26810 }
26811 auto *SameTypeIt = IncIt;
26812 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
26813 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
26814 AreCompatible(VL, *SameTypeIt))) {
26815 auto *I = dyn_cast<Instruction>(*SameTypeIt);
26816 ++SameTypeIt;
26817 if (I && !R.isDeleted(I))
26818 VL.push_back(cast<T>(I));
26819 }
26820
26821 // Try to vectorize them.
26822 unsigned NumElts = VL.size();
26823 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
26824 << NumElts << ")\n");
26825 // The vectorization is a 3-state attempt:
26826 // 1. Try to vectorize instructions with the same/alternate opcodes with the
26827 // size of maximal register at first.
26828 // 2. Try to vectorize remaining instructions with the same type, if
26829 // possible. This may result in the better vectorization results rather than
26830 // if we try just to vectorize instructions with the same/alternate opcodes.
26831 // 3. Final attempt to try to vectorize all instructions with the
26832 // same/alternate ops only, this may result in some extra final
26833 // vectorization.
26834 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
26835 // Success start over because instructions might have been changed.
26836 Changed = true;
26837 VL.swap(Candidates);
26838 Candidates.clear();
26839 for (T *V : VL) {
26840 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
26841 Candidates.push_back(V);
26842 }
26843 } else {
26844 /// \Returns the minimum number of elements that we will attempt to
26845 /// vectorize.
26846 auto GetMinNumElements = [&R](Value *V) {
26847 unsigned EltSize = R.getVectorElementSize(V);
26848 return std::max(2U, R.getMaxVecRegSize() / EltSize);
26849 };
26850 if (NumElts < GetMinNumElements(*IncIt) &&
26851 (Candidates.empty() ||
26852 Candidates.front()->getType() == (*IncIt)->getType())) {
26853 for (T *V : VL) {
26854 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
26855 Candidates.push_back(V);
26856 }
26857 }
26858 }
26859 // Final attempt to vectorize instructions with the same types.
26860 if (Candidates.size() > 1 &&
26861 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
26862 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
26863 // Success start over because instructions might have been changed.
26864 Changed = true;
26865 } else if (MaxVFOnly) {
26866 // Try to vectorize using small vectors.
26868 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
26869 VL.clear()) {
26870 auto *I = dyn_cast<Instruction>(*It);
26871 if (!I || R.isDeleted(I)) {
26872 ++It;
26873 continue;
26874 }
26875 auto *SameTypeIt = It;
26876 while (SameTypeIt != End &&
26877 (!isa<Instruction>(*SameTypeIt) ||
26878 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
26879 AreCompatible(*SameTypeIt, *It))) {
26880 auto *I = dyn_cast<Instruction>(*SameTypeIt);
26881 ++SameTypeIt;
26882 if (I && !R.isDeleted(I))
26883 VL.push_back(cast<T>(I));
26884 }
26885 unsigned NumElts = VL.size();
26886 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
26887 /*MaxVFOnly=*/false))
26888 Changed = true;
26889 It = SameTypeIt;
26890 }
26891 }
26892 Candidates.clear();
26893 }
26894
26895 // Start over at the next instruction of a different type (or the end).
26896 IncIt = SameTypeIt;
26897 }
26898 return Changed;
26899}
26900
26901/// Compare two cmp instructions. If IsCompatibility is true, function returns
26902/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
26903/// operands. If IsCompatibility is false, function implements strict weak
26904/// ordering relation between two cmp instructions, returning true if the first
26905/// instruction is "less" than the second, i.e. its predicate is less than the
26906/// predicate of the second or the operands IDs are less than the operands IDs
26907/// of the second cmp instruction.
26908template <bool IsCompatibility>
26909static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
26910 const DominatorTree &DT) {
26911 assert(isValidElementType(V->getType()) &&
26912 isValidElementType(V2->getType()) &&
26913 "Expected valid element types only.");
26914 if (V == V2)
26915 return IsCompatibility;
26916 auto *CI1 = cast<CmpInst>(V);
26917 auto *CI2 = cast<CmpInst>(V2);
26918 if (CI1->getOperand(0)->getType()->getTypeID() <
26919 CI2->getOperand(0)->getType()->getTypeID())
26920 return !IsCompatibility;
26921 if (CI1->getOperand(0)->getType()->getTypeID() >
26922 CI2->getOperand(0)->getType()->getTypeID())
26923 return false;
26924 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26926 return !IsCompatibility;
26927 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26929 return false;
26930 CmpInst::Predicate Pred1 = CI1->getPredicate();
26931 CmpInst::Predicate Pred2 = CI2->getPredicate();
26934 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
26935 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
26936 if (BasePred1 < BasePred2)
26937 return !IsCompatibility;
26938 if (BasePred1 > BasePred2)
26939 return false;
26940 // Compare operands.
26941 bool CI1Preds = Pred1 == BasePred1;
26942 bool CI2Preds = Pred2 == BasePred1;
26943 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
26944 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
26945 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
26946 if (Op1 == Op2)
26947 continue;
26948 if (Op1->getValueID() < Op2->getValueID())
26949 return !IsCompatibility;
26950 if (Op1->getValueID() > Op2->getValueID())
26951 return false;
26952 if (auto *I1 = dyn_cast<Instruction>(Op1))
26953 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
26954 if (IsCompatibility) {
26955 if (I1->getParent() != I2->getParent())
26956 return false;
26957 } else {
26958 // Try to compare nodes with same parent.
26959 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
26960 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
26961 if (!NodeI1)
26962 return NodeI2 != nullptr;
26963 if (!NodeI2)
26964 return false;
26965 assert((NodeI1 == NodeI2) ==
26966 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26967 "Different nodes should have different DFS numbers");
26968 if (NodeI1 != NodeI2)
26969 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26970 }
26971 InstructionsState S = getSameOpcode({I1, I2}, TLI);
26972 if (S && (IsCompatibility || !S.isAltShuffle()))
26973 continue;
26974 if (IsCompatibility)
26975 return false;
26976 if (I1->getOpcode() != I2->getOpcode())
26977 return I1->getOpcode() < I2->getOpcode();
26978 }
26979 }
26980 return IsCompatibility;
26981}
26982
26983template <typename ItT>
26984bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
26985 BasicBlock *BB, BoUpSLP &R) {
26986 bool Changed = false;
26987 // Try to find reductions first.
26988 for (CmpInst *I : CmpInsts) {
26989 if (R.isDeleted(I))
26990 continue;
26991 for (Value *Op : I->operands())
26992 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
26993 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
26994 if (R.isDeleted(I))
26995 break;
26996 }
26997 }
26998 // Try to vectorize operands as vector bundles.
26999 for (CmpInst *I : CmpInsts) {
27000 if (R.isDeleted(I))
27001 continue;
27002 Changed |= tryToVectorize(I, R);
27003 }
27004 // Try to vectorize list of compares.
27005 // Sort by type, compare predicate, etc.
27006 auto CompareSorter = [&](Value *V, Value *V2) {
27007 if (V == V2)
27008 return false;
27009 return compareCmp<false>(V, V2, *TLI, *DT);
27010 };
27011
27012 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
27013 if (VL.empty() || VL.back() == V1)
27014 return true;
27015 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
27016 };
27017
27019 for (Instruction *V : CmpInsts)
27020 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
27021 Vals.push_back(V);
27022 if (Vals.size() <= 1)
27023 return Changed;
27025 Vals, CompareSorter, AreCompatibleCompares,
27026 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
27027 // Exclude possible reductions from other blocks.
27028 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
27029 return any_of(V->users(), [V](User *U) {
27030 auto *Select = dyn_cast<SelectInst>(U);
27031 return Select &&
27032 Select->getParent() != cast<Instruction>(V)->getParent();
27033 });
27034 });
27035 if (ArePossiblyReducedInOtherBlock)
27036 return false;
27037 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27038 },
27039 /*MaxVFOnly=*/true, R);
27040 return Changed;
27041}
27042
27043bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
27044 BasicBlock *BB, BoUpSLP &R) {
27046 "This function only accepts Insert instructions");
27047 bool OpsChanged = false;
27048 SmallVector<WeakTrackingVH> PostponedInsts;
27049 for (auto *I : reverse(Instructions)) {
27050 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
27051 if (R.isDeleted(I) || isa<CmpInst>(I))
27052 continue;
27053 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
27054 OpsChanged |=
27055 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
27056 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
27057 OpsChanged |=
27058 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
27059 }
27060 // pass2 - try to vectorize reductions only
27061 if (R.isDeleted(I))
27062 continue;
27063 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
27064 if (R.isDeleted(I) || isa<CmpInst>(I))
27065 continue;
27066 // pass3 - try to match and vectorize a buildvector sequence.
27067 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
27068 OpsChanged |=
27069 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
27070 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
27071 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
27072 /*MaxVFOnly=*/false);
27073 }
27074 }
27075 // Now try to vectorize postponed instructions.
27076 OpsChanged |= tryToVectorize(PostponedInsts, R);
27077
27078 Instructions.clear();
27079 return OpsChanged;
27080}
27081
27082bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
27083 bool Changed = false;
27084 SmallVector<Value *, 4> Incoming;
27085 SmallPtrSet<Value *, 16> VisitedInstrs;
27086 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
27087 // node. Allows better to identify the chains that can be vectorized in the
27088 // better way.
27089 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
27090 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
27092 isValidElementType(V2->getType()) &&
27093 "Expected vectorizable types only.");
27094 if (V1 == V2)
27095 return false;
27096 // It is fine to compare type IDs here, since we expect only vectorizable
27097 // types, like ints, floats and pointers, we don't care about other type.
27098 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
27099 return true;
27100 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
27101 return false;
27102 if (V1->getType()->getScalarSizeInBits() <
27103 V2->getType()->getScalarSizeInBits())
27104 return true;
27105 if (V1->getType()->getScalarSizeInBits() >
27106 V2->getType()->getScalarSizeInBits())
27107 return false;
27108 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
27109 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
27110 if (Opcodes1.size() < Opcodes2.size())
27111 return true;
27112 if (Opcodes1.size() > Opcodes2.size())
27113 return false;
27114 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
27115 {
27116 // Instructions come first.
27117 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
27118 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
27119 if (I1 && I2) {
27120 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
27121 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
27122 if (!NodeI1)
27123 return NodeI2 != nullptr;
27124 if (!NodeI2)
27125 return false;
27126 assert((NodeI1 == NodeI2) ==
27127 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27128 "Different nodes should have different DFS numbers");
27129 if (NodeI1 != NodeI2)
27130 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27131 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
27132 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
27133 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
27134 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
27135 if (!E1 || !E2)
27136 continue;
27137
27138 // Sort on ExtractElementInsts primarily by vector operands. Prefer
27139 // program order of the vector operands.
27140 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
27141 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
27142 if (V1 != V2) {
27143 if (V1 && !V2)
27144 return true;
27145 if (!V1 && V2)
27146 return false;
27148 DT->getNode(V1->getParent());
27150 DT->getNode(V2->getParent());
27151 if (!NodeI1)
27152 return NodeI2 != nullptr;
27153 if (!NodeI2)
27154 return false;
27155 assert((NodeI1 == NodeI2) ==
27156 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27157 "Different nodes should have different DFS numbers");
27158 if (NodeI1 != NodeI2)
27159 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27160 return V1->comesBefore(V2);
27161 }
27162 // If we have the same vector operand, try to sort by constant
27163 // index.
27164 std::optional<unsigned> Id1 = getExtractIndex(E1);
27165 std::optional<unsigned> Id2 = getExtractIndex(E2);
27166 // Bring constants to the top
27167 if (Id1 && !Id2)
27168 return true;
27169 if (!Id1 && Id2)
27170 return false;
27171 // First elements come first.
27172 if (Id1 && Id2)
27173 return *Id1 < *Id2;
27174
27175 continue;
27176 }
27177 if (I1->getOpcode() == I2->getOpcode())
27178 continue;
27179 return I1->getOpcode() < I2->getOpcode();
27180 }
27181 if (I1)
27182 return true;
27183 if (I2)
27184 return false;
27185 }
27186 {
27187 // Non-undef constants come next.
27188 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
27189 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
27190 if (C1 && C2)
27191 continue;
27192 if (C1)
27193 return true;
27194 if (C2)
27195 return false;
27196 }
27197 bool U1 = isa<UndefValue>(Opcodes1[I]);
27198 bool U2 = isa<UndefValue>(Opcodes2[I]);
27199 {
27200 // Non-constant non-instructions come next.
27201 if (!U1 && !U2) {
27202 auto ValID1 = Opcodes1[I]->getValueID();
27203 auto ValID2 = Opcodes2[I]->getValueID();
27204 if (ValID1 == ValID2)
27205 continue;
27206 if (ValID1 < ValID2)
27207 return true;
27208 if (ValID1 > ValID2)
27209 return false;
27210 }
27211 if (!U1)
27212 return true;
27213 if (!U2)
27214 return false;
27215 }
27216 // Undefs come last.
27217 assert(U1 && U2 && "The only thing left should be undef & undef.");
27218 }
27219 return false;
27220 };
27221 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
27222 Value *V1) {
27223 if (VL.empty() || V1 == VL.back())
27224 return true;
27225 Value *V2 = VL.back();
27226 if (V1->getType() != V2->getType())
27227 return false;
27228 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
27229 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
27230 if (Opcodes1.size() != Opcodes2.size())
27231 return false;
27232 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
27233 // Undefs are compatible with any other value.
27234 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
27235 continue;
27236 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
27237 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
27238 if (R.isDeleted(I1) || R.isDeleted(I2))
27239 return false;
27240 if (I1->getParent() != I2->getParent())
27241 return false;
27242 if (getSameOpcode({I1, I2}, *TLI))
27243 continue;
27244 return false;
27245 }
27246 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
27247 continue;
27248 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
27249 return false;
27250 }
27251 return true;
27252 };
27253
27254 bool HaveVectorizedPhiNodes = false;
27255 do {
27256 // Collect the incoming values from the PHIs.
27257 Incoming.clear();
27258 for (Instruction &I : *BB) {
27259 auto *P = dyn_cast<PHINode>(&I);
27260 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
27261 break;
27262
27263 // No need to analyze deleted, vectorized and non-vectorizable
27264 // instructions.
27265 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
27266 isValidElementType(P->getType()))
27267 Incoming.push_back(P);
27268 }
27269
27270 if (Incoming.size() <= 1)
27271 break;
27272
27273 // Find the corresponding non-phi nodes for better matching when trying to
27274 // build the tree.
27275 for (Value *V : Incoming) {
27276 SmallVectorImpl<Value *> &Opcodes =
27277 PHIToOpcodes.try_emplace(V).first->getSecond();
27278 if (!Opcodes.empty())
27279 continue;
27280 SmallVector<Value *, 4> Nodes(1, V);
27281 SmallPtrSet<Value *, 4> Visited;
27282 while (!Nodes.empty()) {
27283 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
27284 if (!Visited.insert(PHI).second)
27285 continue;
27286 for (Value *V : PHI->incoming_values()) {
27287 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
27288 Nodes.push_back(PHI1);
27289 continue;
27290 }
27291 Opcodes.emplace_back(V);
27292 }
27293 }
27294 }
27295
27296 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
27297 Incoming, PHICompare, AreCompatiblePHIs,
27298 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
27299 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27300 },
27301 /*MaxVFOnly=*/true, R);
27302 Changed |= HaveVectorizedPhiNodes;
27303 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
27304 auto *PHI = dyn_cast<PHINode>(P.first);
27305 return !PHI || R.isDeleted(PHI);
27306 }))
27307 PHIToOpcodes.clear();
27308 VisitedInstrs.insert_range(Incoming);
27309 } while (HaveVectorizedPhiNodes);
27310
27311 VisitedInstrs.clear();
27312
27313 InstSetVector PostProcessInserts;
27314 SmallSetVector<CmpInst *, 8> PostProcessCmps;
27315 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
27316 // also vectorizes `PostProcessCmps`.
27317 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
27318 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
27319 if (VectorizeCmps) {
27320 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
27321 PostProcessCmps.clear();
27322 }
27323 PostProcessInserts.clear();
27324 return Changed;
27325 };
27326 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
27327 auto IsInPostProcessInstrs = [&](Instruction *I) {
27328 if (auto *Cmp = dyn_cast<CmpInst>(I))
27329 return PostProcessCmps.contains(Cmp);
27331 PostProcessInserts.contains(I);
27332 };
27333 // Returns true if `I` is an instruction without users, like terminator, or
27334 // function call with ignored return value, store. Ignore unused instructions
27335 // (basing on instruction type, except for CallInst and InvokeInst).
27336 auto HasNoUsers = [](Instruction *I) {
27337 return I->use_empty() &&
27338 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
27339 };
27340 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
27341 // Skip instructions with scalable type. The num of elements is unknown at
27342 // compile-time for scalable type.
27343 if (isa<ScalableVectorType>(It->getType()))
27344 continue;
27345
27346 // Skip instructions marked for the deletion.
27347 if (R.isDeleted(&*It))
27348 continue;
27349 // We may go through BB multiple times so skip the one we have checked.
27350 if (!VisitedInstrs.insert(&*It).second) {
27351 if (HasNoUsers(&*It) &&
27352 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
27353 // We would like to start over since some instructions are deleted
27354 // and the iterator may become invalid value.
27355 Changed = true;
27356 It = BB->begin();
27357 E = BB->end();
27358 }
27359 continue;
27360 }
27361
27362 // Try to vectorize reductions that use PHINodes.
27363 if (PHINode *P = dyn_cast<PHINode>(It)) {
27364 // Check that the PHI is a reduction PHI.
27365 if (P->getNumIncomingValues() == 2) {
27366 // Try to match and vectorize a horizontal reduction.
27367 Instruction *Root = getReductionInstr(DT, P, BB, LI);
27368 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
27369 Changed = true;
27370 It = BB->begin();
27371 E = BB->end();
27372 continue;
27373 }
27374 }
27375 // Try to vectorize the incoming values of the PHI, to catch reductions
27376 // that feed into PHIs.
27377 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
27378 // Skip if the incoming block is the current BB for now. Also, bypass
27379 // unreachable IR for efficiency and to avoid crashing.
27380 // TODO: Collect the skipped incoming values and try to vectorize them
27381 // after processing BB.
27382 if (BB == P->getIncomingBlock(I) ||
27383 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
27384 continue;
27385
27386 // Postponed instructions should not be vectorized here, delay their
27387 // vectorization.
27388 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
27389 PI && !IsInPostProcessInstrs(PI)) {
27390 bool Res =
27391 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
27392 Changed |= Res;
27393 if (Res && R.isDeleted(P)) {
27394 It = BB->begin();
27395 E = BB->end();
27396 break;
27397 }
27398 }
27399 }
27400 continue;
27401 }
27402
27403 if (HasNoUsers(&*It)) {
27404 bool OpsChanged = false;
27405 auto *SI = dyn_cast<StoreInst>(It);
27406 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
27407 if (SI) {
27408 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
27409 // Try to vectorize chain in store, if this is the only store to the
27410 // address in the block.
27411 // TODO: This is just a temporarily solution to save compile time. Need
27412 // to investigate if we can safely turn on slp-vectorize-hor-store
27413 // instead to allow lookup for reduction chains in all non-vectorized
27414 // stores (need to check side effects and compile time).
27415 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
27416 SI->getValueOperand()->hasOneUse();
27417 }
27418 if (TryToVectorizeRoot) {
27419 for (auto *V : It->operand_values()) {
27420 // Postponed instructions should not be vectorized here, delay their
27421 // vectorization.
27422 if (auto *VI = dyn_cast<Instruction>(V);
27423 VI && !IsInPostProcessInstrs(VI))
27424 // Try to match and vectorize a horizontal reduction.
27425 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
27426 }
27427 }
27428 // Start vectorization of post-process list of instructions from the
27429 // top-tree instructions to try to vectorize as many instructions as
27430 // possible.
27431 OpsChanged |=
27432 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
27433 if (OpsChanged) {
27434 // We would like to start over since some instructions are deleted
27435 // and the iterator may become invalid value.
27436 Changed = true;
27437 It = BB->begin();
27438 E = BB->end();
27439 continue;
27440 }
27441 }
27442
27444 PostProcessInserts.insert(&*It);
27445 else if (isa<CmpInst>(It))
27446 PostProcessCmps.insert(cast<CmpInst>(&*It));
27447 }
27448
27449 return Changed;
27450}
27451
27452bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
27453 auto Changed = false;
27454 for (auto &Entry : GEPs) {
27455 // If the getelementptr list has fewer than two elements, there's nothing
27456 // to do.
27457 if (Entry.second.size() < 2)
27458 continue;
27459
27460 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
27461 << Entry.second.size() << ".\n");
27462
27463 // Process the GEP list in chunks suitable for the target's supported
27464 // vector size. If a vector register can't hold 1 element, we are done. We
27465 // are trying to vectorize the index computations, so the maximum number of
27466 // elements is based on the size of the index expression, rather than the
27467 // size of the GEP itself (the target's pointer size).
27468 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
27469 return !R.isDeleted(GEP);
27470 });
27471 if (It == Entry.second.end())
27472 continue;
27473 unsigned MaxVecRegSize = R.getMaxVecRegSize();
27474 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
27475 if (MaxVecRegSize < EltSize)
27476 continue;
27477
27478 unsigned MaxElts = MaxVecRegSize / EltSize;
27479 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
27480 auto Len = std::min<unsigned>(BE - BI, MaxElts);
27481 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
27482
27483 // Initialize a set a candidate getelementptrs. Note that we use a
27484 // SetVector here to preserve program order. If the index computations
27485 // are vectorizable and begin with loads, we want to minimize the chance
27486 // of having to reorder them later.
27487 SetVector<Value *> Candidates(llvm::from_range, GEPList);
27488
27489 // Some of the candidates may have already been vectorized after we
27490 // initially collected them or their index is optimized to constant value.
27491 // If so, they are marked as deleted, so remove them from the set of
27492 // candidates.
27493 Candidates.remove_if([&R](Value *I) {
27494 return R.isDeleted(cast<Instruction>(I)) ||
27495 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
27496 });
27497
27498 // Remove from the set of candidates all pairs of getelementptrs with
27499 // constant differences. Such getelementptrs are likely not good
27500 // candidates for vectorization in a bottom-up phase since one can be
27501 // computed from the other. We also ensure all candidate getelementptr
27502 // indices are unique.
27503 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
27504 auto *GEPI = GEPList[I];
27505 if (!Candidates.count(GEPI))
27506 continue;
27507 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
27508 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
27509 auto *GEPJ = GEPList[J];
27510 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
27511 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
27512 Candidates.remove(GEPI);
27513 Candidates.remove(GEPJ);
27514 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
27515 Candidates.remove(GEPJ);
27516 }
27517 }
27518 }
27519
27520 // We break out of the above computation as soon as we know there are
27521 // fewer than two candidates remaining.
27522 if (Candidates.size() < 2)
27523 continue;
27524
27525 // Add the single, non-constant index of each candidate to the bundle. We
27526 // ensured the indices met these constraints when we originally collected
27527 // the getelementptrs.
27528 SmallVector<Value *, 16> Bundle(Candidates.size());
27529 auto BundleIndex = 0u;
27530 for (auto *V : Candidates) {
27531 auto *GEP = cast<GetElementPtrInst>(V);
27532 auto *GEPIdx = GEP->idx_begin()->get();
27533 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
27534 Bundle[BundleIndex++] = GEPIdx;
27535 }
27536
27537 // Try and vectorize the indices. We are currently only interested in
27538 // gather-like cases of the form:
27539 //
27540 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
27541 //
27542 // where the loads of "a", the loads of "b", and the subtractions can be
27543 // performed in parallel. It's likely that detecting this pattern in a
27544 // bottom-up phase will be simpler and less costly than building a
27545 // full-blown top-down phase beginning at the consecutive loads.
27546 Changed |= tryToVectorizeList(Bundle, R);
27547 }
27548 }
27549 return Changed;
27550}
27551
27552bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
27553 bool Changed = false;
27554 // Sort by type, base pointers and values operand. Value operands must be
27555 // compatible (have the same opcode, same parent), otherwise it is
27556 // definitely not profitable to try to vectorize them.
27557 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
27558 if (V->getValueOperand()->getType()->getTypeID() <
27559 V2->getValueOperand()->getType()->getTypeID())
27560 return true;
27561 if (V->getValueOperand()->getType()->getTypeID() >
27562 V2->getValueOperand()->getType()->getTypeID())
27563 return false;
27564 if (V->getPointerOperandType()->getTypeID() <
27565 V2->getPointerOperandType()->getTypeID())
27566 return true;
27567 if (V->getPointerOperandType()->getTypeID() >
27568 V2->getPointerOperandType()->getTypeID())
27569 return false;
27570 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
27571 V2->getValueOperand()->getType()->getScalarSizeInBits())
27572 return true;
27573 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
27574 V2->getValueOperand()->getType()->getScalarSizeInBits())
27575 return false;
27576 // UndefValues are compatible with all other values.
27577 auto *I1 = dyn_cast<Instruction>(V->getValueOperand());
27578 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
27579 if (I1 && I2) {
27580 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
27581 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
27582 assert(NodeI1 && "Should only process reachable instructions");
27583 assert(NodeI2 && "Should only process reachable instructions");
27584 assert((NodeI1 == NodeI2) ==
27585 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27586 "Different nodes should have different DFS numbers");
27587 if (NodeI1 != NodeI2)
27588 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27589 return I1->getOpcode() < I2->getOpcode();
27590 }
27591 if (I1 && !I2)
27592 return true;
27593 if (!I1 && I2)
27594 return false;
27595 return V->getValueOperand()->getValueID() <
27596 V2->getValueOperand()->getValueID();
27597 };
27598
27599 bool SameParent = true;
27600 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
27601 if (VL.empty()) {
27602 SameParent = true;
27603 return true;
27604 }
27605 StoreInst *V2 = VL.back();
27606 if (V1 == V2)
27607 return true;
27608 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
27609 return false;
27610 if (V1->getPointerOperandType() != V2->getPointerOperandType())
27611 return false;
27612 // Undefs are compatible with any other value.
27613 if (isa<UndefValue>(V1->getValueOperand()) ||
27615 return true;
27616 if (isa<Constant>(V1->getValueOperand()) &&
27618 return true;
27619 // Check if the operands of the stores can be vectorized. They can be
27620 // vectorized, if they have compatible operands or have operands, which can
27621 // be vectorized as copyables.
27622 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
27623 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
27624 if (I1 || I2) {
27625 // Accept only tail-following non-compatible values for now.
27626 // TODO: investigate if it is possible to vectorize incompatible values,
27627 // if the copyables are first in the list.
27628 if (I1 && !I2)
27629 return false;
27630 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
27631 SmallVector<Value *> NewVL(VL.size() + 1);
27632 for (auto [SI, V] : zip(VL, NewVL))
27633 V = SI->getValueOperand();
27634 NewVL.back() = V1->getValueOperand();
27635 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
27636 InstructionsState S = Analysis.buildInstructionsState(
27637 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
27638 /*SkipSameCodeCheck=*/!SameParent);
27639 if (S)
27640 return true;
27641 if (!SameParent)
27642 return false;
27643 }
27644 return V1->getValueOperand()->getValueID() ==
27645 V2->getValueOperand()->getValueID();
27646 };
27647
27648 // Attempt to sort and vectorize each of the store-groups.
27649 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
27650 for (auto &Pair : Stores) {
27651 if (Pair.second.size() < 2)
27652 continue;
27653
27654 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
27655 << Pair.second.size() << ".\n");
27656
27657 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
27658 continue;
27659
27660 // Reverse stores to do bottom-to-top analysis. This is important if the
27661 // values are stores to the same addresses several times, in this case need
27662 // to follow the stores order (reversed to meet the memory dependecies).
27663 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
27664 Pair.second.rend());
27666 ReversedStores, StoreSorter, AreCompatibleStores,
27667 [&](ArrayRef<StoreInst *> Candidates, bool) {
27668 return vectorizeStores(Candidates, R, Attempted);
27669 },
27670 /*MaxVFOnly=*/false, R);
27671 }
27672 return Changed;
27673}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
static Type * getIndexType(Value *In)
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op, bool IsCopyable=false)
Checks if the operand is commutative.
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, SmallVectorImpl< int64_t > &Coeffs)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1415
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1549
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1339
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1677
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1497
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1112
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1405
void negate()
Negate this APInt in place.
Definition APInt.h:1477
unsigned logBase2() const
Definition APInt.h:1770
void setAllBits()
Set every bit to 1.
Definition APInt.h:1328
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1376
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:287
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:178
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:219
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:195
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:157
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:470
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:488
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:491
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:718
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:664
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
static bool shouldExecute(CounterInfo &Counter)
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getUnknown()
Definition DebugLoc.h:161
An analysis that produces DemandedBits for a function.
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2553
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2619
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2175
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2575
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2248
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2410
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:154
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Information for memory intrinsic cost model.
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
T & front() const
front - Get the first element.
Definition ArrayRef.h:349
iterator end() const
Definition ArrayRef.h:343
iterator begin() const
Definition ArrayRef.h:342
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:91
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:132
void insert_range(Range &&R)
Definition SetVector.h:176
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:94
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:228
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:180
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:259
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
iterator_range< value_op_iterator > operand_values()
Definition User.h:291
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
bool hasUseList() const
Check if this Value has a use-list.
Definition Value.h:344
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:346
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
InstructionCost getTreeCost(InstructionCost TreeCost, ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
InstructionCost calculateTreeCostAndTrimNonProfitable(ArrayRef< Value * > VectorizedVals={})
Calculates the cost of the subtrees, trims non-profitable ones and returns final cost.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2170
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2106
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1757
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1731
scope_exit(Callable) -> scope_exit< Callable >
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2303
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:2029
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2190
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2016
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1775
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:361
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1397
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1968
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Add
Sum of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1883
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1407
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1437
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1446
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)