LLVM 22.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <map>
99#include <memory>
100#include <optional>
101#include <set>
102#include <string>
103#include <tuple>
104#include <utility>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108using namespace slpvectorizer;
109using namespace std::placeholders;
110
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
113
114STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
115
116DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
117 "Controls which SLP graphs should be vectorized.");
118
119static cl::opt<bool>
120 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
121 cl::desc("Run the SLP vectorization passes"));
122
123static cl::opt<bool>
124 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
125 cl::desc("Enable vectorization for wider vector utilization"));
126
127static cl::opt<int>
129 cl::desc("Only vectorize if you gain more than this "
130 "number "));
131
133 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
134 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
135 "heuristics and makes vectorization decision via cost modeling."));
136
137static cl::opt<bool>
138ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
139 cl::desc("Attempt to vectorize horizontal reductions"));
140
142 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
143 cl::desc(
144 "Attempt to vectorize horizontal reductions feeding into a store"));
145
147 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
148 cl::desc("Improve the code quality by splitting alternate instructions"));
149
150static cl::opt<int>
152 cl::desc("Attempt to vectorize for this register size in bits"));
153
156 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
157
158/// Limits the size of scheduling regions in a block.
159/// It avoid long compile times for _very_ large blocks where vector
160/// instructions are spread over a wide range.
161/// This limit is way higher than needed by real-world functions.
162static cl::opt<int>
163ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
164 cl::desc("Limit the size of the SLP scheduling region per block"));
165
167 "slp-min-reg-size", cl::init(128), cl::Hidden,
168 cl::desc("Attempt to vectorize for this register size in bits"));
169
171 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
172 cl::desc("Limit the recursion depth when building a vectorizable tree"));
173
175 "slp-min-tree-size", cl::init(3), cl::Hidden,
176 cl::desc("Only vectorize small trees if they are fully vectorizable"));
177
178// The maximum depth that the look-ahead score heuristic will explore.
179// The higher this value, the higher the compilation time overhead.
181 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
182 cl::desc("The maximum look-ahead depth for operand reordering scores"));
183
184// The maximum depth that the look-ahead score heuristic will explore
185// when it probing among candidates for vectorization tree roots.
186// The higher this value, the higher the compilation time overhead but unlike
187// similar limit for operands ordering this is less frequently used, hence
188// impact of higher value is less noticeable.
190 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
191 cl::desc("The maximum look-ahead depth for searching best rooting option"));
192
194 "slp-min-strided-loads", cl::init(2), cl::Hidden,
195 cl::desc("The minimum number of loads, which should be considered strided, "
196 "if the stride is > 1 or is runtime value"));
197
199 "slp-max-stride", cl::init(8), cl::Hidden,
200 cl::desc("The maximum stride, considered to be profitable."));
201
202static cl::opt<bool>
203 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
204 cl::desc("Disable tree reordering even if it is "
205 "profitable. Used for testing only."));
206
207static cl::opt<bool>
208 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
209 cl::desc("Generate strided loads even if they are not "
210 "profitable. Used for testing only."));
211
212static cl::opt<bool>
213 ViewSLPTree("view-slp-tree", cl::Hidden,
214 cl::desc("Display the SLP trees with Graphviz"));
215
217 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
218 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
219
220/// Enables vectorization of copyable elements.
222 "slp-copyable-elements", cl::init(true), cl::Hidden,
223 cl::desc("Try to replace values with the idempotent instructions for "
224 "better vectorization."));
225
226// Limit the number of alias checks. The limit is chosen so that
227// it has no negative effect on the llvm benchmarks.
228static const unsigned AliasedCheckLimit = 10;
229
230// Limit of the number of uses for potentially transformed instructions/values,
231// used in checks to avoid compile-time explode.
232static constexpr int UsesLimit = 64;
233
234// Another limit for the alias checks: The maximum distance between load/store
235// instructions where alias checks are done.
236// This limit is useful for very large basic blocks.
237static const unsigned MaxMemDepDistance = 160;
238
239/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
240/// regions to be handled.
241static const int MinScheduleRegionSize = 16;
242
243/// Maximum allowed number of operands in the PHI nodes.
244static const unsigned MaxPHINumOperands = 128;
245
246/// Predicate for the element types that the SLP vectorizer supports.
247///
248/// The most important thing to filter here are types which are invalid in LLVM
249/// vectors. We also filter target specific types which have absolutely no
250/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
251/// avoids spending time checking the cost model and realizing that they will
252/// be inevitably scalarized.
253static bool isValidElementType(Type *Ty) {
254 // TODO: Support ScalableVectorType.
256 Ty = Ty->getScalarType();
257 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
258 !Ty->isPPC_FP128Ty();
259}
260
261/// Returns the type of the given value/instruction \p V. If it is store,
262/// returns the type of its value operand, for Cmp - the types of the compare
263/// operands and for insertelement - the type os the inserted operand.
264/// Otherwise, just the type of the value is returned.
266 if (auto *SI = dyn_cast<StoreInst>(V))
267 return SI->getValueOperand()->getType();
268 if (auto *CI = dyn_cast<CmpInst>(V))
269 return CI->getOperand(0)->getType();
270 if (auto *IE = dyn_cast<InsertElementInst>(V))
271 return IE->getOperand(1)->getType();
272 return V->getType();
273}
274
275/// \returns the number of elements for Ty.
276static unsigned getNumElements(Type *Ty) {
278 "ScalableVectorType is not supported.");
279 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
280 return VecTy->getNumElements();
281 return 1;
282}
283
284/// \returns the vector type of ScalarTy based on vectorization factor.
285static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
286 return FixedVectorType::get(ScalarTy->getScalarType(),
287 VF * getNumElements(ScalarTy));
288}
289
290/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
291/// which forms type, which splits by \p TTI into whole vector types during
292/// legalization.
294 Type *Ty, unsigned Sz) {
295 if (!isValidElementType(Ty))
296 return bit_ceil(Sz);
297 // Find the number of elements, which forms full vectors.
298 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
299 if (NumParts == 0 || NumParts >= Sz)
300 return bit_ceil(Sz);
301 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
302}
303
304/// Returns the number of elements of the given type \p Ty, not greater than \p
305/// Sz, which forms type, which splits by \p TTI into whole vector types during
306/// legalization.
307static unsigned
309 unsigned Sz) {
310 if (!isValidElementType(Ty))
311 return bit_floor(Sz);
312 // Find the number of elements, which forms full vectors.
313 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
314 if (NumParts == 0 || NumParts >= Sz)
315 return bit_floor(Sz);
316 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
317 if (RegVF > Sz)
318 return bit_floor(Sz);
319 return (Sz / RegVF) * RegVF;
320}
321
322static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
323 SmallVectorImpl<int> &Mask) {
324 // The ShuffleBuilder implementation use shufflevector to splat an "element".
325 // But the element have different meaning for SLP (scalar) and REVEC
326 // (vector). We need to expand Mask into masks which shufflevector can use
327 // directly.
328 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
329 for (unsigned I : seq<unsigned>(Mask.size()))
330 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
331 I * VecTyNumElements, VecTyNumElements)))
332 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
333 : Mask[I] * VecTyNumElements + J;
334 Mask.swap(NewMask);
335}
336
337/// \returns the number of groups of shufflevector
338/// A group has the following features
339/// 1. All of value in a group are shufflevector.
340/// 2. The mask of all shufflevector is isExtractSubvectorMask.
341/// 3. The mask of all shufflevector uses all of the elements of the source.
342/// e.g., it is 1 group (%0)
343/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
344/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
345/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
346/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
347/// it is 2 groups (%3 and %4)
348/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
349/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
350/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
351/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
352/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
353/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
354/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
355/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
356/// it is 0 group
357/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
358/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
359/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
360/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
362 if (VL.empty())
363 return 0;
365 return 0;
366 auto *SV = cast<ShuffleVectorInst>(VL.front());
367 unsigned SVNumElements =
368 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
369 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
370 if (SVNumElements % ShuffleMaskSize != 0)
371 return 0;
372 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
373 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
374 return 0;
375 unsigned NumGroup = 0;
376 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
377 auto *SV = cast<ShuffleVectorInst>(VL[I]);
378 Value *Src = SV->getOperand(0);
379 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
380 SmallBitVector ExpectedIndex(GroupSize);
381 if (!all_of(Group, [&](Value *V) {
382 auto *SV = cast<ShuffleVectorInst>(V);
383 // From the same source.
384 if (SV->getOperand(0) != Src)
385 return false;
386 int Index;
387 if (!SV->isExtractSubvectorMask(Index))
388 return false;
389 ExpectedIndex.set(Index / ShuffleMaskSize);
390 return true;
391 }))
392 return 0;
393 if (!ExpectedIndex.all())
394 return 0;
395 ++NumGroup;
396 }
397 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
398 return NumGroup;
399}
400
401/// \returns a shufflevector mask which is used to vectorize shufflevectors
402/// e.g.,
403/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
404/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
405/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
406/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
407/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
408/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
409/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
410/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
411/// the result is
412/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
414 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
415 auto *SV = cast<ShuffleVectorInst>(VL.front());
416 unsigned SVNumElements =
417 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
418 SmallVector<int> Mask;
419 unsigned AccumulateLength = 0;
420 for (Value *V : VL) {
421 auto *SV = cast<ShuffleVectorInst>(V);
422 for (int M : SV->getShuffleMask())
423 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
424 : AccumulateLength + M);
425 AccumulateLength += SVNumElements;
426 }
427 return Mask;
428}
429
430/// \returns True if the value is a constant (but not globals/constant
431/// expressions).
432static bool isConstant(Value *V) {
434}
435
436/// Checks if \p V is one of vector-like instructions, i.e. undef,
437/// insertelement/extractelement with constant indices for fixed vector type or
438/// extractvalue instruction.
442 return false;
443 auto *I = dyn_cast<Instruction>(V);
444 if (!I || isa<ExtractValueInst>(I))
445 return true;
446 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
447 return false;
449 return isConstant(I->getOperand(1));
450 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
451 return isConstant(I->getOperand(2));
452}
453
454/// Returns power-of-2 number of elements in a single register (part), given the
455/// total number of elements \p Size and number of registers (parts) \p
456/// NumParts.
457static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
458 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
459}
460
461/// Returns correct remaining number of elements, considering total amount \p
462/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
463/// and current register (part) \p Part.
464static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
465 unsigned Part) {
466 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
467}
468
469#if !defined(NDEBUG)
470/// Print a short descriptor of the instruction bundle suitable for debug output.
471static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
472 std::string Result;
473 raw_string_ostream OS(Result);
474 if (Idx >= 0)
475 OS << "Idx: " << Idx << ", ";
476 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
477 return Result;
478}
479#endif
480
481/// \returns true if all of the instructions in \p VL are in the same block or
482/// false otherwise.
484 auto *It = find_if(VL, IsaPred<Instruction>);
485 if (It == VL.end())
486 return false;
489 return true;
490
491 BasicBlock *BB = I0->getParent();
492 for (Value *V : iterator_range(It, VL.end())) {
493 if (isa<PoisonValue>(V))
494 continue;
495 auto *II = dyn_cast<Instruction>(V);
496 if (!II)
497 return false;
498
499 if (BB != II->getParent())
500 return false;
501 }
502 return true;
503}
504
505/// \returns True if all of the values in \p VL are constants (but not
506/// globals/constant expressions).
508 // Constant expressions and globals can't be vectorized like normal integer/FP
509 // constants.
510 return all_of(VL, isConstant);
511}
512
513/// \returns True if all of the values in \p VL are identical or some of them
514/// are UndefValue.
515static bool isSplat(ArrayRef<Value *> VL) {
516 Value *FirstNonUndef = nullptr;
517 for (Value *V : VL) {
518 if (isa<UndefValue>(V))
519 continue;
520 if (!FirstNonUndef) {
521 FirstNonUndef = V;
522 continue;
523 }
524 if (V != FirstNonUndef)
525 return false;
526 }
527 return FirstNonUndef != nullptr;
528}
529
530/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
531/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
532/// patterns that make it effectively commutative (like equality comparisons
533/// with zero).
534/// In most cases, users should not call this function directly (since \p I and
535/// \p InstWithUses are the same). However, when analyzing interchangeable
536/// instructions, we need to use the converted opcode along with the original
537/// uses.
538/// \param I The instruction to check for commutativity
539/// \param ValWithUses The value whose uses are analyzed for special
540/// patterns
541static bool isCommutative(Instruction *I, Value *ValWithUses,
542 bool IsCopyable = false) {
543 if (auto *Cmp = dyn_cast<CmpInst>(I))
544 return Cmp->isCommutative();
545 if (auto *BO = dyn_cast<BinaryOperator>(I))
546 return BO->isCommutative() ||
547 (BO->getOpcode() == Instruction::Sub &&
548 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
549 all_of(
550 ValWithUses->uses(),
551 [&](const Use &U) {
552 // Commutative, if icmp eq/ne sub, 0
553 CmpPredicate Pred;
554 if (match(U.getUser(),
555 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
556 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
557 return true;
558 // Commutative, if abs(sub nsw, true) or abs(sub, false).
559 ConstantInt *Flag;
560 auto *I = dyn_cast<BinaryOperator>(U.get());
561 return match(U.getUser(),
562 m_Intrinsic<Intrinsic::abs>(
563 m_Specific(U.get()), m_ConstantInt(Flag))) &&
564 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
565 Flag->isOne());
566 })) ||
567 (BO->getOpcode() == Instruction::FSub &&
568 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
569 all_of(ValWithUses->uses(), [](const Use &U) {
570 return match(U.getUser(),
571 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
572 }));
573 return I->isCommutative();
574}
575
576/// This is a helper function to check whether \p I is commutative.
577/// This is a convenience wrapper that calls the two-parameter version of
578/// isCommutative with the same instruction for both parameters. This is
579/// the common case where the instruction being checked for commutativity
580/// is the same as the instruction whose uses are analyzed for special
581/// patterns (see the two-parameter version above for details).
582/// \param I The instruction to check for commutativity
583/// \returns true if the instruction is commutative, false otherwise
584static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
585
586/// \returns number of operands of \p I, considering commutativity. Returns 2
587/// for commutative instrinsics.
588/// \param I The instruction to check for commutativity
591 // IntrinsicInst::isCommutative returns true if swapping the first "two"
592 // arguments to the intrinsic produces the same result.
593 constexpr unsigned IntrinsicNumOperands = 2;
594 return IntrinsicNumOperands;
595 }
596 return I->getNumOperands();
597}
598
599template <typename T>
600static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
601 unsigned Offset) {
602 static_assert(std::is_same_v<T, InsertElementInst> ||
603 std::is_same_v<T, ExtractElementInst>,
604 "unsupported T");
605 int Index = Offset;
606 if (const auto *IE = dyn_cast<T>(Inst)) {
607 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
608 if (!VT)
609 return std::nullopt;
610 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
611 if (!CI)
612 return std::nullopt;
613 if (CI->getValue().uge(VT->getNumElements()))
614 return std::nullopt;
615 Index *= VT->getNumElements();
616 Index += CI->getZExtValue();
617 return Index;
618 }
619 return std::nullopt;
620}
621
622/// \returns inserting or extracting index of InsertElement, ExtractElement or
623/// InsertValue instruction, using Offset as base offset for index.
624/// \returns std::nullopt if the index is not an immediate.
625static std::optional<unsigned> getElementIndex(const Value *Inst,
626 unsigned Offset = 0) {
627 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
628 return Index;
630 return Index;
631
632 int Index = Offset;
633
634 const auto *IV = dyn_cast<InsertValueInst>(Inst);
635 if (!IV)
636 return std::nullopt;
637
638 Type *CurrentType = IV->getType();
639 for (unsigned I : IV->indices()) {
640 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
641 Index *= ST->getNumElements();
642 CurrentType = ST->getElementType(I);
643 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
644 Index *= AT->getNumElements();
645 CurrentType = AT->getElementType();
646 } else {
647 return std::nullopt;
648 }
649 Index += I;
650 }
651 return Index;
652}
653
654/// \returns true if all of the values in \p VL use the same opcode.
655/// For comparison instructions, also checks if predicates match.
656/// PoisonValues are considered matching.
657/// Interchangeable instructions are not considered.
659 auto *It = find_if(VL, IsaPred<Instruction>);
660 if (It == VL.end())
661 return true;
662 Instruction *MainOp = cast<Instruction>(*It);
663 unsigned Opcode = MainOp->getOpcode();
664 bool IsCmpOp = isa<CmpInst>(MainOp);
665 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
667 return std::all_of(It, VL.end(), [&](Value *V) {
668 if (auto *CI = dyn_cast<CmpInst>(V))
669 return BasePred == CI->getPredicate();
670 if (auto *I = dyn_cast<Instruction>(V))
671 return I->getOpcode() == Opcode;
672 return isa<PoisonValue>(V);
673 });
674}
675
676namespace {
677/// Specifies the way the mask should be analyzed for undefs/poisonous elements
678/// in the shuffle mask.
679enum class UseMask {
680 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
681 ///< check for the mask elements for the first argument (mask
682 ///< indices are in range [0:VF)).
683 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
684 ///< for the mask elements for the second argument (mask indices
685 ///< are in range [VF:2*VF))
686 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
687 ///< future shuffle elements and mark them as ones as being used
688 ///< in future. Non-undef elements are considered as unused since
689 ///< they're already marked as used in the mask.
690};
691} // namespace
692
693/// Prepares a use bitset for the given mask either for the first argument or
694/// for the second.
696 UseMask MaskArg) {
697 SmallBitVector UseMask(VF, true);
698 for (auto [Idx, Value] : enumerate(Mask)) {
699 if (Value == PoisonMaskElem) {
700 if (MaskArg == UseMask::UndefsAsMask)
701 UseMask.reset(Idx);
702 continue;
703 }
704 if (MaskArg == UseMask::FirstArg && Value < VF)
705 UseMask.reset(Value);
706 else if (MaskArg == UseMask::SecondArg && Value >= VF)
707 UseMask.reset(Value - VF);
708 }
709 return UseMask;
710}
711
712/// Checks if the given value is actually an undefined constant vector.
713/// Also, if the \p UseMask is not empty, tries to check if the non-masked
714/// elements actually mask the insertelement buildvector, if any.
715template <bool IsPoisonOnly = false>
717 const SmallBitVector &UseMask = {}) {
718 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
719 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
720 if (isa<T>(V))
721 return Res;
722 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
723 if (!VecTy)
724 return Res.reset();
725 auto *C = dyn_cast<Constant>(V);
726 if (!C) {
727 if (!UseMask.empty()) {
728 const Value *Base = V;
729 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
730 Base = II->getOperand(0);
731 if (isa<T>(II->getOperand(1)))
732 continue;
733 std::optional<unsigned> Idx = getElementIndex(II);
734 if (!Idx) {
735 Res.reset();
736 return Res;
737 }
738 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
739 Res.reset(*Idx);
740 }
741 // TODO: Add analysis for shuffles here too.
742 if (V == Base) {
743 Res.reset();
744 } else {
745 SmallBitVector SubMask(UseMask.size(), false);
746 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
747 }
748 } else {
749 Res.reset();
750 }
751 return Res;
752 }
753 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
754 if (Constant *Elem = C->getAggregateElement(I))
755 if (!isa<T>(Elem) &&
756 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
757 Res.reset(I);
758 }
759 return Res;
760}
761
762/// Checks if the vector of instructions can be represented as a shuffle, like:
763/// %x0 = extractelement <4 x i8> %x, i32 0
764/// %x3 = extractelement <4 x i8> %x, i32 3
765/// %y1 = extractelement <4 x i8> %y, i32 1
766/// %y2 = extractelement <4 x i8> %y, i32 2
767/// %x0x0 = mul i8 %x0, %x0
768/// %x3x3 = mul i8 %x3, %x3
769/// %y1y1 = mul i8 %y1, %y1
770/// %y2y2 = mul i8 %y2, %y2
771/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
772/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
773/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
774/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
775/// ret <4 x i8> %ins4
776/// can be transformed into:
777/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
778/// i32 6>
779/// %2 = mul <4 x i8> %1, %1
780/// ret <4 x i8> %2
781/// Mask will return the Shuffle Mask equivalent to the extracted elements.
782/// TODO: Can we split off and reuse the shuffle mask detection from
783/// ShuffleVectorInst/getShuffleCost?
784static std::optional<TargetTransformInfo::ShuffleKind>
786 AssumptionCache *AC) {
787 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
788 if (It == VL.end())
789 return std::nullopt;
790 unsigned Size =
791 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
792 auto *EI = dyn_cast<ExtractElementInst>(V);
793 if (!EI)
794 return S;
795 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
796 if (!VTy)
797 return S;
798 return std::max(S, VTy->getNumElements());
799 });
800
801 Value *Vec1 = nullptr;
802 Value *Vec2 = nullptr;
803 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
804 auto *EE = dyn_cast<ExtractElementInst>(V);
805 if (!EE)
806 return false;
807 Value *Vec = EE->getVectorOperand();
808 if (isa<UndefValue>(Vec))
809 return false;
810 return isGuaranteedNotToBePoison(Vec, AC);
811 });
812 enum ShuffleMode { Unknown, Select, Permute };
813 ShuffleMode CommonShuffleMode = Unknown;
814 Mask.assign(VL.size(), PoisonMaskElem);
815 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
816 // Undef can be represented as an undef element in a vector.
817 if (isa<UndefValue>(VL[I]))
818 continue;
819 auto *EI = cast<ExtractElementInst>(VL[I]);
820 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
821 return std::nullopt;
822 auto *Vec = EI->getVectorOperand();
823 // We can extractelement from undef or poison vector.
825 continue;
826 // All vector operands must have the same number of vector elements.
827 if (isa<UndefValue>(Vec)) {
828 Mask[I] = I;
829 } else {
830 if (isa<UndefValue>(EI->getIndexOperand()))
831 continue;
832 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
833 if (!Idx)
834 return std::nullopt;
835 // Undefined behavior if Idx is negative or >= Size.
836 if (Idx->getValue().uge(Size))
837 continue;
838 unsigned IntIdx = Idx->getValue().getZExtValue();
839 Mask[I] = IntIdx;
840 }
841 if (isUndefVector(Vec).all() && HasNonUndefVec)
842 continue;
843 // For correct shuffling we have to have at most 2 different vector operands
844 // in all extractelement instructions.
845 if (!Vec1 || Vec1 == Vec) {
846 Vec1 = Vec;
847 } else if (!Vec2 || Vec2 == Vec) {
848 Vec2 = Vec;
849 Mask[I] += Size;
850 } else {
851 return std::nullopt;
852 }
853 if (CommonShuffleMode == Permute)
854 continue;
855 // If the extract index is not the same as the operation number, it is a
856 // permutation.
857 if (Mask[I] % Size != I) {
858 CommonShuffleMode = Permute;
859 continue;
860 }
861 CommonShuffleMode = Select;
862 }
863 // If we're not crossing lanes in different vectors, consider it as blending.
864 if (CommonShuffleMode == Select && Vec2)
866 // If Vec2 was never used, we have a permutation of a single vector, otherwise
867 // we have permutation of 2 vectors.
870}
871
872/// \returns True if Extract{Value,Element} instruction extracts element Idx.
873static std::optional<unsigned> getExtractIndex(const Instruction *E) {
874 unsigned Opcode = E->getOpcode();
875 assert((Opcode == Instruction::ExtractElement ||
876 Opcode == Instruction::ExtractValue) &&
877 "Expected extractelement or extractvalue instruction.");
878 if (Opcode == Instruction::ExtractElement) {
879 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
880 if (!CI)
881 return std::nullopt;
882 return CI->getZExtValue();
883 }
884 auto *EI = cast<ExtractValueInst>(E);
885 if (EI->getNumIndices() != 1)
886 return std::nullopt;
887 return *EI->idx_begin();
888}
889
890/// Checks if the provided value does not require scheduling. It does not
891/// require scheduling if this is not an instruction or it is an instruction
892/// that does not read/write memory and all operands are either not instructions
893/// or phi nodes or instructions from different blocks.
894static bool areAllOperandsNonInsts(Value *V);
895/// Checks if the provided value does not require scheduling. It does not
896/// require scheduling if this is not an instruction or it is an instruction
897/// that does not read/write memory and all users are phi nodes or instructions
898/// from the different blocks.
899static bool isUsedOutsideBlock(Value *V);
900/// Checks if the specified value does not require scheduling. It does not
901/// require scheduling if all operands and all users do not need to be scheduled
902/// in the current basic block.
903static bool doesNotNeedToBeScheduled(Value *V);
904
905/// \returns true if \p Opcode is allowed as part of the main/alternate
906/// instruction for SLP vectorization.
907///
908/// Example of unsupported opcode is SDIV that can potentially cause UB if the
909/// "shuffled out" lane would result in division by zero.
910static bool isValidForAlternation(unsigned Opcode) {
911 return !Instruction::isIntDivRem(Opcode);
912}
913
914namespace {
915
916/// Helper class that determines VL can use the same opcode.
917/// Alternate instruction is supported. In addition, it supports interchangeable
918/// instruction. An interchangeable instruction is an instruction that can be
919/// converted to another instruction with same semantics. For example, x << 1 is
920/// equal to x * 2. x * 1 is equal to x | 0.
921class BinOpSameOpcodeHelper {
922 using MaskType = std::uint_fast16_t;
923 /// Sort SupportedOp because it is used by binary_search.
924 constexpr static std::initializer_list<unsigned> SupportedOp = {
925 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
926 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
927 enum : MaskType {
928 ShlBIT = 0b1,
929 AShrBIT = 0b10,
930 MulBIT = 0b100,
931 AddBIT = 0b1000,
932 SubBIT = 0b10000,
933 AndBIT = 0b100000,
934 OrBIT = 0b1000000,
935 XorBIT = 0b10000000,
936 MainOpBIT = 0b100000000,
938 };
939 /// Return a non-nullptr if either operand of I is a ConstantInt.
940 /// The second return value represents the operand position. We check the
941 /// right-hand side first (1). If the right hand side is not a ConstantInt and
942 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
943 /// side (0).
944 static std::pair<ConstantInt *, unsigned>
945 isBinOpWithConstantInt(const Instruction *I) {
946 unsigned Opcode = I->getOpcode();
947 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
948 (void)SupportedOp;
949 auto *BinOp = cast<BinaryOperator>(I);
950 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
951 return {CI, 1};
952 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
953 Opcode == Instruction::AShr)
954 return {nullptr, 0};
955 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
956 return {CI, 0};
957 return {nullptr, 0};
958 }
959 struct InterchangeableInfo {
960 const Instruction *I = nullptr;
961 /// The bit it sets represents whether MainOp can be converted to.
962 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
963 MulBIT | AShrBIT | ShlBIT;
964 /// We cannot create an interchangeable instruction that does not exist in
965 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
966 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
967 /// 1]. SeenBefore is used to know what operations have been seen before.
968 MaskType SeenBefore = 0;
969 InterchangeableInfo(const Instruction *I) : I(I) {}
970 /// Return false allows BinOpSameOpcodeHelper to find an alternate
971 /// instruction. Directly setting the mask will destroy the mask state,
972 /// preventing us from determining which instruction it should convert to.
973 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
974 if (Mask & InterchangeableMask) {
975 SeenBefore |= OpcodeInMaskForm;
976 Mask &= InterchangeableMask;
977 return true;
978 }
979 return false;
980 }
981 bool equal(unsigned Opcode) {
982 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
983 }
984 unsigned getOpcode() const {
985 MaskType Candidate = Mask & SeenBefore;
986 if (Candidate & MainOpBIT)
987 return I->getOpcode();
988 if (Candidate & ShlBIT)
989 return Instruction::Shl;
990 if (Candidate & AShrBIT)
991 return Instruction::AShr;
992 if (Candidate & MulBIT)
993 return Instruction::Mul;
994 if (Candidate & AddBIT)
995 return Instruction::Add;
996 if (Candidate & SubBIT)
997 return Instruction::Sub;
998 if (Candidate & AndBIT)
999 return Instruction::And;
1000 if (Candidate & OrBIT)
1001 return Instruction::Or;
1002 if (Candidate & XorBIT)
1003 return Instruction::Xor;
1004 llvm_unreachable("Cannot find interchangeable instruction.");
1005 }
1006
1007 /// Return true if the instruction can be converted to \p Opcode.
1008 bool hasCandidateOpcode(unsigned Opcode) const {
1009 MaskType Candidate = Mask & SeenBefore;
1010 switch (Opcode) {
1011 case Instruction::Shl:
1012 return Candidate & ShlBIT;
1013 case Instruction::AShr:
1014 return Candidate & AShrBIT;
1015 case Instruction::Mul:
1016 return Candidate & MulBIT;
1017 case Instruction::Add:
1018 return Candidate & AddBIT;
1019 case Instruction::Sub:
1020 return Candidate & SubBIT;
1021 case Instruction::And:
1022 return Candidate & AndBIT;
1023 case Instruction::Or:
1024 return Candidate & OrBIT;
1025 case Instruction::Xor:
1026 return Candidate & XorBIT;
1027 case Instruction::LShr:
1028 case Instruction::FAdd:
1029 case Instruction::FSub:
1030 case Instruction::FMul:
1031 case Instruction::SDiv:
1032 case Instruction::UDiv:
1033 case Instruction::FDiv:
1034 case Instruction::SRem:
1035 case Instruction::URem:
1036 case Instruction::FRem:
1037 return false;
1038 default:
1039 break;
1040 }
1041 llvm_unreachable("Cannot find interchangeable instruction.");
1042 }
1043
1044 SmallVector<Value *> getOperand(const Instruction *To) const {
1045 unsigned ToOpcode = To->getOpcode();
1046 unsigned FromOpcode = I->getOpcode();
1047 if (FromOpcode == ToOpcode)
1048 return SmallVector<Value *>(I->operands());
1049 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1050 auto [CI, Pos] = isBinOpWithConstantInt(I);
1051 const APInt &FromCIValue = CI->getValue();
1052 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1053 APInt ToCIValue;
1054 switch (FromOpcode) {
1055 case Instruction::Shl:
1056 if (ToOpcode == Instruction::Mul) {
1057 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1058 FromCIValue.getZExtValue());
1059 } else {
1060 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1061 ToCIValue = ToOpcode == Instruction::And
1062 ? APInt::getAllOnes(FromCIValueBitWidth)
1063 : APInt::getZero(FromCIValueBitWidth);
1064 }
1065 break;
1066 case Instruction::Mul:
1067 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1068 if (ToOpcode == Instruction::Shl) {
1069 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1070 } else {
1071 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1072 ToCIValue = ToOpcode == Instruction::And
1073 ? APInt::getAllOnes(FromCIValueBitWidth)
1074 : APInt::getZero(FromCIValueBitWidth);
1075 }
1076 break;
1077 case Instruction::Add:
1078 case Instruction::Sub:
1079 if (FromCIValue.isZero()) {
1080 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1081 } else {
1082 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1083 "Cannot convert the instruction.");
1084 ToCIValue = FromCIValue;
1085 ToCIValue.negate();
1086 }
1087 break;
1088 case Instruction::And:
1089 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1090 ToCIValue = ToOpcode == Instruction::Mul
1091 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1092 : APInt::getZero(FromCIValueBitWidth);
1093 break;
1094 default:
1095 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1096 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1097 break;
1098 }
1099 Value *LHS = I->getOperand(1 - Pos);
1100 Constant *RHS =
1101 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1102 // constant + x cannot be -constant - x
1103 // instead, it should be x - -constant
1104 if (Pos == 1 ||
1105 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1106 FromOpcode == Instruction::Xor) &&
1107 ToOpcode == Instruction::Sub))
1108 return SmallVector<Value *>({LHS, RHS});
1109 return SmallVector<Value *>({RHS, LHS});
1110 }
1111 };
1112 InterchangeableInfo MainOp;
1113 InterchangeableInfo AltOp;
1114 bool isValidForAlternation(const Instruction *I) const {
1115 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1116 ::isValidForAlternation(I->getOpcode());
1117 }
1118 bool initializeAltOp(const Instruction *I) {
1119 if (AltOp.I)
1120 return true;
1122 return false;
1123 AltOp.I = I;
1124 return true;
1125 }
1126
1127public:
1128 BinOpSameOpcodeHelper(const Instruction *MainOp,
1129 const Instruction *AltOp = nullptr)
1130 : MainOp(MainOp), AltOp(AltOp) {
1131 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1132 }
1133 bool add(const Instruction *I) {
1135 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1136 unsigned Opcode = I->getOpcode();
1137 MaskType OpcodeInMaskForm;
1138 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1139 switch (Opcode) {
1140 case Instruction::Shl:
1141 OpcodeInMaskForm = ShlBIT;
1142 break;
1143 case Instruction::AShr:
1144 OpcodeInMaskForm = AShrBIT;
1145 break;
1146 case Instruction::Mul:
1147 OpcodeInMaskForm = MulBIT;
1148 break;
1149 case Instruction::Add:
1150 OpcodeInMaskForm = AddBIT;
1151 break;
1152 case Instruction::Sub:
1153 OpcodeInMaskForm = SubBIT;
1154 break;
1155 case Instruction::And:
1156 OpcodeInMaskForm = AndBIT;
1157 break;
1158 case Instruction::Or:
1159 OpcodeInMaskForm = OrBIT;
1160 break;
1161 case Instruction::Xor:
1162 OpcodeInMaskForm = XorBIT;
1163 break;
1164 default:
1165 return MainOp.equal(Opcode) ||
1166 (initializeAltOp(I) && AltOp.equal(Opcode));
1167 }
1168 MaskType InterchangeableMask = OpcodeInMaskForm;
1169 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1170 if (CI) {
1171 constexpr MaskType CanBeAll =
1172 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1173 const APInt &CIValue = CI->getValue();
1174 switch (Opcode) {
1175 case Instruction::Shl:
1176 if (CIValue.ult(CIValue.getBitWidth()))
1177 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1178 break;
1179 case Instruction::Mul:
1180 if (CIValue.isOne()) {
1181 InterchangeableMask = CanBeAll;
1182 break;
1183 }
1184 if (CIValue.isPowerOf2())
1185 InterchangeableMask = MulBIT | ShlBIT;
1186 break;
1187 case Instruction::Add:
1188 case Instruction::Sub:
1189 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1190 break;
1191 case Instruction::And:
1192 if (CIValue.isAllOnes())
1193 InterchangeableMask = CanBeAll;
1194 break;
1195 case Instruction::Xor:
1196 if (CIValue.isZero())
1197 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1198 break;
1199 default:
1200 if (CIValue.isZero())
1201 InterchangeableMask = CanBeAll;
1202 break;
1203 }
1204 }
1205 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1206 (initializeAltOp(I) &&
1207 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1208 }
1209 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1210 /// Checks if the list of potential opcodes includes \p Opcode.
1211 bool hasCandidateOpcode(unsigned Opcode) const {
1212 return MainOp.hasCandidateOpcode(Opcode);
1213 }
1214 bool hasAltOp() const { return AltOp.I; }
1215 unsigned getAltOpcode() const {
1216 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1217 }
1218 SmallVector<Value *> getOperand(const Instruction *I) const {
1219 return MainOp.getOperand(I);
1220 }
1221};
1222
1223/// Main data required for vectorization of instructions.
1224class InstructionsState {
1225 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1226 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1227 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1228 /// isAltShuffle).
1229 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1230 /// from getMainAltOpsNoStateVL.
1231 /// For those InstructionsState that use alternate instructions, the resulting
1232 /// vectorized output ultimately comes from a shufflevector. For example,
1233 /// given a vector list (VL):
1234 /// VL[0] = add i32 a, e
1235 /// VL[1] = sub i32 b, f
1236 /// VL[2] = add i32 c, g
1237 /// VL[3] = sub i32 d, h
1238 /// The vectorized result would be:
1239 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1240 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1241 /// result = shufflevector <4 x i32> intermediated_0,
1242 /// <4 x i32> intermediated_1,
1243 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1244 /// Since shufflevector is used in the final result, when calculating the cost
1245 /// (getEntryCost), we must account for the usage of shufflevector in
1246 /// GetVectorCost.
1247 Instruction *MainOp = nullptr;
1248 Instruction *AltOp = nullptr;
1249 /// Wether the instruction state represents copyable instructions.
1250 bool HasCopyables = false;
1251
1252public:
1253 Instruction *getMainOp() const {
1254 assert(valid() && "InstructionsState is invalid.");
1255 return MainOp;
1256 }
1257
1258 Instruction *getAltOp() const {
1259 assert(valid() && "InstructionsState is invalid.");
1260 return AltOp;
1261 }
1262
1263 /// The main/alternate opcodes for the list of instructions.
1264 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1265
1266 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1267
1268 /// Some of the instructions in the list have alternate opcodes.
1269 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1270
1271 /// Checks if the instruction matches either the main or alternate opcode.
1272 /// \returns
1273 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1274 /// to it
1275 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1276 /// it
1277 /// - nullptr if \param I cannot be matched or converted to either opcode
1278 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1279 assert(MainOp && "MainOp cannot be nullptr.");
1280 if (I->getOpcode() == MainOp->getOpcode())
1281 return MainOp;
1282 // Prefer AltOp instead of interchangeable instruction of MainOp.
1283 assert(AltOp && "AltOp cannot be nullptr.");
1284 if (I->getOpcode() == AltOp->getOpcode())
1285 return AltOp;
1286 if (!I->isBinaryOp())
1287 return nullptr;
1288 BinOpSameOpcodeHelper Converter(MainOp);
1289 if (!Converter.add(I) || !Converter.add(MainOp))
1290 return nullptr;
1291 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1292 BinOpSameOpcodeHelper AltConverter(AltOp);
1293 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1294 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1295 return AltOp;
1296 }
1297 if (Converter.hasAltOp() && !isAltShuffle())
1298 return nullptr;
1299 return Converter.hasAltOp() ? AltOp : MainOp;
1300 }
1301
1302 /// Checks if main/alt instructions are shift operations.
1303 bool isShiftOp() const {
1304 return getMainOp()->isShift() && getAltOp()->isShift();
1305 }
1306
1307 /// Checks if main/alt instructions are bitwise logic operations.
1308 bool isBitwiseLogicOp() const {
1309 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1310 }
1311
1312 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1313 bool isMulDivLikeOp() const {
1314 constexpr std::array<unsigned, 8> MulDiv = {
1315 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1316 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1317 Instruction::URem, Instruction::FRem};
1318 return is_contained(MulDiv, getOpcode()) &&
1319 is_contained(MulDiv, getAltOpcode());
1320 }
1321
1322 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1323 bool isAddSubLikeOp() const {
1324 constexpr std::array<unsigned, 4> AddSub = {
1325 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1326 Instruction::FSub};
1327 return is_contained(AddSub, getOpcode()) &&
1328 is_contained(AddSub, getAltOpcode());
1329 }
1330
1331 /// Checks if main/alt instructions are cmp operations.
1332 bool isCmpOp() const {
1333 return (getOpcode() == Instruction::ICmp ||
1334 getOpcode() == Instruction::FCmp) &&
1335 getAltOpcode() == getOpcode();
1336 }
1337
1338 /// Checks if the current state is valid, i.e. has non-null MainOp
1339 bool valid() const { return MainOp && AltOp; }
1340
1341 explicit operator bool() const { return valid(); }
1342
1343 InstructionsState() = delete;
1344 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1345 bool HasCopyables = false)
1346 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1347 static InstructionsState invalid() { return {nullptr, nullptr}; }
1348
1349 /// Checks if the value is a copyable element.
1350 bool isCopyableElement(Value *V) const {
1351 assert(valid() && "InstructionsState is invalid.");
1352 if (!HasCopyables)
1353 return false;
1354 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1355 return false;
1356 auto *I = dyn_cast<Instruction>(V);
1357 if (!I)
1358 return !isa<PoisonValue>(V);
1359 if (I->getParent() != MainOp->getParent() &&
1362 return true;
1363 if (I->getOpcode() == MainOp->getOpcode())
1364 return false;
1365 if (!I->isBinaryOp())
1366 return true;
1367 BinOpSameOpcodeHelper Converter(MainOp);
1368 return !Converter.add(I) || !Converter.add(MainOp) ||
1369 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1370 }
1371
1372 /// Checks if the value is non-schedulable.
1373 bool isNonSchedulable(Value *V) const {
1374 assert(valid() && "InstructionsState is invalid.");
1375 auto *I = dyn_cast<Instruction>(V);
1376 if (!HasCopyables)
1377 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1379 // MainOp for copyables always schedulable to correctly identify
1380 // non-schedulable copyables.
1381 if (getMainOp() == V)
1382 return false;
1383 if (isCopyableElement(V)) {
1384 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1385 auto *I = dyn_cast<Instruction>(V);
1386 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1388 // If the copyable instructions comes after MainOp
1389 // (non-schedulable, but used in the block) - cannot vectorize
1390 // it, will possibly generate use before def.
1391 !MainOp->comesBefore(I));
1392 };
1393
1394 return IsNonSchedulableCopyableElement(V);
1395 }
1396 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1398 }
1399
1400 /// Checks if the state represents copyable instructions.
1401 bool areInstructionsWithCopyableElements() const {
1402 assert(valid() && "InstructionsState is invalid.");
1403 return HasCopyables;
1404 }
1405};
1406
1407std::pair<Instruction *, SmallVector<Value *>>
1408convertTo(Instruction *I, const InstructionsState &S) {
1409 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1410 assert(SelectedOp && "Cannot convert the instruction.");
1411 if (I->isBinaryOp()) {
1412 BinOpSameOpcodeHelper Converter(I);
1413 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1414 }
1415 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1416}
1417
1418} // end anonymous namespace
1419
1420static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1421 const TargetLibraryInfo &TLI);
1422
1423/// Find an instruction with a specific opcode in VL.
1424/// \param VL Array of values to search through. Must contain only Instructions
1425/// and PoisonValues.
1426/// \param Opcode The instruction opcode to search for
1427/// \returns
1428/// - The first instruction found with matching opcode
1429/// - nullptr if no matching instruction is found
1431 unsigned Opcode) {
1432 for (Value *V : VL) {
1433 if (isa<PoisonValue>(V))
1434 continue;
1435 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1436 auto *Inst = cast<Instruction>(V);
1437 if (Inst->getOpcode() == Opcode)
1438 return Inst;
1439 }
1440 return nullptr;
1441}
1442
1443/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1444/// compatible instructions or constants, or just some other regular values.
1445static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1446 Value *Op1, const TargetLibraryInfo &TLI) {
1447 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1448 (isConstant(BaseOp1) && isConstant(Op1)) ||
1449 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1450 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1451 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1452 getSameOpcode({BaseOp0, Op0}, TLI) ||
1453 getSameOpcode({BaseOp1, Op1}, TLI);
1454}
1455
1456/// \returns true if a compare instruction \p CI has similar "look" and
1457/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1458/// swapped, false otherwise.
1459static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1460 const TargetLibraryInfo &TLI) {
1461 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1462 "Assessing comparisons of different types?");
1463 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1464 CmpInst::Predicate Pred = CI->getPredicate();
1466
1467 Value *BaseOp0 = BaseCI->getOperand(0);
1468 Value *BaseOp1 = BaseCI->getOperand(1);
1469 Value *Op0 = CI->getOperand(0);
1470 Value *Op1 = CI->getOperand(1);
1471
1472 return (BasePred == Pred &&
1473 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1474 (BasePred == SwappedPred &&
1475 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1476}
1477
1478/// \returns analysis of the Instructions in \p VL described in
1479/// InstructionsState, the Opcode that we suppose the whole list
1480/// could be vectorized even if its structure is diverse.
1481static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1482 const TargetLibraryInfo &TLI) {
1483 // Make sure these are all Instructions.
1485 return InstructionsState::invalid();
1486
1487 auto *It = find_if(VL, IsaPred<Instruction>);
1488 if (It == VL.end())
1489 return InstructionsState::invalid();
1490
1491 Instruction *MainOp = cast<Instruction>(*It);
1492 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1493 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1494 (VL.size() == 2 && InstCnt < 2))
1495 return InstructionsState::invalid();
1496
1497 bool IsCastOp = isa<CastInst>(MainOp);
1498 bool IsBinOp = isa<BinaryOperator>(MainOp);
1499 bool IsCmpOp = isa<CmpInst>(MainOp);
1500 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1502 Instruction *AltOp = MainOp;
1503 unsigned Opcode = MainOp->getOpcode();
1504 unsigned AltOpcode = Opcode;
1505
1506 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1507 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1508 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1509 UniquePreds.insert(BasePred);
1510 UniqueNonSwappedPreds.insert(BasePred);
1511 for (Value *V : VL) {
1512 auto *I = dyn_cast<CmpInst>(V);
1513 if (!I)
1514 return false;
1515 CmpInst::Predicate CurrentPred = I->getPredicate();
1516 CmpInst::Predicate SwappedCurrentPred =
1517 CmpInst::getSwappedPredicate(CurrentPred);
1518 UniqueNonSwappedPreds.insert(CurrentPred);
1519 if (!UniquePreds.contains(CurrentPred) &&
1520 !UniquePreds.contains(SwappedCurrentPred))
1521 UniquePreds.insert(CurrentPred);
1522 }
1523 // Total number of predicates > 2, but if consider swapped predicates
1524 // compatible only 2, consider swappable predicates as compatible opcodes,
1525 // not alternate.
1526 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1527 }();
1528 // Check for one alternate opcode from another BinaryOperator.
1529 // TODO - generalize to support all operators (types, calls etc.).
1530 Intrinsic::ID BaseID = 0;
1531 SmallVector<VFInfo> BaseMappings;
1532 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1533 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1534 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1535 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1536 return InstructionsState::invalid();
1537 }
1538 bool AnyPoison = InstCnt != VL.size();
1539 // Check MainOp too to be sure that it matches the requirements for the
1540 // instructions.
1541 for (Value *V : iterator_range(It, VL.end())) {
1542 auto *I = dyn_cast<Instruction>(V);
1543 if (!I)
1544 continue;
1545
1546 // Cannot combine poison and divisions.
1547 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1548 // intrinsics/functions only.
1549 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1550 return InstructionsState::invalid();
1551 unsigned InstOpcode = I->getOpcode();
1552 if (IsBinOp && isa<BinaryOperator>(I)) {
1553 if (BinOpHelper.add(I))
1554 continue;
1555 } else if (IsCastOp && isa<CastInst>(I)) {
1556 Value *Op0 = MainOp->getOperand(0);
1557 Type *Ty0 = Op0->getType();
1558 Value *Op1 = I->getOperand(0);
1559 Type *Ty1 = Op1->getType();
1560 if (Ty0 == Ty1) {
1561 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1562 continue;
1563 if (Opcode == AltOpcode) {
1564 assert(isValidForAlternation(Opcode) &&
1565 isValidForAlternation(InstOpcode) &&
1566 "Cast isn't safe for alternation, logic needs to be updated!");
1567 AltOpcode = InstOpcode;
1568 AltOp = I;
1569 continue;
1570 }
1571 }
1572 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1573 auto *BaseInst = cast<CmpInst>(MainOp);
1574 Type *Ty0 = BaseInst->getOperand(0)->getType();
1575 Type *Ty1 = Inst->getOperand(0)->getType();
1576 if (Ty0 == Ty1) {
1577 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1578 assert(InstOpcode == AltOpcode &&
1579 "Alternate instructions are only supported by BinaryOperator "
1580 "and CastInst.");
1581 // Check for compatible operands. If the corresponding operands are not
1582 // compatible - need to perform alternate vectorization.
1583 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1584 CmpInst::Predicate SwappedCurrentPred =
1585 CmpInst::getSwappedPredicate(CurrentPred);
1586
1587 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1588 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1589 continue;
1590
1591 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1592 continue;
1593 auto *AltInst = cast<CmpInst>(AltOp);
1594 if (MainOp != AltOp) {
1595 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1596 continue;
1597 } else if (BasePred != CurrentPred) {
1598 assert(
1599 isValidForAlternation(InstOpcode) &&
1600 "CmpInst isn't safe for alternation, logic needs to be updated!");
1601 AltOp = I;
1602 continue;
1603 }
1604 CmpInst::Predicate AltPred = AltInst->getPredicate();
1605 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1606 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1607 continue;
1608 }
1609 } else if (InstOpcode == Opcode) {
1610 assert(InstOpcode == AltOpcode &&
1611 "Alternate instructions are only supported by BinaryOperator and "
1612 "CastInst.");
1613 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1614 if (Gep->getNumOperands() != 2 ||
1615 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1616 return InstructionsState::invalid();
1617 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1619 return InstructionsState::invalid();
1620 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1621 auto *BaseLI = cast<LoadInst>(MainOp);
1622 if (!LI->isSimple() || !BaseLI->isSimple())
1623 return InstructionsState::invalid();
1624 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1625 auto *CallBase = cast<CallInst>(MainOp);
1626 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1627 return InstructionsState::invalid();
1628 if (Call->hasOperandBundles() &&
1630 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1631 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1632 CallBase->op_begin() +
1634 return InstructionsState::invalid();
1636 if (ID != BaseID)
1637 return InstructionsState::invalid();
1638 if (!ID) {
1639 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1640 if (Mappings.size() != BaseMappings.size() ||
1641 Mappings.front().ISA != BaseMappings.front().ISA ||
1642 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1643 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1644 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1645 Mappings.front().Shape.Parameters !=
1646 BaseMappings.front().Shape.Parameters)
1647 return InstructionsState::invalid();
1648 }
1649 }
1650 continue;
1651 }
1652 return InstructionsState::invalid();
1653 }
1654
1655 if (IsBinOp) {
1656 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1657 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1658 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1659 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1660 }
1661 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1662 "Incorrect implementation of allSameOpcode.");
1663 InstructionsState S(MainOp, AltOp);
1664 assert(all_of(VL,
1665 [&](Value *V) {
1666 return isa<PoisonValue>(V) ||
1667 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1668 }) &&
1669 "Invalid InstructionsState.");
1670 return S;
1671}
1672
1673/// \returns true if all of the values in \p VL have the same type or false
1674/// otherwise.
1676 Type *Ty = VL.consume_front()->getType();
1677 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1678}
1679
1680/// \returns True if in-tree use also needs extract. This refers to
1681/// possible scalar operand in vectorized instruction.
1682static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1683 TargetLibraryInfo *TLI,
1684 const TargetTransformInfo *TTI) {
1685 if (!UserInst)
1686 return false;
1687 unsigned Opcode = UserInst->getOpcode();
1688 switch (Opcode) {
1689 case Instruction::Load: {
1690 LoadInst *LI = cast<LoadInst>(UserInst);
1691 return (LI->getPointerOperand() == Scalar);
1692 }
1693 case Instruction::Store: {
1694 StoreInst *SI = cast<StoreInst>(UserInst);
1695 return (SI->getPointerOperand() == Scalar);
1696 }
1697 case Instruction::Call: {
1698 CallInst *CI = cast<CallInst>(UserInst);
1700 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1701 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1702 Arg.value().get() == Scalar;
1703 });
1704 }
1705 default:
1706 return false;
1707 }
1708}
1709
1710/// \returns the AA location that is being access by the instruction.
1713 return MemoryLocation::get(SI);
1714 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1715 return MemoryLocation::get(LI);
1716 return MemoryLocation();
1717}
1718
1719/// \returns True if the instruction is not a volatile or atomic load/store.
1720static bool isSimple(Instruction *I) {
1721 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1722 return LI->isSimple();
1724 return SI->isSimple();
1726 return !MI->isVolatile();
1727 return true;
1728}
1729
1730/// Shuffles \p Mask in accordance with the given \p SubMask.
1731/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1732/// one but two input vectors.
1733static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1734 bool ExtendingManyInputs = false) {
1735 if (SubMask.empty())
1736 return;
1737 assert(
1738 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1739 // Check if input scalars were extended to match the size of other node.
1740 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1741 "SubMask with many inputs support must be larger than the mask.");
1742 if (Mask.empty()) {
1743 Mask.append(SubMask.begin(), SubMask.end());
1744 return;
1745 }
1746 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1747 int TermValue = std::min(Mask.size(), SubMask.size());
1748 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1749 if (SubMask[I] == PoisonMaskElem ||
1750 (!ExtendingManyInputs &&
1751 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1752 continue;
1753 NewMask[I] = Mask[SubMask[I]];
1754 }
1755 Mask.swap(NewMask);
1756}
1757
1758/// Order may have elements assigned special value (size) which is out of
1759/// bounds. Such indices only appear on places which correspond to undef values
1760/// (see canReuseExtract for details) and used in order to avoid undef values
1761/// have effect on operands ordering.
1762/// The first loop below simply finds all unused indices and then the next loop
1763/// nest assigns these indices for undef values positions.
1764/// As an example below Order has two undef positions and they have assigned
1765/// values 3 and 7 respectively:
1766/// before: 6 9 5 4 9 2 1 0
1767/// after: 6 3 5 4 7 2 1 0
1769 const size_t Sz = Order.size();
1770 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1771 SmallBitVector MaskedIndices(Sz);
1772 for (unsigned I = 0; I < Sz; ++I) {
1773 if (Order[I] < Sz)
1774 UnusedIndices.reset(Order[I]);
1775 else
1776 MaskedIndices.set(I);
1777 }
1778 if (MaskedIndices.none())
1779 return;
1780 assert(UnusedIndices.count() == MaskedIndices.count() &&
1781 "Non-synced masked/available indices.");
1782 int Idx = UnusedIndices.find_first();
1783 int MIdx = MaskedIndices.find_first();
1784 while (MIdx >= 0) {
1785 assert(Idx >= 0 && "Indices must be synced.");
1786 Order[MIdx] = Idx;
1787 Idx = UnusedIndices.find_next(Idx);
1788 MIdx = MaskedIndices.find_next(MIdx);
1789 }
1790}
1791
1792/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1793/// Opcode1.
1795 unsigned Opcode0, unsigned Opcode1) {
1796 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1797 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1798 for (unsigned Lane : seq<unsigned>(VL.size())) {
1799 if (isa<PoisonValue>(VL[Lane]))
1800 continue;
1801 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1802 OpcodeMask.set(Lane * ScalarTyNumElements,
1803 Lane * ScalarTyNumElements + ScalarTyNumElements);
1804 }
1805 return OpcodeMask;
1806}
1807
1808/// Replicates the given \p Val \p VF times.
1810 unsigned VF) {
1811 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1812 "Expected scalar constants.");
1813 SmallVector<Constant *> NewVal(Val.size() * VF);
1814 for (auto [I, V] : enumerate(Val))
1815 std::fill_n(NewVal.begin() + I * VF, VF, V);
1816 return NewVal;
1817}
1818
1820 SmallVectorImpl<int> &Mask) {
1821 Mask.clear();
1822 const unsigned E = Indices.size();
1823 Mask.resize(E, PoisonMaskElem);
1824 for (unsigned I = 0; I < E; ++I)
1825 Mask[Indices[I]] = I;
1826}
1827
1828/// Reorders the list of scalars in accordance with the given \p Mask.
1830 ArrayRef<int> Mask) {
1831 assert(!Mask.empty() && "Expected non-empty mask.");
1832 SmallVector<Value *> Prev(Scalars.size(),
1833 PoisonValue::get(Scalars.front()->getType()));
1834 Prev.swap(Scalars);
1835 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1836 if (Mask[I] != PoisonMaskElem)
1837 Scalars[Mask[I]] = Prev[I];
1838}
1839
1840/// Checks if the provided value does not require scheduling. It does not
1841/// require scheduling if this is not an instruction or it is an instruction
1842/// that does not read/write memory and all operands are either not instructions
1843/// or phi nodes or instructions from different blocks.
1845 auto *I = dyn_cast<Instruction>(V);
1846 if (!I)
1847 return true;
1848 return !mayHaveNonDefUseDependency(*I) &&
1849 all_of(I->operands(), [I](Value *V) {
1850 auto *IO = dyn_cast<Instruction>(V);
1851 if (!IO)
1852 return true;
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1854 });
1855}
1856
1857/// Checks if the provided value does not require scheduling. It does not
1858/// require scheduling if this is not an instruction or it is an instruction
1859/// that does not read/write memory and all users are phi nodes or instructions
1860/// from the different blocks.
1861static bool isUsedOutsideBlock(Value *V) {
1862 auto *I = dyn_cast<Instruction>(V);
1863 if (!I)
1864 return true;
1865 // Limits the number of uses to save compile time.
1866 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1867 all_of(I->users(), [I](User *U) {
1868 auto *IU = dyn_cast<Instruction>(U);
1869 if (!IU)
1870 return true;
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1872 });
1873}
1874
1875/// Checks if the specified value does not require scheduling. It does not
1876/// require scheduling if all operands and all users do not need to be scheduled
1877/// in the current basic block.
1880}
1881
1882/// Checks if the specified array of instructions does not require scheduling.
1883/// It is so if all either instructions have operands that do not require
1884/// scheduling or their users do not require scheduling since they are phis or
1885/// in other basic blocks.
1887 return !VL.empty() &&
1889}
1890
1891/// Returns true if widened type of \p Ty elements with size \p Sz represents
1892/// full vector type, i.e. adding extra element results in extra parts upon type
1893/// legalization.
1895 unsigned Sz) {
1896 if (Sz <= 1)
1897 return false;
1899 return false;
1900 if (has_single_bit(Sz))
1901 return true;
1902 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1903 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1904 Sz % NumParts == 0;
1905}
1906
1907/// Returns number of parts, the type \p VecTy will be split at the codegen
1908/// phase. If the type is going to be scalarized or does not uses whole
1909/// registers, returns 1.
1910static unsigned
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1915 return 1;
1916 unsigned Sz = getNumElements(VecTy);
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1918 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1919 return 1;
1920 return NumParts;
1921}
1922
1923/// Bottom Up SLP Vectorizer.
1925 class TreeEntry;
1926 class ScheduleEntity;
1927 class ScheduleData;
1928 class ScheduleCopyableData;
1929 class ScheduleBundle;
1932
1933 /// If we decide to generate strided load / store, this struct contains all
1934 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1935 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1936 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1937 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1938 /// size of element of FixedVectorType.
1939 struct StridedPtrInfo {
1940 Value *StrideVal = nullptr;
1941 const SCEV *StrideSCEV = nullptr;
1942 FixedVectorType *Ty = nullptr;
1943 };
1944 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1945
1946public:
1947 /// Tracks the state we can represent the loads in the given sequence.
1955
1962
1964 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1966 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1967 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1968 AC(AC), DB(DB), DL(DL), ORE(ORE),
1969 Builder(Se->getContext(), TargetFolder(*DL)) {
1970 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1971 // Use the vector register size specified by the target unless overridden
1972 // by a command-line option.
1973 // TODO: It would be better to limit the vectorization factor based on
1974 // data type rather than just register size. For example, x86 AVX has
1975 // 256-bit registers, but it does not support integer operations
1976 // at that width (that requires AVX2).
1977 if (MaxVectorRegSizeOption.getNumOccurrences())
1978 MaxVecRegSize = MaxVectorRegSizeOption;
1979 else
1980 MaxVecRegSize =
1981 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1982 .getFixedValue();
1983
1984 if (MinVectorRegSizeOption.getNumOccurrences())
1985 MinVecRegSize = MinVectorRegSizeOption;
1986 else
1987 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1988 }
1989
1990 /// Vectorize the tree that starts with the elements in \p VL.
1991 /// Returns the vectorized root.
1993
1994 /// Vectorize the tree but with the list of externally used values \p
1995 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1996 /// generated extractvalue instructions.
1998 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1999 Instruction *ReductionRoot = nullptr,
2000 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2001
2002 /// \returns the cost incurred by unwanted spills and fills, caused by
2003 /// holding live values over call sites.
2005
2006 /// \returns the vectorization cost of the subtree that starts at \p VL.
2007 /// A negative number means that this is profitable.
2008 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
2009 InstructionCost ReductionCost = TTI::TCC_Free);
2010
2011 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2012 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2013 void buildTree(ArrayRef<Value *> Roots,
2014 const SmallDenseSet<Value *> &UserIgnoreLst);
2015
2016 /// Construct a vectorizable tree that starts at \p Roots.
2017 void buildTree(ArrayRef<Value *> Roots);
2018
2019 /// Return the scalars of the root node.
2021 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2022 return VectorizableTree.front()->Scalars;
2023 }
2024
2025 /// Returns the type/is-signed info for the root node in the graph without
2026 /// casting.
2027 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2028 const TreeEntry &Root = *VectorizableTree.front();
2029 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2030 !Root.Scalars.front()->getType()->isIntegerTy())
2031 return std::nullopt;
2032 auto It = MinBWs.find(&Root);
2033 if (It != MinBWs.end())
2034 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2035 It->second.first),
2036 It->second.second);
2037 if (Root.getOpcode() == Instruction::ZExt ||
2038 Root.getOpcode() == Instruction::SExt)
2039 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2040 Root.getOpcode() == Instruction::SExt);
2041 return std::nullopt;
2042 }
2043
2044 /// Checks if the root graph node can be emitted with narrower bitwidth at
2045 /// codegen and returns it signedness, if so.
2047 return MinBWs.at(VectorizableTree.front().get()).second;
2048 }
2049
2050 /// Returns reduction type after minbitdth analysis.
2052 if (ReductionBitWidth == 0 ||
2053 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2054 ReductionBitWidth >=
2055 DL->getTypeSizeInBits(
2056 VectorizableTree.front()->Scalars.front()->getType()))
2057 return getWidenedType(
2058 VectorizableTree.front()->Scalars.front()->getType(),
2059 VectorizableTree.front()->getVectorFactor());
2060 return getWidenedType(
2062 VectorizableTree.front()->Scalars.front()->getContext(),
2063 ReductionBitWidth),
2064 VectorizableTree.front()->getVectorFactor());
2065 }
2066
2067 /// Builds external uses of the vectorized scalars, i.e. the list of
2068 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2069 /// ExternallyUsedValues contains additional list of external uses to handle
2070 /// vectorization of reductions.
2071 void
2072 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2073
2074 /// Transforms graph nodes to target specific representations, if profitable.
2075 void transformNodes();
2076
2077 /// Clear the internal data structures that are created by 'buildTree'.
2078 void deleteTree() {
2079 VectorizableTree.clear();
2080 ScalarToTreeEntries.clear();
2081 OperandsToTreeEntry.clear();
2082 ScalarsInSplitNodes.clear();
2083 MustGather.clear();
2084 NonScheduledFirst.clear();
2085 EntryToLastInstruction.clear();
2086 LastInstructionToPos.clear();
2087 LoadEntriesToVectorize.clear();
2088 IsGraphTransformMode = false;
2089 GatheredLoadsEntriesFirst.reset();
2090 CompressEntryToData.clear();
2091 ExternalUses.clear();
2092 ExternalUsesAsOriginalScalar.clear();
2093 ExternalUsesWithNonUsers.clear();
2094 for (auto &Iter : BlocksSchedules) {
2095 BlockScheduling *BS = Iter.second.get();
2096 BS->clear();
2097 }
2098 MinBWs.clear();
2099 ReductionBitWidth = 0;
2100 BaseGraphSize = 1;
2101 CastMaxMinBWSizes.reset();
2102 ExtraBitWidthNodes.clear();
2103 InstrElementSize.clear();
2104 UserIgnoreList = nullptr;
2105 PostponedGathers.clear();
2106 ValueToGatherNodes.clear();
2107 TreeEntryToStridedPtrInfoMap.clear();
2108 }
2109
2110 unsigned getTreeSize() const { return VectorizableTree.size(); }
2111
2112 /// Returns the base graph size, before any transformations.
2113 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2114
2115 /// Perform LICM and CSE on the newly generated gather sequences.
2117
2118 /// Does this non-empty order represent an identity order? Identity
2119 /// should be represented as an empty order, so this is used to
2120 /// decide if we can canonicalize a computed order. Undef elements
2121 /// (represented as size) are ignored.
2123 assert(!Order.empty() && "expected non-empty order");
2124 const unsigned Sz = Order.size();
2125 return all_of(enumerate(Order), [&](const auto &P) {
2126 return P.value() == P.index() || P.value() == Sz;
2127 });
2128 }
2129
2130 /// Checks if the specified gather tree entry \p TE can be represented as a
2131 /// shuffled vector entry + (possibly) permutation with other gathers. It
2132 /// implements the checks only for possibly ordered scalars (Loads,
2133 /// ExtractElement, ExtractValue), which can be part of the graph.
2134 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2135 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2136 /// node might be ignored.
2137 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2138 bool TopToBottom,
2139 bool IgnoreReorder);
2140
2141 /// Sort loads into increasing pointers offsets to allow greater clustering.
2142 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2143
2144 /// Gets reordering data for the given tree entry. If the entry is vectorized
2145 /// - just return ReorderIndices, otherwise check if the scalars can be
2146 /// reordered and return the most optimal order.
2147 /// \return std::nullopt if ordering is not important, empty order, if
2148 /// identity order is important, or the actual order.
2149 /// \param TopToBottom If true, include the order of vectorized stores and
2150 /// insertelement nodes, otherwise skip them.
2151 /// \param IgnoreReorder true, if the root node order can be ignored.
2152 std::optional<OrdersType>
2153 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2154
2155 /// Checks if it is profitable to reorder the current tree.
2156 /// If the tree does not contain many profitable reordable nodes, better to
2157 /// skip it to save compile time.
2158 bool isProfitableToReorder() const;
2159
2160 /// Reorders the current graph to the most profitable order starting from the
2161 /// root node to the leaf nodes. The best order is chosen only from the nodes
2162 /// of the same size (vectorization factor). Smaller nodes are considered
2163 /// parts of subgraph with smaller VF and they are reordered independently. We
2164 /// can make it because we still need to extend smaller nodes to the wider VF
2165 /// and we can merge reordering shuffles with the widening shuffles.
2166 void reorderTopToBottom();
2167
2168 /// Reorders the current graph to the most profitable order starting from
2169 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2170 /// number of reshuffles if the leaf nodes use the same order. In this case we
2171 /// can merge the orders and just shuffle user node instead of shuffling its
2172 /// operands. Plus, even the leaf nodes have different orders, it allows to
2173 /// sink reordering in the graph closer to the root node and merge it later
2174 /// during analysis.
2175 void reorderBottomToTop(bool IgnoreReorder = false);
2176
2177 /// \return The vector element size in bits to use when vectorizing the
2178 /// expression tree ending at \p V. If V is a store, the size is the width of
2179 /// the stored value. Otherwise, the size is the width of the largest loaded
2180 /// value reaching V. This method is used by the vectorizer to calculate
2181 /// vectorization factors.
2182 unsigned getVectorElementSize(Value *V);
2183
2184 /// Compute the minimum type sizes required to represent the entries in a
2185 /// vectorizable tree.
2187
2188 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2189 unsigned getMaxVecRegSize() const {
2190 return MaxVecRegSize;
2191 }
2192
2193 // \returns minimum vector register size as set by cl::opt.
2194 unsigned getMinVecRegSize() const {
2195 return MinVecRegSize;
2196 }
2197
2198 unsigned getMinVF(unsigned Sz) const {
2199 return std::max(2U, getMinVecRegSize() / Sz);
2200 }
2201
2202 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2203 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2204 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2205 return MaxVF ? MaxVF : UINT_MAX;
2206 }
2207
2208 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2209 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2210 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2211 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2212 ///
2213 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2214 unsigned canMapToVector(Type *T) const;
2215
2216 /// \returns True if the VectorizableTree is both tiny and not fully
2217 /// vectorizable. We do not vectorize such trees.
2218 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2219
2220 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2221 /// It may happen, if all gather nodes are loads and they cannot be
2222 /// "clusterized". In this case even subgraphs cannot be vectorized more
2223 /// effectively than the base graph.
2224 bool isTreeNotExtendable() const;
2225
2226 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2227 /// can be load combined in the backend. Load combining may not be allowed in
2228 /// the IR optimizer, so we do not want to alter the pattern. For example,
2229 /// partially transforming a scalar bswap() pattern into vector code is
2230 /// effectively impossible for the backend to undo.
2231 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2232 /// may not be necessary.
2233 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2234
2235 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2236 /// can be load combined in the backend. Load combining may not be allowed in
2237 /// the IR optimizer, so we do not want to alter the pattern. For example,
2238 /// partially transforming a scalar bswap() pattern into vector code is
2239 /// effectively impossible for the backend to undo.
2240 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2241 /// may not be necessary.
2242 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2243 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2244 Align Alignment, const int64_t Diff,
2245 const size_t Sz) const;
2246
2247 /// Return true if an array of scalar loads can be replaced with a strided
2248 /// load (with constant stride).
2249 ///
2250 /// It is possible that the load gets "widened". Suppose that originally each
2251 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2252 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2253 /// ...
2254 /// %b + 0 * %s + (w - 1)
2255 ///
2256 /// %b + 1 * %s + 0
2257 /// %b + 1 * %s + 1
2258 /// %b + 1 * %s + 2
2259 /// ...
2260 /// %b + 1 * %s + (w - 1)
2261 /// ...
2262 ///
2263 /// %b + (n - 1) * %s + 0
2264 /// %b + (n - 1) * %s + 1
2265 /// %b + (n - 1) * %s + 2
2266 /// ...
2267 /// %b + (n - 1) * %s + (w - 1)
2268 ///
2269 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2270 ///
2271 /// \param PointerOps list of pointer arguments of loads.
2272 /// \param ElemTy original scalar type of loads.
2273 /// \param Alignment alignment of the first load.
2274 /// \param SortedIndices is the order of PointerOps as returned by
2275 /// `sortPtrAccesses`
2276 /// \param Diff Pointer difference between the lowest and the highes pointer
2277 /// in `PointerOps` as returned by `getPointersDiff`.
2278 /// \param Ptr0 first pointer in `PointersOps`.
2279 /// \param PtrN last pointer in `PointersOps`.
2280 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2281 /// of `SPtrInfo` necessary to generate the strided load later.
2283 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2284 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2285 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2286
2287 /// Return true if an array of scalar loads can be replaced with a strided
2288 /// load (with run-time stride).
2289 /// \param PointerOps list of pointer arguments of loads.
2290 /// \param ScalarTy type of loads.
2291 /// \param CommonAlignment common alignement of loads as computed by
2292 /// `computeCommonAlignment<LoadInst>`.
2293 /// \param SortedIndicies is a list of indicies computed by this function such
2294 /// that the sequence `PointerOps[SortedIndices[0]],
2295 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2296 /// ordered by the coefficient of the stride. For example, if PointerOps is
2297 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2298 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2299 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2300 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2301 /// of `SPtrInfo` necessary to generate the strided load later.
2302 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2303 Align CommonAlignment,
2304 SmallVectorImpl<unsigned> &SortedIndices,
2305 StridedPtrInfo &SPtrInfo) const;
2306
2307 /// Checks if the given array of loads can be represented as a vectorized,
2308 /// scatter or just simple gather.
2309 /// \param VL list of loads.
2310 /// \param VL0 main load value.
2311 /// \param Order returned order of load instructions.
2312 /// \param PointerOps returned list of pointer operands.
2313 /// \param BestVF return best vector factor, if recursive check found better
2314 /// vectorization sequences rather than masked gather.
2315 /// \param TryRecursiveCheck used to check if long masked gather can be
2316 /// represented as a serie of loads/insert subvector, if profitable.
2319 SmallVectorImpl<Value *> &PointerOps,
2320 StridedPtrInfo &SPtrInfo,
2321 unsigned *BestVF = nullptr,
2322 bool TryRecursiveCheck = true) const;
2323
2324 /// Registers non-vectorizable sequence of loads
2325 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2326 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2327 }
2328
2329 /// Checks if the given loads sequence is known as not vectorizable
2330 template <typename T>
2332 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2333 }
2334
2336
2337 /// This structure holds any data we need about the edges being traversed
2338 /// during buildTreeRec(). We keep track of:
2339 /// (i) the user TreeEntry index, and
2340 /// (ii) the index of the edge.
2341 struct EdgeInfo {
2342 EdgeInfo() = default;
2343 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2345 /// The user TreeEntry.
2346 TreeEntry *UserTE = nullptr;
2347 /// The operand index of the use.
2348 unsigned EdgeIdx = UINT_MAX;
2349#ifndef NDEBUG
2351 const BoUpSLP::EdgeInfo &EI) {
2352 EI.dump(OS);
2353 return OS;
2354 }
2355 /// Debug print.
2356 void dump(raw_ostream &OS) const {
2357 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2358 << " EdgeIdx:" << EdgeIdx << "}";
2359 }
2360 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2361#endif
2362 bool operator == (const EdgeInfo &Other) const {
2363 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2364 }
2365
2366 operator bool() const { return UserTE != nullptr; }
2367 };
2368 friend struct DenseMapInfo<EdgeInfo>;
2369
2370 /// A helper class used for scoring candidates for two consecutive lanes.
2372 const TargetLibraryInfo &TLI;
2373 const DataLayout &DL;
2374 ScalarEvolution &SE;
2375 const BoUpSLP &R;
2376 int NumLanes; // Total number of lanes (aka vectorization factor).
2377 int MaxLevel; // The maximum recursion depth for accumulating score.
2378
2379 public:
2381 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2382 int MaxLevel)
2383 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2384 MaxLevel(MaxLevel) {}
2385
2386 // The hard-coded scores listed here are not very important, though it shall
2387 // be higher for better matches to improve the resulting cost. When
2388 // computing the scores of matching one sub-tree with another, we are
2389 // basically counting the number of values that are matching. So even if all
2390 // scores are set to 1, we would still get a decent matching result.
2391 // However, sometimes we have to break ties. For example we may have to
2392 // choose between matching loads vs matching opcodes. This is what these
2393 // scores are helping us with: they provide the order of preference. Also,
2394 // this is important if the scalar is externally used or used in another
2395 // tree entry node in the different lane.
2396
2397 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2398 static const int ScoreConsecutiveLoads = 4;
2399 /// The same load multiple times. This should have a better score than
2400 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2401 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2402 /// a vector load and 1.0 for a broadcast.
2403 static const int ScoreSplatLoads = 3;
2404 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2405 static const int ScoreReversedLoads = 3;
2406 /// A load candidate for masked gather.
2407 static const int ScoreMaskedGatherCandidate = 1;
2408 /// ExtractElementInst from same vector and consecutive indexes.
2409 static const int ScoreConsecutiveExtracts = 4;
2410 /// ExtractElementInst from same vector and reversed indices.
2411 static const int ScoreReversedExtracts = 3;
2412 /// Constants.
2413 static const int ScoreConstants = 2;
2414 /// Instructions with the same opcode.
2415 static const int ScoreSameOpcode = 2;
2416 /// Instructions with alt opcodes (e.g, add + sub).
2417 static const int ScoreAltOpcodes = 1;
2418 /// Identical instructions (a.k.a. splat or broadcast).
2419 static const int ScoreSplat = 1;
2420 /// Matching with an undef is preferable to failing.
2421 static const int ScoreUndef = 1;
2422 /// Score for failing to find a decent match.
2423 static const int ScoreFail = 0;
2424 /// Score if all users are vectorized.
2425 static const int ScoreAllUserVectorized = 1;
2426
2427 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2428 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2429 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2430 /// MainAltOps.
2432 ArrayRef<Value *> MainAltOps) const {
2433 if (!isValidElementType(V1->getType()) ||
2436
2437 if (V1 == V2) {
2438 if (isa<LoadInst>(V1)) {
2439 // Retruns true if the users of V1 and V2 won't need to be extracted.
2440 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2441 // Bail out if we have too many uses to save compilation time.
2442 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2443 return false;
2444
2445 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2446 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2447 return U == U1 || U == U2 || R.isVectorized(U);
2448 });
2449 };
2450 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2451 };
2452 // A broadcast of a load can be cheaper on some targets.
2453 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2454 ElementCount::getFixed(NumLanes)) &&
2455 ((int)V1->getNumUses() == NumLanes ||
2456 AllUsersAreInternal(V1, V2)))
2458 }
2460 }
2461
2462 auto CheckSameEntryOrFail = [&]() {
2463 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2465 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2466 !TEs2.empty() &&
2467 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2469 }
2471 };
2472
2473 auto *LI1 = dyn_cast<LoadInst>(V1);
2474 auto *LI2 = dyn_cast<LoadInst>(V2);
2475 if (LI1 && LI2) {
2476 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2477 !LI2->isSimple())
2478 return CheckSameEntryOrFail();
2479
2480 std::optional<int64_t> Dist = getPointersDiff(
2481 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2482 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2483 if (!Dist || *Dist == 0) {
2484 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2485 getUnderlyingObject(LI2->getPointerOperand()) &&
2486 R.TTI->isLegalMaskedGather(
2487 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2489 return CheckSameEntryOrFail();
2490 }
2491 // The distance is too large - still may be profitable to use masked
2492 // loads/gathers.
2493 if (std::abs(*Dist) > NumLanes / 2)
2495 // This still will detect consecutive loads, but we might have "holes"
2496 // in some cases. It is ok for non-power-2 vectorization and may produce
2497 // better results. It should not affect current vectorization.
2500 }
2501
2502 auto *C1 = dyn_cast<Constant>(V1);
2503 auto *C2 = dyn_cast<Constant>(V2);
2504 if (C1 && C2)
2506
2507 // Consider constants and buildvector compatible.
2508 if ((C1 && isa<InsertElementInst>(V2)) ||
2509 (C2 && isa<InsertElementInst>(V1)))
2511
2512 // Extracts from consecutive indexes of the same vector better score as
2513 // the extracts could be optimized away.
2514 Value *EV1;
2515 ConstantInt *Ex1Idx;
2516 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2517 // Undefs are always profitable for extractelements.
2518 // Compiler can easily combine poison and extractelement <non-poison> or
2519 // undef and extractelement <poison>. But combining undef +
2520 // extractelement <non-poison-but-may-produce-poison> requires some
2521 // extra operations.
2522 if (isa<UndefValue>(V2))
2523 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2526 Value *EV2 = nullptr;
2527 ConstantInt *Ex2Idx = nullptr;
2528 if (match(V2,
2530 m_Undef())))) {
2531 // Undefs are always profitable for extractelements.
2532 if (!Ex2Idx)
2534 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2536 if (EV2 == EV1) {
2537 int Idx1 = Ex1Idx->getZExtValue();
2538 int Idx2 = Ex2Idx->getZExtValue();
2539 int Dist = Idx2 - Idx1;
2540 // The distance is too large - still may be profitable to use
2541 // shuffles.
2542 if (std::abs(Dist) == 0)
2544 if (std::abs(Dist) > NumLanes / 2)
2548 }
2550 }
2551 return CheckSameEntryOrFail();
2552 }
2553
2554 auto *I1 = dyn_cast<Instruction>(V1);
2555 auto *I2 = dyn_cast<Instruction>(V2);
2556 if (I1 && I2) {
2557 if (I1->getParent() != I2->getParent())
2558 return CheckSameEntryOrFail();
2559 SmallVector<Value *, 4> Ops(MainAltOps);
2560 Ops.push_back(I1);
2561 Ops.push_back(I2);
2562 InstructionsState S = getSameOpcode(Ops, TLI);
2563 // Note: Only consider instructions with <= 2 operands to avoid
2564 // complexity explosion.
2565 if (S &&
2566 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2567 !S.isAltShuffle()) &&
2568 all_of(Ops, [&S](Value *V) {
2569 return isa<PoisonValue>(V) ||
2570 cast<Instruction>(V)->getNumOperands() ==
2571 S.getMainOp()->getNumOperands();
2572 }))
2573 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2575 }
2576
2577 if (I1 && isa<PoisonValue>(V2))
2579
2580 if (isa<UndefValue>(V2))
2582
2583 return CheckSameEntryOrFail();
2584 }
2585
2586 /// Go through the operands of \p LHS and \p RHS recursively until
2587 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2588 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2589 /// of \p U1 and \p U2), except at the beginning of the recursion where
2590 /// these are set to nullptr.
2591 ///
2592 /// For example:
2593 /// \verbatim
2594 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2595 /// \ / \ / \ / \ /
2596 /// + + + +
2597 /// G1 G2 G3 G4
2598 /// \endverbatim
2599 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2600 /// each level recursively, accumulating the score. It starts from matching
2601 /// the additions at level 0, then moves on to the loads (level 1). The
2602 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2603 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2604 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2605 /// Please note that the order of the operands does not matter, as we
2606 /// evaluate the score of all profitable combinations of operands. In
2607 /// other words the score of G1 and G4 is the same as G1 and G2. This
2608 /// heuristic is based on ideas described in:
2609 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2610 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2611 /// Luís F. W. Góes
2613 Instruction *U2, int CurrLevel,
2614 ArrayRef<Value *> MainAltOps) const {
2615
2616 // Get the shallow score of V1 and V2.
2617 int ShallowScoreAtThisLevel =
2618 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2619
2620 // If reached MaxLevel,
2621 // or if V1 and V2 are not instructions,
2622 // or if they are SPLAT,
2623 // or if they are not consecutive,
2624 // or if profitable to vectorize loads or extractelements, early return
2625 // the current cost.
2626 auto *I1 = dyn_cast<Instruction>(LHS);
2627 auto *I2 = dyn_cast<Instruction>(RHS);
2628 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2629 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2630 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2631 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2633 ShallowScoreAtThisLevel))
2634 return ShallowScoreAtThisLevel;
2635 assert(I1 && I2 && "Should have early exited.");
2636
2637 // Contains the I2 operand indexes that got matched with I1 operands.
2638 SmallSet<unsigned, 4> Op2Used;
2639
2640 // Recursion towards the operands of I1 and I2. We are trying all possible
2641 // operand pairs, and keeping track of the best score.
2642 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2643 OpIdx1 != NumOperands1; ++OpIdx1) {
2644 // Try to pair op1I with the best operand of I2.
2645 int MaxTmpScore = 0;
2646 unsigned MaxOpIdx2 = 0;
2647 bool FoundBest = false;
2648 // If I2 is commutative try all combinations.
2649 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2650 unsigned ToIdx = isCommutative(I2)
2651 ? I2->getNumOperands()
2652 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2653 assert(FromIdx <= ToIdx && "Bad index");
2654 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2655 // Skip operands already paired with OpIdx1.
2656 if (Op2Used.count(OpIdx2))
2657 continue;
2658 // Recursively calculate the cost at each level
2659 int TmpScore =
2660 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2661 I1, I2, CurrLevel + 1, {});
2662 // Look for the best score.
2663 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2664 TmpScore > MaxTmpScore) {
2665 MaxTmpScore = TmpScore;
2666 MaxOpIdx2 = OpIdx2;
2667 FoundBest = true;
2668 }
2669 }
2670 if (FoundBest) {
2671 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2672 Op2Used.insert(MaxOpIdx2);
2673 ShallowScoreAtThisLevel += MaxTmpScore;
2674 }
2675 }
2676 return ShallowScoreAtThisLevel;
2677 }
2678 };
2679 /// A helper data structure to hold the operands of a vector of instructions.
2680 /// This supports a fixed vector length for all operand vectors.
2682 /// For each operand we need (i) the value, and (ii) the opcode that it
2683 /// would be attached to if the expression was in a left-linearized form.
2684 /// This is required to avoid illegal operand reordering.
2685 /// For example:
2686 /// \verbatim
2687 /// 0 Op1
2688 /// |/
2689 /// Op1 Op2 Linearized + Op2
2690 /// \ / ----------> |/
2691 /// - -
2692 ///
2693 /// Op1 - Op2 (0 + Op1) - Op2
2694 /// \endverbatim
2695 ///
2696 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2697 ///
2698 /// Another way to think of this is to track all the operations across the
2699 /// path from the operand all the way to the root of the tree and to
2700 /// calculate the operation that corresponds to this path. For example, the
2701 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2702 /// corresponding operation is a '-' (which matches the one in the
2703 /// linearized tree, as shown above).
2704 ///
2705 /// For lack of a better term, we refer to this operation as Accumulated
2706 /// Path Operation (APO).
2707 struct OperandData {
2708 OperandData() = default;
2709 OperandData(Value *V, bool APO, bool IsUsed)
2710 : V(V), APO(APO), IsUsed(IsUsed) {}
2711 /// The operand value.
2712 Value *V = nullptr;
2713 /// TreeEntries only allow a single opcode, or an alternate sequence of
2714 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2715 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2716 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2717 /// (e.g., Add/Mul)
2718 bool APO = false;
2719 /// Helper data for the reordering function.
2720 bool IsUsed = false;
2721 };
2722
2723 /// During operand reordering, we are trying to select the operand at lane
2724 /// that matches best with the operand at the neighboring lane. Our
2725 /// selection is based on the type of value we are looking for. For example,
2726 /// if the neighboring lane has a load, we need to look for a load that is
2727 /// accessing a consecutive address. These strategies are summarized in the
2728 /// 'ReorderingMode' enumerator.
2729 enum class ReorderingMode {
2730 Load, ///< Matching loads to consecutive memory addresses
2731 Opcode, ///< Matching instructions based on opcode (same or alternate)
2732 Constant, ///< Matching constants
2733 Splat, ///< Matching the same instruction multiple times (broadcast)
2734 Failed, ///< We failed to create a vectorizable group
2735 };
2736
2737 using OperandDataVec = SmallVector<OperandData, 2>;
2738
2739 /// A vector of operand vectors.
2741 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2742 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2743 unsigned ArgSize = 0;
2744
2745 const TargetLibraryInfo &TLI;
2746 const DataLayout &DL;
2747 ScalarEvolution &SE;
2748 const BoUpSLP &R;
2749 const Loop *L = nullptr;
2750
2751 /// \returns the operand data at \p OpIdx and \p Lane.
2752 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2753 return OpsVec[OpIdx][Lane];
2754 }
2755
2756 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2757 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2758 return OpsVec[OpIdx][Lane];
2759 }
2760
2761 /// Clears the used flag for all entries.
2762 void clearUsed() {
2763 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2764 OpIdx != NumOperands; ++OpIdx)
2765 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2766 ++Lane)
2767 OpsVec[OpIdx][Lane].IsUsed = false;
2768 }
2769
2770 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2771 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2772 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2773 }
2774
2775 /// \param Lane lane of the operands under analysis.
2776 /// \param OpIdx operand index in \p Lane lane we're looking the best
2777 /// candidate for.
2778 /// \param Idx operand index of the current candidate value.
2779 /// \returns The additional score due to possible broadcasting of the
2780 /// elements in the lane. It is more profitable to have power-of-2 unique
2781 /// elements in the lane, it will be vectorized with higher probability
2782 /// after removing duplicates. Currently the SLP vectorizer supports only
2783 /// vectorization of the power-of-2 number of unique scalars.
2784 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2785 const SmallBitVector &UsedLanes) const {
2786 Value *IdxLaneV = getData(Idx, Lane).V;
2787 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2788 isa<ExtractElementInst>(IdxLaneV))
2789 return 0;
2791 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2792 if (Ln == Lane)
2793 continue;
2794 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2795 if (!isa<Instruction>(OpIdxLnV))
2796 return 0;
2797 Uniques.try_emplace(OpIdxLnV, Ln);
2798 }
2799 unsigned UniquesCount = Uniques.size();
2800 auto IdxIt = Uniques.find(IdxLaneV);
2801 unsigned UniquesCntWithIdxLaneV =
2802 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2803 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2804 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2805 unsigned UniquesCntWithOpIdxLaneV =
2806 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2807 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2808 return 0;
2809 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2810 UniquesCntWithOpIdxLaneV,
2811 UniquesCntWithOpIdxLaneV -
2812 bit_floor(UniquesCntWithOpIdxLaneV)) -
2813 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2814 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2815 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2816 }
2817
2818 /// \param Lane lane of the operands under analysis.
2819 /// \param OpIdx operand index in \p Lane lane we're looking the best
2820 /// candidate for.
2821 /// \param Idx operand index of the current candidate value.
2822 /// \returns The additional score for the scalar which users are all
2823 /// vectorized.
2824 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2825 Value *IdxLaneV = getData(Idx, Lane).V;
2826 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2827 // Do not care about number of uses for vector-like instructions
2828 // (extractelement/extractvalue with constant indices), they are extracts
2829 // themselves and already externally used. Vectorization of such
2830 // instructions does not add extra extractelement instruction, just may
2831 // remove it.
2832 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2833 isVectorLikeInstWithConstOps(OpIdxLaneV))
2835 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2836 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2837 return 0;
2838 return R.areAllUsersVectorized(IdxLaneI)
2840 : 0;
2841 }
2842
2843 /// Score scaling factor for fully compatible instructions but with
2844 /// different number of external uses. Allows better selection of the
2845 /// instructions with less external uses.
2846 static const int ScoreScaleFactor = 10;
2847
2848 /// \Returns the look-ahead score, which tells us how much the sub-trees
2849 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2850 /// score. This helps break ties in an informed way when we cannot decide on
2851 /// the order of the operands by just considering the immediate
2852 /// predecessors.
2853 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2854 int Lane, unsigned OpIdx, unsigned Idx,
2855 bool &IsUsed, const SmallBitVector &UsedLanes) {
2856 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2858 // Keep track of the instruction stack as we recurse into the operands
2859 // during the look-ahead score exploration.
2860 int Score =
2861 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2862 /*CurrLevel=*/1, MainAltOps);
2863 if (Score) {
2864 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2865 if (Score <= -SplatScore) {
2866 // Failed score.
2867 Score = 0;
2868 } else {
2869 Score += SplatScore;
2870 // Scale score to see the difference between different operands
2871 // and similar operands but all vectorized/not all vectorized
2872 // uses. It does not affect actual selection of the best
2873 // compatible operand in general, just allows to select the
2874 // operand with all vectorized uses.
2875 Score *= ScoreScaleFactor;
2876 Score += getExternalUseScore(Lane, OpIdx, Idx);
2877 IsUsed = true;
2878 }
2879 }
2880 return Score;
2881 }
2882
2883 /// Best defined scores per lanes between the passes. Used to choose the
2884 /// best operand (with the highest score) between the passes.
2885 /// The key - {Operand Index, Lane}.
2886 /// The value - the best score between the passes for the lane and the
2887 /// operand.
2889 BestScoresPerLanes;
2890
2891 // Search all operands in Ops[*][Lane] for the one that matches best
2892 // Ops[OpIdx][LastLane] and return its opreand index.
2893 // If no good match can be found, return std::nullopt.
2894 std::optional<unsigned>
2895 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2896 ArrayRef<ReorderingMode> ReorderingModes,
2897 ArrayRef<Value *> MainAltOps,
2898 const SmallBitVector &UsedLanes) {
2899 unsigned NumOperands = getNumOperands();
2900
2901 // The operand of the previous lane at OpIdx.
2902 Value *OpLastLane = getData(OpIdx, LastLane).V;
2903
2904 // Our strategy mode for OpIdx.
2905 ReorderingMode RMode = ReorderingModes[OpIdx];
2906 if (RMode == ReorderingMode::Failed)
2907 return std::nullopt;
2908
2909 // The linearized opcode of the operand at OpIdx, Lane.
2910 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2911
2912 // The best operand index and its score.
2913 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2914 // are using the score to differentiate between the two.
2915 struct BestOpData {
2916 std::optional<unsigned> Idx;
2917 unsigned Score = 0;
2918 } BestOp;
2919 BestOp.Score =
2920 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2921 .first->second;
2922
2923 // Track if the operand must be marked as used. If the operand is set to
2924 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2925 // want to reestimate the operands again on the following iterations).
2926 bool IsUsed = RMode == ReorderingMode::Splat ||
2927 RMode == ReorderingMode::Constant ||
2928 RMode == ReorderingMode::Load;
2929 // Iterate through all unused operands and look for the best.
2930 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2931 // Get the operand at Idx and Lane.
2932 OperandData &OpData = getData(Idx, Lane);
2933 Value *Op = OpData.V;
2934 bool OpAPO = OpData.APO;
2935
2936 // Skip already selected operands.
2937 if (OpData.IsUsed)
2938 continue;
2939
2940 // Skip if we are trying to move the operand to a position with a
2941 // different opcode in the linearized tree form. This would break the
2942 // semantics.
2943 if (OpAPO != OpIdxAPO)
2944 continue;
2945
2946 // Look for an operand that matches the current mode.
2947 switch (RMode) {
2948 case ReorderingMode::Load:
2949 case ReorderingMode::Opcode: {
2950 bool LeftToRight = Lane > LastLane;
2951 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2952 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2953 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2954 OpIdx, Idx, IsUsed, UsedLanes);
2955 if (Score > static_cast<int>(BestOp.Score) ||
2956 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2957 Idx == OpIdx)) {
2958 BestOp.Idx = Idx;
2959 BestOp.Score = Score;
2960 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2961 }
2962 break;
2963 }
2964 case ReorderingMode::Constant:
2965 if (isa<Constant>(Op) ||
2966 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2967 BestOp.Idx = Idx;
2968 if (isa<Constant>(Op)) {
2970 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2972 }
2974 IsUsed = false;
2975 }
2976 break;
2977 case ReorderingMode::Splat:
2978 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2979 IsUsed = Op == OpLastLane;
2980 if (Op == OpLastLane) {
2981 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2982 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2984 }
2985 BestOp.Idx = Idx;
2986 }
2987 break;
2988 case ReorderingMode::Failed:
2989 llvm_unreachable("Not expected Failed reordering mode.");
2990 }
2991 }
2992
2993 if (BestOp.Idx) {
2994 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2995 return BestOp.Idx;
2996 }
2997 // If we could not find a good match return std::nullopt.
2998 return std::nullopt;
2999 }
3000
3001 /// Helper for reorderOperandVecs.
3002 /// \returns the lane that we should start reordering from. This is the one
3003 /// which has the least number of operands that can freely move about or
3004 /// less profitable because it already has the most optimal set of operands.
3005 unsigned getBestLaneToStartReordering() const {
3006 unsigned Min = UINT_MAX;
3007 unsigned SameOpNumber = 0;
3008 // std::pair<unsigned, unsigned> is used to implement a simple voting
3009 // algorithm and choose the lane with the least number of operands that
3010 // can freely move about or less profitable because it already has the
3011 // most optimal set of operands. The first unsigned is a counter for
3012 // voting, the second unsigned is the counter of lanes with instructions
3013 // with same/alternate opcodes and same parent basic block.
3015 // Try to be closer to the original results, if we have multiple lanes
3016 // with same cost. If 2 lanes have the same cost, use the one with the
3017 // highest index.
3018 for (int I = getNumLanes(); I > 0; --I) {
3019 unsigned Lane = I - 1;
3020 OperandsOrderData NumFreeOpsHash =
3021 getMaxNumOperandsThatCanBeReordered(Lane);
3022 // Compare the number of operands that can move and choose the one with
3023 // the least number.
3024 if (NumFreeOpsHash.NumOfAPOs < Min) {
3025 Min = NumFreeOpsHash.NumOfAPOs;
3026 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3027 HashMap.clear();
3028 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3029 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3030 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3031 // Select the most optimal lane in terms of number of operands that
3032 // should be moved around.
3033 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3034 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3035 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3036 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3037 auto [It, Inserted] =
3038 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3039 if (!Inserted)
3040 ++It->second.first;
3041 }
3042 }
3043 // Select the lane with the minimum counter.
3044 unsigned BestLane = 0;
3045 unsigned CntMin = UINT_MAX;
3046 for (const auto &Data : reverse(HashMap)) {
3047 if (Data.second.first < CntMin) {
3048 CntMin = Data.second.first;
3049 BestLane = Data.second.second;
3050 }
3051 }
3052 return BestLane;
3053 }
3054
3055 /// Data structure that helps to reorder operands.
3056 struct OperandsOrderData {
3057 /// The best number of operands with the same APOs, which can be
3058 /// reordered.
3059 unsigned NumOfAPOs = UINT_MAX;
3060 /// Number of operands with the same/alternate instruction opcode and
3061 /// parent.
3062 unsigned NumOpsWithSameOpcodeParent = 0;
3063 /// Hash for the actual operands ordering.
3064 /// Used to count operands, actually their position id and opcode
3065 /// value. It is used in the voting mechanism to find the lane with the
3066 /// least number of operands that can freely move about or less profitable
3067 /// because it already has the most optimal set of operands. Can be
3068 /// replaced with SmallVector<unsigned> instead but hash code is faster
3069 /// and requires less memory.
3070 unsigned Hash = 0;
3071 };
3072 /// \returns the maximum number of operands that are allowed to be reordered
3073 /// for \p Lane and the number of compatible instructions(with the same
3074 /// parent/opcode). This is used as a heuristic for selecting the first lane
3075 /// to start operand reordering.
3076 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3077 unsigned CntTrue = 0;
3078 unsigned NumOperands = getNumOperands();
3079 // Operands with the same APO can be reordered. We therefore need to count
3080 // how many of them we have for each APO, like this: Cnt[APO] = x.
3081 // Since we only have two APOs, namely true and false, we can avoid using
3082 // a map. Instead we can simply count the number of operands that
3083 // correspond to one of them (in this case the 'true' APO), and calculate
3084 // the other by subtracting it from the total number of operands.
3085 // Operands with the same instruction opcode and parent are more
3086 // profitable since we don't need to move them in many cases, with a high
3087 // probability such lane already can be vectorized effectively.
3088 bool AllUndefs = true;
3089 unsigned NumOpsWithSameOpcodeParent = 0;
3090 Instruction *OpcodeI = nullptr;
3091 BasicBlock *Parent = nullptr;
3092 unsigned Hash = 0;
3093 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3094 const OperandData &OpData = getData(OpIdx, Lane);
3095 if (OpData.APO)
3096 ++CntTrue;
3097 // Use Boyer-Moore majority voting for finding the majority opcode and
3098 // the number of times it occurs.
3099 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3100 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3101 I->getParent() != Parent) {
3102 if (NumOpsWithSameOpcodeParent == 0) {
3103 NumOpsWithSameOpcodeParent = 1;
3104 OpcodeI = I;
3105 Parent = I->getParent();
3106 } else {
3107 --NumOpsWithSameOpcodeParent;
3108 }
3109 } else {
3110 ++NumOpsWithSameOpcodeParent;
3111 }
3112 }
3113 Hash = hash_combine(
3114 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3115 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3116 }
3117 if (AllUndefs)
3118 return {};
3119 OperandsOrderData Data;
3120 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3121 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3122 Data.Hash = Hash;
3123 return Data;
3124 }
3125
3126 /// Go through the instructions in VL and append their operands.
3127 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3128 const InstructionsState &S) {
3129 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3130 assert((empty() || all_of(Operands,
3131 [this](const ValueList &VL) {
3132 return VL.size() == getNumLanes();
3133 })) &&
3134 "Expected same number of lanes");
3135 assert(S.valid() && "InstructionsState is invalid.");
3136 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3137 // arguments to the intrinsic produces the same result.
3138 Instruction *MainOp = S.getMainOp();
3139 unsigned NumOperands = MainOp->getNumOperands();
3141 OpsVec.resize(ArgSize);
3142 unsigned NumLanes = VL.size();
3143 for (OperandDataVec &Ops : OpsVec)
3144 Ops.resize(NumLanes);
3145 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3146 // Our tree has just 3 nodes: the root and two operands.
3147 // It is therefore trivial to get the APO. We only need to check the
3148 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3149 // operand. The LHS operand of both add and sub is never attached to an
3150 // inversese operation in the linearized form, therefore its APO is
3151 // false. The RHS is true only if V is an inverse operation.
3152
3153 // Since operand reordering is performed on groups of commutative
3154 // operations or alternating sequences (e.g., +, -), we can safely tell
3155 // the inverse operations by checking commutativity.
3156 auto *I = dyn_cast<Instruction>(VL[Lane]);
3157 if (!I && isa<PoisonValue>(VL[Lane])) {
3158 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3159 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3160 continue;
3161 }
3162 bool IsInverseOperation = false;
3163 if (S.isCopyableElement(VL[Lane])) {
3164 // The value is a copyable element.
3165 IsInverseOperation =
3166 !isCommutative(MainOp, VL[Lane], /*IsCopyable=*/true);
3167 } else {
3168 assert(I && "Expected instruction");
3169 auto [SelectedOp, Ops] = convertTo(I, S);
3170 // We cannot check commutativity by the converted instruction
3171 // (SelectedOp) because isCommutative also examines def-use
3172 // relationships.
3173 IsInverseOperation = !isCommutative(SelectedOp, I);
3174 }
3175 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3176 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3177 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3178 }
3179 }
3180 }
3181
3182 /// \returns the number of operands.
3183 unsigned getNumOperands() const { return ArgSize; }
3184
3185 /// \returns the number of lanes.
3186 unsigned getNumLanes() const { return OpsVec[0].size(); }
3187
3188 /// \returns the operand value at \p OpIdx and \p Lane.
3189 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3190 return getData(OpIdx, Lane).V;
3191 }
3192
3193 /// \returns true if the data structure is empty.
3194 bool empty() const { return OpsVec.empty(); }
3195
3196 /// Clears the data.
3197 void clear() { OpsVec.clear(); }
3198
3199 /// \Returns true if there are enough operands identical to \p Op to fill
3200 /// the whole vector (it is mixed with constants or loop invariant values).
3201 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3202 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3203 assert(Op == getValue(OpIdx, Lane) &&
3204 "Op is expected to be getValue(OpIdx, Lane).");
3205 // Small number of loads - try load matching.
3206 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3207 return false;
3208 bool OpAPO = getData(OpIdx, Lane).APO;
3209 bool IsInvariant = L && L->isLoopInvariant(Op);
3210 unsigned Cnt = 0;
3211 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3212 if (Ln == Lane)
3213 continue;
3214 // This is set to true if we found a candidate for broadcast at Lane.
3215 bool FoundCandidate = false;
3216 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3217 OperandData &Data = getData(OpI, Ln);
3218 if (Data.APO != OpAPO || Data.IsUsed)
3219 continue;
3220 Value *OpILane = getValue(OpI, Lane);
3221 bool IsConstantOp = isa<Constant>(OpILane);
3222 // Consider the broadcast candidate if:
3223 // 1. Same value is found in one of the operands.
3224 if (Data.V == Op ||
3225 // 2. The operand in the given lane is not constant but there is a
3226 // constant operand in another lane (which can be moved to the
3227 // given lane). In this case we can represent it as a simple
3228 // permutation of constant and broadcast.
3229 (!IsConstantOp &&
3230 ((Lns > 2 && isa<Constant>(Data.V)) ||
3231 // 2.1. If we have only 2 lanes, need to check that value in the
3232 // next lane does not build same opcode sequence.
3233 (Lns == 2 &&
3234 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3235 isa<Constant>(Data.V)))) ||
3236 // 3. The operand in the current lane is loop invariant (can be
3237 // hoisted out) and another operand is also a loop invariant
3238 // (though not a constant). In this case the whole vector can be
3239 // hoisted out.
3240 // FIXME: need to teach the cost model about this case for better
3241 // estimation.
3242 (IsInvariant && !isa<Constant>(Data.V) &&
3243 !getSameOpcode({Op, Data.V}, TLI) &&
3244 L->isLoopInvariant(Data.V))) {
3245 FoundCandidate = true;
3246 Data.IsUsed = Data.V == Op;
3247 if (Data.V == Op)
3248 ++Cnt;
3249 break;
3250 }
3251 }
3252 if (!FoundCandidate)
3253 return false;
3254 }
3255 return getNumLanes() == 2 || Cnt > 1;
3256 }
3257
3258 /// Checks if there is at least single compatible operand in lanes other
3259 /// than \p Lane, compatible with the operand \p Op.
3260 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3261 assert(Op == getValue(OpIdx, Lane) &&
3262 "Op is expected to be getValue(OpIdx, Lane).");
3263 bool OpAPO = getData(OpIdx, Lane).APO;
3264 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3265 if (Ln == Lane)
3266 continue;
3267 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3268 const OperandData &Data = getData(OpI, Ln);
3269 if (Data.APO != OpAPO || Data.IsUsed)
3270 return true;
3271 Value *OpILn = getValue(OpI, Ln);
3272 return (L && L->isLoopInvariant(OpILn)) ||
3273 (getSameOpcode({Op, OpILn}, TLI) &&
3274 allSameBlock({Op, OpILn}));
3275 }))
3276 return true;
3277 }
3278 return false;
3279 }
3280
3281 public:
3282 /// Initialize with all the operands of the instruction vector \p RootVL.
3284 const InstructionsState &S, const BoUpSLP &R)
3285 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3286 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3287 // Append all the operands of RootVL.
3288 appendOperands(RootVL, Operands, S);
3289 }
3290
3291 /// \Returns a value vector with the operands across all lanes for the
3292 /// opearnd at \p OpIdx.
3293 ValueList getVL(unsigned OpIdx) const {
3294 ValueList OpVL(OpsVec[OpIdx].size());
3295 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3296 "Expected same num of lanes across all operands");
3297 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3298 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3299 return OpVL;
3300 }
3301
3302 // Performs operand reordering for 2 or more operands.
3303 // The original operands are in OrigOps[OpIdx][Lane].
3304 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3305 void reorder() {
3306 unsigned NumOperands = getNumOperands();
3307 unsigned NumLanes = getNumLanes();
3308 // Each operand has its own mode. We are using this mode to help us select
3309 // the instructions for each lane, so that they match best with the ones
3310 // we have selected so far.
3311 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3312
3313 // This is a greedy single-pass algorithm. We are going over each lane
3314 // once and deciding on the best order right away with no back-tracking.
3315 // However, in order to increase its effectiveness, we start with the lane
3316 // that has operands that can move the least. For example, given the
3317 // following lanes:
3318 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3319 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3320 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3321 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3322 // we will start at Lane 1, since the operands of the subtraction cannot
3323 // be reordered. Then we will visit the rest of the lanes in a circular
3324 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3325
3326 // Find the first lane that we will start our search from.
3327 unsigned FirstLane = getBestLaneToStartReordering();
3328
3329 // Initialize the modes.
3330 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3331 Value *OpLane0 = getValue(OpIdx, FirstLane);
3332 // Keep track if we have instructions with all the same opcode on one
3333 // side.
3334 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3335 // Check if OpLane0 should be broadcast.
3336 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3337 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3338 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3339 else if (isa<LoadInst>(OpILane0))
3340 ReorderingModes[OpIdx] = ReorderingMode::Load;
3341 else
3342 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3343 } else if (isa<Constant>(OpLane0)) {
3344 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3345 } else if (isa<Argument>(OpLane0)) {
3346 // Our best hope is a Splat. It may save some cost in some cases.
3347 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3348 } else {
3349 llvm_unreachable("Unexpected value kind.");
3350 }
3351 }
3352
3353 // Check that we don't have same operands. No need to reorder if operands
3354 // are just perfect diamond or shuffled diamond match. Do not do it only
3355 // for possible broadcasts or non-power of 2 number of scalars (just for
3356 // now).
3357 auto &&SkipReordering = [this]() {
3358 SmallPtrSet<Value *, 4> UniqueValues;
3359 ArrayRef<OperandData> Op0 = OpsVec.front();
3360 for (const OperandData &Data : Op0)
3361 UniqueValues.insert(Data.V);
3363 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3364 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3365 return !UniqueValues.contains(Data.V);
3366 }))
3367 return false;
3368 }
3369 // TODO: Check if we can remove a check for non-power-2 number of
3370 // scalars after full support of non-power-2 vectorization.
3371 return UniqueValues.size() != 2 &&
3372 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3373 UniqueValues.size());
3374 };
3375
3376 // If the initial strategy fails for any of the operand indexes, then we
3377 // perform reordering again in a second pass. This helps avoid assigning
3378 // high priority to the failed strategy, and should improve reordering for
3379 // the non-failed operand indexes.
3380 for (int Pass = 0; Pass != 2; ++Pass) {
3381 // Check if no need to reorder operands since they're are perfect or
3382 // shuffled diamond match.
3383 // Need to do it to avoid extra external use cost counting for
3384 // shuffled matches, which may cause regressions.
3385 if (SkipReordering())
3386 break;
3387 // Skip the second pass if the first pass did not fail.
3388 bool StrategyFailed = false;
3389 // Mark all operand data as free to use.
3390 clearUsed();
3391 // We keep the original operand order for the FirstLane, so reorder the
3392 // rest of the lanes. We are visiting the nodes in a circular fashion,
3393 // using FirstLane as the center point and increasing the radius
3394 // distance.
3395 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3396 for (unsigned I = 0; I < NumOperands; ++I)
3397 MainAltOps[I].push_back(getData(I, FirstLane).V);
3398
3399 SmallBitVector UsedLanes(NumLanes);
3400 UsedLanes.set(FirstLane);
3401 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3402 // Visit the lane on the right and then the lane on the left.
3403 for (int Direction : {+1, -1}) {
3404 int Lane = FirstLane + Direction * Distance;
3405 if (Lane < 0 || Lane >= (int)NumLanes)
3406 continue;
3407 UsedLanes.set(Lane);
3408 int LastLane = Lane - Direction;
3409 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3410 "Out of bounds");
3411 // Look for a good match for each operand.
3412 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3413 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3414 std::optional<unsigned> BestIdx =
3415 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3416 MainAltOps[OpIdx], UsedLanes);
3417 // By not selecting a value, we allow the operands that follow to
3418 // select a better matching value. We will get a non-null value in
3419 // the next run of getBestOperand().
3420 if (BestIdx) {
3421 // Swap the current operand with the one returned by
3422 // getBestOperand().
3423 swap(OpIdx, *BestIdx, Lane);
3424 } else {
3425 // Enable the second pass.
3426 StrategyFailed = true;
3427 }
3428 // Try to get the alternate opcode and follow it during analysis.
3429 if (MainAltOps[OpIdx].size() != 2) {
3430 OperandData &AltOp = getData(OpIdx, Lane);
3431 InstructionsState OpS =
3432 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3433 if (OpS && OpS.isAltShuffle())
3434 MainAltOps[OpIdx].push_back(AltOp.V);
3435 }
3436 }
3437 }
3438 }
3439 // Skip second pass if the strategy did not fail.
3440 if (!StrategyFailed)
3441 break;
3442 }
3443 }
3444
3445#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3446 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3447 switch (RMode) {
3448 case ReorderingMode::Load:
3449 return "Load";
3450 case ReorderingMode::Opcode:
3451 return "Opcode";
3452 case ReorderingMode::Constant:
3453 return "Constant";
3454 case ReorderingMode::Splat:
3455 return "Splat";
3456 case ReorderingMode::Failed:
3457 return "Failed";
3458 }
3459 llvm_unreachable("Unimplemented Reordering Type");
3460 }
3461
3462 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3463 raw_ostream &OS) {
3464 return OS << getModeStr(RMode);
3465 }
3466
3467 /// Debug print.
3468 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3469 printMode(RMode, dbgs());
3470 }
3471
3472 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3473 return printMode(RMode, OS);
3474 }
3475
3477 const unsigned Indent = 2;
3478 unsigned Cnt = 0;
3479 for (const OperandDataVec &OpDataVec : OpsVec) {
3480 OS << "Operand " << Cnt++ << "\n";
3481 for (const OperandData &OpData : OpDataVec) {
3482 OS.indent(Indent) << "{";
3483 if (Value *V = OpData.V)
3484 OS << *V;
3485 else
3486 OS << "null";
3487 OS << ", APO:" << OpData.APO << "}\n";
3488 }
3489 OS << "\n";
3490 }
3491 return OS;
3492 }
3493
3494 /// Debug print.
3495 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3496#endif
3497 };
3498
3499 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3500 /// for a pair which have highest score deemed to have best chance to form
3501 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3502 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3503 /// of the cost, considered to be good enough score.
3504 std::optional<int>
3505 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3506 int Limit = LookAheadHeuristics::ScoreFail) const {
3507 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3509 int BestScore = Limit;
3510 std::optional<int> Index;
3511 for (int I : seq<int>(0, Candidates.size())) {
3512 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3513 Candidates[I].second,
3514 /*U1=*/nullptr, /*U2=*/nullptr,
3515 /*CurrLevel=*/1, {});
3516 if (Score > BestScore) {
3517 BestScore = Score;
3518 Index = I;
3519 }
3520 }
3521 return Index;
3522 }
3523
3524 /// Checks if the instruction is marked for deletion.
3525 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3526
3527 /// Removes an instruction from its block and eventually deletes it.
3528 /// It's like Instruction::eraseFromParent() except that the actual deletion
3529 /// is delayed until BoUpSLP is destructed.
3531 DeletedInstructions.insert(I);
3532 }
3533
3534 /// Remove instructions from the parent function and clear the operands of \p
3535 /// DeadVals instructions, marking for deletion trivially dead operands.
3536 template <typename T>
3538 ArrayRef<T *> DeadVals,
3539 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3541 for (T *V : DeadVals) {
3542 auto *I = cast<Instruction>(V);
3544 }
3545 DenseSet<Value *> Processed;
3546 for (T *V : DeadVals) {
3547 if (!V || !Processed.insert(V).second)
3548 continue;
3549 auto *I = cast<Instruction>(V);
3551 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3552 for (Use &U : I->operands()) {
3553 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3554 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3556 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3557 return Entry->VectorizedValue == OpI;
3558 })))
3559 DeadInsts.push_back(OpI);
3560 }
3561 I->dropAllReferences();
3562 }
3563 for (T *V : DeadVals) {
3564 auto *I = cast<Instruction>(V);
3565 if (!I->getParent())
3566 continue;
3567 assert((I->use_empty() || all_of(I->uses(),
3568 [&](Use &U) {
3569 return isDeleted(
3570 cast<Instruction>(U.getUser()));
3571 })) &&
3572 "trying to erase instruction with users.");
3573 I->removeFromParent();
3574 SE->forgetValue(I);
3575 }
3576 // Process the dead instruction list until empty.
3577 while (!DeadInsts.empty()) {
3578 Value *V = DeadInsts.pop_back_val();
3580 if (!VI || !VI->getParent())
3581 continue;
3583 "Live instruction found in dead worklist!");
3584 assert(VI->use_empty() && "Instructions with uses are not dead.");
3585
3586 // Don't lose the debug info while deleting the instructions.
3587 salvageDebugInfo(*VI);
3588
3589 // Null out all of the instruction's operands to see if any operand
3590 // becomes dead as we go.
3591 for (Use &OpU : VI->operands()) {
3592 Value *OpV = OpU.get();
3593 if (!OpV)
3594 continue;
3595 OpU.set(nullptr);
3596
3597 if (!OpV->use_empty())
3598 continue;
3599
3600 // If the operand is an instruction that became dead as we nulled out
3601 // the operand, and if it is 'trivially' dead, delete it in a future
3602 // loop iteration.
3603 if (auto *OpI = dyn_cast<Instruction>(OpV))
3604 if (!DeletedInstructions.contains(OpI) &&
3605 (!OpI->getType()->isVectorTy() ||
3606 none_of(VectorValuesAndScales,
3607 [&](const std::tuple<Value *, unsigned, bool> &V) {
3608 return std::get<0>(V) == OpI;
3609 })) &&
3611 DeadInsts.push_back(OpI);
3612 }
3613
3614 VI->removeFromParent();
3615 eraseInstruction(VI);
3616 SE->forgetValue(VI);
3617 }
3618 }
3619
3620 /// Checks if the instruction was already analyzed for being possible
3621 /// reduction root.
3623 return AnalyzedReductionsRoots.count(I);
3624 }
3625 /// Register given instruction as already analyzed for being possible
3626 /// reduction root.
3628 AnalyzedReductionsRoots.insert(I);
3629 }
3630 /// Checks if the provided list of reduced values was checked already for
3631 /// vectorization.
3633 return AnalyzedReductionVals.contains(hash_value(VL));
3634 }
3635 /// Adds the list of reduced values to list of already checked values for the
3636 /// vectorization.
3638 AnalyzedReductionVals.insert(hash_value(VL));
3639 }
3640 /// Clear the list of the analyzed reduction root instructions.
3642 AnalyzedReductionsRoots.clear();
3643 AnalyzedReductionVals.clear();
3644 AnalyzedMinBWVals.clear();
3645 }
3646 /// Checks if the given value is gathered in one of the nodes.
3647 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3648 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3649 }
3650 /// Checks if the given value is gathered in one of the nodes.
3651 bool isGathered(const Value *V) const {
3652 return MustGather.contains(V);
3653 }
3654 /// Checks if the specified value was not schedule.
3655 bool isNotScheduled(const Value *V) const {
3656 return NonScheduledFirst.contains(V);
3657 }
3658
3659 /// Check if the value is vectorized in the tree.
3660 bool isVectorized(const Value *V) const {
3661 assert(V && "V cannot be nullptr.");
3662 return ScalarToTreeEntries.contains(V);
3663 }
3664
3665 ~BoUpSLP();
3666
3667private:
3668 /// Determine if a node \p E in can be demoted to a smaller type with a
3669 /// truncation. We collect the entries that will be demoted in ToDemote.
3670 /// \param E Node for analysis
3671 /// \param ToDemote indices of the nodes to be demoted.
3672 bool collectValuesToDemote(
3673 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3675 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3676 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3677
3678 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3679 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3680 /// they have only one user and reordarable).
3681 /// \param ReorderableGathers List of all gather nodes that require reordering
3682 /// (e.g., gather of extractlements or partially vectorizable loads).
3683 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3684 /// reordering, subset of \p NonVectorized.
3685 void buildReorderableOperands(
3686 TreeEntry *UserTE,
3687 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3688 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3689 SmallVectorImpl<TreeEntry *> &GatherOps);
3690
3691 /// Checks if the given \p TE is a gather node with clustered reused scalars
3692 /// and reorders it per given \p Mask.
3693 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3694
3695 /// Checks if all users of \p I are the part of the vectorization tree.
3696 bool areAllUsersVectorized(
3697 Instruction *I,
3698 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3699
3700 /// Return information about the vector formed for the specified index
3701 /// of a vector of (the same) instruction.
3703
3704 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3705 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3706 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3707 return const_cast<TreeEntry *>(
3708 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3709 }
3710
3711 /// Gets the root instruction for the given node. If the node is a strided
3712 /// load/store node with the reverse order, the root instruction is the last
3713 /// one.
3714 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3715
3716 /// \returns Cast context for the given graph node.
3718 getCastContextHint(const TreeEntry &TE) const;
3719
3720 /// \returns the cost of the vectorizable entry.
3721 InstructionCost getEntryCost(const TreeEntry *E,
3722 ArrayRef<Value *> VectorizedVals,
3723 SmallPtrSetImpl<Value *> &CheckedExtracts);
3724
3725 /// Checks if it is legal and profitable to build SplitVectorize node for the
3726 /// given \p VL.
3727 /// \param Op1 first homogeneous scalars.
3728 /// \param Op2 second homogeneous scalars.
3729 /// \param ReorderIndices indices to reorder the scalars.
3730 /// \returns true if the node was successfully built.
3731 bool canBuildSplitNode(ArrayRef<Value *> VL,
3732 const InstructionsState &LocalState,
3735 OrdersType &ReorderIndices) const;
3736
3737 /// This is the recursive part of buildTree.
3738 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3739 unsigned InterleaveFactor = 0);
3740
3741 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3742 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3743 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3744 /// returns false, setting \p CurrentOrder to either an empty vector or a
3745 /// non-identity permutation that allows to reuse extract instructions.
3746 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3747 /// extract order.
3748 bool canReuseExtract(ArrayRef<Value *> VL,
3749 SmallVectorImpl<unsigned> &CurrentOrder,
3750 bool ResizeAllowed = false) const;
3751
3752 /// Vectorize a single entry in the tree.
3753 Value *vectorizeTree(TreeEntry *E);
3754
3755 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3756 /// \p E.
3757 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3758
3759 /// Create a new vector from a list of scalar values. Produces a sequence
3760 /// which exploits values reused across lanes, and arranges the inserts
3761 /// for ease of later optimization.
3762 template <typename BVTy, typename ResTy, typename... Args>
3763 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3764
3765 /// Create a new vector from a list of scalar values. Produces a sequence
3766 /// which exploits values reused across lanes, and arranges the inserts
3767 /// for ease of later optimization.
3768 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3769
3770 /// Returns the instruction in the bundle, which can be used as a base point
3771 /// for scheduling. Usually it is the last instruction in the bundle, except
3772 /// for the case when all operands are external (in this case, it is the first
3773 /// instruction in the list).
3774 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3775
3776 /// Tries to find extractelement instructions with constant indices from fixed
3777 /// vector type and gather such instructions into a bunch, which highly likely
3778 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3779 /// was successful, the matched scalars are replaced by poison values in \p VL
3780 /// for future analysis.
3781 std::optional<TargetTransformInfo::ShuffleKind>
3782 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3783 SmallVectorImpl<int> &Mask) const;
3784
3785 /// Tries to find extractelement instructions with constant indices from fixed
3786 /// vector type and gather such instructions into a bunch, which highly likely
3787 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3788 /// was successful, the matched scalars are replaced by poison values in \p VL
3789 /// for future analysis.
3791 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3793 unsigned NumParts) const;
3794
3795 /// Checks if the gathered \p VL can be represented as a single register
3796 /// shuffle(s) of previous tree entries.
3797 /// \param TE Tree entry checked for permutation.
3798 /// \param VL List of scalars (a subset of the TE scalar), checked for
3799 /// permutations. Must form single-register vector.
3800 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3801 /// commands to build the mask using the original vector value, without
3802 /// relying on the potential reordering.
3803 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3804 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3805 std::optional<TargetTransformInfo::ShuffleKind>
3806 isGatherShuffledSingleRegisterEntry(
3807 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3808 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3809 bool ForOrder);
3810
3811 /// Checks if the gathered \p VL can be represented as multi-register
3812 /// shuffle(s) of previous tree entries.
3813 /// \param TE Tree entry checked for permutation.
3814 /// \param VL List of scalars (a subset of the TE scalar), checked for
3815 /// permutations.
3816 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3817 /// commands to build the mask using the original vector value, without
3818 /// relying on the potential reordering.
3819 /// \returns per-register series of ShuffleKind, if gathered values can be
3820 /// represented as shuffles of previous tree entries. \p Mask is filled with
3821 /// the shuffle mask (also on per-register base).
3823 isGatherShuffledEntry(
3824 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3826 unsigned NumParts, bool ForOrder = false);
3827
3828 /// \returns the cost of gathering (inserting) the values in \p VL into a
3829 /// vector.
3830 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3831 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3832 Type *ScalarTy) const;
3833
3834 /// Set the Builder insert point to one after the last instruction in
3835 /// the bundle
3836 void setInsertPointAfterBundle(const TreeEntry *E);
3837
3838 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3839 /// specified, the starting vector value is poison.
3840 Value *
3841 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3842 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3843
3844 /// \returns whether the VectorizableTree is fully vectorizable and will
3845 /// be beneficial even the tree height is tiny.
3846 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3847
3848 /// Run through the list of all gathered loads in the graph and try to find
3849 /// vector loads/masked gathers instead of regular gathers. Later these loads
3850 /// are reshufled to build final gathered nodes.
3851 void tryToVectorizeGatheredLoads(
3852 const SmallMapVector<
3853 std::tuple<BasicBlock *, Value *, Type *>,
3854 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3855 &GatheredLoads);
3856
3857 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3858 /// users of \p TE and collects the stores. It returns the map from the store
3859 /// pointers to the collected stores.
3861 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3862
3863 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3864 /// stores in \p StoresVec can form a vector instruction. If so it returns
3865 /// true and populates \p ReorderIndices with the shuffle indices of the
3866 /// stores when compared to the sorted vector.
3867 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3868 OrdersType &ReorderIndices) const;
3869
3870 /// Iterates through the users of \p TE, looking for scalar stores that can be
3871 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3872 /// their order and builds an order index vector for each store bundle. It
3873 /// returns all these order vectors found.
3874 /// We run this after the tree has formed, otherwise we may come across user
3875 /// instructions that are not yet in the tree.
3877 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3878
3879 /// Tries to reorder the gathering node for better vectorization
3880 /// opportunities.
3881 void reorderGatherNode(TreeEntry &TE);
3882
3883 class TreeEntry {
3884 public:
3885 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3886 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3887
3888 /// \returns Common mask for reorder indices and reused scalars.
3889 SmallVector<int> getCommonMask() const {
3890 if (State == TreeEntry::SplitVectorize)
3891 return {};
3892 SmallVector<int> Mask;
3893 inversePermutation(ReorderIndices, Mask);
3894 ::addMask(Mask, ReuseShuffleIndices);
3895 return Mask;
3896 }
3897
3898 /// \returns The mask for split nodes.
3899 SmallVector<int> getSplitMask() const {
3900 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3901 "Expected only split vectorize node.");
3902 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3903 unsigned CommonVF = std::max<unsigned>(
3904 CombinedEntriesWithIndices.back().second,
3905 Scalars.size() - CombinedEntriesWithIndices.back().second);
3906 for (auto [Idx, I] : enumerate(ReorderIndices))
3907 Mask[I] =
3908 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3909 ? CommonVF - CombinedEntriesWithIndices.back().second
3910 : 0);
3911 return Mask;
3912 }
3913
3914 /// Updates (reorders) SplitVectorize node according to the given mask \p
3915 /// Mask and order \p MaskOrder.
3916 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3917 ArrayRef<int> MaskOrder);
3918
3919 /// \returns true if the scalars in VL are equal to this entry.
3920 bool isSame(ArrayRef<Value *> VL) const {
3921 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3922 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3923 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3924 return VL.size() == Mask.size() &&
3925 std::equal(VL.begin(), VL.end(), Mask.begin(),
3926 [Scalars](Value *V, int Idx) {
3927 return (isa<UndefValue>(V) &&
3928 Idx == PoisonMaskElem) ||
3929 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3930 });
3931 };
3932 if (!ReorderIndices.empty()) {
3933 // TODO: implement matching if the nodes are just reordered, still can
3934 // treat the vector as the same if the list of scalars matches VL
3935 // directly, without reordering.
3936 SmallVector<int> Mask;
3937 inversePermutation(ReorderIndices, Mask);
3938 if (VL.size() == Scalars.size())
3939 return IsSame(Scalars, Mask);
3940 if (VL.size() == ReuseShuffleIndices.size()) {
3941 ::addMask(Mask, ReuseShuffleIndices);
3942 return IsSame(Scalars, Mask);
3943 }
3944 return false;
3945 }
3946 return IsSame(Scalars, ReuseShuffleIndices);
3947 }
3948
3949 /// \returns true if current entry has same operands as \p TE.
3950 bool hasEqualOperands(const TreeEntry &TE) const {
3951 if (TE.getNumOperands() != getNumOperands())
3952 return false;
3953 SmallBitVector Used(getNumOperands());
3954 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3955 unsigned PrevCount = Used.count();
3956 for (unsigned K = 0; K < E; ++K) {
3957 if (Used.test(K))
3958 continue;
3959 if (getOperand(K) == TE.getOperand(I)) {
3960 Used.set(K);
3961 break;
3962 }
3963 }
3964 // Check if we actually found the matching operand.
3965 if (PrevCount == Used.count())
3966 return false;
3967 }
3968 return true;
3969 }
3970
3971 /// \return Final vectorization factor for the node. Defined by the total
3972 /// number of vectorized scalars, including those, used several times in the
3973 /// entry and counted in the \a ReuseShuffleIndices, if any.
3974 unsigned getVectorFactor() const {
3975 if (!ReuseShuffleIndices.empty())
3976 return ReuseShuffleIndices.size();
3977 return Scalars.size();
3978 };
3979
3980 /// Checks if the current node is a gather node.
3981 bool isGather() const { return State == NeedToGather; }
3982
3983 /// A vector of scalars.
3984 ValueList Scalars;
3985
3986 /// The Scalars are vectorized into this value. It is initialized to Null.
3987 WeakTrackingVH VectorizedValue = nullptr;
3988
3989 /// Do we need to gather this sequence or vectorize it
3990 /// (either with vector instruction or with scatter/gather
3991 /// intrinsics for store/load)?
3992 enum EntryState {
3993 Vectorize, ///< The node is regularly vectorized.
3994 ScatterVectorize, ///< Masked scatter/gather node.
3995 StridedVectorize, ///< Strided loads (and stores)
3996 CompressVectorize, ///< (Masked) load with compress.
3997 NeedToGather, ///< Gather/buildvector node.
3998 CombinedVectorize, ///< Vectorized node, combined with its user into more
3999 ///< complex node like select/cmp to minmax, mul/add to
4000 ///< fma, etc. Must be used for the following nodes in
4001 ///< the pattern, not the very first one.
4002 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4003 ///< independently and then combines back.
4004 };
4005 EntryState State;
4006
4007 /// List of combined opcodes supported by the vectorizer.
4008 enum CombinedOpcode {
4009 NotCombinedOp = -1,
4010 MinMax = Instruction::OtherOpsEnd + 1,
4011 FMulAdd,
4012 };
4013 CombinedOpcode CombinedOp = NotCombinedOp;
4014
4015 /// Does this sequence require some shuffling?
4016 SmallVector<int, 4> ReuseShuffleIndices;
4017
4018 /// Does this entry require reordering?
4019 SmallVector<unsigned, 4> ReorderIndices;
4020
4021 /// Points back to the VectorizableTree.
4022 ///
4023 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4024 /// to be a pointer and needs to be able to initialize the child iterator.
4025 /// Thus we need a reference back to the container to translate the indices
4026 /// to entries.
4027 VecTreeTy &Container;
4028
4029 /// The TreeEntry index containing the user of this entry.
4030 EdgeInfo UserTreeIndex;
4031
4032 /// The index of this treeEntry in VectorizableTree.
4033 unsigned Idx = 0;
4034
4035 /// For gather/buildvector/alt opcode nodes, which are combined from
4036 /// other nodes as a series of insertvector instructions.
4037 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4038
4039 private:
4040 /// The operands of each instruction in each lane Operands[op_index][lane].
4041 /// Note: This helps avoid the replication of the code that performs the
4042 /// reordering of operands during buildTreeRec() and vectorizeTree().
4043 SmallVector<ValueList, 2> Operands;
4044
4045 /// Copyable elements of the entry node.
4046 SmallPtrSet<const Value *, 4> CopyableElements;
4047
4048 /// MainOp and AltOp are recorded inside. S should be obtained from
4049 /// newTreeEntry.
4050 InstructionsState S = InstructionsState::invalid();
4051
4052 /// Interleaving factor for interleaved loads Vectorize nodes.
4053 unsigned InterleaveFactor = 0;
4054
4055 /// True if the node does not require scheduling.
4056 bool DoesNotNeedToSchedule = false;
4057
4058 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4059 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4060 if (Operands.size() < OpIdx + 1)
4061 Operands.resize(OpIdx + 1);
4062 assert(Operands[OpIdx].empty() && "Already resized?");
4063 assert(OpVL.size() <= Scalars.size() &&
4064 "Number of operands is greater than the number of scalars.");
4065 Operands[OpIdx].resize(OpVL.size());
4066 copy(OpVL, Operands[OpIdx].begin());
4067 }
4068
4069 public:
4070 /// Returns interleave factor for interleave nodes.
4071 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4072 /// Sets interleaving factor for the interleaving nodes.
4073 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4074
4075 /// Marks the node as one that does not require scheduling.
4076 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4077 /// Returns true if the node is marked as one that does not require
4078 /// scheduling.
4079 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4080
4081 /// Set this bundle's operands from \p Operands.
4082 void setOperands(ArrayRef<ValueList> Operands) {
4083 for (unsigned I : seq<unsigned>(Operands.size()))
4084 setOperand(I, Operands[I]);
4085 }
4086
4087 /// Reorders operands of the node to the given mask \p Mask.
4088 void reorderOperands(ArrayRef<int> Mask) {
4089 for (ValueList &Operand : Operands)
4090 reorderScalars(Operand, Mask);
4091 }
4092
4093 /// \returns the \p OpIdx operand of this TreeEntry.
4094 ValueList &getOperand(unsigned OpIdx) {
4095 assert(OpIdx < Operands.size() && "Off bounds");
4096 return Operands[OpIdx];
4097 }
4098
4099 /// \returns the \p OpIdx operand of this TreeEntry.
4100 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4101 assert(OpIdx < Operands.size() && "Off bounds");
4102 return Operands[OpIdx];
4103 }
4104
4105 /// \returns the number of operands.
4106 unsigned getNumOperands() const { return Operands.size(); }
4107
4108 /// \return the single \p OpIdx operand.
4109 Value *getSingleOperand(unsigned OpIdx) const {
4110 assert(OpIdx < Operands.size() && "Off bounds");
4111 assert(!Operands[OpIdx].empty() && "No operand available");
4112 return Operands[OpIdx][0];
4113 }
4114
4115 /// Some of the instructions in the list have alternate opcodes.
4116 bool isAltShuffle() const { return S.isAltShuffle(); }
4117
4118 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4119 return S.getMatchingMainOpOrAltOp(I);
4120 }
4121
4122 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4123 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4124 /// \p OpValue.
4125 Value *isOneOf(Value *Op) const {
4126 auto *I = dyn_cast<Instruction>(Op);
4127 if (I && getMatchingMainOpOrAltOp(I))
4128 return Op;
4129 return S.getMainOp();
4130 }
4131
4132 void setOperations(const InstructionsState &S) {
4133 assert(S && "InstructionsState is invalid.");
4134 this->S = S;
4135 }
4136
4137 Instruction *getMainOp() const { return S.getMainOp(); }
4138
4139 Instruction *getAltOp() const { return S.getAltOp(); }
4140
4141 /// The main/alternate opcodes for the list of instructions.
4142 unsigned getOpcode() const { return S.getOpcode(); }
4143
4144 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4145
4146 bool hasState() const { return S.valid(); }
4147
4148 /// Add \p V to the list of copyable elements.
4149 void addCopyableElement(Value *V) {
4150 assert(S.isCopyableElement(V) && "Not a copyable element.");
4151 CopyableElements.insert(V);
4152 }
4153
4154 /// Returns true if \p V is a copyable element.
4155 bool isCopyableElement(Value *V) const {
4156 return CopyableElements.contains(V);
4157 }
4158
4159 /// Returns true if any scalar in the list is a copyable element.
4160 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4161
4162 /// Returns the state of the operations.
4163 const InstructionsState &getOperations() const { return S; }
4164
4165 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4166 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4167 unsigned findLaneForValue(Value *V) const {
4168 unsigned FoundLane = getVectorFactor();
4169 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4170 std::advance(It, 1)) {
4171 if (*It != V)
4172 continue;
4173 FoundLane = std::distance(Scalars.begin(), It);
4174 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4175 if (!ReorderIndices.empty())
4176 FoundLane = ReorderIndices[FoundLane];
4177 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4178 if (ReuseShuffleIndices.empty())
4179 break;
4180 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4181 RIt != ReuseShuffleIndices.end()) {
4182 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4183 break;
4184 }
4185 }
4186 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4187 return FoundLane;
4188 }
4189
4190 /// Build a shuffle mask for graph entry which represents a merge of main
4191 /// and alternate operations.
4192 void
4193 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4194 SmallVectorImpl<int> &Mask,
4195 SmallVectorImpl<Value *> *OpScalars = nullptr,
4196 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4197
4198 /// Return true if this is a non-power-of-2 node.
4199 bool isNonPowOf2Vec() const {
4200 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4201 return IsNonPowerOf2;
4202 }
4203
4204 /// Return true if this is a node, which tries to vectorize number of
4205 /// elements, forming whole vectors.
4206 bool
4207 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4208 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4209 TTI, getValueType(Scalars.front()), Scalars.size());
4210 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4211 "Reshuffling not supported with non-power-of-2 vectors yet.");
4212 return IsNonPowerOf2;
4213 }
4214
4215 Value *getOrdered(unsigned Idx) const {
4216 assert(isGather() && "Must be used only for buildvectors/gathers.");
4217 if (ReorderIndices.empty())
4218 return Scalars[Idx];
4219 SmallVector<int> Mask;
4220 inversePermutation(ReorderIndices, Mask);
4221 return Scalars[Mask[Idx]];
4222 }
4223
4224#ifndef NDEBUG
4225 /// Debug printer.
4226 LLVM_DUMP_METHOD void dump() const {
4227 dbgs() << Idx << ".\n";
4228 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4229 dbgs() << "Operand " << OpI << ":\n";
4230 for (const Value *V : Operands[OpI])
4231 dbgs().indent(2) << *V << "\n";
4232 }
4233 dbgs() << "Scalars: \n";
4234 for (Value *V : Scalars)
4235 dbgs().indent(2) << *V << "\n";
4236 dbgs() << "State: ";
4237 if (S && hasCopyableElements())
4238 dbgs() << "[[Copyable]] ";
4239 switch (State) {
4240 case Vectorize:
4241 if (InterleaveFactor > 0) {
4242 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4243 << "\n";
4244 } else {
4245 dbgs() << "Vectorize\n";
4246 }
4247 break;
4248 case ScatterVectorize:
4249 dbgs() << "ScatterVectorize\n";
4250 break;
4251 case StridedVectorize:
4252 dbgs() << "StridedVectorize\n";
4253 break;
4254 case CompressVectorize:
4255 dbgs() << "CompressVectorize\n";
4256 break;
4257 case NeedToGather:
4258 dbgs() << "NeedToGather\n";
4259 break;
4260 case CombinedVectorize:
4261 dbgs() << "CombinedVectorize\n";
4262 break;
4263 case SplitVectorize:
4264 dbgs() << "SplitVectorize\n";
4265 break;
4266 }
4267 if (S) {
4268 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4269 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4270 } else {
4271 dbgs() << "MainOp: NULL\n";
4272 dbgs() << "AltOp: NULL\n";
4273 }
4274 dbgs() << "VectorizedValue: ";
4275 if (VectorizedValue)
4276 dbgs() << *VectorizedValue << "\n";
4277 else
4278 dbgs() << "NULL\n";
4279 dbgs() << "ReuseShuffleIndices: ";
4280 if (ReuseShuffleIndices.empty())
4281 dbgs() << "Empty";
4282 else
4283 for (int ReuseIdx : ReuseShuffleIndices)
4284 dbgs() << ReuseIdx << ", ";
4285 dbgs() << "\n";
4286 dbgs() << "ReorderIndices: ";
4287 for (unsigned ReorderIdx : ReorderIndices)
4288 dbgs() << ReorderIdx << ", ";
4289 dbgs() << "\n";
4290 dbgs() << "UserTreeIndex: ";
4291 if (UserTreeIndex)
4292 dbgs() << UserTreeIndex;
4293 else
4294 dbgs() << "<invalid>";
4295 dbgs() << "\n";
4296 if (!CombinedEntriesWithIndices.empty()) {
4297 dbgs() << "Combined entries: ";
4298 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4299 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4300 });
4301 dbgs() << "\n";
4302 }
4303 }
4304#endif
4305 };
4306
4307#ifndef NDEBUG
4308 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4309 InstructionCost VecCost, InstructionCost ScalarCost,
4310 StringRef Banner) const {
4311 dbgs() << "SLP: " << Banner << ":\n";
4312 E->dump();
4313 dbgs() << "SLP: Costs:\n";
4314 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4315 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4316 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4317 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4318 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4319 }
4320#endif
4321
4322 /// Create a new gather TreeEntry
4323 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4324 const InstructionsState &S,
4325 const EdgeInfo &UserTreeIdx,
4326 ArrayRef<int> ReuseShuffleIndices = {}) {
4327 auto Invalid = ScheduleBundle::invalid();
4328 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4329 }
4330
4331 /// Create a new VectorizableTree entry.
4332 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4333 const InstructionsState &S,
4334 const EdgeInfo &UserTreeIdx,
4335 ArrayRef<int> ReuseShuffleIndices = {},
4336 ArrayRef<unsigned> ReorderIndices = {},
4337 unsigned InterleaveFactor = 0) {
4338 TreeEntry::EntryState EntryState =
4339 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4340 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4341 ReuseShuffleIndices, ReorderIndices);
4342 if (E && InterleaveFactor > 0)
4343 E->setInterleave(InterleaveFactor);
4344 return E;
4345 }
4346
4347 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4348 TreeEntry::EntryState EntryState,
4349 ScheduleBundle &Bundle, const InstructionsState &S,
4350 const EdgeInfo &UserTreeIdx,
4351 ArrayRef<int> ReuseShuffleIndices = {},
4352 ArrayRef<unsigned> ReorderIndices = {}) {
4353 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4354 EntryState == TreeEntry::SplitVectorize)) ||
4355 (Bundle && EntryState != TreeEntry::NeedToGather &&
4356 EntryState != TreeEntry::SplitVectorize)) &&
4357 "Need to vectorize gather entry?");
4358 // Gathered loads still gathered? Do not create entry, use the original one.
4359 if (GatheredLoadsEntriesFirst.has_value() &&
4360 EntryState == TreeEntry::NeedToGather && S &&
4361 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4362 !UserTreeIdx.UserTE)
4363 return nullptr;
4364 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4365 TreeEntry *Last = VectorizableTree.back().get();
4366 Last->Idx = VectorizableTree.size() - 1;
4367 Last->State = EntryState;
4368 if (UserTreeIdx.UserTE)
4369 OperandsToTreeEntry.try_emplace(
4370 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4371 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4372 // for non-power-of-two vectors.
4373 assert(
4374 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4375 ReuseShuffleIndices.empty()) &&
4376 "Reshuffling scalars not yet supported for nodes with padding");
4377 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4378 ReuseShuffleIndices.end());
4379 if (ReorderIndices.empty()) {
4380 Last->Scalars.assign(VL.begin(), VL.end());
4381 if (S)
4382 Last->setOperations(S);
4383 } else {
4384 // Reorder scalars and build final mask.
4385 Last->Scalars.assign(VL.size(), nullptr);
4386 transform(ReorderIndices, Last->Scalars.begin(),
4387 [VL](unsigned Idx) -> Value * {
4388 if (Idx >= VL.size())
4389 return UndefValue::get(VL.front()->getType());
4390 return VL[Idx];
4391 });
4392 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4393 if (S)
4394 Last->setOperations(S);
4395 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4396 }
4397 if (EntryState == TreeEntry::SplitVectorize) {
4398 assert(S && "Split nodes must have operations.");
4399 Last->setOperations(S);
4400 SmallPtrSet<Value *, 4> Processed;
4401 for (Value *V : VL) {
4402 auto *I = dyn_cast<Instruction>(V);
4403 if (!I)
4404 continue;
4405 auto It = ScalarsInSplitNodes.find(V);
4406 if (It == ScalarsInSplitNodes.end()) {
4407 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4408 (void)Processed.insert(V);
4409 } else if (Processed.insert(V).second) {
4410 assert(!is_contained(It->getSecond(), Last) &&
4411 "Value already associated with the node.");
4412 It->getSecond().push_back(Last);
4413 }
4414 }
4415 } else if (!Last->isGather()) {
4416 if (isa<PHINode>(S.getMainOp()) ||
4417 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4418 (!S.areInstructionsWithCopyableElements() &&
4419 doesNotNeedToSchedule(VL)) ||
4420 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4421 Last->setDoesNotNeedToSchedule();
4422 SmallPtrSet<Value *, 4> Processed;
4423 for (Value *V : VL) {
4424 if (isa<PoisonValue>(V))
4425 continue;
4426 if (S.isCopyableElement(V)) {
4427 Last->addCopyableElement(V);
4428 continue;
4429 }
4430 auto It = ScalarToTreeEntries.find(V);
4431 if (It == ScalarToTreeEntries.end()) {
4432 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4433 (void)Processed.insert(V);
4434 } else if (Processed.insert(V).second) {
4435 assert(!is_contained(It->getSecond(), Last) &&
4436 "Value already associated with the node.");
4437 It->getSecond().push_back(Last);
4438 }
4439 }
4440 // Update the scheduler bundle to point to this TreeEntry.
4441 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4442 "Bundle and VL out of sync");
4443 if (!Bundle.getBundle().empty()) {
4444#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4445 auto *BundleMember = Bundle.getBundle().begin();
4446 SmallPtrSet<Value *, 4> Processed;
4447 for (Value *V : VL) {
4448 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4449 continue;
4450 ++BundleMember;
4451 }
4452 assert(BundleMember == Bundle.getBundle().end() &&
4453 "Bundle and VL out of sync");
4454#endif
4455 Bundle.setTreeEntry(Last);
4456 }
4457 } else {
4458 // Build a map for gathered scalars to the nodes where they are used.
4459 bool AllConstsOrCasts = true;
4460 for (Value *V : VL) {
4461 if (S && S.areInstructionsWithCopyableElements() &&
4462 S.isCopyableElement(V))
4463 Last->addCopyableElement(V);
4464 if (!isConstant(V)) {
4465 auto *I = dyn_cast<CastInst>(V);
4466 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4467 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4468 !UserTreeIdx.UserTE->isGather())
4469 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4470 }
4471 }
4472 if (AllConstsOrCasts)
4473 CastMaxMinBWSizes =
4474 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4475 MustGather.insert_range(VL);
4476 }
4477
4478 if (UserTreeIdx.UserTE)
4479 Last->UserTreeIndex = UserTreeIdx;
4480 return Last;
4481 }
4482
4483 /// -- Vectorization State --
4484 /// Holds all of the tree entries.
4485 TreeEntry::VecTreeTy VectorizableTree;
4486
4487#ifndef NDEBUG
4488 /// Debug printer.
4489 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4490 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4491 VectorizableTree[Id]->dump();
4492 dbgs() << "\n";
4493 }
4494 }
4495#endif
4496
4497 /// Get list of vector entries, associated with the value \p V.
4498 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4499 assert(V && "V cannot be nullptr.");
4500 auto It = ScalarToTreeEntries.find(V);
4501 if (It == ScalarToTreeEntries.end())
4502 return {};
4503 return It->getSecond();
4504 }
4505
4506 /// Get list of split vector entries, associated with the value \p V.
4507 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4508 assert(V && "V cannot be nullptr.");
4509 auto It = ScalarsInSplitNodes.find(V);
4510 if (It == ScalarsInSplitNodes.end())
4511 return {};
4512 return It->getSecond();
4513 }
4514
4515 /// Returns first vector node for value \p V, matching values \p VL.
4516 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4517 bool SameVF = false) const {
4518 assert(V && "V cannot be nullptr.");
4519 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4520 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4521 return TE;
4522 return nullptr;
4523 }
4524
4525 /// Check that the operand node of alternate node does not generate
4526 /// buildvector sequence. If it is, then probably not worth it to build
4527 /// alternate shuffle, if number of buildvector operands + alternate
4528 /// instruction > than the number of buildvector instructions.
4529 /// \param S the instructions state of the analyzed values.
4530 /// \param VL list of the instructions with alternate opcodes.
4531 bool areAltOperandsProfitable(const InstructionsState &S,
4532 ArrayRef<Value *> VL) const;
4533
4534 /// Contains all the outputs of legality analysis for a list of values to
4535 /// vectorize.
4536 class ScalarsVectorizationLegality {
4537 InstructionsState S;
4538 bool IsLegal;
4539 bool TryToFindDuplicates;
4540 bool TrySplitVectorize;
4541
4542 public:
4543 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4544 bool TryToFindDuplicates = true,
4545 bool TrySplitVectorize = false)
4546 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4547 TrySplitVectorize(TrySplitVectorize) {
4548 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4549 "Inconsistent state");
4550 }
4551 const InstructionsState &getInstructionsState() const { return S; };
4552 bool isLegal() const { return IsLegal; }
4553 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4554 bool trySplitVectorize() const { return TrySplitVectorize; }
4555 };
4556
4557 /// Checks if the specified list of the instructions/values can be vectorized
4558 /// in general.
4559 ScalarsVectorizationLegality
4560 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4561 const EdgeInfo &UserTreeIdx,
4562 bool TryCopyableElementsVectorization) const;
4563
4564 /// Checks if the specified list of the instructions/values can be vectorized
4565 /// and fills required data before actual scheduling of the instructions.
4566 TreeEntry::EntryState getScalarsVectorizationState(
4567 const InstructionsState &S, ArrayRef<Value *> VL,
4568 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4569 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4570
4571 /// Maps a specific scalar to its tree entry(ies).
4572 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4573
4574 /// Maps the operand index and entry to the corresponding tree entry.
4575 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4576 OperandsToTreeEntry;
4577
4578 /// Scalars, used in split vectorize nodes.
4579 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4580
4581 /// Maps a value to the proposed vectorizable size.
4582 SmallDenseMap<Value *, unsigned> InstrElementSize;
4583
4584 /// A list of scalars that we found that we need to keep as scalars.
4585 ValueSet MustGather;
4586
4587 /// A set of first non-schedulable values.
4588 ValueSet NonScheduledFirst;
4589
4590 /// A map between the vectorized entries and the last instructions in the
4591 /// bundles. The bundles are built in use order, not in the def order of the
4592 /// instructions. So, we cannot rely directly on the last instruction in the
4593 /// bundle being the last instruction in the program order during
4594 /// vectorization process since the basic blocks are affected, need to
4595 /// pre-gather them before.
4596 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4597
4598 /// Keeps the mapping between the last instructions and their insertion
4599 /// points, which is an instruction-after-the-last-instruction.
4600 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4601
4602 /// List of gather nodes, depending on other gather/vector nodes, which should
4603 /// be emitted after the vector instruction emission process to correctly
4604 /// handle order of the vector instructions and shuffles.
4605 SetVector<const TreeEntry *> PostponedGathers;
4606
4607 using ValueToGatherNodesMap =
4608 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4609 ValueToGatherNodesMap ValueToGatherNodes;
4610
4611 /// A list of the load entries (node indices), which can be vectorized using
4612 /// strided or masked gather approach, but attempted to be represented as
4613 /// contiguous loads.
4614 SetVector<unsigned> LoadEntriesToVectorize;
4615
4616 /// true if graph nodes transforming mode is on.
4617 bool IsGraphTransformMode = false;
4618
4619 /// The index of the first gathered load entry in the VectorizeTree.
4620 std::optional<unsigned> GatheredLoadsEntriesFirst;
4621
4622 /// Maps compress entries to their mask data for the final codegen.
4623 SmallDenseMap<const TreeEntry *,
4624 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4625 CompressEntryToData;
4626
4627 /// This POD struct describes one external user in the vectorized tree.
4628 struct ExternalUser {
4629 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4630 : Scalar(S), User(U), E(E), Lane(L) {}
4631
4632 /// Which scalar in our function.
4633 Value *Scalar = nullptr;
4634
4635 /// Which user that uses the scalar.
4636 llvm::User *User = nullptr;
4637
4638 /// Vector node, the value is part of.
4639 const TreeEntry &E;
4640
4641 /// Which lane does the scalar belong to.
4642 unsigned Lane;
4643 };
4644 using UserList = SmallVector<ExternalUser, 16>;
4645
4646 /// Checks if two instructions may access the same memory.
4647 ///
4648 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4649 /// is invariant in the calling loop.
4650 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4651 Instruction *Inst2) {
4652 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4653 // First check if the result is already in the cache.
4654 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4655 auto Res = AliasCache.try_emplace(Key);
4656 if (!Res.second)
4657 return Res.first->second;
4658 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4659 // Store the result in the cache.
4660 Res.first->getSecond() = Aliased;
4661 return Aliased;
4662 }
4663
4664 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4665
4666 /// Cache for alias results.
4667 /// TODO: consider moving this to the AliasAnalysis itself.
4668 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4669
4670 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4671 // globally through SLP because we don't perform any action which
4672 // invalidates capture results.
4673 BatchAAResults BatchAA;
4674
4675 /// Temporary store for deleted instructions. Instructions will be deleted
4676 /// eventually when the BoUpSLP is destructed. The deferral is required to
4677 /// ensure that there are no incorrect collisions in the AliasCache, which
4678 /// can happen if a new instruction is allocated at the same address as a
4679 /// previously deleted instruction.
4680 DenseSet<Instruction *> DeletedInstructions;
4681
4682 /// Set of the instruction, being analyzed already for reductions.
4683 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4684
4685 /// Set of hashes for the list of reduction values already being analyzed.
4686 DenseSet<size_t> AnalyzedReductionVals;
4687
4688 /// Values, already been analyzed for mininmal bitwidth and found to be
4689 /// non-profitable.
4690 DenseSet<Value *> AnalyzedMinBWVals;
4691
4692 /// A list of values that need to extracted out of the tree.
4693 /// This list holds pairs of (Internal Scalar : External User). External User
4694 /// can be nullptr, it means that this Internal Scalar will be used later,
4695 /// after vectorization.
4696 UserList ExternalUses;
4697
4698 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4699 /// extractelement instructions.
4700 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4701
4702 /// A list of scalar to be extracted without specific user necause of too many
4703 /// uses.
4704 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4705
4706 /// Values used only by @llvm.assume calls.
4707 SmallPtrSet<const Value *, 32> EphValues;
4708
4709 /// Holds all of the instructions that we gathered, shuffle instructions and
4710 /// extractelements.
4711 SetVector<Instruction *> GatherShuffleExtractSeq;
4712
4713 /// A list of blocks that we are going to CSE.
4714 DenseSet<BasicBlock *> CSEBlocks;
4715
4716 /// List of hashes of vector of loads, which are known to be non vectorizable.
4717 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4718
4719 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4720 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4721 /// instructions, while ScheduleBundle represents a batch of instructions,
4722 /// going to be groupped together. ScheduleCopyableData models extra user for
4723 /// "copyable" instructions.
4724 class ScheduleEntity {
4725 friend class ScheduleBundle;
4726 friend class ScheduleData;
4727 friend class ScheduleCopyableData;
4728
4729 protected:
4730 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4731 Kind getKind() const { return K; }
4732 ScheduleEntity(Kind K) : K(K) {}
4733
4734 private:
4735 /// Used for getting a "good" final ordering of instructions.
4736 int SchedulingPriority = 0;
4737 /// True if this instruction (or bundle) is scheduled (or considered as
4738 /// scheduled in the dry-run).
4739 bool IsScheduled = false;
4740 /// The kind of the ScheduleEntity.
4741 const Kind K = Kind::ScheduleData;
4742
4743 public:
4744 ScheduleEntity() = delete;
4745 /// Gets/sets the scheduling priority.
4746 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4747 int getSchedulingPriority() const { return SchedulingPriority; }
4748 bool isReady() const {
4749 if (const auto *SD = dyn_cast<ScheduleData>(this))
4750 return SD->isReady();
4751 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4752 return CD->isReady();
4753 return cast<ScheduleBundle>(this)->isReady();
4754 }
4755 /// Returns true if the dependency information has been calculated.
4756 /// Note that depenendency validity can vary between instructions within
4757 /// a single bundle.
4758 bool hasValidDependencies() const {
4759 if (const auto *SD = dyn_cast<ScheduleData>(this))
4760 return SD->hasValidDependencies();
4761 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4762 return CD->hasValidDependencies();
4763 return cast<ScheduleBundle>(this)->hasValidDependencies();
4764 }
4765 /// Gets the number of unscheduled dependencies.
4766 int getUnscheduledDeps() const {
4767 if (const auto *SD = dyn_cast<ScheduleData>(this))
4768 return SD->getUnscheduledDeps();
4769 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4770 return CD->getUnscheduledDeps();
4771 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4772 }
4773 /// Increments the number of unscheduled dependencies.
4774 int incrementUnscheduledDeps(int Incr) {
4775 if (auto *SD = dyn_cast<ScheduleData>(this))
4776 return SD->incrementUnscheduledDeps(Incr);
4777 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4778 }
4779 /// Gets the number of dependencies.
4780 int getDependencies() const {
4781 if (const auto *SD = dyn_cast<ScheduleData>(this))
4782 return SD->getDependencies();
4783 return cast<ScheduleCopyableData>(this)->getDependencies();
4784 }
4785 /// Gets the instruction.
4786 Instruction *getInst() const {
4787 if (const auto *SD = dyn_cast<ScheduleData>(this))
4788 return SD->getInst();
4789 return cast<ScheduleCopyableData>(this)->getInst();
4790 }
4791
4792 /// Gets/sets if the bundle is scheduled.
4793 bool isScheduled() const { return IsScheduled; }
4794 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4795
4796 static bool classof(const ScheduleEntity *) { return true; }
4797
4798#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4799 void dump(raw_ostream &OS) const {
4800 if (const auto *SD = dyn_cast<ScheduleData>(this))
4801 return SD->dump(OS);
4802 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4803 return CD->dump(OS);
4804 return cast<ScheduleBundle>(this)->dump(OS);
4805 }
4806
4807 LLVM_DUMP_METHOD void dump() const {
4808 dump(dbgs());
4809 dbgs() << '\n';
4810 }
4811#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4812 };
4813
4814#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4816 const BoUpSLP::ScheduleEntity &SE) {
4817 SE.dump(OS);
4818 return OS;
4819 }
4820#endif
4821
4822 /// Contains all scheduling relevant data for an instruction.
4823 /// A ScheduleData either represents a single instruction or a member of an
4824 /// instruction bundle (= a group of instructions which is combined into a
4825 /// vector instruction).
4826 class ScheduleData final : public ScheduleEntity {
4827 public:
4828 // The initial value for the dependency counters. It means that the
4829 // dependencies are not calculated yet.
4830 enum { InvalidDeps = -1 };
4831
4832 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4833 static bool classof(const ScheduleEntity *Entity) {
4834 return Entity->getKind() == Kind::ScheduleData;
4835 }
4836
4837 void init(int BlockSchedulingRegionID, Instruction *I) {
4838 NextLoadStore = nullptr;
4839 IsScheduled = false;
4840 SchedulingRegionID = BlockSchedulingRegionID;
4841 clearDependencies();
4842 Inst = I;
4843 }
4844
4845 /// Verify basic self consistency properties
4846 void verify() {
4847 if (hasValidDependencies()) {
4848 assert(UnscheduledDeps <= Dependencies && "invariant");
4849 } else {
4850 assert(UnscheduledDeps == Dependencies && "invariant");
4851 }
4852
4853 if (IsScheduled) {
4854 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4855 "unexpected scheduled state");
4856 }
4857 }
4858
4859 /// Returns true if the dependency information has been calculated.
4860 /// Note that depenendency validity can vary between instructions within
4861 /// a single bundle.
4862 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4863
4864 /// Returns true if it is ready for scheduling, i.e. it has no more
4865 /// unscheduled depending instructions/bundles.
4866 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4867
4868 /// Modifies the number of unscheduled dependencies for this instruction,
4869 /// and returns the number of remaining dependencies for the containing
4870 /// bundle.
4871 int incrementUnscheduledDeps(int Incr) {
4872 assert(hasValidDependencies() &&
4873 "increment of unscheduled deps would be meaningless");
4874 UnscheduledDeps += Incr;
4875 assert(UnscheduledDeps >= 0 &&
4876 "Expected valid number of unscheduled deps");
4877 return UnscheduledDeps;
4878 }
4879
4880 /// Sets the number of unscheduled dependencies to the number of
4881 /// dependencies.
4882 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4883
4884 /// Clears all dependency information.
4885 void clearDependencies() {
4886 clearDirectDependencies();
4887 MemoryDependencies.clear();
4888 ControlDependencies.clear();
4889 }
4890
4891 /// Clears all direct dependencies only, except for control and memory
4892 /// dependencies.
4893 /// Required for copyable elements to correctly handle control/memory deps
4894 /// and avoid extra reclaculation of such deps.
4895 void clearDirectDependencies() {
4896 Dependencies = InvalidDeps;
4897 resetUnscheduledDeps();
4898 IsScheduled = false;
4899 }
4900
4901 /// Gets the number of unscheduled dependencies.
4902 int getUnscheduledDeps() const { return UnscheduledDeps; }
4903 /// Gets the number of dependencies.
4904 int getDependencies() const { return Dependencies; }
4905 /// Initializes the number of dependencies.
4906 void initDependencies() { Dependencies = 0; }
4907 /// Increments the number of dependencies.
4908 void incDependencies() { Dependencies++; }
4909
4910 /// Gets scheduling region ID.
4911 int getSchedulingRegionID() const { return SchedulingRegionID; }
4912
4913 /// Gets the instruction.
4914 Instruction *getInst() const { return Inst; }
4915
4916 /// Gets the list of memory dependencies.
4917 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4918 return MemoryDependencies;
4919 }
4920 /// Adds a memory dependency.
4921 void addMemoryDependency(ScheduleData *Dep) {
4922 MemoryDependencies.push_back(Dep);
4923 }
4924 /// Gets the list of control dependencies.
4925 ArrayRef<ScheduleData *> getControlDependencies() const {
4926 return ControlDependencies;
4927 }
4928 /// Adds a control dependency.
4929 void addControlDependency(ScheduleData *Dep) {
4930 ControlDependencies.push_back(Dep);
4931 }
4932 /// Gets/sets the next load/store instruction in the block.
4933 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4934 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4935
4936 void dump(raw_ostream &OS) const { OS << *Inst; }
4937
4938 LLVM_DUMP_METHOD void dump() const {
4939 dump(dbgs());
4940 dbgs() << '\n';
4941 }
4942
4943 private:
4944 Instruction *Inst = nullptr;
4945
4946 /// Single linked list of all memory instructions (e.g. load, store, call)
4947 /// in the block - until the end of the scheduling region.
4948 ScheduleData *NextLoadStore = nullptr;
4949
4950 /// The dependent memory instructions.
4951 /// This list is derived on demand in calculateDependencies().
4952 SmallVector<ScheduleData *> MemoryDependencies;
4953
4954 /// List of instructions which this instruction could be control dependent
4955 /// on. Allowing such nodes to be scheduled below this one could introduce
4956 /// a runtime fault which didn't exist in the original program.
4957 /// ex: this is a load or udiv following a readonly call which inf loops
4958 SmallVector<ScheduleData *> ControlDependencies;
4959
4960 /// This ScheduleData is in the current scheduling region if this matches
4961 /// the current SchedulingRegionID of BlockScheduling.
4962 int SchedulingRegionID = 0;
4963
4964 /// The number of dependencies. Constitutes of the number of users of the
4965 /// instruction plus the number of dependent memory instructions (if any).
4966 /// This value is calculated on demand.
4967 /// If InvalidDeps, the number of dependencies is not calculated yet.
4968 int Dependencies = InvalidDeps;
4969
4970 /// The number of dependencies minus the number of dependencies of scheduled
4971 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4972 /// for scheduling.
4973 /// Note that this is negative as long as Dependencies is not calculated.
4974 int UnscheduledDeps = InvalidDeps;
4975 };
4976
4977#ifndef NDEBUG
4979 const BoUpSLP::ScheduleData &SD) {
4980 SD.dump(OS);
4981 return OS;
4982 }
4983#endif
4984
4985 class ScheduleBundle final : public ScheduleEntity {
4986 /// The schedule data for the instructions in the bundle.
4988 /// True if this bundle is valid.
4989 bool IsValid = true;
4990 /// The TreeEntry that this instruction corresponds to.
4991 TreeEntry *TE = nullptr;
4992 ScheduleBundle(bool IsValid)
4993 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4994
4995 public:
4996 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4997 static bool classof(const ScheduleEntity *Entity) {
4998 return Entity->getKind() == Kind::ScheduleBundle;
4999 }
5000
5001 /// Verify basic self consistency properties
5002 void verify() const {
5003 for (const ScheduleEntity *SD : Bundle) {
5004 if (SD->hasValidDependencies()) {
5005 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5006 "invariant");
5007 } else {
5008 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5009 "invariant");
5010 }
5011
5012 if (isScheduled()) {
5013 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5014 "unexpected scheduled state");
5015 }
5016 }
5017 }
5018
5019 /// Returns the number of unscheduled dependencies in the bundle.
5020 int unscheduledDepsInBundle() const {
5021 assert(*this && "bundle must not be empty");
5022 int Sum = 0;
5023 for (const ScheduleEntity *BundleMember : Bundle) {
5024 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5025 return ScheduleData::InvalidDeps;
5026 Sum += BundleMember->getUnscheduledDeps();
5027 }
5028 return Sum;
5029 }
5030
5031 /// Returns true if the dependency information has been calculated.
5032 /// Note that depenendency validity can vary between instructions within
5033 /// a single bundle.
5034 bool hasValidDependencies() const {
5035 return all_of(Bundle, [](const ScheduleEntity *SD) {
5036 return SD->hasValidDependencies();
5037 });
5038 }
5039
5040 /// Returns true if it is ready for scheduling, i.e. it has no more
5041 /// unscheduled depending instructions/bundles.
5042 bool isReady() const {
5043 assert(*this && "bundle must not be empty");
5044 return unscheduledDepsInBundle() == 0 && !isScheduled();
5045 }
5046
5047 /// Returns the bundle of scheduling data, associated with the current
5048 /// instruction.
5049 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5050 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5051 /// Adds an instruction to the bundle.
5052 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5053
5054 /// Gets/sets the associated tree entry.
5055 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5056 TreeEntry *getTreeEntry() const { return TE; }
5057
5058 static ScheduleBundle invalid() { return {false}; }
5059
5060 operator bool() const { return IsValid; }
5061
5062#ifndef NDEBUG
5063 void dump(raw_ostream &OS) const {
5064 if (!*this) {
5065 OS << "[]";
5066 return;
5067 }
5068 OS << '[';
5069 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5071 OS << "<Copyable>";
5072 OS << *SD->getInst();
5073 });
5074 OS << ']';
5075 }
5076
5077 LLVM_DUMP_METHOD void dump() const {
5078 dump(dbgs());
5079 dbgs() << '\n';
5080 }
5081#endif // NDEBUG
5082 };
5083
5084#ifndef NDEBUG
5086 const BoUpSLP::ScheduleBundle &Bundle) {
5087 Bundle.dump(OS);
5088 return OS;
5089 }
5090#endif
5091
5092 /// Contains all scheduling relevant data for the copyable instruction.
5093 /// It models the virtual instructions, supposed to replace the original
5094 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5095 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5096 /// instruction %virt = add %0, 0.
5097 class ScheduleCopyableData final : public ScheduleEntity {
5098 /// The source schedule data for the instruction.
5099 Instruction *Inst = nullptr;
5100 /// The edge information for the instruction.
5101 const EdgeInfo EI;
5102 /// This ScheduleData is in the current scheduling region if this matches
5103 /// the current SchedulingRegionID of BlockScheduling.
5104 int SchedulingRegionID = 0;
5105 /// Bundle, this data is part of.
5106 ScheduleBundle &Bundle;
5107
5108 public:
5109 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5110 const EdgeInfo &EI, ScheduleBundle &Bundle)
5111 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5112 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5113 static bool classof(const ScheduleEntity *Entity) {
5114 return Entity->getKind() == Kind::ScheduleCopyableData;
5115 }
5116
5117 /// Verify basic self consistency properties
5118 void verify() {
5119 if (hasValidDependencies()) {
5120 assert(UnscheduledDeps <= Dependencies && "invariant");
5121 } else {
5122 assert(UnscheduledDeps == Dependencies && "invariant");
5123 }
5124
5125 if (IsScheduled) {
5126 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5127 "unexpected scheduled state");
5128 }
5129 }
5130
5131 /// Returns true if the dependency information has been calculated.
5132 /// Note that depenendency validity can vary between instructions within
5133 /// a single bundle.
5134 bool hasValidDependencies() const {
5135 return Dependencies != ScheduleData::InvalidDeps;
5136 }
5137
5138 /// Returns true if it is ready for scheduling, i.e. it has no more
5139 /// unscheduled depending instructions/bundles.
5140 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5141
5142 /// Modifies the number of unscheduled dependencies for this instruction,
5143 /// and returns the number of remaining dependencies for the containing
5144 /// bundle.
5145 int incrementUnscheduledDeps(int Incr) {
5146 assert(hasValidDependencies() &&
5147 "increment of unscheduled deps would be meaningless");
5148 UnscheduledDeps += Incr;
5149 assert(UnscheduledDeps >= 0 && "invariant");
5150 return UnscheduledDeps;
5151 }
5152
5153 /// Sets the number of unscheduled dependencies to the number of
5154 /// dependencies.
5155 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5156
5157 /// Gets the number of unscheduled dependencies.
5158 int getUnscheduledDeps() const { return UnscheduledDeps; }
5159 /// Gets the number of dependencies.
5160 int getDependencies() const { return Dependencies; }
5161 /// Initializes the number of dependencies.
5162 void initDependencies() { Dependencies = 0; }
5163 /// Increments the number of dependencies.
5164 void incDependencies() { Dependencies++; }
5165
5166 /// Gets scheduling region ID.
5167 int getSchedulingRegionID() const { return SchedulingRegionID; }
5168
5169 /// Gets the instruction.
5170 Instruction *getInst() const { return Inst; }
5171
5172 /// Clears all dependency information.
5173 void clearDependencies() {
5174 Dependencies = ScheduleData::InvalidDeps;
5175 UnscheduledDeps = ScheduleData::InvalidDeps;
5176 IsScheduled = false;
5177 }
5178
5179 /// Gets the edge information.
5180 const EdgeInfo &getEdgeInfo() const { return EI; }
5181
5182 /// Gets the bundle.
5183 ScheduleBundle &getBundle() { return Bundle; }
5184 const ScheduleBundle &getBundle() const { return Bundle; }
5185
5186#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5187 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5188
5189 LLVM_DUMP_METHOD void dump() const {
5190 dump(dbgs());
5191 dbgs() << '\n';
5192 }
5193#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5194
5195 private:
5196 /// true, if it has valid dependency information. These nodes always have
5197 /// only single dependency.
5198 int Dependencies = ScheduleData::InvalidDeps;
5199
5200 /// The number of dependencies minus the number of dependencies of scheduled
5201 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5202 /// for scheduling.
5203 /// Note that this is negative as long as Dependencies is not calculated.
5204 int UnscheduledDeps = ScheduleData::InvalidDeps;
5205 };
5206
5207#ifndef NDEBUG
5208 friend inline raw_ostream &
5209 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5210 SD.dump(OS);
5211 return OS;
5212 }
5213#endif
5214
5215 friend struct GraphTraits<BoUpSLP *>;
5216 friend struct DOTGraphTraits<BoUpSLP *>;
5217
5218 /// Contains all scheduling data for a basic block.
5219 /// It does not schedules instructions, which are not memory read/write
5220 /// instructions and their operands are either constants, or arguments, or
5221 /// phis, or instructions from others blocks, or their users are phis or from
5222 /// the other blocks. The resulting vector instructions can be placed at the
5223 /// beginning of the basic block without scheduling (if operands does not need
5224 /// to be scheduled) or at the end of the block (if users are outside of the
5225 /// block). It allows to save some compile time and memory used by the
5226 /// compiler.
5227 /// ScheduleData is assigned for each instruction in between the boundaries of
5228 /// the tree entry, even for those, which are not part of the graph. It is
5229 /// required to correctly follow the dependencies between the instructions and
5230 /// their correct scheduling. The ScheduleData is not allocated for the
5231 /// instructions, which do not require scheduling, like phis, nodes with
5232 /// extractelements/insertelements only or nodes with instructions, with
5233 /// uses/operands outside of the block.
5234 struct BlockScheduling {
5235 BlockScheduling(BasicBlock *BB)
5236 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5237
5238 void clear() {
5239 ScheduledBundles.clear();
5240 ScheduledBundlesList.clear();
5241 ScheduleCopyableDataMap.clear();
5242 ScheduleCopyableDataMapByInst.clear();
5243 ScheduleCopyableDataMapByInstUser.clear();
5244 ScheduleCopyableDataMapByUsers.clear();
5245 ReadyInsts.clear();
5246 ScheduleStart = nullptr;
5247 ScheduleEnd = nullptr;
5248 FirstLoadStoreInRegion = nullptr;
5249 LastLoadStoreInRegion = nullptr;
5250 RegionHasStackSave = false;
5251
5252 // Reduce the maximum schedule region size by the size of the
5253 // previous scheduling run.
5254 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5255 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5256 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5257 ScheduleRegionSize = 0;
5258
5259 // Make a new scheduling region, i.e. all existing ScheduleData is not
5260 // in the new region yet.
5261 ++SchedulingRegionID;
5262 }
5263
5264 ScheduleData *getScheduleData(Instruction *I) {
5265 if (!I)
5266 return nullptr;
5267 if (BB != I->getParent())
5268 // Avoid lookup if can't possibly be in map.
5269 return nullptr;
5270 ScheduleData *SD = ScheduleDataMap.lookup(I);
5271 if (SD && isInSchedulingRegion(*SD))
5272 return SD;
5273 return nullptr;
5274 }
5275
5276 ScheduleData *getScheduleData(Value *V) {
5277 return getScheduleData(dyn_cast<Instruction>(V));
5278 }
5279
5280 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5281 /// operand number) and value.
5282 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5283 const Value *V) const {
5284 if (ScheduleCopyableDataMap.empty())
5285 return nullptr;
5286 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5287 if (It == ScheduleCopyableDataMap.end())
5288 return nullptr;
5289 ScheduleCopyableData *SD = It->getSecond().get();
5290 if (!isInSchedulingRegion(*SD))
5291 return nullptr;
5292 return SD;
5293 }
5294
5295 /// Returns the ScheduleCopyableData for the given user \p User, operand
5296 /// number and operand \p V.
5298 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5299 const Value *V) {
5300 if (ScheduleCopyableDataMapByInstUser.empty())
5301 return {};
5302 const auto It = ScheduleCopyableDataMapByInstUser.find(
5303 std::make_pair(std::make_pair(User, OperandIdx), V));
5304 if (It == ScheduleCopyableDataMapByInstUser.end())
5305 return {};
5307 for (ScheduleCopyableData *SD : It->getSecond()) {
5308 if (isInSchedulingRegion(*SD))
5309 Res.push_back(SD);
5310 }
5311 return Res;
5312 }
5313
5314 /// Returns true if all operands of the given instruction \p User are
5315 /// replaced by copyable data.
5316 /// \param User The user instruction.
5317 /// \param Op The operand, which might be replaced by the copyable data.
5318 /// \param SLP The SLP tree.
5319 /// \param NumOps The number of operands used. If the instruction uses the
5320 /// same operand several times, check for the first use, then the second,
5321 /// etc.
5322 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5323 Instruction *Op, BoUpSLP &SLP,
5324 unsigned NumOps) const {
5325 assert(NumOps > 0 && "No operands");
5326 if (ScheduleCopyableDataMap.empty())
5327 return false;
5328 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5329 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5330 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5331 if (Entries.empty())
5332 return false;
5333 for (const Use &U : User->operands()) {
5334 if (U.get() != Op)
5335 continue;
5336 // Check all tree entries, if they have operands replaced by copyable
5337 // data.
5338 for (TreeEntry *TE : Entries) {
5339 unsigned Inc = 0;
5340 bool IsNonSchedulableWithParentPhiNode =
5341 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5342 TE->UserTreeIndex.UserTE->hasState() &&
5343 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5344 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5345 // Count the number of unique phi nodes, which are the parent for
5346 // parent entry, and exit, if all the unique phis are processed.
5347 if (IsNonSchedulableWithParentPhiNode) {
5348 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5349 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5350 for (Value *V : ParentTE->Scalars) {
5351 auto *PHI = dyn_cast<PHINode>(V);
5352 if (!PHI)
5353 continue;
5354 if (ParentsUniqueUsers.insert(PHI).second &&
5355 is_contained(PHI->incoming_values(), User))
5356 ++Inc;
5357 }
5358 } else {
5359 Inc = 1;
5360 }
5361
5362 // Check if the user is commutative.
5363 // The commutatives are handled later, as their operands can be
5364 // reordered.
5365 // Same applies even for non-commutative cmps, because we can invert
5366 // their predicate potentially and, thus, reorder the operands.
5367 bool IsCommutativeUser =
5368 ::isCommutative(User) ||
5369 ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
5370 if (!IsCommutativeUser && !isa<CmpInst>(User)) {
5371 unsigned &OpCnt =
5372 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
5373 EdgeInfo EI(TE, U.getOperandNo());
5374 if (!getScheduleCopyableData(EI, Op))
5375 continue;
5376 // Found copyable operand - continue.
5377 OpCnt += Inc;
5378 continue;
5379 }
5380 PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5381 .first->getSecond() += Inc;
5382 }
5383 }
5384 if (PotentiallyReorderedEntriesCount.empty())
5385 return all_of(OrderedEntriesCount,
5386 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5387 return P.second == NumOps;
5388 });
5389 // Check the commutative/cmp entries.
5390 for (auto &P : PotentiallyReorderedEntriesCount) {
5391 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5392 bool IsNonSchedulableWithParentPhiNode =
5393 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5394 P.first->UserTreeIndex.UserTE->hasState() &&
5395 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5396 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5397 auto *It = find(P.first->Scalars, User);
5398 do {
5399 assert(It != P.first->Scalars.end() &&
5400 "User is not in the tree entry");
5401 int Lane = std::distance(P.first->Scalars.begin(), It);
5402 assert(Lane >= 0 && "Lane is not found");
5403 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5404 Lane = P.first->ReorderIndices[Lane];
5405 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5406 "Couldn't find extract lane");
5407 // Count the number of unique phi nodes, which are the parent for
5408 // parent entry, and exit, if all the unique phis are processed.
5409 if (IsNonSchedulableWithParentPhiNode) {
5410 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5411 Value *User = ParentTE->Scalars[Lane];
5412 if (!ParentsUniqueUsers.insert(User).second) {
5413 It =
5414 find(make_range(std::next(It), P.first->Scalars.end()), User);
5415 continue;
5416 }
5417 }
5418 for (unsigned OpIdx :
5420 P.first->getMainOp()))) {
5421 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5422 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5423 --P.getSecond();
5424 }
5425 // If parent node is schedulable, it will be handled correctly.
5426 if (!IsNonSchedulableWithParentPhiNode)
5427 break;
5428 It = find(make_range(std::next(It), P.first->Scalars.end()), User);
5429 } while (It != P.first->Scalars.end());
5430 }
5431 return all_of(PotentiallyReorderedEntriesCount,
5432 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5433 return P.second == NumOps - 1;
5434 }) &&
5435 all_of(OrderedEntriesCount,
5436 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5437 return P.second == NumOps;
5438 });
5439 }
5440
5442 getScheduleCopyableData(const Instruction *I) const {
5443 if (ScheduleCopyableDataMapByInst.empty())
5444 return {};
5445 const auto It = ScheduleCopyableDataMapByInst.find(I);
5446 if (It == ScheduleCopyableDataMapByInst.end())
5447 return {};
5449 for (ScheduleCopyableData *SD : It->getSecond()) {
5450 if (isInSchedulingRegion(*SD))
5451 Res.push_back(SD);
5452 }
5453 return Res;
5454 }
5455
5457 getScheduleCopyableDataUsers(const Instruction *User) const {
5458 if (ScheduleCopyableDataMapByUsers.empty())
5459 return {};
5460 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5461 if (It == ScheduleCopyableDataMapByUsers.end())
5462 return {};
5464 for (ScheduleCopyableData *SD : It->getSecond()) {
5465 if (isInSchedulingRegion(*SD))
5466 Res.push_back(SD);
5467 }
5468 return Res;
5469 }
5470
5471 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5472 Instruction *I,
5473 int SchedulingRegionID,
5474 ScheduleBundle &Bundle) {
5475 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5476 ScheduleCopyableData *CD =
5477 ScheduleCopyableDataMap
5478 .try_emplace(std::make_pair(EI, I),
5479 std::make_unique<ScheduleCopyableData>(
5480 SchedulingRegionID, I, EI, Bundle))
5481 .first->getSecond()
5482 .get();
5483 ScheduleCopyableDataMapByInst[I].push_back(CD);
5484 if (EI.UserTE) {
5485 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5486 const auto *It = find(Op, I);
5487 assert(It != Op.end() && "Lane not set");
5488 SmallPtrSet<Instruction *, 4> Visited;
5489 do {
5490 int Lane = std::distance(Op.begin(), It);
5491 assert(Lane >= 0 && "Lane not set");
5492 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5493 !EI.UserTE->ReorderIndices.empty())
5494 Lane = EI.UserTE->ReorderIndices[Lane];
5495 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5496 "Couldn't find extract lane");
5497 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5498 if (!Visited.insert(In).second) {
5499 It = find(make_range(std::next(It), Op.end()), I);
5500 continue;
5501 }
5502 ScheduleCopyableDataMapByInstUser
5503 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5504 .first->getSecond()
5505 .push_back(CD);
5506 ScheduleCopyableDataMapByUsers.try_emplace(I)
5507 .first->getSecond()
5508 .insert(CD);
5509 // Remove extra deps for users, becoming non-immediate users of the
5510 // instruction. It may happen, if the chain of same copyable elements
5511 // appears in the tree.
5512 if (In == I) {
5513 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5514 if (ScheduleCopyableData *UserCD =
5515 getScheduleCopyableData(UserEI, In))
5516 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5517 }
5518 It = find(make_range(std::next(It), Op.end()), I);
5519 } while (It != Op.end());
5520 } else {
5521 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5522 CD);
5523 }
5524 return *CD;
5525 }
5526
5527 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5528 auto *I = dyn_cast<Instruction>(V);
5529 if (!I)
5530 return {};
5531 auto It = ScheduledBundles.find(I);
5532 if (It == ScheduledBundles.end())
5533 return {};
5534 return It->getSecond();
5535 }
5536
5537 /// Returns true if the entity is in the scheduling region.
5538 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5539 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5540 return Data->getSchedulingRegionID() == SchedulingRegionID;
5541 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5542 return CD->getSchedulingRegionID() == SchedulingRegionID;
5543 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5544 [&](const ScheduleEntity *BundleMember) {
5545 return isInSchedulingRegion(*BundleMember);
5546 });
5547 }
5548
5549 /// Marks an instruction as scheduled and puts all dependent ready
5550 /// instructions into the ready-list.
5551 template <typename ReadyListType>
5552 void schedule(const BoUpSLP &R, const InstructionsState &S,
5553 const EdgeInfo &EI, ScheduleEntity *Data,
5554 ReadyListType &ReadyList) {
5555 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5557 // Handle the def-use chain dependencies.
5558
5559 // Decrement the unscheduled counter and insert to ready list if ready.
5560 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5561 if ((IsControl || Data->hasValidDependencies()) &&
5562 Data->incrementUnscheduledDeps(-1) == 0) {
5563 // There are no more unscheduled dependencies after
5564 // decrementing, so we can put the dependent instruction
5565 // into the ready list.
5566 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5568 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5569 CopyableBundle.push_back(&CD->getBundle());
5570 Bundles = CopyableBundle;
5571 } else {
5572 Bundles = getScheduleBundles(Data->getInst());
5573 }
5574 if (!Bundles.empty()) {
5575 for (ScheduleBundle *Bundle : Bundles) {
5576 if (Bundle->unscheduledDepsInBundle() == 0) {
5577 assert(!Bundle->isScheduled() &&
5578 "already scheduled bundle gets ready");
5579 ReadyList.insert(Bundle);
5581 << "SLP: gets ready: " << *Bundle << "\n");
5582 }
5583 }
5584 return;
5585 }
5586 assert(!Data->isScheduled() &&
5587 "already scheduled bundle gets ready");
5589 "Expected non-copyable data");
5590 ReadyList.insert(Data);
5591 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5592 }
5593 };
5594
5595 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5596 Instruction *I) {
5597 if (!ScheduleCopyableDataMap.empty()) {
5599 getScheduleCopyableData(User, OpIdx, I);
5600 for (ScheduleCopyableData *CD : CopyableData)
5601 DecrUnsched(CD, /*IsControl=*/false);
5602 if (!CopyableData.empty())
5603 return;
5604 }
5605 if (ScheduleData *OpSD = getScheduleData(I))
5606 DecrUnsched(OpSD, /*IsControl=*/false);
5607 };
5608
5609 // If BundleMember is a vector bundle, its operands may have been
5610 // reordered during buildTree(). We therefore need to get its operands
5611 // through the TreeEntry.
5612 if (!Bundles.empty()) {
5613 auto *In = BundleMember->getInst();
5614 // Count uses of each instruction operand.
5615 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5616 unsigned TotalOpCount = 0;
5617 if (isa<ScheduleCopyableData>(BundleMember)) {
5618 // Copyable data is used only once (uses itself).
5619 TotalOpCount = OperandsUses[In] = 1;
5620 } else {
5621 for (const Use &U : In->operands()) {
5622 if (auto *I = dyn_cast<Instruction>(U.get())) {
5623 auto Res = OperandsUses.try_emplace(I, 0);
5624 ++Res.first->getSecond();
5625 ++TotalOpCount;
5626 }
5627 }
5628 }
5629 // Decrement the unscheduled counter and insert to ready list if
5630 // ready.
5631 auto DecrUnschedForInst =
5632 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5633 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5634 &Checked) {
5635 if (!ScheduleCopyableDataMap.empty()) {
5636 const EdgeInfo EI = {UserTE, OpIdx};
5637 if (ScheduleCopyableData *CD =
5638 getScheduleCopyableData(EI, I)) {
5639 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
5640 return;
5641 DecrUnsched(CD, /*IsControl=*/false);
5642 return;
5643 }
5644 }
5645 auto It = OperandsUses.find(I);
5646 assert(It != OperandsUses.end() && "Operand not found");
5647 if (It->second > 0) {
5648 --It->getSecond();
5649 assert(TotalOpCount > 0 && "No more operands to decrement");
5650 --TotalOpCount;
5651 if (ScheduleData *OpSD = getScheduleData(I)) {
5652 if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
5653 return;
5654 DecrUnsched(OpSD, /*IsControl=*/false);
5655 }
5656 }
5657 };
5658
5659 for (ScheduleBundle *Bundle : Bundles) {
5660 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5661 break;
5662 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5663 // Need to search for the lane since the tree entry can be
5664 // reordered.
5665 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
5666 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5667 bool IsNonSchedulableWithParentPhiNode =
5668 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5669 Bundle->getTreeEntry()->UserTreeIndex &&
5670 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5671 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5672 TreeEntry::SplitVectorize &&
5673 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5674 Instruction::PHI;
5675 do {
5676 int Lane =
5677 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5678 assert(Lane >= 0 && "Lane not set");
5679 if (isa<StoreInst>(In) &&
5680 !Bundle->getTreeEntry()->ReorderIndices.empty())
5681 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5682 assert(Lane < static_cast<int>(
5683 Bundle->getTreeEntry()->Scalars.size()) &&
5684 "Couldn't find extract lane");
5685
5686 // Since vectorization tree is being built recursively this
5687 // assertion ensures that the tree entry has all operands set
5688 // before reaching this code. Couple of exceptions known at the
5689 // moment are extracts where their second (immediate) operand is
5690 // not added. Since immediates do not affect scheduler behavior
5691 // this is considered okay.
5692 assert(In &&
5694 In->getNumOperands() ==
5695 Bundle->getTreeEntry()->getNumOperands() ||
5696 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5697 "Missed TreeEntry operands?");
5698
5699 // Count the number of unique phi nodes, which are the parent for
5700 // parent entry, and exit, if all the unique phis are processed.
5701 if (IsNonSchedulableWithParentPhiNode) {
5702 const TreeEntry *ParentTE =
5703 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5704 Value *User = ParentTE->Scalars[Lane];
5705 if (!ParentsUniqueUsers.insert(User).second) {
5706 It = std::find(std::next(It),
5707 Bundle->getTreeEntry()->Scalars.end(), In);
5708 continue;
5709 }
5710 }
5711
5712 for (unsigned OpIdx :
5713 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5714 if (auto *I = dyn_cast<Instruction>(
5715 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5716 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5717 << *I << "\n");
5718 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5719 }
5720 // If parent node is schedulable, it will be handled correctly.
5721 if (!IsNonSchedulableWithParentPhiNode)
5722 break;
5723 It = std::find(std::next(It),
5724 Bundle->getTreeEntry()->Scalars.end(), In);
5725 } while (It != Bundle->getTreeEntry()->Scalars.end());
5726 }
5727 } else {
5728 // If BundleMember is a stand-alone instruction, no operand reordering
5729 // has taken place, so we directly access its operands.
5730 for (Use &U : BundleMember->getInst()->operands()) {
5731 if (auto *I = dyn_cast<Instruction>(U.get())) {
5733 << "SLP: check for readiness (def): " << *I << "\n");
5734 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5735 }
5736 }
5737 }
5738 // Handle the memory dependencies.
5739 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5740 if (!SD)
5741 return;
5742 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5743 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5744 if (!VisitedMemory.insert(MemoryDep).second)
5745 continue;
5746 // There are no more unscheduled dependencies after decrementing,
5747 // so we can put the dependent instruction into the ready list.
5748 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5749 << *MemoryDep << "\n");
5750 DecrUnsched(MemoryDep);
5751 }
5752 // Handle the control dependencies.
5753 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5754 for (ScheduleData *Dep : SD->getControlDependencies()) {
5755 if (!VisitedControl.insert(Dep).second)
5756 continue;
5757 // There are no more unscheduled dependencies after decrementing,
5758 // so we can put the dependent instruction into the ready list.
5760 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5761 DecrUnsched(Dep, /*IsControl=*/true);
5762 }
5763 };
5764 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5765 SD->setScheduled(/*Scheduled=*/true);
5766 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5769 Instruction *In = SD->getInst();
5770 if (R.isVectorized(In)) {
5771 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5772 for (TreeEntry *TE : Entries) {
5774 In->getNumOperands() != TE->getNumOperands())
5775 continue;
5776 auto &BundlePtr =
5777 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5778 BundlePtr->setTreeEntry(TE);
5779 BundlePtr->add(SD);
5780 Bundles.push_back(BundlePtr.get());
5781 }
5782 }
5783 ProcessBundleMember(SD, Bundles);
5784 } else {
5785 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5786 Bundle.setScheduled(/*Scheduled=*/true);
5787 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5788 auto AreAllBundlesScheduled =
5789 [&](const ScheduleEntity *SD,
5790 ArrayRef<ScheduleBundle *> SDBundles) {
5792 return true;
5793 return !SDBundles.empty() &&
5794 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5795 return SDBundle->isScheduled();
5796 });
5797 };
5798 for (ScheduleEntity *SD : Bundle.getBundle()) {
5801 SDBundles = getScheduleBundles(SD->getInst());
5802 if (AreAllBundlesScheduled(SD, SDBundles)) {
5803 SD->setScheduled(/*Scheduled=*/true);
5804 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5805 : SDBundles);
5806 }
5807 }
5808 }
5809 }
5810
5811 /// Verify basic self consistency properties of the data structure.
5812 void verify() {
5813 if (!ScheduleStart)
5814 return;
5815
5816 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5817 ScheduleStart->comesBefore(ScheduleEnd) &&
5818 "Not a valid scheduling region?");
5819
5820 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5821 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5822 if (!Bundles.empty()) {
5823 for (ScheduleBundle *Bundle : Bundles) {
5824 assert(isInSchedulingRegion(*Bundle) &&
5825 "primary schedule data not in window?");
5826 Bundle->verify();
5827 }
5828 continue;
5829 }
5830 auto *SD = getScheduleData(I);
5831 if (!SD)
5832 continue;
5833 assert(isInSchedulingRegion(*SD) &&
5834 "primary schedule data not in window?");
5835 SD->verify();
5836 }
5837
5838 assert(all_of(ReadyInsts,
5839 [](const ScheduleEntity *Bundle) {
5840 return Bundle->isReady();
5841 }) &&
5842 "item in ready list not ready?");
5843 }
5844
5845 /// Put all instructions into the ReadyList which are ready for scheduling.
5846 template <typename ReadyListType>
5847 void initialFillReadyList(ReadyListType &ReadyList) {
5848 SmallPtrSet<ScheduleBundle *, 16> Visited;
5849 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5850 ScheduleData *SD = getScheduleData(I);
5851 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5852 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5853 !Bundles.empty()) {
5854 for (ScheduleBundle *Bundle : Bundles) {
5855 if (!Visited.insert(Bundle).second)
5856 continue;
5857 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5858 ReadyList.insert(Bundle);
5859 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5860 << *Bundle << "\n");
5861 }
5862 }
5863 continue;
5864 }
5865 ReadyList.insert(SD);
5867 << "SLP: initially in ready list: " << *SD << "\n");
5868 }
5869 }
5870 }
5871
5872 /// Build a bundle from the ScheduleData nodes corresponding to the
5873 /// scalar instruction for each lane.
5874 /// \param VL The list of scalar instructions.
5875 /// \param S The state of the instructions.
5876 /// \param EI The edge in the SLP graph or the user node/operand number.
5877 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5878 const InstructionsState &S, const EdgeInfo &EI);
5879
5880 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5881 /// cyclic dependencies. This is only a dry-run, no instructions are
5882 /// actually moved at this stage.
5883 /// \returns the scheduling bundle. The returned Optional value is not
5884 /// std::nullopt if \p VL is allowed to be scheduled.
5885 std::optional<ScheduleBundle *>
5886 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5887 const InstructionsState &S, const EdgeInfo &EI);
5888
5889 /// Allocates schedule data chunk.
5890 ScheduleData *allocateScheduleDataChunks();
5891
5892 /// Extends the scheduling region so that V is inside the region.
5893 /// \returns true if the region size is within the limit.
5894 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5895
5896 /// Initialize the ScheduleData structures for new instructions in the
5897 /// scheduling region.
5898 void initScheduleData(Instruction *FromI, Instruction *ToI,
5899 ScheduleData *PrevLoadStore,
5900 ScheduleData *NextLoadStore);
5901
5902 /// Updates the dependency information of a bundle and of all instructions/
5903 /// bundles which depend on the original bundle.
5904 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5905 BoUpSLP *SLP,
5906 ArrayRef<ScheduleData *> ControlDeps = {});
5907
5908 /// Sets all instruction in the scheduling region to un-scheduled.
5909 void resetSchedule();
5910
5911 BasicBlock *BB;
5912
5913 /// Simple memory allocation for ScheduleData.
5915
5916 /// The size of a ScheduleData array in ScheduleDataChunks.
5917 int ChunkSize;
5918
5919 /// The allocator position in the current chunk, which is the last entry
5920 /// of ScheduleDataChunks.
5921 int ChunkPos;
5922
5923 /// Attaches ScheduleData to Instruction.
5924 /// Note that the mapping survives during all vectorization iterations, i.e.
5925 /// ScheduleData structures are recycled.
5926 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5927
5928 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5929 /// number) and the operand instruction, represented as copyable element.
5930 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5931 std::unique_ptr<ScheduleCopyableData>>
5932 ScheduleCopyableDataMap;
5933
5934 /// Represents mapping between instruction and all related
5935 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5936 /// element). The SLP tree may contain several representations of the same
5937 /// instruction.
5938 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5939 ScheduleCopyableDataMapByInst;
5940
5941 /// Represents mapping between user value and operand number, the operand
5942 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5943 /// the same user may refernce the same operand in different tree entries
5944 /// and the operand may be modelled by the different copyable data element.
5945 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5947 ScheduleCopyableDataMapByInstUser;
5948
5949 /// Represents mapping between instruction and all related
5950 /// ScheduleCopyableData. It represents the mapping between the actual
5951 /// instruction and the last copyable data element in the chain. E.g., if
5952 /// the graph models the following instructions:
5953 /// %0 = non-add instruction ...
5954 /// ...
5955 /// %4 = add %3, 1
5956 /// %5 = add %4, 1
5957 /// %6 = insertelement poison, %0, 0
5958 /// %7 = insertelement %6, %5, 1
5959 /// And the graph is modeled as:
5960 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
5961 /// -> [1, 0] -> [%1, 0]
5962 ///
5963 /// this map will map %0 only to the copyable element <1>, which is the last
5964 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
5965 /// keep the map to <0>, not the %0.
5966 SmallDenseMap<const Instruction *,
5967 SmallSetVector<ScheduleCopyableData *, 4>>
5968 ScheduleCopyableDataMapByUsers;
5969
5970 /// Attaches ScheduleBundle to Instruction.
5971 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5972 ScheduledBundles;
5973 /// The list of ScheduleBundles.
5974 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5975
5976 /// The ready-list for scheduling (only used for the dry-run).
5977 SetVector<ScheduleEntity *> ReadyInsts;
5978
5979 /// The first instruction of the scheduling region.
5980 Instruction *ScheduleStart = nullptr;
5981
5982 /// The first instruction _after_ the scheduling region.
5983 Instruction *ScheduleEnd = nullptr;
5984
5985 /// The first memory accessing instruction in the scheduling region
5986 /// (can be null).
5987 ScheduleData *FirstLoadStoreInRegion = nullptr;
5988
5989 /// The last memory accessing instruction in the scheduling region
5990 /// (can be null).
5991 ScheduleData *LastLoadStoreInRegion = nullptr;
5992
5993 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5994 /// region? Used to optimize the dependence calculation for the
5995 /// common case where there isn't.
5996 bool RegionHasStackSave = false;
5997
5998 /// The current size of the scheduling region.
5999 int ScheduleRegionSize = 0;
6000
6001 /// The maximum size allowed for the scheduling region.
6002 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6003
6004 /// The ID of the scheduling region. For a new vectorization iteration this
6005 /// is incremented which "removes" all ScheduleData from the region.
6006 /// Make sure that the initial SchedulingRegionID is greater than the
6007 /// initial SchedulingRegionID in ScheduleData (which is 0).
6008 int SchedulingRegionID = 1;
6009 };
6010
6011 /// Attaches the BlockScheduling structures to basic blocks.
6012 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6013
6014 /// Performs the "real" scheduling. Done before vectorization is actually
6015 /// performed in a basic block.
6016 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6017
6018 /// List of users to ignore during scheduling and that don't need extracting.
6019 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6020
6021 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6022 /// sorted SmallVectors of unsigned.
6023 struct OrdersTypeDenseMapInfo {
6024 static OrdersType getEmptyKey() {
6025 OrdersType V;
6026 V.push_back(~1U);
6027 return V;
6028 }
6029
6030 static OrdersType getTombstoneKey() {
6031 OrdersType V;
6032 V.push_back(~2U);
6033 return V;
6034 }
6035
6036 static unsigned getHashValue(const OrdersType &V) {
6037 return static_cast<unsigned>(hash_combine_range(V));
6038 }
6039
6040 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6041 return LHS == RHS;
6042 }
6043 };
6044
6045 // Analysis and block reference.
6046 Function *F;
6047 ScalarEvolution *SE;
6048 TargetTransformInfo *TTI;
6049 TargetLibraryInfo *TLI;
6050 LoopInfo *LI;
6051 DominatorTree *DT;
6052 AssumptionCache *AC;
6053 DemandedBits *DB;
6054 const DataLayout *DL;
6055 OptimizationRemarkEmitter *ORE;
6056
6057 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6058 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6059
6060 /// Instruction builder to construct the vectorized tree.
6061 IRBuilder<TargetFolder> Builder;
6062
6063 /// A map of scalar integer values to the smallest bit width with which they
6064 /// can legally be represented. The values map to (width, signed) pairs,
6065 /// where "width" indicates the minimum bit width and "signed" is True if the
6066 /// value must be signed-extended, rather than zero-extended, back to its
6067 /// original width.
6068 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6069
6070 /// Final size of the reduced vector, if the current graph represents the
6071 /// input for the reduction and it was possible to narrow the size of the
6072 /// reduction.
6073 unsigned ReductionBitWidth = 0;
6074
6075 /// Canonical graph size before the transformations.
6076 unsigned BaseGraphSize = 1;
6077
6078 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6079 /// type sizes, used in the tree.
6080 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6081
6082 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6083 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6084 DenseSet<unsigned> ExtraBitWidthNodes;
6085};
6086
6087template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6091 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6092 SecondInfo::getEmptyKey());
6093 }
6094
6096 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6097 SecondInfo::getTombstoneKey());
6098 }
6099
6100 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6101 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6102 SecondInfo::getHashValue(Val.EdgeIdx));
6103 }
6104
6105 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6106 const BoUpSLP::EdgeInfo &RHS) {
6107 return LHS == RHS;
6108 }
6109};
6110
6111template <> struct llvm::GraphTraits<BoUpSLP *> {
6112 using TreeEntry = BoUpSLP::TreeEntry;
6113
6114 /// NodeRef has to be a pointer per the GraphWriter.
6116
6117 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6118
6119 /// Add the VectorizableTree to the index iterator to be able to return
6120 /// TreeEntry pointers.
6122 : public iterator_adaptor_base<
6123 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6125
6129
6130 NodeRef operator*() { return I->UserTE; }
6131 };
6132
6134 return R.VectorizableTree[0].get();
6135 }
6136
6138 return {&N->UserTreeIndex, N->Container};
6139 }
6140
6142 return {&N->UserTreeIndex + 1, N->Container};
6143 }
6144
6145 /// For the node iterator we just need to turn the TreeEntry iterator into a
6146 /// TreeEntry* iterator so that it dereferences to NodeRef.
6148 using ItTy = ContainerTy::iterator;
6149 ItTy It;
6150
6151 public:
6152 nodes_iterator(const ItTy &It2) : It(It2) {}
6153 NodeRef operator*() { return It->get(); }
6155 ++It;
6156 return *this;
6157 }
6158 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6159 };
6160
6162 return nodes_iterator(R->VectorizableTree.begin());
6163 }
6164
6166 return nodes_iterator(R->VectorizableTree.end());
6167 }
6168
6169 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6170};
6171
6172template <>
6174 using TreeEntry = BoUpSLP::TreeEntry;
6175
6176 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6177
6178 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6179 std::string Str;
6180 raw_string_ostream OS(Str);
6181 OS << Entry->Idx << ".\n";
6182 if (isSplat(Entry->Scalars))
6183 OS << "<splat> ";
6184 for (auto *V : Entry->Scalars) {
6185 OS << *V;
6186 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6187 return EU.Scalar == V;
6188 }))
6189 OS << " <extract>";
6190 OS << "\n";
6191 }
6192 return Str;
6193 }
6194
6195 static std::string getNodeAttributes(const TreeEntry *Entry,
6196 const BoUpSLP *) {
6197 if (Entry->isGather())
6198 return "color=red";
6199 if (Entry->State == TreeEntry::ScatterVectorize ||
6200 Entry->State == TreeEntry::StridedVectorize ||
6201 Entry->State == TreeEntry::CompressVectorize)
6202 return "color=blue";
6203 return "";
6204 }
6205};
6206
6209 for (auto *I : DeletedInstructions) {
6210 if (!I->getParent()) {
6211 // Temporarily insert instruction back to erase them from parent and
6212 // memory later.
6213 if (isa<PHINode>(I))
6214 // Phi nodes must be the very first instructions in the block.
6215 I->insertBefore(F->getEntryBlock(),
6216 F->getEntryBlock().getFirstNonPHIIt());
6217 else
6218 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6219 continue;
6220 }
6221 for (Use &U : I->operands()) {
6222 auto *Op = dyn_cast<Instruction>(U.get());
6223 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6225 DeadInsts.emplace_back(Op);
6226 }
6227 I->dropAllReferences();
6228 }
6229 for (auto *I : DeletedInstructions) {
6230 assert(I->use_empty() &&
6231 "trying to erase instruction with users.");
6232 I->eraseFromParent();
6233 }
6234
6235 // Cleanup any dead scalar code feeding the vectorized instructions
6237
6238#ifdef EXPENSIVE_CHECKS
6239 // If we could guarantee that this call is not extremely slow, we could
6240 // remove the ifdef limitation (see PR47712).
6241 assert(!verifyFunction(*F, &dbgs()));
6242#endif
6243}
6244
6245/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6246/// contains original mask for the scalars reused in the node. Procedure
6247/// transform this mask in accordance with the given \p Mask.
6249 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6250 "Expected non-empty mask.");
6251 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6252 Prev.swap(Reuses);
6253 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6254 if (Mask[I] != PoisonMaskElem)
6255 Reuses[Mask[I]] = Prev[I];
6256}
6257
6258/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6259/// the original order of the scalars. Procedure transforms the provided order
6260/// in accordance with the given \p Mask. If the resulting \p Order is just an
6261/// identity order, \p Order is cleared.
6263 bool BottomOrder = false) {
6264 assert(!Mask.empty() && "Expected non-empty mask.");
6265 unsigned Sz = Mask.size();
6266 if (BottomOrder) {
6267 SmallVector<unsigned> PrevOrder;
6268 if (Order.empty()) {
6269 PrevOrder.resize(Sz);
6270 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6271 } else {
6272 PrevOrder.swap(Order);
6273 }
6274 Order.assign(Sz, Sz);
6275 for (unsigned I = 0; I < Sz; ++I)
6276 if (Mask[I] != PoisonMaskElem)
6277 Order[I] = PrevOrder[Mask[I]];
6278 if (all_of(enumerate(Order), [&](const auto &Data) {
6279 return Data.value() == Sz || Data.index() == Data.value();
6280 })) {
6281 Order.clear();
6282 return;
6283 }
6284 fixupOrderingIndices(Order);
6285 return;
6286 }
6287 SmallVector<int> MaskOrder;
6288 if (Order.empty()) {
6289 MaskOrder.resize(Sz);
6290 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6291 } else {
6292 inversePermutation(Order, MaskOrder);
6293 }
6294 reorderReuses(MaskOrder, Mask);
6295 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6296 Order.clear();
6297 return;
6298 }
6299 Order.assign(Sz, Sz);
6300 for (unsigned I = 0; I < Sz; ++I)
6301 if (MaskOrder[I] != PoisonMaskElem)
6302 Order[MaskOrder[I]] = I;
6303 fixupOrderingIndices(Order);
6304}
6305
6306std::optional<BoUpSLP::OrdersType>
6307BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6308 bool TopToBottom, bool IgnoreReorder) {
6309 assert(TE.isGather() && "Expected gather node only.");
6310 // Try to find subvector extract/insert patterns and reorder only such
6311 // patterns.
6312 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6313 Type *ScalarTy = GatheredScalars.front()->getType();
6314 size_t NumScalars = GatheredScalars.size();
6315 if (!isValidElementType(ScalarTy))
6316 return std::nullopt;
6317 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6318 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6319 SmallVector<int> ExtractMask;
6320 SmallVector<int> Mask;
6323 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6325 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6326 /*ForOrder=*/true);
6327 // No shuffled operands - ignore.
6328 if (GatherShuffles.empty() && ExtractShuffles.empty())
6329 return std::nullopt;
6330 OrdersType CurrentOrder(NumScalars, NumScalars);
6331 if (GatherShuffles.size() == 1 &&
6332 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6333 Entries.front().front()->isSame(TE.Scalars)) {
6334 // If the full matched node in whole tree rotation - no need to consider the
6335 // matching order, rotating the whole tree.
6336 if (TopToBottom)
6337 return std::nullopt;
6338 // No need to keep the order for the same user node.
6339 if (Entries.front().front()->UserTreeIndex.UserTE ==
6340 TE.UserTreeIndex.UserTE)
6341 return std::nullopt;
6342 // No need to keep the order for the matched root node, if it can be freely
6343 // reordered.
6344 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6345 return std::nullopt;
6346 // If shuffling 2 elements only and the matching node has reverse reuses -
6347 // no need to count order, both work fine.
6348 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6349 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6350 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6351 [](const auto &P) {
6352 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6353 }))
6354 return std::nullopt;
6355
6356 // Perfect match in the graph, will reuse the previously vectorized
6357 // node. Cost is 0.
6358 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6359 return CurrentOrder;
6360 }
6361 auto IsSplatMask = [](ArrayRef<int> Mask) {
6362 int SingleElt = PoisonMaskElem;
6363 return all_of(Mask, [&](int I) {
6364 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6365 SingleElt = I;
6366 return I == PoisonMaskElem || I == SingleElt;
6367 });
6368 };
6369 // Exclusive broadcast mask - ignore.
6370 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6371 (Entries.size() != 1 ||
6372 Entries.front().front()->ReorderIndices.empty())) ||
6373 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6374 return std::nullopt;
6375 SmallBitVector ShuffledSubMasks(NumParts);
6376 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6377 ArrayRef<int> Mask, int PartSz, int NumParts,
6378 function_ref<unsigned(unsigned)> GetVF) {
6379 for (int I : seq<int>(0, NumParts)) {
6380 if (ShuffledSubMasks.test(I))
6381 continue;
6382 const int VF = GetVF(I);
6383 if (VF == 0)
6384 continue;
6385 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6386 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6387 // Shuffle of at least 2 vectors - ignore.
6388 if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {
6389 llvm::fill(Slice, NumScalars);
6390 ShuffledSubMasks.set(I);
6391 continue;
6392 }
6393 // Try to include as much elements from the mask as possible.
6394 int FirstMin = INT_MAX;
6395 int SecondVecFound = false;
6396 for (int K : seq<int>(Limit)) {
6397 int Idx = Mask[I * PartSz + K];
6398 if (Idx == PoisonMaskElem) {
6399 Value *V = GatheredScalars[I * PartSz + K];
6400 if (isConstant(V) && !isa<PoisonValue>(V)) {
6401 SecondVecFound = true;
6402 break;
6403 }
6404 continue;
6405 }
6406 if (Idx < VF) {
6407 if (FirstMin > Idx)
6408 FirstMin = Idx;
6409 } else {
6410 SecondVecFound = true;
6411 break;
6412 }
6413 }
6414 FirstMin = (FirstMin / PartSz) * PartSz;
6415 // Shuffle of at least 2 vectors - ignore.
6416 if (SecondVecFound) {
6417 llvm::fill(Slice, NumScalars);
6418 ShuffledSubMasks.set(I);
6419 continue;
6420 }
6421 for (int K : seq<int>(Limit)) {
6422 int Idx = Mask[I * PartSz + K];
6423 if (Idx == PoisonMaskElem)
6424 continue;
6425 Idx -= FirstMin;
6426 if (Idx >= PartSz) {
6427 SecondVecFound = true;
6428 break;
6429 }
6430 if (CurrentOrder[I * PartSz + Idx] >
6431 static_cast<unsigned>(I * PartSz + K) &&
6432 CurrentOrder[I * PartSz + Idx] !=
6433 static_cast<unsigned>(I * PartSz + Idx))
6434 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6435 }
6436 // Shuffle of at least 2 vectors - ignore.
6437 if (SecondVecFound) {
6438 llvm::fill(Slice, NumScalars);
6439 ShuffledSubMasks.set(I);
6440 continue;
6441 }
6442 }
6443 };
6444 int PartSz = getPartNumElems(NumScalars, NumParts);
6445 if (!ExtractShuffles.empty())
6446 TransformMaskToOrder(
6447 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6448 if (!ExtractShuffles[I])
6449 return 0U;
6450 unsigned VF = 0;
6451 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6452 for (unsigned Idx : seq<unsigned>(Sz)) {
6453 int K = I * PartSz + Idx;
6454 if (ExtractMask[K] == PoisonMaskElem)
6455 continue;
6456 if (!TE.ReuseShuffleIndices.empty())
6457 K = TE.ReuseShuffleIndices[K];
6458 if (K == PoisonMaskElem)
6459 continue;
6460 if (!TE.ReorderIndices.empty())
6461 K = std::distance(TE.ReorderIndices.begin(),
6462 find(TE.ReorderIndices, K));
6463 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6464 if (!EI)
6465 continue;
6466 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6467 ->getElementCount()
6468 .getKnownMinValue());
6469 }
6470 return VF;
6471 });
6472 // Check special corner case - single shuffle of the same entry.
6473 if (GatherShuffles.size() == 1 && NumParts != 1) {
6474 if (ShuffledSubMasks.any())
6475 return std::nullopt;
6476 PartSz = NumScalars;
6477 NumParts = 1;
6478 }
6479 if (!Entries.empty())
6480 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6481 if (!GatherShuffles[I])
6482 return 0U;
6483 return std::max(Entries[I].front()->getVectorFactor(),
6484 Entries[I].back()->getVectorFactor());
6485 });
6486 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6487 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6488 return std::nullopt;
6489 return std::move(CurrentOrder);
6490}
6491
6492static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6493 const TargetLibraryInfo &TLI,
6494 bool CompareOpcodes = true) {
6497 return false;
6498 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6499 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6500 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6501 (!GEP2 || GEP2->getNumOperands() == 2) &&
6502 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6503 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6504 !CompareOpcodes ||
6505 (GEP1 && GEP2 &&
6506 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6507}
6508
6509/// Calculates minimal alignment as a common alignment.
6510template <typename T>
6512 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6513 for (Value *V : VL)
6514 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6515 return CommonAlignment;
6516}
6517
6518/// Check if \p Order represents reverse order.
6520 assert(!Order.empty() &&
6521 "Order is empty. Please check it before using isReverseOrder.");
6522 unsigned Sz = Order.size();
6523 return all_of(enumerate(Order), [&](const auto &Pair) {
6524 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6525 });
6526}
6527
6528/// Checks if the provided list of pointers \p Pointers represents the strided
6529/// pointers for type ElemTy. If they are not, nullptr is returned.
6530/// Otherwise, SCEV* of the stride value is returned.
6531static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6532 const DataLayout &DL, ScalarEvolution &SE,
6533 SmallVectorImpl<unsigned> &SortedIndices) {
6535 const SCEV *PtrSCEVLowest = nullptr;
6536 const SCEV *PtrSCEVHighest = nullptr;
6537 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6538 // addresses).
6539 for (Value *Ptr : PointerOps) {
6540 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6541 if (!PtrSCEV)
6542 return nullptr;
6543 SCEVs.push_back(PtrSCEV);
6544 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6545 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6546 continue;
6547 }
6548 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6549 if (isa<SCEVCouldNotCompute>(Diff))
6550 return nullptr;
6551 if (Diff->isNonConstantNegative()) {
6552 PtrSCEVLowest = PtrSCEV;
6553 continue;
6554 }
6555 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6556 if (isa<SCEVCouldNotCompute>(Diff1))
6557 return nullptr;
6558 if (Diff1->isNonConstantNegative()) {
6559 PtrSCEVHighest = PtrSCEV;
6560 continue;
6561 }
6562 }
6563 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6564 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6565 if (isa<SCEVCouldNotCompute>(Dist))
6566 return nullptr;
6567 int Size = DL.getTypeStoreSize(ElemTy);
6568 auto TryGetStride = [&](const SCEV *Dist,
6569 const SCEV *Multiplier) -> const SCEV * {
6570 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6571 if (M->getOperand(0) == Multiplier)
6572 return M->getOperand(1);
6573 if (M->getOperand(1) == Multiplier)
6574 return M->getOperand(0);
6575 return nullptr;
6576 }
6577 if (Multiplier == Dist)
6578 return SE.getConstant(Dist->getType(), 1);
6579 return SE.getUDivExactExpr(Dist, Multiplier);
6580 };
6581 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6582 const SCEV *Stride = nullptr;
6583 if (Size != 1 || SCEVs.size() > 2) {
6584 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6585 Stride = TryGetStride(Dist, Sz);
6586 if (!Stride)
6587 return nullptr;
6588 }
6589 if (!Stride || isa<SCEVConstant>(Stride))
6590 return nullptr;
6591 // Iterate through all pointers and check if all distances are
6592 // unique multiple of Stride.
6593 using DistOrdPair = std::pair<int64_t, int>;
6594 auto Compare = llvm::less_first();
6595 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6596 int Cnt = 0;
6597 bool IsConsecutive = true;
6598 for (const SCEV *PtrSCEV : SCEVs) {
6599 unsigned Dist = 0;
6600 if (PtrSCEV != PtrSCEVLowest) {
6601 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6602 const SCEV *Coeff = TryGetStride(Diff, Stride);
6603 if (!Coeff)
6604 return nullptr;
6605 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6606 if (!SC || isa<SCEVCouldNotCompute>(SC))
6607 return nullptr;
6608 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6609 SE.getMulExpr(Stride, SC)))
6610 ->isZero())
6611 return nullptr;
6612 Dist = SC->getAPInt().getZExtValue();
6613 }
6614 // If the strides are not the same or repeated, we can't vectorize.
6615 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6616 return nullptr;
6617 auto Res = Offsets.emplace(Dist, Cnt);
6618 if (!Res.second)
6619 return nullptr;
6620 // Consecutive order if the inserted element is the last one.
6621 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6622 ++Cnt;
6623 }
6624 if (Offsets.size() != SCEVs.size())
6625 return nullptr;
6626 SortedIndices.clear();
6627 if (!IsConsecutive) {
6628 // Fill SortedIndices array only if it is non-consecutive.
6629 SortedIndices.resize(PointerOps.size());
6630 Cnt = 0;
6631 for (const std::pair<int64_t, int> &Pair : Offsets) {
6632 SortedIndices[Cnt] = Pair.second;
6633 ++Cnt;
6634 }
6635 }
6636 return Stride;
6637}
6638
6639static std::pair<InstructionCost, InstructionCost>
6641 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6642 Type *ScalarTy, VectorType *VecTy);
6643
6644/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6645/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6646/// subvector pattern.
6647static InstructionCost
6649 VectorType *Tp, ArrayRef<int> Mask = {},
6651 int Index = 0, VectorType *SubTp = nullptr,
6653 VectorType *DstTy = Tp;
6654 if (!Mask.empty())
6655 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6656
6657 if (Kind != TTI::SK_PermuteTwoSrc)
6658 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6659 Args);
6660 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6661 int NumSubElts;
6663 Mask, NumSrcElts, NumSubElts, Index)) {
6664 if (Index + NumSubElts > NumSrcElts &&
6665 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6666 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6667 TTI::TCK_RecipThroughput, Index, Tp);
6668 }
6669 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6670 Args);
6671}
6672
6673/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6674/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6675/// instead of a scalar.
6676static InstructionCost
6678 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6679 bool Extract, TTI::TargetCostKind CostKind,
6680 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6682 "ScalableVectorType is not supported.");
6683 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6684 getNumElements(Ty) &&
6685 "Incorrect usage.");
6686 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6687 assert(SLPReVec && "Only supported by REVEC.");
6688 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6689 // of CreateInsertElement.
6690 unsigned ScalarTyNumElements = VecTy->getNumElements();
6691 InstructionCost Cost = 0;
6692 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6693 if (!DemandedElts[I])
6694 continue;
6695 if (Insert)
6697 I * ScalarTyNumElements, VecTy);
6698 if (Extract)
6700 I * ScalarTyNumElements, VecTy);
6701 }
6702 return Cost;
6703 }
6704 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6705 CostKind, ForPoisonSrc, VL);
6706}
6707
6708/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6709/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6711 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6712 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6713 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6714 if (Opcode == Instruction::ExtractElement) {
6715 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6716 assert(SLPReVec && "Only supported by REVEC.");
6717 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6719 cast<VectorType>(Val), {}, CostKind,
6720 Index * VecTy->getNumElements(), VecTy);
6721 }
6722 }
6723 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6724 ScalarUserAndIdx);
6725}
6726
6727/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6728/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6730 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6731 VectorType *VecTy, unsigned Index,
6733 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6734 assert(SLPReVec && "Only supported by REVEC.");
6735 auto *SubTp =
6736 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6738 Index * ScalarTy->getNumElements(), SubTp) +
6739 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6740 CostKind);
6741 }
6742 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6743}
6744
6745/// Creates subvector insert. Generates shuffle using \p Generator or
6746/// using default shuffle.
6748 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6749 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6750 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6751 return Vec;
6752 const unsigned SubVecVF = getNumElements(V->getType());
6753 // Create shuffle, insertvector requires that index is multiple of
6754 // the subvector length.
6755 const unsigned VecVF = getNumElements(Vec->getType());
6757 if (isa<PoisonValue>(Vec)) {
6758 auto *Begin = std::next(Mask.begin(), Index);
6759 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6760 Vec = Builder.CreateShuffleVector(V, Mask);
6761 return Vec;
6762 }
6763 std::iota(Mask.begin(), Mask.end(), 0);
6764 std::iota(std::next(Mask.begin(), Index),
6765 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6766 if (Generator)
6767 return Generator(Vec, V, Mask);
6768 // 1. Resize V to the size of Vec.
6769 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6770 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6771 V = Builder.CreateShuffleVector(V, ResizeMask);
6772 // 2. Insert V into Vec.
6773 return Builder.CreateShuffleVector(Vec, V, Mask);
6774}
6775
6776/// Generates subvector extract using \p Generator or using default shuffle.
6778 unsigned SubVecVF, unsigned Index) {
6779 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6780 std::iota(Mask.begin(), Mask.end(), Index);
6781 return Builder.CreateShuffleVector(Vec, Mask);
6782}
6783
6784/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6785/// with \p Order.
6786/// \return true if the mask represents strided access, false - otherwise.
6788 ArrayRef<unsigned> Order, Type *ScalarTy,
6789 const DataLayout &DL, ScalarEvolution &SE,
6790 SmallVectorImpl<int> &CompressMask) {
6791 const unsigned Sz = PointerOps.size();
6792 CompressMask.assign(Sz, PoisonMaskElem);
6793 // The first element always set.
6794 CompressMask[0] = 0;
6795 // Check if the mask represents strided access.
6796 std::optional<unsigned> Stride = 0;
6797 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6798 for (unsigned I : seq<unsigned>(1, Sz)) {
6799 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6800 std::optional<int64_t> OptPos =
6801 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6802 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6803 return false;
6804 unsigned Pos = static_cast<unsigned>(*OptPos);
6805 CompressMask[I] = Pos;
6806 if (!Stride)
6807 continue;
6808 if (*Stride == 0) {
6809 *Stride = Pos;
6810 continue;
6811 }
6812 if (Pos != *Stride * I)
6813 Stride.reset();
6814 }
6815 return Stride.has_value();
6816}
6817
6818/// Checks if the \p VL can be transformed to a (masked)load + compress or
6819/// (masked) interleaved load.
6821 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6824 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6825 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6826 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6827 VectorType *&LoadVecTy) {
6828 InterleaveFactor = 0;
6829 Type *ScalarTy = VL.front()->getType();
6830 const size_t Sz = VL.size();
6831 auto *VecTy = getWidenedType(ScalarTy, Sz);
6833 SmallVector<int> Mask;
6834 if (!Order.empty())
6835 inversePermutation(Order, Mask);
6836 // Check external uses.
6837 for (const auto [I, V] : enumerate(VL)) {
6838 if (AreAllUsersVectorized(V))
6839 continue;
6840 InstructionCost ExtractCost =
6841 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6842 Mask.empty() ? I : Mask[I]);
6843 InstructionCost ScalarCost =
6844 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6845 if (ExtractCost <= ScalarCost)
6846 return false;
6847 }
6848 Value *Ptr0;
6849 Value *PtrN;
6850 if (Order.empty()) {
6851 Ptr0 = PointerOps.front();
6852 PtrN = PointerOps.back();
6853 } else {
6854 Ptr0 = PointerOps[Order.front()];
6855 PtrN = PointerOps[Order.back()];
6856 }
6857 std::optional<int64_t> Diff =
6858 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6859 if (!Diff)
6860 return false;
6861 const size_t MaxRegSize =
6863 .getFixedValue();
6864 // Check for very large distances between elements.
6865 if (*Diff / Sz >= MaxRegSize / 8)
6866 return false;
6867 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6868 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6869 Align CommonAlignment = LI->getAlign();
6870 IsMasked = !isSafeToLoadUnconditionally(
6871 Ptr0, LoadVecTy, CommonAlignment, DL,
6872 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6873 &TLI);
6874 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6875 LI->getPointerAddressSpace()))
6876 return false;
6877 // TODO: perform the analysis of each scalar load for better
6878 // safe-load-unconditionally analysis.
6879 bool IsStrided =
6880 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6881 assert(CompressMask.size() >= 2 && "At least two elements are required");
6882 SmallVector<Value *> OrderedPointerOps(PointerOps);
6883 if (!Order.empty())
6884 reorderScalars(OrderedPointerOps, Mask);
6885 auto [ScalarGEPCost, VectorGEPCost] =
6886 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6887 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6888 // The cost of scalar loads.
6889 InstructionCost ScalarLoadsCost =
6890 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6891 [&](InstructionCost C, Value *V) {
6892 return C + TTI.getInstructionCost(cast<Instruction>(V),
6893 CostKind);
6894 }) +
6895 ScalarGEPCost;
6896 APInt DemandedElts = APInt::getAllOnes(Sz);
6897 InstructionCost GatherCost =
6898 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6899 /*Insert=*/true,
6900 /*Extract=*/false, CostKind) +
6901 ScalarLoadsCost;
6902 InstructionCost LoadCost = 0;
6903 if (IsMasked) {
6904 LoadCost = TTI.getMemIntrinsicInstrCost(
6905 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
6906 CommonAlignment,
6907 LI->getPointerAddressSpace()),
6908 CostKind);
6909 } else {
6910 LoadCost =
6911 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6912 LI->getPointerAddressSpace(), CostKind);
6913 }
6914 if (IsStrided && !IsMasked && Order.empty()) {
6915 // Check for potential segmented(interleaved) loads.
6916 VectorType *AlignedLoadVecTy = getWidenedType(
6917 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6918 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6919 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6920 &TLI))
6921 AlignedLoadVecTy = LoadVecTy;
6922 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6923 CommonAlignment,
6924 LI->getPointerAddressSpace())) {
6925 InstructionCost InterleavedCost =
6926 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6927 Instruction::Load, AlignedLoadVecTy,
6928 CompressMask[1], {}, CommonAlignment,
6929 LI->getPointerAddressSpace(), CostKind, IsMasked);
6930 if (InterleavedCost < GatherCost) {
6931 InterleaveFactor = CompressMask[1];
6932 LoadVecTy = AlignedLoadVecTy;
6933 return true;
6934 }
6935 }
6936 }
6937 InstructionCost CompressCost = ::getShuffleCost(
6938 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
6939 if (!Order.empty()) {
6940 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6941 for (unsigned I : seq<unsigned>(Sz)) {
6942 NewMask[I] = CompressMask[Mask[I]];
6943 }
6944 CompressMask.swap(NewMask);
6945 }
6946 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6947 return TotalVecCost < GatherCost;
6948}
6949
6950/// Checks if the \p VL can be transformed to a (masked)load + compress or
6951/// (masked) interleaved load.
6952static bool
6955 const DataLayout &DL, ScalarEvolution &SE,
6956 AssumptionCache &AC, const DominatorTree &DT,
6957 const TargetLibraryInfo &TLI,
6958 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6959 bool IsMasked;
6960 unsigned InterleaveFactor;
6961 SmallVector<int> CompressMask;
6962 VectorType *LoadVecTy;
6963 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6964 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6965 CompressMask, LoadVecTy);
6966}
6967
6968/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6969/// PointerOps:
6970/// 1. Target with strided load support is detected.
6971/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6972/// potential stride <= MaxProfitableLoadStride and the potential stride is
6973/// power-of-2 (to avoid perf regressions for the very small number of loads)
6974/// and max distance > number of loads, or potential stride is -1.
6975/// 3. The loads are ordered, or number of unordered loads <=
6976/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6977/// to avoid extra costs for very expensive shuffles).
6978/// 4. Any pointer operand is an instruction with the users outside of the
6979/// current graph (for masked gathers extra extractelement instructions
6980/// might be required).
6982 Align Alignment, const int64_t Diff,
6983 const size_t Sz) const {
6984 if (Diff % (Sz - 1) != 0)
6985 return false;
6986
6987 // Try to generate strided load node.
6988 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
6989 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
6990 return !isVectorized(U) && !MustGather.contains(U);
6991 });
6992 });
6993
6994 const uint64_t AbsoluteDiff = std::abs(Diff);
6995 auto *VecTy = getWidenedType(ScalarTy, Sz);
6996 if (IsAnyPointerUsedOutGraph ||
6997 (AbsoluteDiff > Sz &&
6999 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
7000 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
7001 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7002 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7003 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7004 return false;
7005 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7006 return false;
7007 return true;
7008 }
7009 return false;
7010}
7011
7013 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7014 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7015 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
7016 const size_t Sz = PointerOps.size();
7017 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7018 // Go through `PointerOps` in sorted order and record offsets from `Ptr0`.
7019 for (unsigned I : seq<unsigned>(Sz)) {
7020 Value *Ptr =
7021 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7022 SortedOffsetsFromBase[I] =
7023 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
7024 }
7025
7026 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7027 // ```
7028 // [
7029 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7030 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7031 // ...
7032 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7033 // GroupSize - 1}), // last group
7034 // ]
7035 // ```
7036 // The distance between consecutive elements within each group should all be
7037 // the same `StrideWithinGroup`. The distance between the first elements of
7038 // consecutive groups should all be the same `StrideBetweenGroups`.
7039
7040 int64_t StrideWithinGroup =
7041 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7042 // Determine size of the first group. Later we will check that all other
7043 // groups have the same size.
7044 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7045 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7046 StrideWithinGroup;
7047 };
7048 auto Indices = seq<unsigned>(1, Sz);
7049 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7050 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7051
7052 unsigned VecSz = Sz;
7053 Type *NewScalarTy = ScalarTy;
7054
7055 // Quick detour: at this point we can say what the type of strided load would
7056 // be if all the checks pass. Check if this type is legal for the target.
7057 bool NeedsWidening = Sz != GroupSize;
7058 if (NeedsWidening) {
7059 if (Sz % GroupSize != 0)
7060 return false;
7061
7062 if (StrideWithinGroup != 1)
7063 return false;
7064 VecSz = Sz / GroupSize;
7065 NewScalarTy = Type::getIntNTy(
7066 SE->getContext(),
7067 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7068 }
7069
7070 if (!isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7071 return false;
7072
7073 int64_t StrideIntVal = StrideWithinGroup;
7074 if (NeedsWidening) {
7075 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7076 // Check that the strides between groups are all the same.
7077 unsigned CurrentGroupStartIdx = GroupSize;
7078 int64_t StrideBetweenGroups =
7079 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7080 StrideIntVal = StrideBetweenGroups;
7081 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7082 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7083 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7084 StrideBetweenGroups)
7085 return false;
7086 }
7087
7088 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7089 auto Indices = seq<unsigned>(StartIdx + 1, Sz);
7090 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7091 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7092 return GroupEndIdx - StartIdx == GroupSize;
7093 };
7094 for (unsigned I = 0; I < Sz; I += GroupSize) {
7095 if (!CheckGroup(I))
7096 return false;
7097 }
7098 }
7099
7100 Type *StrideTy = DL->getIndexType(Ptr0->getType());
7101 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
7102 SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
7103 return true;
7104}
7105
7107 Type *ScalarTy, Align CommonAlignment,
7108 SmallVectorImpl<unsigned> &SortedIndices,
7109 StridedPtrInfo &SPtrInfo) const {
7110 const unsigned Sz = PointerOps.size();
7111 FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, Sz);
7112 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7113 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7114 return false;
7115 if (const SCEV *Stride =
7116 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) {
7117 SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
7118 SPtrInfo.StrideSCEV = Stride;
7119 return true;
7120 }
7121 return false;
7122}
7123
7125 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7126 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7127 unsigned *BestVF, bool TryRecursiveCheck) const {
7128 // Check that a vectorized load would load the same memory as a scalar
7129 // load. For example, we don't want to vectorize loads that are smaller
7130 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7131 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7132 // from such a struct, we read/write packed bits disagreeing with the
7133 // unvectorized version.
7134 if (BestVF)
7135 *BestVF = 0;
7137 return LoadsState::Gather;
7138 Type *ScalarTy = VL0->getType();
7139
7140 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7141 return LoadsState::Gather;
7142
7143 // Make sure all loads in the bundle are simple - we can't vectorize
7144 // atomic or volatile loads.
7145 PointerOps.clear();
7146 const size_t Sz = VL.size();
7147 PointerOps.resize(Sz);
7148 auto *POIter = PointerOps.begin();
7149 for (Value *V : VL) {
7150 auto *L = dyn_cast<LoadInst>(V);
7151 if (!L || !L->isSimple())
7152 return LoadsState::Gather;
7153 *POIter = L->getPointerOperand();
7154 ++POIter;
7155 }
7156
7157 Order.clear();
7158 // Check the order of pointer operands or that all pointers are the same.
7159 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7160
7161 auto *VecTy = getWidenedType(ScalarTy, Sz);
7162 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7163 if (!IsSorted) {
7164 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7165 SPtrInfo))
7167
7168 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7169 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7170 return LoadsState::Gather;
7171
7172 if (!all_of(PointerOps, [&](Value *P) {
7173 return arePointersCompatible(P, PointerOps.front(), *TLI);
7174 }))
7175 return LoadsState::Gather;
7176
7177 } else {
7178 Value *Ptr0;
7179 Value *PtrN;
7180 if (Order.empty()) {
7181 Ptr0 = PointerOps.front();
7182 PtrN = PointerOps.back();
7183 } else {
7184 Ptr0 = PointerOps[Order.front()];
7185 PtrN = PointerOps[Order.back()];
7186 }
7187 std::optional<int64_t> Diff =
7188 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7189 // Check that the sorted loads are consecutive.
7190 if (static_cast<uint64_t>(*Diff) == Sz - 1)
7191 return LoadsState::Vectorize;
7192 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7193 *TLI, [&](Value *V) {
7194 return areAllUsersVectorized(
7195 cast<Instruction>(V), UserIgnoreList);
7196 }))
7198 Align Alignment =
7199 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7200 ->getAlign();
7201 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7202 *Diff, Ptr0, PtrN, SPtrInfo))
7204 }
7205 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7206 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7207 return LoadsState::Gather;
7208 // Correctly identify compare the cost of loads + shuffles rather than
7209 // strided/masked gather loads. Returns true if vectorized + shuffles
7210 // representation is better than just gather.
7211 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7212 unsigned *BestVF,
7213 bool ProfitableGatherPointers) {
7214 if (BestVF)
7215 *BestVF = 0;
7216 // Compare masked gather cost and loads + insert subvector costs.
7218 auto [ScalarGEPCost, VectorGEPCost] =
7219 getGEPCosts(TTI, PointerOps, PointerOps.front(),
7220 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7221 // Estimate the cost of masked gather GEP. If not a splat, roughly
7222 // estimate as a buildvector, otherwise estimate as splat.
7223 APInt DemandedElts = APInt::getAllOnes(Sz);
7224 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7225 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7226 if (static_cast<unsigned>(count_if(
7227 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7228 any_of(PointerOps, [&](Value *V) {
7229 return getUnderlyingObject(V) !=
7230 getUnderlyingObject(PointerOps.front());
7231 }))
7232 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7233 DemandedElts, /*Insert=*/true,
7234 /*Extract=*/false, CostKind);
7235 else
7236 VectorGEPCost +=
7238 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7239 /*Insert=*/true, /*Extract=*/false, CostKind) +
7240 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7241 // The cost of scalar loads.
7242 InstructionCost ScalarLoadsCost =
7243 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7244 [&](InstructionCost C, Value *V) {
7245 return C + TTI.getInstructionCost(
7247 }) +
7248 ScalarGEPCost;
7249 // The cost of masked gather.
7250 InstructionCost MaskedGatherCost =
7251 TTI.getMemIntrinsicInstrCost(
7252 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
7254 /*VariableMask=*/false, CommonAlignment),
7255 CostKind) +
7256 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7257 InstructionCost GatherCost =
7258 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7259 /*Insert=*/true,
7260 /*Extract=*/false, CostKind) +
7261 ScalarLoadsCost;
7262 // The list of loads is small or perform partial check already - directly
7263 // compare masked gather cost and gather cost.
7264 constexpr unsigned ListLimit = 4;
7265 if (!TryRecursiveCheck || VL.size() < ListLimit)
7266 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7267
7268 // FIXME: The following code has not been updated for non-power-of-2
7269 // vectors (and not whole registers). The splitting logic here does not
7270 // cover the original vector if the vector factor is not a power of two.
7271 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7272 return false;
7273
7274 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7275 unsigned MinVF = getMinVF(2 * Sz);
7276 DemandedElts.clearAllBits();
7277 // Iterate through possible vectorization factors and check if vectorized +
7278 // shuffles is better than just gather.
7279 for (unsigned VF =
7280 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7281 VF >= MinVF;
7282 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7284 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7285 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7287 SmallVector<Value *> PointerOps;
7288 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7289 PointerOps, SPtrInfo, BestVF,
7290 /*TryRecursiveCheck=*/false);
7291 // Check that the sorted loads are consecutive.
7292 if (LS == LoadsState::Gather) {
7293 if (BestVF) {
7294 DemandedElts.setAllBits();
7295 break;
7296 }
7297 DemandedElts.setBits(Cnt, Cnt + VF);
7298 continue;
7299 }
7300 // If need the reorder - consider as high-cost masked gather for now.
7301 if ((LS == LoadsState::Vectorize ||
7304 !Order.empty() && !isReverseOrder(Order))
7306 States.push_back(LS);
7307 }
7308 if (DemandedElts.isAllOnes())
7309 // All loads gathered - try smaller VF.
7310 continue;
7311 // Can be vectorized later as a serie of loads/insertelements.
7312 InstructionCost VecLdCost = 0;
7313 if (!DemandedElts.isZero()) {
7314 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7315 /*Insert=*/true,
7316 /*Extract=*/false, CostKind) +
7317 ScalarGEPCost;
7318 for (unsigned Idx : seq<unsigned>(VL.size()))
7319 if (DemandedElts[Idx])
7320 VecLdCost +=
7321 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7322 }
7323 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7324 for (auto [I, LS] : enumerate(States)) {
7325 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7326 InstructionCost VectorGEPCost =
7327 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7328 ? 0
7329 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7330 LI0->getPointerOperand(),
7331 Instruction::GetElementPtr, CostKind, ScalarTy,
7332 SubVecTy)
7333 .second;
7334 if (LS == LoadsState::ScatterVectorize) {
7335 if (static_cast<unsigned>(
7336 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7337 PointerOps.size() - 1 ||
7338 any_of(PointerOps, [&](Value *V) {
7339 return getUnderlyingObject(V) !=
7340 getUnderlyingObject(PointerOps.front());
7341 }))
7342 VectorGEPCost += getScalarizationOverhead(
7343 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7344 /*Insert=*/true, /*Extract=*/false, CostKind);
7345 else
7346 VectorGEPCost +=
7348 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7349 /*Insert=*/true, /*Extract=*/false, CostKind) +
7350 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7351 CostKind);
7352 }
7353 switch (LS) {
7355 VecLdCost +=
7356 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7357 LI0->getPointerAddressSpace(), CostKind,
7359 VectorGEPCost;
7360 break;
7362 VecLdCost += TTI.getMemIntrinsicInstrCost(
7364 Intrinsic::experimental_vp_strided_load,
7365 SubVecTy, LI0->getPointerOperand(),
7366 /*VariableMask=*/false, CommonAlignment),
7367 CostKind) +
7368 VectorGEPCost;
7369 break;
7371 VecLdCost += TTI.getMemIntrinsicInstrCost(
7373 Intrinsic::masked_load, SubVecTy,
7374 CommonAlignment, LI0->getPointerAddressSpace()),
7375 CostKind) +
7377 {}, CostKind);
7378 break;
7380 VecLdCost += TTI.getMemIntrinsicInstrCost(
7382 Intrinsic::masked_gather, SubVecTy,
7383 LI0->getPointerOperand(),
7384 /*VariableMask=*/false, CommonAlignment),
7385 CostKind) +
7386 VectorGEPCost;
7387 break;
7388 case LoadsState::Gather:
7389 // Gathers are already calculated - ignore.
7390 continue;
7391 }
7392 SmallVector<int> ShuffleMask(VL.size());
7393 for (int Idx : seq<int>(0, VL.size()))
7394 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7395 if (I > 0)
7396 VecLdCost +=
7397 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7398 CostKind, I * VF, SubVecTy);
7399 }
7400 // If masked gather cost is higher - better to vectorize, so
7401 // consider it as a gather node. It will be better estimated
7402 // later.
7403 if (MaskedGatherCost >= VecLdCost &&
7404 VecLdCost - GatherCost < -SLPCostThreshold) {
7405 if (BestVF)
7406 *BestVF = VF;
7407 return true;
7408 }
7409 }
7410 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7411 };
7412 // TODO: need to improve analysis of the pointers, if not all of them are
7413 // GEPs or have > 2 operands, we end up with a gather node, which just
7414 // increases the cost.
7415 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7416 bool ProfitableGatherPointers =
7417 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7418 return L->isLoopInvariant(V);
7419 })) <= Sz / 2;
7420 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7422 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7423 (GEP && GEP->getNumOperands() == 2 &&
7424 isa<Constant, Instruction>(GEP->getOperand(1)));
7425 })) {
7426 // Check if potential masked gather can be represented as series
7427 // of loads + insertsubvectors.
7428 // If masked gather cost is higher - better to vectorize, so
7429 // consider it as a gather node. It will be better estimated
7430 // later.
7431 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7432 ProfitableGatherPointers))
7434 }
7435
7436 return LoadsState::Gather;
7437}
7438
7440 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7441 const DataLayout &DL, ScalarEvolution &SE,
7442 SmallVectorImpl<unsigned> &SortedIndices) {
7443 assert(
7444 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7445 "Expected list of pointer operands.");
7446 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7447 // Ptr into, sort and return the sorted indices with values next to one
7448 // another.
7450 std::pair<BasicBlock *, Value *>,
7452 Bases;
7453 Bases
7454 .try_emplace(std::make_pair(
7456 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7457
7458 SortedIndices.clear();
7459 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7460 auto Key = std::make_pair(BBs[Cnt + 1],
7462 bool Found = any_of(Bases.try_emplace(Key).first->second,
7463 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7464 std::optional<int64_t> Diff =
7465 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7466 ElemTy, Ptr, DL, SE,
7467 /*StrictCheck=*/true);
7468 if (!Diff)
7469 return false;
7470
7471 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7472 return true;
7473 });
7474
7475 if (!Found) {
7476 // If we haven't found enough to usefully cluster, return early.
7477 if (Bases.size() > VL.size() / 2 - 1)
7478 return false;
7479
7480 // Not found already - add a new Base
7481 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7482 }
7483 }
7484
7485 if (Bases.size() == VL.size())
7486 return false;
7487
7488 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7489 Bases.front().second.size() == VL.size()))
7490 return false;
7491
7492 // For each of the bases sort the pointers by Offset and check if any of the
7493 // base become consecutively allocated.
7494 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7495 SmallPtrSet<Value *, 13> FirstPointers;
7496 SmallPtrSet<Value *, 13> SecondPointers;
7497 Value *P1 = Ptr1;
7498 Value *P2 = Ptr2;
7499 unsigned Depth = 0;
7500 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7501 if (P1 == P2 || Depth > RecursionMaxDepth)
7502 return false;
7503 FirstPointers.insert(P1);
7504 SecondPointers.insert(P2);
7505 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7506 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7507 ++Depth;
7508 }
7509 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7510 "Unable to find matching root.");
7511 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7512 };
7513 for (auto &Base : Bases) {
7514 for (auto &Vec : Base.second) {
7515 if (Vec.size() > 1) {
7517 int64_t InitialOffset = std::get<1>(Vec[0]);
7518 bool AnyConsecutive =
7519 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7520 return std::get<1>(P.value()) ==
7521 int64_t(P.index()) + InitialOffset;
7522 });
7523 // Fill SortedIndices array only if it looks worth-while to sort the
7524 // ptrs.
7525 if (!AnyConsecutive)
7526 return false;
7527 }
7528 }
7529 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7530 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7531 });
7532 }
7533
7534 for (auto &T : Bases)
7535 for (const auto &Vec : T.second)
7536 for (const auto &P : Vec)
7537 SortedIndices.push_back(std::get<2>(P));
7538
7539 assert(SortedIndices.size() == VL.size() &&
7540 "Expected SortedIndices to be the size of VL");
7541 return true;
7542}
7543
7544std::optional<BoUpSLP::OrdersType>
7545BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7546 assert(TE.isGather() && "Expected gather node only.");
7547 Type *ScalarTy = TE.Scalars[0]->getType();
7548
7550 Ptrs.reserve(TE.Scalars.size());
7552 BBs.reserve(TE.Scalars.size());
7553 for (Value *V : TE.Scalars) {
7554 auto *L = dyn_cast<LoadInst>(V);
7555 if (!L || !L->isSimple())
7556 return std::nullopt;
7557 Ptrs.push_back(L->getPointerOperand());
7558 BBs.push_back(L->getParent());
7559 }
7560
7561 BoUpSLP::OrdersType Order;
7562 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7563 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7564 return std::move(Order);
7565 return std::nullopt;
7566}
7567
7568/// Check if two insertelement instructions are from the same buildvector.
7571 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7572 // Instructions must be from the same basic blocks.
7573 if (VU->getParent() != V->getParent())
7574 return false;
7575 // Checks if 2 insertelements are from the same buildvector.
7576 if (VU->getType() != V->getType())
7577 return false;
7578 // Multiple used inserts are separate nodes.
7579 if (!VU->hasOneUse() && !V->hasOneUse())
7580 return false;
7581 auto *IE1 = VU;
7582 auto *IE2 = V;
7583 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7584 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7585 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7586 return false;
7587 // Go through the vector operand of insertelement instructions trying to find
7588 // either VU as the original vector for IE2 or V as the original vector for
7589 // IE1.
7590 SmallBitVector ReusedIdx(
7591 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7592 bool IsReusedIdx = false;
7593 do {
7594 if (IE2 == VU && !IE1)
7595 return VU->hasOneUse();
7596 if (IE1 == V && !IE2)
7597 return V->hasOneUse();
7598 if (IE1 && IE1 != V) {
7599 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7600 IsReusedIdx |= ReusedIdx.test(Idx1);
7601 ReusedIdx.set(Idx1);
7602 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7603 IE1 = nullptr;
7604 else
7605 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7606 }
7607 if (IE2 && IE2 != VU) {
7608 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7609 IsReusedIdx |= ReusedIdx.test(Idx2);
7610 ReusedIdx.set(Idx2);
7611 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7612 IE2 = nullptr;
7613 else
7614 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7615 }
7616 } while (!IsReusedIdx && (IE1 || IE2));
7617 return false;
7618}
7619
7620/// Checks if the specified instruction \p I is an alternate operation for
7621/// the given \p MainOp and \p AltOp instructions.
7622static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7623 Instruction *AltOp,
7624 const TargetLibraryInfo &TLI);
7625
7626std::optional<BoUpSLP::OrdersType>
7627BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7628 bool IgnoreReorder) {
7629 // No need to reorder if need to shuffle reuses, still need to shuffle the
7630 // node.
7631 if (!TE.ReuseShuffleIndices.empty()) {
7632 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7633 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7634 "Reshuffling scalars not yet supported for nodes with padding");
7635
7636 if (isSplat(TE.Scalars))
7637 return std::nullopt;
7638 // Check if reuse shuffle indices can be improved by reordering.
7639 // For this, check that reuse mask is "clustered", i.e. each scalar values
7640 // is used once in each submask of size <number_of_scalars>.
7641 // Example: 4 scalar values.
7642 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7643 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7644 // element 3 is used twice in the second submask.
7645 unsigned Sz = TE.Scalars.size();
7646 if (TE.isGather()) {
7647 if (std::optional<OrdersType> CurrentOrder =
7648 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7649 SmallVector<int> Mask;
7650 fixupOrderingIndices(*CurrentOrder);
7651 inversePermutation(*CurrentOrder, Mask);
7652 ::addMask(Mask, TE.ReuseShuffleIndices);
7653 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7654 unsigned Sz = TE.Scalars.size();
7655 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7656 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7657 if (Idx != PoisonMaskElem)
7658 Res[Idx + K * Sz] = I + K * Sz;
7659 }
7660 return std::move(Res);
7661 }
7662 }
7663 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7664 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7665 2 * TE.getVectorFactor())) == 1)
7666 return std::nullopt;
7667 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7668 return std::nullopt;
7669 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7670 Sz)) {
7671 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7672 if (TE.ReorderIndices.empty())
7673 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7674 else
7675 inversePermutation(TE.ReorderIndices, ReorderMask);
7676 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7677 unsigned VF = ReorderMask.size();
7678 OrdersType ResOrder(VF, VF);
7679 unsigned NumParts = divideCeil(VF, Sz);
7680 SmallBitVector UsedVals(NumParts);
7681 for (unsigned I = 0; I < VF; I += Sz) {
7682 int Val = PoisonMaskElem;
7683 unsigned UndefCnt = 0;
7684 unsigned Limit = std::min(Sz, VF - I);
7685 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7686 [&](int Idx) {
7687 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7688 Val = Idx;
7689 if (Idx == PoisonMaskElem)
7690 ++UndefCnt;
7691 return Idx != PoisonMaskElem && Idx != Val;
7692 }) ||
7693 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7694 UndefCnt > Sz / 2)
7695 return std::nullopt;
7696 UsedVals.set(Val);
7697 for (unsigned K = 0; K < NumParts; ++K) {
7698 unsigned Idx = Val + Sz * K;
7699 if (Idx < VF && I + K < VF)
7700 ResOrder[Idx] = I + K;
7701 }
7702 }
7703 return std::move(ResOrder);
7704 }
7705 unsigned VF = TE.getVectorFactor();
7706 // Try build correct order for extractelement instructions.
7707 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7708 TE.ReuseShuffleIndices.end());
7709 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7710 all_of(TE.Scalars, [Sz](Value *V) {
7711 if (isa<PoisonValue>(V))
7712 return true;
7713 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7714 return Idx && *Idx < Sz;
7715 })) {
7716 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7717 "by BinaryOperator and CastInst.");
7718 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7719 if (TE.ReorderIndices.empty())
7720 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7721 else
7722 inversePermutation(TE.ReorderIndices, ReorderMask);
7723 for (unsigned I = 0; I < VF; ++I) {
7724 int &Idx = ReusedMask[I];
7725 if (Idx == PoisonMaskElem)
7726 continue;
7727 Value *V = TE.Scalars[ReorderMask[Idx]];
7728 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7729 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7730 }
7731 }
7732 // Build the order of the VF size, need to reorder reuses shuffles, they are
7733 // always of VF size.
7734 OrdersType ResOrder(VF);
7735 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7736 auto *It = ResOrder.begin();
7737 for (unsigned K = 0; K < VF; K += Sz) {
7738 OrdersType CurrentOrder(TE.ReorderIndices);
7739 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7740 if (SubMask.front() == PoisonMaskElem)
7741 std::iota(SubMask.begin(), SubMask.end(), 0);
7742 reorderOrder(CurrentOrder, SubMask);
7743 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7744 std::advance(It, Sz);
7745 }
7746 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7747 return Data.index() == Data.value();
7748 }))
7749 return std::nullopt; // No need to reorder.
7750 return std::move(ResOrder);
7751 }
7752 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7753 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7754 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
7755 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
7756 return std::nullopt;
7757 if (TE.State == TreeEntry::SplitVectorize ||
7758 ((TE.State == TreeEntry::Vectorize ||
7759 TE.State == TreeEntry::StridedVectorize ||
7760 TE.State == TreeEntry::CompressVectorize) &&
7762 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7763 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7764 "Alternate instructions are only supported by "
7765 "BinaryOperator and CastInst.");
7766 return TE.ReorderIndices;
7767 }
7768 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7769 TE.isAltShuffle()) {
7770 assert(TE.ReuseShuffleIndices.empty() &&
7771 "ReuseShuffleIndices should be "
7772 "empty for alternate instructions.");
7773 SmallVector<int> Mask;
7774 TE.buildAltOpShuffleMask(
7775 [&](Instruction *I) {
7776 assert(TE.getMatchingMainOpOrAltOp(I) &&
7777 "Unexpected main/alternate opcode");
7778 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
7779 },
7780 Mask);
7781 const int VF = TE.getVectorFactor();
7782 OrdersType ResOrder(VF, VF);
7783 for (unsigned I : seq<unsigned>(VF)) {
7784 if (Mask[I] == PoisonMaskElem)
7785 continue;
7786 ResOrder[Mask[I] % VF] = I;
7787 }
7788 return std::move(ResOrder);
7789 }
7790 if (!TE.ReorderIndices.empty())
7791 return TE.ReorderIndices;
7792 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7793 if (!TE.ReorderIndices.empty())
7794 return TE.ReorderIndices;
7795
7796 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
7797 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
7798 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7799 continue;
7800 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
7801 if (!II)
7802 continue;
7803 Instruction *BVHead = nullptr;
7804 BasicBlock *BB = II->getParent();
7805 while (II && II->hasOneUse() && II->getParent() == BB) {
7806 BVHead = II;
7807 II = dyn_cast<InsertElementInst>(II->getOperand(0));
7808 }
7809 I = BVHead;
7810 }
7811
7812 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
7813 assert(BB1 != BB2 && "Expected different basic blocks.");
7814 if (!DT->isReachableFromEntry(BB1))
7815 return false;
7816 if (!DT->isReachableFromEntry(BB2))
7817 return true;
7818 auto *NodeA = DT->getNode(BB1);
7819 auto *NodeB = DT->getNode(BB2);
7820 assert(NodeA && "Should only process reachable instructions");
7821 assert(NodeB && "Should only process reachable instructions");
7822 assert((NodeA == NodeB) ==
7823 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7824 "Different nodes should have different DFS numbers");
7825 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7826 };
7827 auto PHICompare = [&](unsigned I1, unsigned I2) {
7828 Value *V1 = TE.Scalars[I1];
7829 Value *V2 = TE.Scalars[I2];
7830 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
7831 return false;
7832 if (isa<PoisonValue>(V1))
7833 return true;
7834 if (isa<PoisonValue>(V2))
7835 return false;
7836 if (V1->getNumUses() < V2->getNumUses())
7837 return true;
7838 if (V1->getNumUses() > V2->getNumUses())
7839 return false;
7840 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
7841 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
7842 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7843 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7844 FirstUserOfPhi2->getParent());
7845 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7846 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7847 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7848 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7849 if (IE1 && !IE2)
7850 return true;
7851 if (!IE1 && IE2)
7852 return false;
7853 if (IE1 && IE2) {
7854 if (UserBVHead[I1] && !UserBVHead[I2])
7855 return true;
7856 if (!UserBVHead[I1])
7857 return false;
7858 if (UserBVHead[I1] == UserBVHead[I2])
7859 return getElementIndex(IE1) < getElementIndex(IE2);
7860 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
7861 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
7862 UserBVHead[I2]->getParent());
7863 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7864 }
7865 if (EE1 && !EE2)
7866 return true;
7867 if (!EE1 && EE2)
7868 return false;
7869 if (EE1 && EE2) {
7870 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7871 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7872 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7873 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7874 if (!Inst2 && !P2)
7875 return Inst1 || P1;
7876 if (EE1->getOperand(0) == EE2->getOperand(0))
7877 return getElementIndex(EE1) < getElementIndex(EE2);
7878 if (!Inst1 && Inst2)
7879 return false;
7880 if (Inst1 && Inst2) {
7881 if (Inst1->getParent() != Inst2->getParent())
7882 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
7883 return Inst1->comesBefore(Inst2);
7884 }
7885 if (!P1 && P2)
7886 return false;
7887 assert(P1 && P2 &&
7888 "Expected either instructions or arguments vector operands.");
7889 return P1->getArgNo() < P2->getArgNo();
7890 }
7891 return false;
7892 };
7893 OrdersType Phis(TE.Scalars.size());
7894 std::iota(Phis.begin(), Phis.end(), 0);
7895 stable_sort(Phis, PHICompare);
7896 if (isIdentityOrder(Phis))
7897 return std::nullopt; // No need to reorder.
7898 return std::move(Phis);
7899 }
7900 if (TE.isGather() &&
7901 (!TE.hasState() || !TE.isAltShuffle() ||
7902 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7903 allSameType(TE.Scalars)) {
7904 // TODO: add analysis of other gather nodes with extractelement
7905 // instructions and other values/instructions, not only undefs.
7906 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7908 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7909 all_of(TE.Scalars, [](Value *V) {
7910 auto *EE = dyn_cast<ExtractElementInst>(V);
7911 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7912 })) {
7913 // Check that gather of extractelements can be represented as
7914 // just a shuffle of a single vector.
7915 OrdersType CurrentOrder;
7916 bool Reuse =
7917 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
7918 if (Reuse || !CurrentOrder.empty())
7919 return std::move(CurrentOrder);
7920 }
7921 // If the gather node is <undef, v, .., poison> and
7922 // insertelement poison, v, 0 [+ permute]
7923 // is cheaper than
7924 // insertelement poison, v, n - try to reorder.
7925 // If rotating the whole graph, exclude the permute cost, the whole graph
7926 // might be transformed.
7927 int Sz = TE.Scalars.size();
7928 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
7929 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7930 const auto *It = find_if_not(TE.Scalars, isConstant);
7931 if (It == TE.Scalars.begin())
7932 return OrdersType();
7933 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
7934 if (It != TE.Scalars.end()) {
7935 OrdersType Order(Sz, Sz);
7936 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7937 Order[Idx] = 0;
7938 fixupOrderingIndices(Order);
7939 SmallVector<int> Mask;
7940 inversePermutation(Order, Mask);
7941 InstructionCost PermuteCost =
7942 TopToBottom
7943 ? 0
7944 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
7945 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
7946 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
7947 PoisonValue::get(Ty), *It);
7948 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
7949 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
7950 PoisonValue::get(Ty), *It);
7951 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7952 OrdersType Order(Sz, Sz);
7953 Order[Idx] = 0;
7954 return std::move(Order);
7955 }
7956 }
7957 }
7958 if (isSplat(TE.Scalars))
7959 return std::nullopt;
7960 if (TE.Scalars.size() >= 3)
7961 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
7962 return Order;
7963 // Check if can include the order of vectorized loads. For masked gathers do
7964 // extra analysis later, so include such nodes into a special list.
7965 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7966 SmallVector<Value *> PointerOps;
7967 StridedPtrInfo SPtrInfo;
7968 OrdersType CurrentOrder;
7969 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7970 CurrentOrder, PointerOps, SPtrInfo);
7973 return std::move(CurrentOrder);
7974 }
7975 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
7976 // has been auditted for correctness with non-power-of-two vectors.
7977 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
7978 if (std::optional<OrdersType> CurrentOrder =
7979 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
7980 return CurrentOrder;
7981 }
7982 return std::nullopt;
7983}
7984
7985/// Checks if the given mask is a "clustered" mask with the same clusters of
7986/// size \p Sz, which are not identity submasks.
7988 unsigned Sz) {
7989 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
7990 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
7991 return false;
7992 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
7993 ArrayRef<int> Cluster = Mask.slice(I, Sz);
7994 if (Cluster != FirstCluster)
7995 return false;
7996 }
7997 return true;
7998}
7999
8000void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8001 // Reorder reuses mask.
8002 reorderReuses(TE.ReuseShuffleIndices, Mask);
8003 const unsigned Sz = TE.Scalars.size();
8004 // For vectorized and non-clustered reused no need to do anything else.
8005 if (!TE.isGather() ||
8007 Sz) ||
8008 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
8009 return;
8010 SmallVector<int> NewMask;
8011 inversePermutation(TE.ReorderIndices, NewMask);
8012 addMask(NewMask, TE.ReuseShuffleIndices);
8013 // Clear reorder since it is going to be applied to the new mask.
8014 TE.ReorderIndices.clear();
8015 // Try to improve gathered nodes with clustered reuses, if possible.
8016 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
8017 SmallVector<unsigned> NewOrder(Slice);
8018 inversePermutation(NewOrder, NewMask);
8019 reorderScalars(TE.Scalars, NewMask);
8020 // Fill the reuses mask with the identity submasks.
8021 for (auto *It = TE.ReuseShuffleIndices.begin(),
8022 *End = TE.ReuseShuffleIndices.end();
8023 It != End; std::advance(It, Sz))
8024 std::iota(It, std::next(It, Sz), 0);
8025}
8026
8028 ArrayRef<unsigned> SecondaryOrder) {
8029 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8030 "Expected same size of orders");
8031 size_t Sz = Order.size();
8032 SmallBitVector UsedIndices(Sz);
8033 for (unsigned Idx : seq<unsigned>(0, Sz)) {
8034 if (Order[Idx] != Sz)
8035 UsedIndices.set(Order[Idx]);
8036 }
8037 if (SecondaryOrder.empty()) {
8038 for (unsigned Idx : seq<unsigned>(0, Sz))
8039 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8040 Order[Idx] = Idx;
8041 } else {
8042 for (unsigned Idx : seq<unsigned>(0, Sz))
8043 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8044 !UsedIndices.test(SecondaryOrder[Idx]))
8045 Order[Idx] = SecondaryOrder[Idx];
8046 }
8047}
8048
8051 return false;
8052
8053 constexpr unsigned TinyVF = 2;
8054 constexpr unsigned TinyTree = 10;
8055 constexpr unsigned PhiOpsLimit = 12;
8056 constexpr unsigned GatherLoadsLimit = 2;
8057 if (VectorizableTree.size() <= TinyTree)
8058 return true;
8059 if (VectorizableTree.front()->hasState() &&
8060 !VectorizableTree.front()->isGather() &&
8061 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8062 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8063 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8064 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8065 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8066 VectorizableTree.front()->ReorderIndices.empty()) {
8067 // Check if the tree has only single store and single (unordered) load node,
8068 // other nodes are phis or geps/binops, combined with phis, and/or single
8069 // gather load node
8070 if (VectorizableTree.front()->hasState() &&
8071 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8072 VectorizableTree.front()->Scalars.size() == TinyVF &&
8073 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8074 return false;
8075 // Single node, which require reorder - skip.
8076 if (VectorizableTree.front()->hasState() &&
8077 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8078 VectorizableTree.front()->ReorderIndices.empty()) {
8079 const unsigned ReorderedSplitsCnt =
8080 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8081 return TE->State == TreeEntry::SplitVectorize &&
8082 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8083 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8084 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
8085 });
8086 if (ReorderedSplitsCnt <= 1 &&
8087 static_cast<unsigned>(count_if(
8088 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8089 return ((!TE->isGather() &&
8090 (TE->ReorderIndices.empty() ||
8091 (TE->UserTreeIndex.UserTE &&
8092 TE->UserTreeIndex.UserTE->State ==
8093 TreeEntry::Vectorize &&
8094 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8095 .empty()))) ||
8096 (TE->isGather() && TE->ReorderIndices.empty() &&
8097 (!TE->hasState() || TE->isAltShuffle() ||
8098 TE->getOpcode() == Instruction::Load ||
8099 TE->getOpcode() == Instruction::ZExt ||
8100 TE->getOpcode() == Instruction::SExt))) &&
8101 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8102 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
8103 return !isConstant(V) && isVectorized(V);
8104 }));
8105 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8106 return false;
8107 }
8108 bool HasPhis = false;
8109 bool HasLoad = true;
8110 unsigned GatherLoads = 0;
8111 for (const std::unique_ptr<TreeEntry> &TE :
8112 ArrayRef(VectorizableTree).drop_front()) {
8113 if (TE->State == TreeEntry::SplitVectorize)
8114 continue;
8115 if (!TE->hasState()) {
8116 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
8118 continue;
8119 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8121 continue;
8122 return true;
8123 }
8124 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8125 if (!TE->isGather()) {
8126 HasLoad = false;
8127 continue;
8128 }
8129 if (HasLoad)
8130 return true;
8131 ++GatherLoads;
8132 if (GatherLoads >= GatherLoadsLimit)
8133 return true;
8134 }
8135 if (TE->getOpcode() == Instruction::GetElementPtr ||
8136 Instruction::isBinaryOp(TE->getOpcode()))
8137 continue;
8138 if (TE->getOpcode() != Instruction::PHI &&
8139 (!TE->hasCopyableElements() ||
8140 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8141 TE->Scalars.size() / 2))
8142 return true;
8143 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8144 TE->getNumOperands() > PhiOpsLimit)
8145 return false;
8146 HasPhis = true;
8147 }
8148 return !HasPhis;
8149 }
8150 return true;
8151}
8152
8153void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8154 ArrayRef<int> MaskOrder) {
8155 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8156 SmallVector<int> NewMask(getVectorFactor());
8157 SmallVector<int> NewMaskOrder(getVectorFactor());
8158 std::iota(NewMask.begin(), NewMask.end(), 0);
8159 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8160 if (Idx == 0) {
8161 copy(Mask, NewMask.begin());
8162 copy(MaskOrder, NewMaskOrder.begin());
8163 } else {
8164 assert(Idx == 1 && "Expected either 0 or 1 index.");
8165 unsigned Offset = CombinedEntriesWithIndices.back().second;
8166 for (unsigned I : seq<unsigned>(Mask.size())) {
8167 NewMask[I + Offset] = Mask[I] + Offset;
8168 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8169 }
8170 }
8171 reorderScalars(Scalars, NewMask);
8172 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8173 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8174 ReorderIndices.clear();
8175}
8176
8178 // Maps VF to the graph nodes.
8180 // ExtractElement gather nodes which can be vectorized and need to handle
8181 // their ordering.
8183
8184 // Phi nodes can have preferred ordering based on their result users
8186
8187 // AltShuffles can also have a preferred ordering that leads to fewer
8188 // instructions, e.g., the addsub instruction in x86.
8189 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8190
8191 // Maps a TreeEntry to the reorder indices of external users.
8193 ExternalUserReorderMap;
8194 // Find all reorderable nodes with the given VF.
8195 // Currently the are vectorized stores,loads,extracts + some gathering of
8196 // extracts.
8197 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8198 const std::unique_ptr<TreeEntry> &TE) {
8199 // Look for external users that will probably be vectorized.
8200 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8201 findExternalStoreUsersReorderIndices(TE.get());
8202 if (!ExternalUserReorderIndices.empty()) {
8203 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8204 ExternalUserReorderMap.try_emplace(TE.get(),
8205 std::move(ExternalUserReorderIndices));
8206 }
8207
8208 // Patterns like [fadd,fsub] can be combined into a single instruction in
8209 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8210 // to take into account their order when looking for the most used order.
8211 if (TE->hasState() && TE->isAltShuffle() &&
8212 TE->State != TreeEntry::SplitVectorize) {
8213 Type *ScalarTy = TE->Scalars[0]->getType();
8214 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8215 unsigned Opcode0 = TE->getOpcode();
8216 unsigned Opcode1 = TE->getAltOpcode();
8217 SmallBitVector OpcodeMask(
8218 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8219 // If this pattern is supported by the target then we consider the order.
8220 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8221 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8222 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8223 }
8224 // TODO: Check the reverse order too.
8225 }
8226
8227 bool IgnoreReorder =
8228 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8229 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8230 VectorizableTree.front()->getOpcode() == Instruction::Store);
8231 if (std::optional<OrdersType> CurrentOrder =
8232 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8233 // Do not include ordering for nodes used in the alt opcode vectorization,
8234 // better to reorder them during bottom-to-top stage. If follow the order
8235 // here, it causes reordering of the whole graph though actually it is
8236 // profitable just to reorder the subgraph that starts from the alternate
8237 // opcode vectorization node. Such nodes already end-up with the shuffle
8238 // instruction and it is just enough to change this shuffle rather than
8239 // rotate the scalars for the whole graph.
8240 unsigned Cnt = 0;
8241 const TreeEntry *UserTE = TE.get();
8242 while (UserTE && Cnt < RecursionMaxDepth) {
8243 if (!UserTE->UserTreeIndex)
8244 break;
8245 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8246 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8247 UserTE->UserTreeIndex.UserTE->Idx != 0)
8248 return;
8249 UserTE = UserTE->UserTreeIndex.UserTE;
8250 ++Cnt;
8251 }
8252 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8253 if (!(TE->State == TreeEntry::Vectorize ||
8254 TE->State == TreeEntry::StridedVectorize ||
8255 TE->State == TreeEntry::SplitVectorize ||
8256 TE->State == TreeEntry::CompressVectorize) ||
8257 !TE->ReuseShuffleIndices.empty())
8258 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8259 if (TE->State == TreeEntry::Vectorize &&
8260 TE->getOpcode() == Instruction::PHI)
8261 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8262 }
8263 });
8264
8265 // Reorder the graph nodes according to their vectorization factor.
8266 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8267 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8268 auto It = VFToOrderedEntries.find(VF);
8269 if (It == VFToOrderedEntries.end())
8270 continue;
8271 // Try to find the most profitable order. We just are looking for the most
8272 // used order and reorder scalar elements in the nodes according to this
8273 // mostly used order.
8274 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8275 // Delete VF entry upon exit.
8276 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
8277
8278 // All operands are reordered and used only in this node - propagate the
8279 // most used order to the user node.
8282 OrdersUses;
8283 for (const TreeEntry *OpTE : OrderedEntries) {
8284 // No need to reorder this nodes, still need to extend and to use shuffle,
8285 // just need to merge reordering shuffle and the reuse shuffle.
8286 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8287 OpTE->State != TreeEntry::SplitVectorize)
8288 continue;
8289 // Count number of orders uses.
8290 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8291 &PhisToOrders]() -> const OrdersType & {
8292 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8293 auto It = GathersToOrders.find(OpTE);
8294 if (It != GathersToOrders.end())
8295 return It->second;
8296 }
8297 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8298 auto It = AltShufflesToOrders.find(OpTE);
8299 if (It != AltShufflesToOrders.end())
8300 return It->second;
8301 }
8302 if (OpTE->State == TreeEntry::Vectorize &&
8303 OpTE->getOpcode() == Instruction::PHI) {
8304 auto It = PhisToOrders.find(OpTE);
8305 if (It != PhisToOrders.end())
8306 return It->second;
8307 }
8308 return OpTE->ReorderIndices;
8309 }();
8310 // First consider the order of the external scalar users.
8311 auto It = ExternalUserReorderMap.find(OpTE);
8312 if (It != ExternalUserReorderMap.end()) {
8313 const auto &ExternalUserReorderIndices = It->second;
8314 // If the OpTE vector factor != number of scalars - use natural order,
8315 // it is an attempt to reorder node with reused scalars but with
8316 // external uses.
8317 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8318 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8319 ExternalUserReorderIndices.size();
8320 } else {
8321 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8322 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8323 }
8324 // No other useful reorder data in this entry.
8325 if (Order.empty())
8326 continue;
8327 }
8328 // Stores actually store the mask, not the order, need to invert.
8329 if (OpTE->State == TreeEntry::Vectorize &&
8330 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8331 assert(!OpTE->isAltShuffle() &&
8332 "Alternate instructions are only supported by BinaryOperator "
8333 "and CastInst.");
8334 SmallVector<int> Mask;
8335 inversePermutation(Order, Mask);
8336 unsigned E = Order.size();
8337 OrdersType CurrentOrder(E, E);
8338 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8339 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8340 });
8341 fixupOrderingIndices(CurrentOrder);
8342 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8343 } else {
8344 ++OrdersUses.try_emplace(Order, 0).first->second;
8345 }
8346 }
8347 if (OrdersUses.empty())
8348 continue;
8349 // Choose the most used order.
8350 unsigned IdentityCnt = 0;
8351 unsigned FilledIdentityCnt = 0;
8352 OrdersType IdentityOrder(VF, VF);
8353 for (auto &Pair : OrdersUses) {
8354 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8355 if (!Pair.first.empty())
8356 FilledIdentityCnt += Pair.second;
8357 IdentityCnt += Pair.second;
8358 combineOrders(IdentityOrder, Pair.first);
8359 }
8360 }
8361 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8362 unsigned Cnt = IdentityCnt;
8363 for (auto &Pair : OrdersUses) {
8364 // Prefer identity order. But, if filled identity found (non-empty order)
8365 // with same number of uses, as the new candidate order, we can choose
8366 // this candidate order.
8367 if (Cnt < Pair.second ||
8368 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8369 Cnt == Pair.second && !BestOrder.empty() &&
8370 isIdentityOrder(BestOrder))) {
8371 combineOrders(Pair.first, BestOrder);
8372 BestOrder = Pair.first;
8373 Cnt = Pair.second;
8374 } else {
8375 combineOrders(BestOrder, Pair.first);
8376 }
8377 }
8378 // Set order of the user node.
8379 if (isIdentityOrder(BestOrder))
8380 continue;
8381 fixupOrderingIndices(BestOrder);
8382 SmallVector<int> Mask;
8383 inversePermutation(BestOrder, Mask);
8384 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8385 unsigned E = BestOrder.size();
8386 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8387 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8388 });
8389 // Do an actual reordering, if profitable.
8390 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8391 // Just do the reordering for the nodes with the given VF.
8392 if (TE->Scalars.size() != VF) {
8393 if (TE->ReuseShuffleIndices.size() == VF) {
8394 assert(TE->State != TreeEntry::SplitVectorize &&
8395 "Split vectorized not expected.");
8396 // Need to reorder the reuses masks of the operands with smaller VF to
8397 // be able to find the match between the graph nodes and scalar
8398 // operands of the given node during vectorization/cost estimation.
8399 assert(
8400 (!TE->UserTreeIndex ||
8401 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8402 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8403 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8404 "All users must be of VF size.");
8405 if (SLPReVec) {
8406 assert(SLPReVec && "Only supported by REVEC.");
8407 // ShuffleVectorInst does not do reorderOperands (and it should not
8408 // because ShuffleVectorInst supports only a limited set of
8409 // patterns). Only do reorderNodeWithReuses if the user is not
8410 // ShuffleVectorInst.
8411 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8412 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8413 continue;
8414 }
8415 // Update ordering of the operands with the smaller VF than the given
8416 // one.
8417 reorderNodeWithReuses(*TE, Mask);
8418 // Update orders in user split vectorize nodes.
8419 if (TE->UserTreeIndex &&
8420 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8421 TE->UserTreeIndex.UserTE->reorderSplitNode(
8422 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8423 }
8424 continue;
8425 }
8426 if ((TE->State == TreeEntry::SplitVectorize &&
8427 TE->ReuseShuffleIndices.empty()) ||
8428 ((TE->State == TreeEntry::Vectorize ||
8429 TE->State == TreeEntry::StridedVectorize ||
8430 TE->State == TreeEntry::CompressVectorize) &&
8432 InsertElementInst>(TE->getMainOp()) ||
8433 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8434 assert(
8435 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8436 TE->ReuseShuffleIndices.empty())) &&
8437 "Alternate instructions are only supported by BinaryOperator "
8438 "and CastInst.");
8439 // Build correct orders for extract{element,value}, loads,
8440 // stores and alternate (split) nodes.
8441 reorderOrder(TE->ReorderIndices, Mask);
8442 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8443 TE->reorderOperands(Mask);
8444 } else {
8445 // Reorder the node and its operands.
8446 TE->reorderOperands(Mask);
8447 assert(TE->ReorderIndices.empty() &&
8448 "Expected empty reorder sequence.");
8449 reorderScalars(TE->Scalars, Mask);
8450 }
8451 if (!TE->ReuseShuffleIndices.empty()) {
8452 // Apply reversed order to keep the original ordering of the reused
8453 // elements to avoid extra reorder indices shuffling.
8454 OrdersType CurrentOrder;
8455 reorderOrder(CurrentOrder, MaskOrder);
8456 SmallVector<int> NewReuses;
8457 inversePermutation(CurrentOrder, NewReuses);
8458 addMask(NewReuses, TE->ReuseShuffleIndices);
8459 TE->ReuseShuffleIndices.swap(NewReuses);
8460 } else if (TE->UserTreeIndex &&
8461 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8462 // Update orders in user split vectorize nodes.
8463 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8464 Mask, MaskOrder);
8465 }
8466 }
8467}
8468
8469void BoUpSLP::buildReorderableOperands(
8470 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8471 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8472 SmallVectorImpl<TreeEntry *> &GatherOps) {
8473 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8474 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8475 return OpData.first == I &&
8476 (OpData.second->State == TreeEntry::Vectorize ||
8477 OpData.second->State == TreeEntry::StridedVectorize ||
8478 OpData.second->State == TreeEntry::CompressVectorize ||
8479 OpData.second->State == TreeEntry::SplitVectorize);
8480 }))
8481 continue;
8482 // Do not request operands, if they do not exist.
8483 if (UserTE->hasState()) {
8484 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8485 UserTE->getOpcode() == Instruction::ExtractValue)
8486 continue;
8487 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8488 continue;
8489 if (UserTE->getOpcode() == Instruction::Store &&
8490 UserTE->State == TreeEntry::Vectorize && I == 1)
8491 continue;
8492 if (UserTE->getOpcode() == Instruction::Load &&
8493 (UserTE->State == TreeEntry::Vectorize ||
8494 UserTE->State == TreeEntry::StridedVectorize ||
8495 UserTE->State == TreeEntry::CompressVectorize))
8496 continue;
8497 }
8498 TreeEntry *TE = getOperandEntry(UserTE, I);
8499 assert(TE && "Expected operand entry.");
8500 if (!TE->isGather()) {
8501 // Add the node to the list of the ordered nodes with the identity
8502 // order.
8503 Edges.emplace_back(I, TE);
8504 // Add ScatterVectorize nodes to the list of operands, where just
8505 // reordering of the scalars is required. Similar to the gathers, so
8506 // simply add to the list of gathered ops.
8507 // If there are reused scalars, process this node as a regular vectorize
8508 // node, just reorder reuses mask.
8509 if (TE->State == TreeEntry::ScatterVectorize &&
8510 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8511 GatherOps.push_back(TE);
8512 continue;
8513 }
8514 if (ReorderableGathers.contains(TE))
8515 GatherOps.push_back(TE);
8516 }
8517}
8518
8519void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8520 struct TreeEntryCompare {
8521 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8522 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8523 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8524 return LHS->Idx < RHS->Idx;
8525 }
8526 };
8528 DenseSet<const TreeEntry *> GathersToOrders;
8529 // Find all reorderable leaf nodes with the given VF.
8530 // Currently the are vectorized loads,extracts without alternate operands +
8531 // some gathering of extracts.
8533 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8534 if (TE->State != TreeEntry::Vectorize &&
8535 TE->State != TreeEntry::StridedVectorize &&
8536 TE->State != TreeEntry::CompressVectorize &&
8537 TE->State != TreeEntry::SplitVectorize)
8538 NonVectorized.insert(TE.get());
8539 if (std::optional<OrdersType> CurrentOrder =
8540 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8541 Queue.push(TE.get());
8542 if (!(TE->State == TreeEntry::Vectorize ||
8543 TE->State == TreeEntry::StridedVectorize ||
8544 TE->State == TreeEntry::CompressVectorize ||
8545 TE->State == TreeEntry::SplitVectorize) ||
8546 !TE->ReuseShuffleIndices.empty())
8547 GathersToOrders.insert(TE.get());
8548 }
8549 }
8550
8551 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8552 // I.e., if the node has operands, that are reordered, try to make at least
8553 // one operand order in the natural order and reorder others + reorder the
8554 // user node itself.
8555 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8556 while (!Queue.empty()) {
8557 // 1. Filter out only reordered nodes.
8558 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8559 TreeEntry *TE = Queue.top();
8560 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8561 Queue.pop();
8562 SmallVector<TreeEntry *> OrderedOps(1, TE);
8563 while (!Queue.empty()) {
8564 TE = Queue.top();
8565 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8566 break;
8567 Queue.pop();
8568 OrderedOps.push_back(TE);
8569 }
8570 for (TreeEntry *TE : OrderedOps) {
8571 if (!(TE->State == TreeEntry::Vectorize ||
8572 TE->State == TreeEntry::StridedVectorize ||
8573 TE->State == TreeEntry::CompressVectorize ||
8574 TE->State == TreeEntry::SplitVectorize ||
8575 (TE->isGather() && GathersToOrders.contains(TE))) ||
8576 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8577 !Visited.insert(TE).second)
8578 continue;
8579 // Build a map between user nodes and their operands order to speedup
8580 // search. The graph currently does not provide this dependency directly.
8581 Users.first = TE->UserTreeIndex.UserTE;
8582 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8583 }
8584 if (Users.first) {
8585 auto &Data = Users;
8586 if (Data.first->State == TreeEntry::SplitVectorize) {
8587 assert(
8588 Data.second.size() <= 2 &&
8589 "Expected not greater than 2 operands for split vectorize node.");
8590 if (any_of(Data.second,
8591 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8592 continue;
8593 // Update orders in user split vectorize nodes.
8594 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8595 "Expected exactly 2 entries.");
8596 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8597 TreeEntry &OpTE = *VectorizableTree[P.first];
8598 OrdersType Order = OpTE.ReorderIndices;
8599 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8600 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8601 continue;
8602 const auto BestOrder =
8603 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8604 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8605 continue;
8606 Order = *BestOrder;
8607 }
8608 fixupOrderingIndices(Order);
8609 SmallVector<int> Mask;
8610 inversePermutation(Order, Mask);
8611 const unsigned E = Order.size();
8612 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8613 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8614 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8615 });
8616 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8617 // Clear ordering of the operand.
8618 if (!OpTE.ReorderIndices.empty()) {
8619 OpTE.ReorderIndices.clear();
8620 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8621 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8622 } else {
8623 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8624 reorderScalars(OpTE.Scalars, Mask);
8625 }
8626 }
8627 if (Data.first->ReuseShuffleIndices.empty() &&
8628 !Data.first->ReorderIndices.empty()) {
8629 // Insert user node to the list to try to sink reordering deeper in
8630 // the graph.
8631 Queue.push(Data.first);
8632 }
8633 continue;
8634 }
8635 // Check that operands are used only in the User node.
8636 SmallVector<TreeEntry *> GatherOps;
8637 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8638 GatherOps);
8639 // All operands are reordered and used only in this node - propagate the
8640 // most used order to the user node.
8643 OrdersUses;
8644 // Do the analysis for each tree entry only once, otherwise the order of
8645 // the same node my be considered several times, though might be not
8646 // profitable.
8649 for (const auto &Op : Data.second) {
8650 TreeEntry *OpTE = Op.second;
8651 if (!VisitedOps.insert(OpTE).second)
8652 continue;
8653 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8654 continue;
8655 const auto Order = [&]() -> const OrdersType {
8656 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8657 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8658 IgnoreReorder)
8659 .value_or(OrdersType(1));
8660 return OpTE->ReorderIndices;
8661 }();
8662 // The order is partially ordered, skip it in favor of fully non-ordered
8663 // orders.
8664 if (Order.size() == 1)
8665 continue;
8666
8667 // Check that the reordering does not increase number of shuffles, i.e.
8668 // same-values-nodes has same parents or their parents has same parents.
8669 if (!Order.empty() && !isIdentityOrder(Order)) {
8670 Value *Root = OpTE->hasState()
8671 ? OpTE->getMainOp()
8672 : *find_if_not(OpTE->Scalars, isConstant);
8673 auto GetSameNodesUsers = [&](Value *Root) {
8675 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8676 if (TE != OpTE && TE->UserTreeIndex &&
8677 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8678 TE->Scalars.size() == OpTE->Scalars.size() &&
8679 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8680 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8681 Res.insert(TE->UserTreeIndex.UserTE);
8682 }
8683 for (const TreeEntry *TE : getTreeEntries(Root)) {
8684 if (TE != OpTE && TE->UserTreeIndex &&
8685 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8686 TE->Scalars.size() == OpTE->Scalars.size() &&
8687 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8688 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8689 Res.insert(TE->UserTreeIndex.UserTE);
8690 }
8691 return Res.takeVector();
8692 };
8693 auto GetNumOperands = [](const TreeEntry *TE) {
8694 if (TE->State == TreeEntry::SplitVectorize)
8695 return TE->getNumOperands();
8696 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8697 return CI->arg_size();
8698 return TE->getNumOperands();
8699 };
8700 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8701 const TreeEntry *TE) {
8703 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8705 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8708 continue;
8709 const TreeEntry *Op = getOperandEntry(TE, Idx);
8710 if (Op->isGather() && Op->hasState()) {
8711 const TreeEntry *VecOp =
8712 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8713 if (VecOp)
8714 Op = VecOp;
8715 }
8716 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8717 return false;
8718 }
8719 return true;
8720 };
8721 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8722 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8723 if (!RevisitedOps.insert(UTE).second)
8724 return false;
8725 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8726 !UTE->ReuseShuffleIndices.empty() ||
8727 (UTE->UserTreeIndex &&
8728 UTE->UserTreeIndex.UserTE == Data.first) ||
8729 (Data.first->UserTreeIndex &&
8730 Data.first->UserTreeIndex.UserTE == UTE) ||
8731 (IgnoreReorder && UTE->UserTreeIndex &&
8732 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8733 NodeShouldBeReorderedWithOperands(UTE);
8734 }))
8735 continue;
8736 for (TreeEntry *UTE : Users) {
8738 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8740 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8743 continue;
8744 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8745 Visited.erase(Op);
8746 Queue.push(const_cast<TreeEntry *>(Op));
8747 }
8748 }
8749 }
8750 unsigned NumOps = count_if(
8751 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
8752 return P.second == OpTE;
8753 });
8754 // Stores actually store the mask, not the order, need to invert.
8755 if (OpTE->State == TreeEntry::Vectorize &&
8756 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8757 assert(!OpTE->isAltShuffle() &&
8758 "Alternate instructions are only supported by BinaryOperator "
8759 "and CastInst.");
8760 SmallVector<int> Mask;
8761 inversePermutation(Order, Mask);
8762 unsigned E = Order.size();
8763 OrdersType CurrentOrder(E, E);
8764 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8765 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8766 });
8767 fixupOrderingIndices(CurrentOrder);
8768 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8769 } else {
8770 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8771 }
8772 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
8773 const auto AllowsReordering = [&](const TreeEntry *TE) {
8774 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8775 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8776 (IgnoreReorder && TE->Idx == 0))
8777 return true;
8778 if (TE->isGather()) {
8779 if (GathersToOrders.contains(TE))
8780 return !getReorderingData(*TE, /*TopToBottom=*/false,
8781 IgnoreReorder)
8782 .value_or(OrdersType(1))
8783 .empty();
8784 return true;
8785 }
8786 return false;
8787 };
8788 if (OpTE->UserTreeIndex) {
8789 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8790 if (!VisitedUsers.insert(UserTE).second)
8791 continue;
8792 // May reorder user node if it requires reordering, has reused
8793 // scalars, is an alternate op vectorize node or its op nodes require
8794 // reordering.
8795 if (AllowsReordering(UserTE))
8796 continue;
8797 // Check if users allow reordering.
8798 // Currently look up just 1 level of operands to avoid increase of
8799 // the compile time.
8800 // Profitable to reorder if definitely more operands allow
8801 // reordering rather than those with natural order.
8803 if (static_cast<unsigned>(count_if(
8804 Ops, [UserTE, &AllowsReordering](
8805 const std::pair<unsigned, TreeEntry *> &Op) {
8806 return AllowsReordering(Op.second) &&
8807 Op.second->UserTreeIndex.UserTE == UserTE;
8808 })) <= Ops.size() / 2)
8809 ++Res.first->second;
8810 }
8811 }
8812 if (OrdersUses.empty()) {
8813 Visited.insert_range(llvm::make_second_range(Data.second));
8814 continue;
8815 }
8816 // Choose the most used order.
8817 unsigned IdentityCnt = 0;
8818 unsigned VF = Data.second.front().second->getVectorFactor();
8819 OrdersType IdentityOrder(VF, VF);
8820 for (auto &Pair : OrdersUses) {
8821 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8822 IdentityCnt += Pair.second;
8823 combineOrders(IdentityOrder, Pair.first);
8824 }
8825 }
8826 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8827 unsigned Cnt = IdentityCnt;
8828 for (auto &Pair : OrdersUses) {
8829 // Prefer identity order. But, if filled identity found (non-empty
8830 // order) with same number of uses, as the new candidate order, we can
8831 // choose this candidate order.
8832 if (Cnt < Pair.second) {
8833 combineOrders(Pair.first, BestOrder);
8834 BestOrder = Pair.first;
8835 Cnt = Pair.second;
8836 } else {
8837 combineOrders(BestOrder, Pair.first);
8838 }
8839 }
8840 // Set order of the user node.
8841 if (isIdentityOrder(BestOrder)) {
8842 Visited.insert_range(llvm::make_second_range(Data.second));
8843 continue;
8844 }
8845 fixupOrderingIndices(BestOrder);
8846 // Erase operands from OrderedEntries list and adjust their orders.
8847 VisitedOps.clear();
8848 SmallVector<int> Mask;
8849 inversePermutation(BestOrder, Mask);
8850 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8851 unsigned E = BestOrder.size();
8852 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8853 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8854 });
8855 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
8856 TreeEntry *TE = Op.second;
8857 if (!VisitedOps.insert(TE).second)
8858 continue;
8859 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
8860 reorderNodeWithReuses(*TE, Mask);
8861 continue;
8862 }
8863 // Gathers are processed separately.
8864 if (TE->State != TreeEntry::Vectorize &&
8865 TE->State != TreeEntry::StridedVectorize &&
8866 TE->State != TreeEntry::CompressVectorize &&
8867 TE->State != TreeEntry::SplitVectorize &&
8868 (TE->State != TreeEntry::ScatterVectorize ||
8869 TE->ReorderIndices.empty()))
8870 continue;
8871 assert((BestOrder.size() == TE->ReorderIndices.size() ||
8872 TE->ReorderIndices.empty()) &&
8873 "Non-matching sizes of user/operand entries.");
8874 reorderOrder(TE->ReorderIndices, Mask);
8875 if (IgnoreReorder && TE == VectorizableTree.front().get())
8876 IgnoreReorder = false;
8877 }
8878 // For gathers just need to reorder its scalars.
8879 for (TreeEntry *Gather : GatherOps) {
8880 assert(Gather->ReorderIndices.empty() &&
8881 "Unexpected reordering of gathers.");
8882 if (!Gather->ReuseShuffleIndices.empty()) {
8883 // Just reorder reuses indices.
8884 reorderReuses(Gather->ReuseShuffleIndices, Mask);
8885 continue;
8886 }
8887 reorderScalars(Gather->Scalars, Mask);
8888 Visited.insert(Gather);
8889 }
8890 // Reorder operands of the user node and set the ordering for the user
8891 // node itself.
8892 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
8893 return TE.isAltShuffle() &&
8894 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8895 TE.ReorderIndices.empty());
8896 };
8897 if (Data.first->State != TreeEntry::Vectorize ||
8899 Data.first->getMainOp()) ||
8900 IsNotProfitableAltCodeNode(*Data.first))
8901 Data.first->reorderOperands(Mask);
8902 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
8903 IsNotProfitableAltCodeNode(*Data.first) ||
8904 Data.first->State == TreeEntry::StridedVectorize ||
8905 Data.first->State == TreeEntry::CompressVectorize) {
8906 reorderScalars(Data.first->Scalars, Mask);
8907 reorderOrder(Data.first->ReorderIndices, MaskOrder,
8908 /*BottomOrder=*/true);
8909 if (Data.first->ReuseShuffleIndices.empty() &&
8910 !Data.first->ReorderIndices.empty() &&
8911 !IsNotProfitableAltCodeNode(*Data.first)) {
8912 // Insert user node to the list to try to sink reordering deeper in
8913 // the graph.
8914 Queue.push(Data.first);
8915 }
8916 } else {
8917 reorderOrder(Data.first->ReorderIndices, Mask);
8918 }
8919 }
8920 }
8921 // If the reordering is unnecessary, just remove the reorder.
8922 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8923 VectorizableTree.front()->ReuseShuffleIndices.empty())
8924 VectorizableTree.front()->ReorderIndices.clear();
8925}
8926
8927Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
8928 if (Entry.hasState() &&
8929 (Entry.getOpcode() == Instruction::Store ||
8930 Entry.getOpcode() == Instruction::Load) &&
8931 Entry.State == TreeEntry::StridedVectorize &&
8932 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
8933 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8934 return dyn_cast<Instruction>(Entry.Scalars.front());
8935}
8936
8938 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
8939 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8940 DenseMap<Value *, unsigned> ScalarToExtUses;
8941 // Collect the values that we need to extract from the tree.
8942 for (auto &TEPtr : VectorizableTree) {
8943 TreeEntry *Entry = TEPtr.get();
8944
8945 // No need to handle users of gathered values.
8946 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8947 continue;
8948
8949 // For each lane:
8950 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8951 Value *Scalar = Entry->Scalars[Lane];
8952 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8953 continue;
8954
8955 // All uses must be replaced already? No need to do it again.
8956 auto It = ScalarToExtUses.find(Scalar);
8957 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
8958 continue;
8959
8960 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8961 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8962 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
8963 << " from " << *Scalar << "for many users.\n");
8964 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8965 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8966 ExternalUsesWithNonUsers.insert(Scalar);
8967 continue;
8968 }
8969
8970 // Check if the scalar is externally used as an extra arg.
8971 const auto ExtI = ExternallyUsedValues.find(Scalar);
8972 if (ExtI != ExternallyUsedValues.end()) {
8973 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8974 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
8975 << FoundLane << " from " << *Scalar << ".\n");
8976 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
8977 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8978 continue;
8979 }
8980 for (User *U : Scalar->users()) {
8981 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
8982
8983 Instruction *UserInst = dyn_cast<Instruction>(U);
8984 if (!UserInst || isDeleted(UserInst))
8985 continue;
8986
8987 // Ignore users in the user ignore list.
8988 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8989 continue;
8990
8991 // Skip in-tree scalars that become vectors
8992 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
8993 !UseEntries.empty()) {
8994 // Some in-tree scalars will remain as scalar in vectorized
8995 // instructions. If that is the case, the one in FoundLane will
8996 // be used.
8997 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8998 isa<LoadInst, StoreInst>(UserInst)) ||
8999 isa<CallInst>(UserInst)) ||
9000 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9001 return UseEntry->State == TreeEntry::ScatterVectorize ||
9003 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9004 TTI);
9005 })) {
9006 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9007 << ".\n");
9008 assert(none_of(UseEntries,
9009 [](TreeEntry *UseEntry) {
9010 return UseEntry->isGather();
9011 }) &&
9012 "Bad state");
9013 continue;
9014 }
9015 U = nullptr;
9016 if (It != ScalarToExtUses.end()) {
9017 ExternalUses[It->second].User = nullptr;
9018 break;
9019 }
9020 }
9021
9022 if (U && Scalar->hasNUsesOrMore(UsesLimit))
9023 U = nullptr;
9024 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9025 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9026 << " from lane " << FoundLane << " from " << *Scalar
9027 << ".\n");
9028 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9029 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9030 ExternalUsesWithNonUsers.insert(Scalar);
9031 if (!U)
9032 break;
9033 }
9034 }
9035 }
9036}
9037
9039BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9042 PtrToStoresMap;
9043 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
9044 Value *V = TE->Scalars[Lane];
9045 // Don't iterate over the users of constant data.
9046 if (!isa<Instruction>(V))
9047 continue;
9048 // To save compilation time we don't visit if we have too many users.
9049 if (V->hasNUsesOrMore(UsesLimit))
9050 break;
9051
9052 // Collect stores per pointer object.
9053 for (User *U : V->users()) {
9054 auto *SI = dyn_cast<StoreInst>(U);
9055 // Test whether we can handle the store. V might be a global, which could
9056 // be used in a different function.
9057 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9058 !isValidElementType(SI->getValueOperand()->getType()))
9059 continue;
9060 // Skip entry if already
9061 if (isVectorized(U))
9062 continue;
9063
9064 Value *Ptr =
9065 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
9066 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9067 SI->getValueOperand()->getType(), Ptr}];
9068 // For now just keep one store per pointer object per lane.
9069 // TODO: Extend this to support multiple stores per pointer per lane
9070 if (StoresVec.size() > Lane)
9071 continue;
9072 if (!StoresVec.empty()) {
9073 std::optional<int64_t> Diff = getPointersDiff(
9074 SI->getValueOperand()->getType(), SI->getPointerOperand(),
9075 SI->getValueOperand()->getType(),
9076 StoresVec.front()->getPointerOperand(), *DL, *SE,
9077 /*StrictCheck=*/true);
9078 // We failed to compare the pointers so just abandon this store.
9079 if (!Diff)
9080 continue;
9081 }
9082 StoresVec.push_back(SI);
9083 }
9084 }
9085 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9086 unsigned I = 0;
9087 for (auto &P : PtrToStoresMap) {
9088 Res[I].swap(P.second);
9089 ++I;
9090 }
9091 return Res;
9092}
9093
9094bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9095 OrdersType &ReorderIndices) const {
9096 // We check whether the stores in StoreVec can form a vector by sorting them
9097 // and checking whether they are consecutive.
9098
9099 // To avoid calling getPointersDiff() while sorting we create a vector of
9100 // pairs {store, offset from first} and sort this instead.
9102 StoreInst *S0 = StoresVec[0];
9103 StoreOffsetVec.emplace_back(0, 0);
9104 Type *S0Ty = S0->getValueOperand()->getType();
9105 Value *S0Ptr = S0->getPointerOperand();
9106 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
9107 StoreInst *SI = StoresVec[Idx];
9108 std::optional<int64_t> Diff =
9109 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
9110 SI->getPointerOperand(), *DL, *SE,
9111 /*StrictCheck=*/true);
9112 StoreOffsetVec.emplace_back(*Diff, Idx);
9113 }
9114
9115 // Check if the stores are consecutive by checking if their difference is 1.
9116 if (StoreOffsetVec.size() != StoresVec.size())
9117 return false;
9118 sort(StoreOffsetVec, llvm::less_first());
9119 unsigned Idx = 0;
9120 int64_t PrevDist = 0;
9121 for (const auto &P : StoreOffsetVec) {
9122 if (Idx > 0 && P.first != PrevDist + 1)
9123 return false;
9124 PrevDist = P.first;
9125 ++Idx;
9126 }
9127
9128 // Calculate the shuffle indices according to their offset against the sorted
9129 // StoreOffsetVec.
9130 ReorderIndices.assign(StoresVec.size(), 0);
9131 bool IsIdentity = true;
9132 for (auto [I, P] : enumerate(StoreOffsetVec)) {
9133 ReorderIndices[P.second] = I;
9134 IsIdentity &= P.second == I;
9135 }
9136 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9137 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9138 // same convention here.
9139 if (IsIdentity)
9140 ReorderIndices.clear();
9141
9142 return true;
9143}
9144
9145#ifndef NDEBUG
9147 for (unsigned Idx : Order)
9148 dbgs() << Idx << ", ";
9149 dbgs() << "\n";
9150}
9151#endif
9152
9154BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9155 unsigned NumLanes = TE->Scalars.size();
9156
9157 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9158
9159 // Holds the reorder indices for each candidate store vector that is a user of
9160 // the current TreeEntry.
9161 SmallVector<OrdersType, 1> ExternalReorderIndices;
9162
9163 // Now inspect the stores collected per pointer and look for vectorization
9164 // candidates. For each candidate calculate the reorder index vector and push
9165 // it into `ExternalReorderIndices`
9166 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9167 // If we have fewer than NumLanes stores, then we can't form a vector.
9168 if (StoresVec.size() != NumLanes)
9169 continue;
9170
9171 // If the stores are not consecutive then abandon this StoresVec.
9172 OrdersType ReorderIndices;
9173 if (!canFormVector(StoresVec, ReorderIndices))
9174 continue;
9175
9176 // We now know that the scalars in StoresVec can form a vector instruction,
9177 // so set the reorder indices.
9178 ExternalReorderIndices.push_back(ReorderIndices);
9179 }
9180 return ExternalReorderIndices;
9181}
9182
9184 const SmallDenseSet<Value *> &UserIgnoreLst) {
9185 deleteTree();
9186 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9187 "TreeEntryToStridedPtrInfoMap is not cleared");
9188 UserIgnoreList = &UserIgnoreLst;
9189 if (!allSameType(Roots))
9190 return;
9191 buildTreeRec(Roots, 0, EdgeInfo());
9192}
9193
9195 deleteTree();
9196 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9197 "TreeEntryToStridedPtrInfoMap is not cleared");
9198 if (!allSameType(Roots))
9199 return;
9200 buildTreeRec(Roots, 0, EdgeInfo());
9201}
9202
9203/// Tries to find subvector of loads and builds new vector of only loads if can
9204/// be profitable.
9206 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9208 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9209 bool AddNew = true) {
9210 if (VL.empty())
9211 return;
9212 Type *ScalarTy = getValueType(VL.front());
9213 if (!isValidElementType(ScalarTy))
9214 return;
9216 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9217 for (Value *V : VL) {
9218 auto *LI = dyn_cast<LoadInst>(V);
9219 if (!LI)
9220 continue;
9221 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9222 continue;
9223 bool IsFound = false;
9224 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9225 assert(LI->getParent() == Data.front().first->getParent() &&
9226 LI->getType() == Data.front().first->getType() &&
9227 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9228 getUnderlyingObject(Data.front().first->getPointerOperand(),
9230 "Expected loads with the same type, same parent and same "
9231 "underlying pointer.");
9232 std::optional<int64_t> Dist = getPointersDiff(
9233 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9234 Data.front().first->getPointerOperand(), DL, SE,
9235 /*StrictCheck=*/true);
9236 if (!Dist)
9237 continue;
9238 auto It = Map.find(*Dist);
9239 if (It != Map.end() && It->second != LI)
9240 continue;
9241 if (It == Map.end()) {
9242 Data.emplace_back(LI, *Dist);
9243 Map.try_emplace(*Dist, LI);
9244 }
9245 IsFound = true;
9246 break;
9247 }
9248 if (!IsFound) {
9249 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9250 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9251 }
9252 }
9253 auto FindMatchingLoads =
9256 &GatheredLoads,
9257 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9258 int64_t &Offset, unsigned &Start) {
9259 if (Loads.empty())
9260 return GatheredLoads.end();
9261 LoadInst *LI = Loads.front().first;
9262 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9263 if (Idx < Start)
9264 continue;
9265 ToAdd.clear();
9266 if (LI->getParent() != Data.front().first->getParent() ||
9267 LI->getType() != Data.front().first->getType())
9268 continue;
9269 std::optional<int64_t> Dist =
9271 Data.front().first->getType(),
9272 Data.front().first->getPointerOperand(), DL, SE,
9273 /*StrictCheck=*/true);
9274 if (!Dist)
9275 continue;
9276 SmallSet<int64_t, 4> DataDists;
9278 for (std::pair<LoadInst *, int64_t> P : Data) {
9279 DataDists.insert(P.second);
9280 DataLoads.insert(P.first);
9281 }
9282 // Found matching gathered loads - check if all loads are unique or
9283 // can be effectively vectorized.
9284 unsigned NumUniques = 0;
9285 for (auto [Cnt, Pair] : enumerate(Loads)) {
9286 bool Used = DataLoads.contains(Pair.first);
9287 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9288 ++NumUniques;
9289 ToAdd.insert(Cnt);
9290 } else if (Used) {
9291 Repeated.insert(Cnt);
9292 }
9293 }
9294 if (NumUniques > 0 &&
9295 (Loads.size() == NumUniques ||
9296 (Loads.size() - NumUniques >= 2 &&
9297 Loads.size() - NumUniques >= Loads.size() / 2 &&
9298 (has_single_bit(Data.size() + NumUniques) ||
9299 bit_ceil(Data.size()) <
9300 bit_ceil(Data.size() + NumUniques))))) {
9301 Offset = *Dist;
9302 Start = Idx + 1;
9303 return std::next(GatheredLoads.begin(), Idx);
9304 }
9305 }
9306 ToAdd.clear();
9307 return GatheredLoads.end();
9308 };
9309 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9310 unsigned Start = 0;
9311 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9312 int64_t Offset = 0;
9313 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9314 Offset, Start);
9315 while (It != GatheredLoads.end()) {
9316 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9317 for (unsigned Idx : LocalToAdd)
9318 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9319 ToAdd.insert_range(LocalToAdd);
9320 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9321 Start);
9322 }
9323 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9324 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9325 })) {
9326 auto AddNewLoads =
9328 for (unsigned Idx : seq<unsigned>(Data.size())) {
9329 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9330 continue;
9331 Loads.push_back(Data[Idx]);
9332 }
9333 };
9334 if (!AddNew) {
9335 LoadInst *LI = Data.front().first;
9336 It = find_if(
9337 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9338 return PD.front().first->getParent() == LI->getParent() &&
9339 PD.front().first->getType() == LI->getType();
9340 });
9341 while (It != GatheredLoads.end()) {
9342 AddNewLoads(*It);
9343 It = std::find_if(
9344 std::next(It), GatheredLoads.end(),
9345 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9346 return PD.front().first->getParent() == LI->getParent() &&
9347 PD.front().first->getType() == LI->getType();
9348 });
9349 }
9350 }
9351 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9352 AddNewLoads(GatheredLoads.emplace_back());
9353 }
9354 }
9355}
9356
9357void BoUpSLP::tryToVectorizeGatheredLoads(
9358 const SmallMapVector<
9359 std::tuple<BasicBlock *, Value *, Type *>,
9360 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9361 &GatheredLoads) {
9362 GatheredLoadsEntriesFirst = VectorizableTree.size();
9363
9364 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9365 LoadEntriesToVectorize.size());
9366 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9367 Set.insert_range(VectorizableTree[Idx]->Scalars);
9368
9369 // Sort loads by distance.
9370 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9371 const std::pair<LoadInst *, int64_t> &L2) {
9372 return L1.second > L2.second;
9373 };
9374
9375 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9376 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9377 Loads.size());
9378 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9379 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9380 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9381 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9382 };
9383
9384 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9385 BoUpSLP::ValueSet &VectorizedLoads,
9386 SmallVectorImpl<LoadInst *> &NonVectorized,
9387 bool Final, unsigned MaxVF) {
9389 unsigned StartIdx = 0;
9390 SmallVector<int> CandidateVFs;
9391 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9392 CandidateVFs.push_back(MaxVF);
9393 for (int NumElts = getFloorFullVectorNumberOfElements(
9394 *TTI, Loads.front()->getType(), MaxVF);
9395 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9396 *TTI, Loads.front()->getType(), NumElts - 1)) {
9397 CandidateVFs.push_back(NumElts);
9398 if (VectorizeNonPowerOf2 && NumElts > 2)
9399 CandidateVFs.push_back(NumElts - 1);
9400 }
9401
9402 if (Final && CandidateVFs.empty())
9403 return Results;
9404
9405 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9406 for (unsigned NumElts : CandidateVFs) {
9407 if (Final && NumElts > BestVF)
9408 continue;
9409 SmallVector<unsigned> MaskedGatherVectorized;
9410 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9411 ++Cnt) {
9412 ArrayRef<LoadInst *> Slice =
9413 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9414 if (VectorizedLoads.count(Slice.front()) ||
9415 VectorizedLoads.count(Slice.back()) ||
9417 continue;
9418 // Check if it is profitable to try vectorizing gathered loads. It is
9419 // profitable if we have more than 3 consecutive loads or if we have
9420 // less but all users are vectorized or deleted.
9421 bool AllowToVectorize = false;
9422 // Check if it is profitable to vectorize 2-elements loads.
9423 if (NumElts == 2) {
9424 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9425 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9426 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9427 for (LoadInst *LI : Slice) {
9428 // If single use/user - allow to vectorize.
9429 if (LI->hasOneUse())
9430 continue;
9431 // 1. Check if number of uses equals number of users.
9432 // 2. All users are deleted.
9433 // 3. The load broadcasts are not allowed or the load is not
9434 // broadcasted.
9435 if (static_cast<unsigned int>(std::distance(
9436 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9437 return false;
9438 if (!IsLegalBroadcastLoad)
9439 continue;
9440 if (LI->hasNUsesOrMore(UsesLimit))
9441 return false;
9442 for (User *U : LI->users()) {
9443 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9444 continue;
9445 for (const TreeEntry *UTE : getTreeEntries(U)) {
9446 for (int I : seq<int>(UTE->getNumOperands())) {
9447 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9448 return V == LI || isa<PoisonValue>(V);
9449 }))
9450 // Found legal broadcast - do not vectorize.
9451 return false;
9452 }
9453 }
9454 }
9455 }
9456 return true;
9457 };
9458 AllowToVectorize = CheckIfAllowed(Slice);
9459 } else {
9460 AllowToVectorize =
9461 (NumElts >= 3 ||
9462 any_of(ValueToGatherNodes.at(Slice.front()),
9463 [=](const TreeEntry *TE) {
9464 return TE->Scalars.size() == 2 &&
9465 ((TE->Scalars.front() == Slice.front() &&
9466 TE->Scalars.back() == Slice.back()) ||
9467 (TE->Scalars.front() == Slice.back() &&
9468 TE->Scalars.back() == Slice.front()));
9469 })) &&
9470 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9471 Slice.size());
9472 }
9473 if (AllowToVectorize) {
9474 SmallVector<Value *> PointerOps;
9475 OrdersType CurrentOrder;
9476 // Try to build vector load.
9477 ArrayRef<Value *> Values(
9478 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9479 StridedPtrInfo SPtrInfo;
9480 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9481 PointerOps, SPtrInfo, &BestVF);
9482 if (LS != LoadsState::Gather ||
9483 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9484 if (LS == LoadsState::ScatterVectorize) {
9485 if (MaskedGatherVectorized.empty() ||
9486 Cnt >= MaskedGatherVectorized.back() + NumElts)
9487 MaskedGatherVectorized.push_back(Cnt);
9488 continue;
9489 }
9490 if (LS != LoadsState::Gather) {
9491 Results.emplace_back(Values, LS);
9492 VectorizedLoads.insert_range(Slice);
9493 // If we vectorized initial block, no need to try to vectorize it
9494 // again.
9495 if (Cnt == StartIdx)
9496 StartIdx += NumElts;
9497 }
9498 // Check if the whole array was vectorized already - exit.
9499 if (StartIdx >= Loads.size())
9500 break;
9501 // Erase last masked gather candidate, if another candidate within
9502 // the range is found to be better.
9503 if (!MaskedGatherVectorized.empty() &&
9504 Cnt < MaskedGatherVectorized.back() + NumElts)
9505 MaskedGatherVectorized.pop_back();
9506 Cnt += NumElts - 1;
9507 continue;
9508 }
9509 }
9510 if (!AllowToVectorize || BestVF == 0)
9512 }
9513 // Mark masked gathers candidates as vectorized, if any.
9514 for (unsigned Cnt : MaskedGatherVectorized) {
9515 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9516 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9517 ArrayRef<Value *> Values(
9518 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9519 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9520 VectorizedLoads.insert_range(Slice);
9521 // If we vectorized initial block, no need to try to vectorize it again.
9522 if (Cnt == StartIdx)
9523 StartIdx += NumElts;
9524 }
9525 }
9526 for (LoadInst *LI : Loads) {
9527 if (!VectorizedLoads.contains(LI))
9528 NonVectorized.push_back(LI);
9529 }
9530 return Results;
9531 };
9532 auto ProcessGatheredLoads =
9533 [&, &TTI = *TTI](
9535 bool Final = false) {
9536 SmallVector<LoadInst *> NonVectorized;
9537 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9538 GatheredLoads) {
9539 if (LoadsDists.size() <= 1) {
9540 NonVectorized.push_back(LoadsDists.back().first);
9541 continue;
9542 }
9544 LoadsDists);
9545 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9546 stable_sort(LocalLoadsDists, LoadSorter);
9548 unsigned MaxConsecutiveDistance = 0;
9549 unsigned CurrentConsecutiveDist = 1;
9550 int64_t LastDist = LocalLoadsDists.front().second;
9551 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9552 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9553 if (isVectorized(L.first))
9554 continue;
9555 assert(LastDist >= L.second &&
9556 "Expected first distance always not less than second");
9557 if (static_cast<uint64_t>(LastDist - L.second) ==
9558 CurrentConsecutiveDist) {
9559 ++CurrentConsecutiveDist;
9560 MaxConsecutiveDistance =
9561 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9562 Loads.push_back(L.first);
9563 continue;
9564 }
9565 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9566 !Loads.empty())
9567 Loads.pop_back();
9568 CurrentConsecutiveDist = 1;
9569 LastDist = L.second;
9570 Loads.push_back(L.first);
9571 }
9572 if (Loads.size() <= 1)
9573 continue;
9574 if (AllowMaskedGather)
9575 MaxConsecutiveDistance = Loads.size();
9576 else if (MaxConsecutiveDistance < 2)
9577 continue;
9578 BoUpSLP::ValueSet VectorizedLoads;
9579 SmallVector<LoadInst *> SortedNonVectorized;
9581 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9582 Final, MaxConsecutiveDistance);
9583 if (!Results.empty() && !SortedNonVectorized.empty() &&
9584 OriginalLoads.size() == Loads.size() &&
9585 MaxConsecutiveDistance == Loads.size() &&
9587 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9588 return P.second == LoadsState::ScatterVectorize;
9589 })) {
9590 VectorizedLoads.clear();
9591 SmallVector<LoadInst *> UnsortedNonVectorized;
9593 UnsortedResults =
9594 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9595 UnsortedNonVectorized, Final,
9596 OriginalLoads.size());
9597 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9598 SortedNonVectorized.swap(UnsortedNonVectorized);
9599 Results.swap(UnsortedResults);
9600 }
9601 }
9602 for (auto [Slice, _] : Results) {
9603 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9604 << Slice.size() << ")\n");
9605 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9606 for (Value *L : Slice)
9607 if (!isVectorized(L))
9608 SortedNonVectorized.push_back(cast<LoadInst>(L));
9609 continue;
9610 }
9611
9612 // Select maximum VF as a maximum of user gathered nodes and
9613 // distance between scalar loads in these nodes.
9614 unsigned MaxVF = Slice.size();
9615 unsigned UserMaxVF = 0;
9616 unsigned InterleaveFactor = 0;
9617 if (MaxVF == 2) {
9618 UserMaxVF = MaxVF;
9619 } else {
9620 // Found distance between segments of the interleaved loads.
9621 std::optional<unsigned> InterleavedLoadsDistance = 0;
9622 unsigned Order = 0;
9623 std::optional<unsigned> CommonVF = 0;
9624 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9625 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9626 for (auto [Idx, V] : enumerate(Slice)) {
9627 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9628 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9629 unsigned Pos =
9630 EntryToPosition.try_emplace(E, Idx).first->second;
9631 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9632 if (CommonVF) {
9633 if (*CommonVF == 0) {
9634 CommonVF = E->Scalars.size();
9635 continue;
9636 }
9637 if (*CommonVF != E->Scalars.size())
9638 CommonVF.reset();
9639 }
9640 // Check if the load is the part of the interleaved load.
9641 if (Pos != Idx && InterleavedLoadsDistance) {
9642 if (!DeinterleavedNodes.contains(E) &&
9643 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9644 if (isa<Constant>(V))
9645 return false;
9646 if (isVectorized(V))
9647 return true;
9648 const auto &Nodes = ValueToGatherNodes.at(V);
9649 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9650 !is_contained(Slice, V);
9651 })) {
9652 InterleavedLoadsDistance.reset();
9653 continue;
9654 }
9655 DeinterleavedNodes.insert(E);
9656 if (*InterleavedLoadsDistance == 0) {
9657 InterleavedLoadsDistance = Idx - Pos;
9658 continue;
9659 }
9660 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9661 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9662 InterleavedLoadsDistance.reset();
9663 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9664 }
9665 }
9666 }
9667 DeinterleavedNodes.clear();
9668 // Check if the large load represents interleaved load operation.
9669 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9670 CommonVF.value_or(0) != 0) {
9671 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9672 unsigned VF = *CommonVF;
9673 OrdersType Order;
9674 SmallVector<Value *> PointerOps;
9675 StridedPtrInfo SPtrInfo;
9676 // Segmented load detected - vectorize at maximum vector factor.
9677 if (InterleaveFactor <= Slice.size() &&
9678 TTI.isLegalInterleavedAccessType(
9679 getWidenedType(Slice.front()->getType(), VF),
9680 InterleaveFactor,
9681 cast<LoadInst>(Slice.front())->getAlign(),
9682 cast<LoadInst>(Slice.front())
9684 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9685 SPtrInfo) == LoadsState::Vectorize) {
9686 UserMaxVF = InterleaveFactor * VF;
9687 } else {
9688 InterleaveFactor = 0;
9689 }
9690 }
9691 // Cannot represent the loads as consecutive vectorizable nodes -
9692 // just exit.
9693 unsigned ConsecutiveNodesSize = 0;
9694 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9695 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9696 [&, Slice = Slice](const auto &P) {
9697 const auto *It = find_if(Slice, [&](Value *V) {
9698 return std::get<1>(P).contains(V);
9699 });
9700 if (It == Slice.end())
9701 return false;
9702 const TreeEntry &TE =
9703 *VectorizableTree[std::get<0>(P)];
9704 ArrayRef<Value *> VL = TE.Scalars;
9705 OrdersType Order;
9706 SmallVector<Value *> PointerOps;
9707 StridedPtrInfo SPtrInfo;
9709 VL, VL.front(), Order, PointerOps, SPtrInfo);
9710 if (State == LoadsState::ScatterVectorize ||
9712 return false;
9713 ConsecutiveNodesSize += VL.size();
9714 size_t Start = std::distance(Slice.begin(), It);
9715 size_t Sz = Slice.size() - Start;
9716 return Sz < VL.size() ||
9717 Slice.slice(Start, VL.size()) != VL;
9718 }))
9719 continue;
9720 // Try to build long masked gather loads.
9721 UserMaxVF = bit_ceil(UserMaxVF);
9722 if (InterleaveFactor == 0 &&
9723 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9724 [&, Slice = Slice](unsigned Idx) {
9725 OrdersType Order;
9726 SmallVector<Value *> PointerOps;
9727 StridedPtrInfo SPtrInfo;
9728 return canVectorizeLoads(
9729 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9730 Slice[Idx * UserMaxVF], Order, PointerOps,
9731 SPtrInfo) == LoadsState::ScatterVectorize;
9732 }))
9733 UserMaxVF = MaxVF;
9734 if (Slice.size() != ConsecutiveNodesSize)
9735 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9736 }
9737 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9738 bool IsVectorized = true;
9739 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
9740 ArrayRef<Value *> SubSlice =
9741 Slice.slice(I, std::min(VF, E - I));
9742 if (isVectorized(SubSlice.front()))
9743 continue;
9744 // Check if the subslice is to be-vectorized entry, which is not
9745 // equal to entry.
9746 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9747 [&](const auto &P) {
9748 return !SubSlice.equals(
9749 VectorizableTree[std::get<0>(P)]
9750 ->Scalars) &&
9751 set_is_subset(SubSlice, std::get<1>(P));
9752 }))
9753 continue;
9754 unsigned Sz = VectorizableTree.size();
9755 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9756 if (Sz == VectorizableTree.size()) {
9757 IsVectorized = false;
9758 // Try non-interleaved vectorization with smaller vector
9759 // factor.
9760 if (InterleaveFactor > 0) {
9761 VF = 2 * (MaxVF / InterleaveFactor);
9762 InterleaveFactor = 0;
9763 }
9764 continue;
9765 }
9766 }
9767 if (IsVectorized)
9768 break;
9769 }
9770 }
9771 NonVectorized.append(SortedNonVectorized);
9772 }
9773 return NonVectorized;
9774 };
9775 for (const auto &GLs : GatheredLoads) {
9776 const auto &Ref = GLs.second;
9777 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
9778 if (!Ref.empty() && !NonVectorized.empty() &&
9779 std::accumulate(
9780 Ref.begin(), Ref.end(), 0u,
9781 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9782 -> unsigned { return S + LoadsDists.size(); }) !=
9783 NonVectorized.size() &&
9784 IsMaskedGatherSupported(NonVectorized)) {
9786 FinalGatheredLoads;
9787 for (LoadInst *LI : NonVectorized) {
9788 // Reinsert non-vectorized loads to other list of loads with the same
9789 // base pointers.
9790 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
9791 FinalGatheredLoads,
9792 /*AddNew=*/false);
9793 }
9794 // Final attempt to vectorize non-vectorized loads.
9795 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
9796 }
9797 }
9798 // Try to vectorize postponed load entries, previously marked as gathered.
9799 for (unsigned Idx : LoadEntriesToVectorize) {
9800 const TreeEntry &E = *VectorizableTree[Idx];
9801 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
9802 // Avoid reordering, if possible.
9803 if (!E.ReorderIndices.empty()) {
9804 // Build a mask out of the reorder indices and reorder scalars per this
9805 // mask.
9806 SmallVector<int> ReorderMask;
9807 inversePermutation(E.ReorderIndices, ReorderMask);
9808 reorderScalars(GatheredScalars, ReorderMask);
9809 }
9810 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9811 }
9812 // If no new entries created, consider it as no gathered loads entries must be
9813 // handled.
9814 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9815 VectorizableTree.size())
9816 GatheredLoadsEntriesFirst.reset();
9817}
9818
9819/// Generates key/subkey pair for the given value to provide effective sorting
9820/// of the values and better detection of the vectorizable values sequences. The
9821/// keys/subkeys can be used for better sorting of the values themselves (keys)
9822/// and in values subgroups (subkeys).
9823static std::pair<size_t, size_t> generateKeySubkey(
9824 Value *V, const TargetLibraryInfo *TLI,
9825 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
9826 bool AllowAlternate) {
9827 hash_code Key = hash_value(V->getValueID() + 2);
9828 hash_code SubKey = hash_value(0);
9829 // Sort the loads by the distance between the pointers.
9830 if (auto *LI = dyn_cast<LoadInst>(V)) {
9831 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
9832 if (LI->isSimple())
9833 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
9834 else
9835 Key = SubKey = hash_value(LI);
9836 } else if (isVectorLikeInstWithConstOps(V)) {
9837 // Sort extracts by the vector operands.
9839 Key = hash_value(Value::UndefValueVal + 1);
9840 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
9841 if (!isUndefVector(EI->getVectorOperand()).all() &&
9842 !isa<UndefValue>(EI->getIndexOperand()))
9843 SubKey = hash_value(EI->getVectorOperand());
9844 }
9845 } else if (auto *I = dyn_cast<Instruction>(V)) {
9846 // Sort other instructions just by the opcodes except for CMPInst.
9847 // For CMP also sort by the predicate kind.
9849 isValidForAlternation(I->getOpcode())) {
9850 if (AllowAlternate)
9851 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
9852 else
9853 Key = hash_combine(hash_value(I->getOpcode()), Key);
9854 SubKey = hash_combine(
9855 hash_value(I->getOpcode()), hash_value(I->getType()),
9857 ? I->getType()
9858 : cast<CastInst>(I)->getOperand(0)->getType()));
9859 // For casts, look through the only operand to improve compile time.
9860 if (isa<CastInst>(I)) {
9861 std::pair<size_t, size_t> OpVals =
9862 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
9863 /*AllowAlternate=*/true);
9864 Key = hash_combine(OpVals.first, Key);
9865 SubKey = hash_combine(OpVals.first, SubKey);
9866 }
9867 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
9868 CmpInst::Predicate Pred = CI->getPredicate();
9869 if (CI->isCommutative())
9870 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
9872 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
9873 hash_value(SwapPred),
9874 hash_value(CI->getOperand(0)->getType()));
9875 } else if (auto *Call = dyn_cast<CallInst>(I)) {
9878 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
9879 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
9880 SubKey = hash_combine(hash_value(I->getOpcode()),
9881 hash_value(Call->getCalledFunction()));
9882 } else {
9884 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
9885 }
9886 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
9887 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
9888 hash_value(Op.Tag), SubKey);
9889 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
9890 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9891 SubKey = hash_value(Gep->getPointerOperand());
9892 else
9893 SubKey = hash_value(Gep);
9894 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
9895 !isa<ConstantInt>(I->getOperand(1))) {
9896 // Do not try to vectorize instructions with potentially high cost.
9897 SubKey = hash_value(I);
9898 } else {
9899 SubKey = hash_value(I->getOpcode());
9900 }
9901 Key = hash_combine(hash_value(I->getParent()), Key);
9902 }
9903 return std::make_pair(Key, SubKey);
9904}
9905
9906/// Checks if the specified instruction \p I is an main operation for the given
9907/// \p MainOp and \p AltOp instructions.
9908static bool isMainInstruction(Instruction *I, Instruction *MainOp,
9909 Instruction *AltOp, const TargetLibraryInfo &TLI);
9910
9911bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
9912 ArrayRef<Value *> VL) const {
9913 Type *ScalarTy = S.getMainOp()->getType();
9914 unsigned Opcode0 = S.getOpcode();
9915 unsigned Opcode1 = S.getAltOpcode();
9916 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9917 // If this pattern is supported by the target then consider it profitable.
9918 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
9919 Opcode1, OpcodeMask))
9920 return true;
9921 SmallVector<ValueList> Operands;
9922 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9923 Operands.emplace_back();
9924 // Prepare the operand vector.
9925 for (Value *V : VL) {
9926 if (isa<PoisonValue>(V)) {
9927 Operands.back().push_back(
9928 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
9929 continue;
9930 }
9931 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
9932 }
9933 }
9934 if (Operands.size() == 2) {
9935 // Try find best operands candidates.
9936 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9938 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
9939 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
9940 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
9941 std::optional<int> Res = findBestRootPair(Candidates);
9942 switch (Res.value_or(0)) {
9943 case 0:
9944 break;
9945 case 1:
9946 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
9947 break;
9948 case 2:
9949 std::swap(Operands[0][I], Operands[1][I]);
9950 break;
9951 default:
9952 llvm_unreachable("Unexpected index.");
9953 }
9954 }
9955 }
9956 DenseSet<unsigned> UniqueOpcodes;
9957 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
9958 unsigned NonInstCnt = 0;
9959 // Estimate number of instructions, required for the vectorized node and for
9960 // the buildvector node.
9961 unsigned UndefCnt = 0;
9962 // Count the number of extra shuffles, required for vector nodes.
9963 unsigned ExtraShuffleInsts = 0;
9964 // Check that operands do not contain same values and create either perfect
9965 // diamond match or shuffled match.
9966 if (Operands.size() == 2) {
9967 // Do not count same operands twice.
9968 if (Operands.front() == Operands.back()) {
9969 Operands.erase(Operands.begin());
9970 } else if (!allConstant(Operands.front()) &&
9971 all_of(Operands.front(), [&](Value *V) {
9972 return is_contained(Operands.back(), V);
9973 })) {
9974 Operands.erase(Operands.begin());
9975 ++ExtraShuffleInsts;
9976 }
9977 }
9978 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
9979 // Vectorize node, if:
9980 // 1. at least single operand is constant or splat.
9981 // 2. Operands have many loop invariants (the instructions are not loop
9982 // invariants).
9983 // 3. At least single unique operands is supposed to vectorized.
9984 return none_of(Operands,
9985 [&](ArrayRef<Value *> Op) {
9986 if (allConstant(Op) ||
9987 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
9988 getSameOpcode(Op, *TLI)))
9989 return false;
9990 DenseMap<Value *, unsigned> Uniques;
9991 for (Value *V : Op) {
9993 isVectorized(V) || (L && L->isLoopInvariant(V))) {
9994 if (isa<UndefValue>(V))
9995 ++UndefCnt;
9996 continue;
9997 }
9998 auto Res = Uniques.try_emplace(V, 0);
9999 // Found first duplicate - need to add shuffle.
10000 if (!Res.second && Res.first->second == 1)
10001 ++ExtraShuffleInsts;
10002 ++Res.first->getSecond();
10003 if (auto *I = dyn_cast<Instruction>(V))
10004 UniqueOpcodes.insert(I->getOpcode());
10005 else if (Res.second)
10006 ++NonInstCnt;
10007 }
10008 return none_of(Uniques, [&](const auto &P) {
10009 return P.first->hasNUsesOrMore(P.second + 1) &&
10010 none_of(P.first->users(), [&](User *U) {
10011 return isVectorized(U) || Uniques.contains(U);
10012 });
10013 });
10014 }) ||
10015 // Do not vectorize node, if estimated number of vector instructions is
10016 // more than estimated number of buildvector instructions. Number of
10017 // vector operands is number of vector instructions + number of vector
10018 // instructions for operands (buildvectors). Number of buildvector
10019 // instructions is just number_of_operands * number_of_scalars.
10020 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10021 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
10022 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10023}
10024
10025/// Builds the arguments types vector for the given call instruction with the
10026/// given \p ID for the specified vector factor.
10029 const unsigned VF, unsigned MinBW,
10030 const TargetTransformInfo *TTI) {
10031 SmallVector<Type *> ArgTys;
10032 for (auto [Idx, Arg] : enumerate(CI->args())) {
10035 ArgTys.push_back(Arg->getType());
10036 continue;
10037 }
10038 if (MinBW > 0) {
10039 ArgTys.push_back(
10040 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
10041 continue;
10042 }
10043 }
10044 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
10045 }
10046 return ArgTys;
10047}
10048
10049/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10050/// function (if possible) calls. Returns invalid cost for the corresponding
10051/// calls, if they cannot be vectorized/will be scalarized.
10052static std::pair<InstructionCost, InstructionCost>
10055 ArrayRef<Type *> ArgTys) {
10056 auto Shape = VFShape::get(CI->getFunctionType(),
10058 false /*HasGlobalPred*/);
10059 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10060 auto LibCost = InstructionCost::getInvalid();
10061 if (!CI->isNoBuiltin() && VecFunc) {
10062 // Calculate the cost of the vector library call.
10063 // If the corresponding vector call is cheaper, return its cost.
10064 LibCost =
10065 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
10066 }
10068
10069 // Calculate the cost of the vector intrinsic call.
10070 FastMathFlags FMF;
10071 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
10072 FMF = FPCI->getFastMathFlags();
10073 const InstructionCost ScalarLimit = 10000;
10074 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10075 LibCost.isValid() ? LibCost : ScalarLimit);
10076 auto IntrinsicCost =
10077 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
10078 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10079 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10081
10082 return {IntrinsicCost, LibCost};
10083}
10084
10085BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10086 const InstructionsState &S, ArrayRef<Value *> VL,
10087 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10088 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10089 assert(S.getMainOp() &&
10090 "Expected instructions with same/alternate opcodes only.");
10091
10092 unsigned ShuffleOrOp =
10093 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10094 Instruction *VL0 = S.getMainOp();
10095 switch (ShuffleOrOp) {
10096 case Instruction::PHI: {
10097 // Too many operands - gather, most probably won't be vectorized.
10098 if (VL0->getNumOperands() > MaxPHINumOperands)
10099 return TreeEntry::NeedToGather;
10100 // Check for terminator values (e.g. invoke).
10101 for (Value *V : VL) {
10102 auto *PHI = dyn_cast<PHINode>(V);
10103 if (!PHI)
10104 continue;
10105 for (Value *Incoming : PHI->incoming_values()) {
10107 if (Term && Term->isTerminator()) {
10109 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10110 return TreeEntry::NeedToGather;
10111 }
10112 }
10113 }
10114
10115 return TreeEntry::Vectorize;
10116 }
10117 case Instruction::ExtractElement:
10118 if (any_of(VL, [&](Value *V) {
10119 auto *EI = dyn_cast<ExtractElementInst>(V);
10120 if (!EI)
10121 return true;
10122 return isVectorized(EI->getOperand(0));
10123 }))
10124 return TreeEntry::NeedToGather;
10125 [[fallthrough]];
10126 case Instruction::ExtractValue: {
10127 bool Reuse = canReuseExtract(VL, CurrentOrder);
10128 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10129 // non-full registers).
10130 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
10131 return TreeEntry::NeedToGather;
10132 if (Reuse || !CurrentOrder.empty())
10133 return TreeEntry::Vectorize;
10134 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10135 return TreeEntry::NeedToGather;
10136 }
10137 case Instruction::InsertElement: {
10138 // Check that we have a buildvector and not a shuffle of 2 or more
10139 // different vectors.
10140 ValueSet SourceVectors;
10141 for (Value *V : VL) {
10142 if (isa<PoisonValue>(V)) {
10143 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10144 return TreeEntry::NeedToGather;
10145 }
10146 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10147 assert(getElementIndex(V) != std::nullopt &&
10148 "Non-constant or undef index?");
10149 }
10150
10151 if (count_if(VL, [&SourceVectors](Value *V) {
10152 return !SourceVectors.contains(V);
10153 }) >= 2) {
10154 // Found 2nd source vector - cancel.
10155 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10156 "different source vectors.\n");
10157 return TreeEntry::NeedToGather;
10158 }
10159
10160 if (any_of(VL, [&SourceVectors](Value *V) {
10161 // The last InsertElement can have multiple uses.
10162 return SourceVectors.contains(V) && !V->hasOneUse();
10163 })) {
10164 assert(SLPReVec && "Only supported by REVEC.");
10165 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10166 "multiple uses.\n");
10167 return TreeEntry::NeedToGather;
10168 }
10169
10170 return TreeEntry::Vectorize;
10171 }
10172 case Instruction::Load: {
10173 // Check that a vectorized load would load the same memory as a scalar
10174 // load. For example, we don't want to vectorize loads that are smaller
10175 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10176 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10177 // from such a struct, we read/write packed bits disagreeing with the
10178 // unvectorized version.
10179 auto IsGatheredNode = [&]() {
10180 if (!GatheredLoadsEntriesFirst)
10181 return false;
10182 return all_of(VL, [&](Value *V) {
10183 if (isa<PoisonValue>(V))
10184 return true;
10185 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10186 return TE->Idx >= *GatheredLoadsEntriesFirst;
10187 });
10188 });
10189 };
10190 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10192 return TreeEntry::Vectorize;
10194 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10195 // Delay slow vectorized nodes for better vectorization attempts.
10196 LoadEntriesToVectorize.insert(VectorizableTree.size());
10197 return TreeEntry::NeedToGather;
10198 }
10199 return IsGatheredNode() ? TreeEntry::NeedToGather
10200 : TreeEntry::CompressVectorize;
10202 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10203 // Delay slow vectorized nodes for better vectorization attempts.
10204 LoadEntriesToVectorize.insert(VectorizableTree.size());
10205 return TreeEntry::NeedToGather;
10206 }
10207 return IsGatheredNode() ? TreeEntry::NeedToGather
10208 : TreeEntry::ScatterVectorize;
10210 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10211 // Delay slow vectorized nodes for better vectorization attempts.
10212 LoadEntriesToVectorize.insert(VectorizableTree.size());
10213 return TreeEntry::NeedToGather;
10214 }
10215 return IsGatheredNode() ? TreeEntry::NeedToGather
10216 : TreeEntry::StridedVectorize;
10217 case LoadsState::Gather:
10218#ifndef NDEBUG
10219 Type *ScalarTy = VL0->getType();
10220 if (DL->getTypeSizeInBits(ScalarTy) !=
10221 DL->getTypeAllocSizeInBits(ScalarTy))
10222 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10223 else if (any_of(VL, [](Value *V) {
10224 auto *LI = dyn_cast<LoadInst>(V);
10225 return !LI || !LI->isSimple();
10226 }))
10227 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10228 else
10229 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10230#endif // NDEBUG
10232 return TreeEntry::NeedToGather;
10233 }
10234 llvm_unreachable("Unexpected state of loads");
10235 }
10236 case Instruction::ZExt:
10237 case Instruction::SExt:
10238 case Instruction::FPToUI:
10239 case Instruction::FPToSI:
10240 case Instruction::FPExt:
10241 case Instruction::PtrToInt:
10242 case Instruction::IntToPtr:
10243 case Instruction::SIToFP:
10244 case Instruction::UIToFP:
10245 case Instruction::Trunc:
10246 case Instruction::FPTrunc:
10247 case Instruction::BitCast: {
10248 Type *SrcTy = VL0->getOperand(0)->getType();
10249 for (Value *V : VL) {
10250 if (isa<PoisonValue>(V))
10251 continue;
10252 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10253 if (Ty != SrcTy || !isValidElementType(Ty)) {
10254 LLVM_DEBUG(
10255 dbgs() << "SLP: Gathering casts with different src types.\n");
10256 return TreeEntry::NeedToGather;
10257 }
10258 }
10259 return TreeEntry::Vectorize;
10260 }
10261 case Instruction::ICmp:
10262 case Instruction::FCmp: {
10263 // Check that all of the compares have the same predicate.
10264 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10266 Type *ComparedTy = VL0->getOperand(0)->getType();
10267 for (Value *V : VL) {
10268 if (isa<PoisonValue>(V))
10269 continue;
10270 auto *Cmp = cast<CmpInst>(V);
10271 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10272 Cmp->getOperand(0)->getType() != ComparedTy) {
10273 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10274 return TreeEntry::NeedToGather;
10275 }
10276 }
10277 return TreeEntry::Vectorize;
10278 }
10279 case Instruction::Select:
10280 case Instruction::FNeg:
10281 case Instruction::Add:
10282 case Instruction::FAdd:
10283 case Instruction::Sub:
10284 case Instruction::FSub:
10285 case Instruction::Mul:
10286 case Instruction::FMul:
10287 case Instruction::UDiv:
10288 case Instruction::SDiv:
10289 case Instruction::FDiv:
10290 case Instruction::URem:
10291 case Instruction::SRem:
10292 case Instruction::FRem:
10293 case Instruction::Shl:
10294 case Instruction::LShr:
10295 case Instruction::AShr:
10296 case Instruction::And:
10297 case Instruction::Or:
10298 case Instruction::Xor:
10299 case Instruction::Freeze:
10300 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10301 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10302 auto *I = dyn_cast<Instruction>(V);
10303 return I && I->isBinaryOp() && !I->isFast();
10304 }))
10305 return TreeEntry::NeedToGather;
10306 return TreeEntry::Vectorize;
10307 case Instruction::GetElementPtr: {
10308 // We don't combine GEPs with complicated (nested) indexing.
10309 for (Value *V : VL) {
10310 auto *I = dyn_cast<GetElementPtrInst>(V);
10311 if (!I)
10312 continue;
10313 if (I->getNumOperands() != 2) {
10314 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10315 return TreeEntry::NeedToGather;
10316 }
10317 }
10318
10319 // We can't combine several GEPs into one vector if they operate on
10320 // different types.
10321 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10322 for (Value *V : VL) {
10323 auto *GEP = dyn_cast<GEPOperator>(V);
10324 if (!GEP)
10325 continue;
10326 Type *CurTy = GEP->getSourceElementType();
10327 if (Ty0 != CurTy) {
10328 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10329 return TreeEntry::NeedToGather;
10330 }
10331 }
10332
10333 // We don't combine GEPs with non-constant indexes.
10334 Type *Ty1 = VL0->getOperand(1)->getType();
10335 for (Value *V : VL) {
10336 auto *I = dyn_cast<GetElementPtrInst>(V);
10337 if (!I)
10338 continue;
10339 auto *Op = I->getOperand(1);
10340 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10341 (Op->getType() != Ty1 &&
10342 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10343 Op->getType()->getScalarSizeInBits() >
10344 DL->getIndexSizeInBits(
10345 V->getType()->getPointerAddressSpace())))) {
10346 LLVM_DEBUG(
10347 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10348 return TreeEntry::NeedToGather;
10349 }
10350 }
10351
10352 return TreeEntry::Vectorize;
10353 }
10354 case Instruction::Store: {
10355 // Check if the stores are consecutive or if we need to swizzle them.
10356 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10357 // Avoid types that are padded when being allocated as scalars, while
10358 // being packed together in a vector (such as i1).
10359 if (DL->getTypeSizeInBits(ScalarTy) !=
10360 DL->getTypeAllocSizeInBits(ScalarTy)) {
10361 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10362 return TreeEntry::NeedToGather;
10363 }
10364 // Make sure all stores in the bundle are simple - we can't vectorize
10365 // atomic or volatile stores.
10366 for (Value *V : VL) {
10367 auto *SI = cast<StoreInst>(V);
10368 if (!SI->isSimple()) {
10369 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10370 return TreeEntry::NeedToGather;
10371 }
10372 PointerOps.push_back(SI->getPointerOperand());
10373 }
10374
10375 // Check the order of pointer operands.
10376 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10377 Value *Ptr0;
10378 Value *PtrN;
10379 if (CurrentOrder.empty()) {
10380 Ptr0 = PointerOps.front();
10381 PtrN = PointerOps.back();
10382 } else {
10383 Ptr0 = PointerOps[CurrentOrder.front()];
10384 PtrN = PointerOps[CurrentOrder.back()];
10385 }
10386 std::optional<int64_t> Dist =
10387 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10388 // Check that the sorted pointer operands are consecutive.
10389 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10390 return TreeEntry::Vectorize;
10391 }
10392
10393 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10394 return TreeEntry::NeedToGather;
10395 }
10396 case Instruction::Call: {
10397 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10398 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10399 auto *I = dyn_cast<Instruction>(V);
10400 return I && !I->isFast();
10401 }))
10402 return TreeEntry::NeedToGather;
10403 // Check if the calls are all to the same vectorizable intrinsic or
10404 // library function.
10405 CallInst *CI = cast<CallInst>(VL0);
10407
10408 VFShape Shape = VFShape::get(
10409 CI->getFunctionType(),
10410 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10411 false /*HasGlobalPred*/);
10412 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10413
10414 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10415 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10416 return TreeEntry::NeedToGather;
10417 }
10418 Function *F = CI->getCalledFunction();
10419 unsigned NumArgs = CI->arg_size();
10420 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10421 for (unsigned J = 0; J != NumArgs; ++J)
10423 ScalarArgs[J] = CI->getArgOperand(J);
10424 for (Value *V : VL) {
10425 CallInst *CI2 = dyn_cast<CallInst>(V);
10426 if (!CI2 || CI2->getCalledFunction() != F ||
10427 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10428 (VecFunc &&
10429 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10431 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10432 << "\n");
10433 return TreeEntry::NeedToGather;
10434 }
10435 // Some intrinsics have scalar arguments and should be same in order for
10436 // them to be vectorized.
10437 for (unsigned J = 0; J != NumArgs; ++J) {
10439 Value *A1J = CI2->getArgOperand(J);
10440 if (ScalarArgs[J] != A1J) {
10442 << "SLP: mismatched arguments in call:" << *CI
10443 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10444 return TreeEntry::NeedToGather;
10445 }
10446 }
10447 }
10448 // Verify that the bundle operands are identical between the two calls.
10449 if (CI->hasOperandBundles() &&
10450 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10451 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10452 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10453 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10454 << "!=" << *V << '\n');
10455 return TreeEntry::NeedToGather;
10456 }
10457 }
10458 SmallVector<Type *> ArgTys =
10459 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10460 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10461 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10462 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10463 return TreeEntry::NeedToGather;
10464
10465 return TreeEntry::Vectorize;
10466 }
10467 case Instruction::ShuffleVector: {
10468 if (!S.isAltShuffle()) {
10469 // REVEC can support non alternate shuffle.
10471 return TreeEntry::Vectorize;
10472 // If this is not an alternate sequence of opcode like add-sub
10473 // then do not vectorize this instruction.
10474 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10475 return TreeEntry::NeedToGather;
10476 }
10477 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10478 LLVM_DEBUG(
10479 dbgs()
10480 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10481 "the whole alt sequence is not profitable.\n");
10482 return TreeEntry::NeedToGather;
10483 }
10484
10485 return TreeEntry::Vectorize;
10486 }
10487 default:
10488 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10489 return TreeEntry::NeedToGather;
10490 }
10491}
10492
10493namespace {
10494/// Allows to correctly handle operands of the phi nodes based on the \p Main
10495/// PHINode order of incoming basic blocks/values.
10496class PHIHandler {
10497 DominatorTree &DT;
10498 PHINode *Main = nullptr;
10501
10502public:
10503 PHIHandler() = delete;
10504 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10505 : DT(DT), Main(Main), Phis(Phis),
10506 Operands(Main->getNumIncomingValues(),
10507 SmallVector<Value *>(Phis.size(), nullptr)) {}
10508 void buildOperands() {
10509 constexpr unsigned FastLimit = 4;
10510 if (Main->getNumIncomingValues() <= FastLimit) {
10511 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10512 BasicBlock *InBB = Main->getIncomingBlock(I);
10513 if (!DT.isReachableFromEntry(InBB)) {
10514 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10515 continue;
10516 }
10517 // Prepare the operand vector.
10518 for (auto [Idx, V] : enumerate(Phis)) {
10519 auto *P = dyn_cast<PHINode>(V);
10520 if (!P) {
10522 "Expected isa instruction or poison value.");
10523 Operands[I][Idx] = V;
10524 continue;
10525 }
10526 if (P->getIncomingBlock(I) == InBB)
10527 Operands[I][Idx] = P->getIncomingValue(I);
10528 else
10529 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10530 }
10531 }
10532 return;
10533 }
10534 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10535 Blocks;
10536 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10537 BasicBlock *InBB = Main->getIncomingBlock(I);
10538 if (!DT.isReachableFromEntry(InBB)) {
10539 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10540 continue;
10541 }
10542 Blocks.try_emplace(InBB).first->second.push_back(I);
10543 }
10544 for (auto [Idx, V] : enumerate(Phis)) {
10545 if (isa<PoisonValue>(V)) {
10546 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10547 Operands[I][Idx] = V;
10548 continue;
10549 }
10550 auto *P = cast<PHINode>(V);
10551 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10552 BasicBlock *InBB = P->getIncomingBlock(I);
10553 if (InBB == Main->getIncomingBlock(I)) {
10554 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10555 continue;
10556 Operands[I][Idx] = P->getIncomingValue(I);
10557 continue;
10558 }
10559 auto *It = Blocks.find(InBB);
10560 if (It == Blocks.end())
10561 continue;
10562 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10563 }
10564 }
10565 for (const auto &P : Blocks) {
10566 ArrayRef<unsigned> IncomingValues = P.second;
10567 if (IncomingValues.size() <= 1)
10568 continue;
10569 unsigned BasicI = IncomingValues.consume_front();
10570 for (unsigned I : IncomingValues) {
10571 assert(all_of(enumerate(Operands[I]),
10572 [&](const auto &Data) {
10573 return !Data.value() ||
10574 Data.value() == Operands[BasicI][Data.index()];
10575 }) &&
10576 "Expected empty operands list.");
10577 Operands[I] = Operands[BasicI];
10578 }
10579 }
10580 }
10581 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10582};
10583} // namespace
10584
10585/// Returns main/alternate instructions for the given \p VL. Unlike
10586/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10587/// node support.
10588/// \returns first main/alt instructions, if only poisons and instruction with
10589/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10590static std::pair<Instruction *, Instruction *>
10592 Instruction *MainOp = nullptr;
10593 Instruction *AltOp = nullptr;
10594 for (Value *V : VL) {
10595 if (isa<PoisonValue>(V))
10596 continue;
10597 auto *I = dyn_cast<Instruction>(V);
10598 if (!I)
10599 return {};
10600 if (!MainOp) {
10601 MainOp = I;
10602 continue;
10603 }
10604 if (MainOp->getOpcode() == I->getOpcode()) {
10605 if (I->getParent() != MainOp->getParent())
10606 return {};
10607 continue;
10608 }
10609 if (!AltOp) {
10610 AltOp = I;
10611 continue;
10612 }
10613 if (AltOp->getOpcode() == I->getOpcode()) {
10614 if (I->getParent() != AltOp->getParent())
10615 return {};
10616 continue;
10617 }
10618 return {};
10619 }
10620 if (!AltOp)
10621 return {};
10622 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10623 "Expected different main and alt instructions.");
10624 return std::make_pair(MainOp, AltOp);
10625}
10626
10627/// Checks that every instruction appears once in the list and if not, packs
10628/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10629/// unique scalars is extended by poison values to the whole register size.
10630///
10631/// \returns false if \p VL could not be uniquified, in which case \p VL is
10632/// unchanged and \p ReuseShuffleIndices is empty.
10634 SmallVectorImpl<int> &ReuseShuffleIndices,
10635 const TargetTransformInfo &TTI,
10636 const TargetLibraryInfo &TLI,
10637 const InstructionsState &S,
10638 const BoUpSLP::EdgeInfo &UserTreeIdx,
10639 bool TryPad = false) {
10640 // Check that every instruction appears once in this bundle.
10641 SmallVector<Value *> UniqueValues;
10642 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10643 for (Value *V : VL) {
10644 if (isConstant(V)) {
10645 // Constants are always considered distinct, even if the same constant
10646 // appears multiple times in VL.
10647 ReuseShuffleIndices.emplace_back(
10648 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10649 UniqueValues.emplace_back(V);
10650 continue;
10651 }
10652 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10653 ReuseShuffleIndices.emplace_back(Res.first->second);
10654 if (Res.second)
10655 UniqueValues.emplace_back(V);
10656 }
10657
10658 // Easy case: VL has unique values and a "natural" size
10659 size_t NumUniqueScalarValues = UniqueValues.size();
10660 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10661 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10662 if (NumUniqueScalarValues == VL.size() &&
10663 (VectorizeNonPowerOf2 || IsFullVectors)) {
10664 ReuseShuffleIndices.clear();
10665 return true;
10666 }
10667
10668 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10669 if ((UserTreeIdx.UserTE &&
10670 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10672 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10673 "for nodes with padding.\n");
10674 ReuseShuffleIndices.clear();
10675 return false;
10676 }
10677
10678 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10679 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10680 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10681 return isa<UndefValue>(V) || !isConstant(V);
10682 }))) {
10683 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10684 S.getMainOp()->isSafeToRemove() &&
10685 (S.areInstructionsWithCopyableElements() ||
10686 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10687 // Find the number of elements, which forms full vectors.
10688 unsigned PWSz = getFullVectorNumberOfElements(
10689 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10690 PWSz = std::min<unsigned>(PWSz, VL.size());
10691 if (PWSz == VL.size()) {
10692 // We ended up with the same size after removing duplicates and
10693 // upgrading the resulting vector size to a "nice size". Just keep
10694 // the initial VL then.
10695 ReuseShuffleIndices.clear();
10696 } else {
10697 // Pad unique values with poison to grow the vector to a "nice" size
10698 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10699 UniqueValues.end());
10700 PaddedUniqueValues.append(
10701 PWSz - UniqueValues.size(),
10702 PoisonValue::get(UniqueValues.front()->getType()));
10703 // Check that extended with poisons/copyable operations are still valid
10704 // for vectorization (div/rem are not allowed).
10705 if ((!S.areInstructionsWithCopyableElements() &&
10706 !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
10707 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10708 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10709 isa<CallInst>(S.getMainOp())))) {
10710 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10711 ReuseShuffleIndices.clear();
10712 return false;
10713 }
10714 VL = std::move(PaddedUniqueValues);
10715 }
10716 return true;
10717 }
10718 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10719 ReuseShuffleIndices.clear();
10720 return false;
10721 }
10722 VL = std::move(UniqueValues);
10723 return true;
10724}
10725
10726bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10727 const InstructionsState &LocalState,
10728 SmallVectorImpl<Value *> &Op1,
10729 SmallVectorImpl<Value *> &Op2,
10730 OrdersType &ReorderIndices) const {
10731 constexpr unsigned SmallNodeSize = 4;
10732 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10734 return false;
10735
10736 // Check if this is a duplicate of another split entry.
10737 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10738 << ".\n");
10739 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10740 if (E->isSame(VL)) {
10741 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10742 << *LocalState.getMainOp() << ".\n");
10743 return false;
10744 }
10745 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10746 if (all_of(VL, [&](Value *V) {
10747 return isa<PoisonValue>(V) || Values.contains(V);
10748 })) {
10749 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10750 return false;
10751 }
10752 }
10753
10754 ReorderIndices.assign(VL.size(), VL.size());
10755 SmallBitVector Op1Indices(VL.size());
10756 for (auto [Idx, V] : enumerate(VL)) {
10757 auto *I = dyn_cast<Instruction>(V);
10758 if (!I) {
10759 Op1.push_back(V);
10760 Op1Indices.set(Idx);
10761 continue;
10762 }
10763 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10764 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
10765 *TLI)) ||
10766 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10767 !isAlternateInstruction(I, LocalState.getMainOp(),
10768 LocalState.getAltOp(), *TLI))) {
10769 Op1.push_back(V);
10770 Op1Indices.set(Idx);
10771 continue;
10772 }
10773 Op2.push_back(V);
10774 }
10775 Type *ScalarTy = getValueType(VL.front());
10776 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
10777 unsigned Opcode0 = LocalState.getOpcode();
10778 unsigned Opcode1 = LocalState.getAltOpcode();
10779 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10780 // Enable split node, only if all nodes do not form legal alternate
10781 // instruction (like X86 addsub).
10782 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
10783 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
10784 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10785 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10786 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
10787 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
10788 return false;
10789 // Enable split node, only if all nodes are power-of-2/full registers.
10790 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
10791 for (unsigned Idx : seq<unsigned>(VL.size())) {
10792 if (Op1Indices.test(Idx)) {
10793 ReorderIndices[Op1Cnt] = Idx;
10794 ++Op1Cnt;
10795 } else {
10796 ReorderIndices[Op2Cnt] = Idx;
10797 ++Op2Cnt;
10798 }
10799 }
10800 if (isIdentityOrder(ReorderIndices))
10801 ReorderIndices.clear();
10802 SmallVector<int> Mask;
10803 if (!ReorderIndices.empty())
10804 inversePermutation(ReorderIndices, Mask);
10805 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10806 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
10807 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
10808 // Check non-profitable single register ops, which better to be represented
10809 // as alternate ops.
10810 if (NumParts >= VL.size())
10811 return false;
10813 InstructionCost InsertCost = ::getShuffleCost(
10814 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
10815 FixedVectorType *SubVecTy =
10816 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
10817 InstructionCost NewShuffleCost =
10818 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
10819 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10820 (Mask.empty() || InsertCost >= NewShuffleCost))
10821 return false;
10822 if ((LocalState.getMainOp()->isBinaryOp() &&
10823 LocalState.getAltOp()->isBinaryOp() &&
10824 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10825 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10826 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10827 (LocalState.getMainOp()->isUnaryOp() &&
10828 LocalState.getAltOp()->isUnaryOp())) {
10829 InstructionCost OriginalVecOpsCost =
10830 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10831 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10832 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
10833 for (unsigned Idx : seq<unsigned>(VL.size())) {
10834 if (isa<PoisonValue>(VL[Idx]))
10835 continue;
10836 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
10837 }
10838 InstructionCost OriginalCost =
10839 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
10840 VecTy, OriginalMask, Kind);
10841 InstructionCost NewVecOpsCost =
10842 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10843 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10844 InstructionCost NewCost =
10845 NewVecOpsCost + InsertCost +
10846 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10847 VectorizableTree.front()->getOpcode() == Instruction::Store
10848 ? NewShuffleCost
10849 : 0);
10850 // If not profitable to split - exit.
10851 if (NewCost >= OriginalCost)
10852 return false;
10853 }
10854 return true;
10855}
10856
10857namespace {
10858/// Class accepts incoming list of values, checks if it is able to model
10859/// "copyable" values as compatible operations, and generates the list of values
10860/// for scheduling and list of operands doe the new nodes.
10861class InstructionsCompatibilityAnalysis {
10862 DominatorTree &DT;
10863 const DataLayout &DL;
10864 const TargetTransformInfo &TTI;
10865 const TargetLibraryInfo &TLI;
10866 unsigned MainOpcode = 0;
10867 Instruction *MainOp = nullptr;
10868
10869 /// Checks if the opcode is supported as the main opcode for copyable
10870 /// elements.
10871 static bool isSupportedOpcode(const unsigned Opcode) {
10872 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
10873 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
10874 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
10875 Opcode == Instruction::And || Opcode == Instruction::Or ||
10876 Opcode == Instruction::Xor;
10877 }
10878
10879 /// Identifies the best candidate value, which represents main opcode
10880 /// operation.
10881 /// Currently the best candidate is the Add instruction with the parent
10882 /// block with the highest DFS incoming number (block, that dominates other).
10883 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
10884 BasicBlock *Parent = nullptr;
10885 // Checks if the instruction has supported opcode.
10886 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
10887 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
10888 return false;
10889 return I && isSupportedOpcode(I->getOpcode()) &&
10890 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
10891 };
10892 // Exclude operands instructions immediately to improve compile time, it
10893 // will be unable to schedule anyway.
10894 SmallDenseSet<Value *, 8> Operands;
10895 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10896 bool AnyUndef = false;
10897 for (Value *V : VL) {
10898 auto *I = dyn_cast<Instruction>(V);
10899 if (!I) {
10900 AnyUndef |= isa<UndefValue>(V);
10901 continue;
10902 }
10903 if (!DT.isReachableFromEntry(I->getParent()))
10904 continue;
10905 if (Candidates.empty()) {
10906 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10907 Parent = I->getParent();
10908 Operands.insert(I->op_begin(), I->op_end());
10909 continue;
10910 }
10911 if (Parent == I->getParent()) {
10912 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10913 Operands.insert(I->op_begin(), I->op_end());
10914 continue;
10915 }
10916 auto *NodeA = DT.getNode(Parent);
10917 auto *NodeB = DT.getNode(I->getParent());
10918 assert(NodeA && "Should only process reachable instructions");
10919 assert(NodeB && "Should only process reachable instructions");
10920 assert((NodeA == NodeB) ==
10921 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10922 "Different nodes should have different DFS numbers");
10923 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10924 Candidates.clear();
10925 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10926 Parent = I->getParent();
10927 Operands.clear();
10928 Operands.insert(I->op_begin(), I->op_end());
10929 }
10930 }
10931 unsigned BestOpcodeNum = 0;
10932 MainOp = nullptr;
10933 bool UsedOutside = false;
10934 for (const auto &P : Candidates) {
10935 bool PUsedOutside = all_of(P.second, isUsedOutsideBlock);
10936 if (UsedOutside && !PUsedOutside)
10937 continue;
10938 if (!UsedOutside && PUsedOutside)
10939 BestOpcodeNum = 0;
10940 if (P.second.size() < BestOpcodeNum)
10941 continue;
10942 // If have inner dependencies - skip.
10943 if (!PUsedOutside && any_of(P.second, [&](Instruction *I) {
10944 return Operands.contains(I);
10945 }))
10946 continue;
10947 UsedOutside = PUsedOutside;
10948 for (Instruction *I : P.second) {
10949 if (IsSupportedInstruction(I, AnyUndef)) {
10950 MainOp = I;
10951 BestOpcodeNum = P.second.size();
10952 break;
10953 }
10954 }
10955 }
10956 if (MainOp) {
10957 // Do not match, if any copyable is a terminator from the same block as
10958 // the main operation.
10959 if (any_of(VL, [&](Value *V) {
10960 auto *I = dyn_cast<Instruction>(V);
10961 return I && I->getParent() == MainOp->getParent() &&
10962 I->isTerminator();
10963 })) {
10964 MainOp = nullptr;
10965 return;
10966 }
10967 MainOpcode = MainOp->getOpcode();
10968 }
10969 }
10970
10971 /// Returns the idempotent value for the \p MainOp with the detected \p
10972 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
10973 /// the operand itself, since V or V == V.
10974 Value *selectBestIdempotentValue() const {
10975 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10976 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
10977 !MainOp->isCommutative());
10978 }
10979
10980 /// Returns the value and operands for the \p V, considering if it is original
10981 /// instruction and its actual operands should be returned, or it is a
10982 /// copyable element and its should be represented as idempotent instruction.
10983 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
10984 if (isa<PoisonValue>(V))
10985 return {V, V};
10986 if (!S.isCopyableElement(V))
10987 return convertTo(cast<Instruction>(V), S).second;
10988 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10989 return {V, selectBestIdempotentValue()};
10990 }
10991
10992 /// Builds operands for the original instructions.
10993 void
10994 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
10995 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
10996
10997 unsigned ShuffleOrOp =
10998 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10999 Instruction *VL0 = S.getMainOp();
11000
11001 switch (ShuffleOrOp) {
11002 case Instruction::PHI: {
11003 auto *PH = cast<PHINode>(VL0);
11004
11005 // Keeps the reordered operands to avoid code duplication.
11006 PHIHandler Handler(DT, PH, VL);
11007 Handler.buildOperands();
11008 Operands.assign(PH->getNumOperands(), {});
11009 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
11010 Operands[I].assign(Handler.getOperands(I).begin(),
11011 Handler.getOperands(I).end());
11012 return;
11013 }
11014 case Instruction::ExtractValue:
11015 case Instruction::ExtractElement:
11016 // This is a special case, as it does not gather, but at the same time
11017 // we are not extending buildTree_rec() towards the operands.
11018 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
11019 return;
11020 case Instruction::InsertElement:
11021 Operands.assign(2, {VL.size(), nullptr});
11022 for (auto [Idx, V] : enumerate(VL)) {
11023 auto *IE = cast<InsertElementInst>(V);
11024 for (auto [OpIdx, Ops] : enumerate(Operands))
11025 Ops[Idx] = IE->getOperand(OpIdx);
11026 }
11027 return;
11028 case Instruction::Load:
11029 Operands.assign(
11030 1, {VL.size(),
11031 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
11032 for (auto [V, Op] : zip(VL, Operands.back())) {
11033 auto *LI = dyn_cast<LoadInst>(V);
11034 if (!LI)
11035 continue;
11036 Op = LI->getPointerOperand();
11037 }
11038 return;
11039 case Instruction::ZExt:
11040 case Instruction::SExt:
11041 case Instruction::FPToUI:
11042 case Instruction::FPToSI:
11043 case Instruction::FPExt:
11044 case Instruction::PtrToInt:
11045 case Instruction::IntToPtr:
11046 case Instruction::SIToFP:
11047 case Instruction::UIToFP:
11048 case Instruction::Trunc:
11049 case Instruction::FPTrunc:
11050 case Instruction::BitCast:
11051 case Instruction::ICmp:
11052 case Instruction::FCmp:
11053 case Instruction::Select:
11054 case Instruction::FNeg:
11055 case Instruction::Add:
11056 case Instruction::FAdd:
11057 case Instruction::Sub:
11058 case Instruction::FSub:
11059 case Instruction::Mul:
11060 case Instruction::FMul:
11061 case Instruction::UDiv:
11062 case Instruction::SDiv:
11063 case Instruction::FDiv:
11064 case Instruction::URem:
11065 case Instruction::SRem:
11066 case Instruction::FRem:
11067 case Instruction::Shl:
11068 case Instruction::LShr:
11069 case Instruction::AShr:
11070 case Instruction::And:
11071 case Instruction::Or:
11072 case Instruction::Xor:
11073 case Instruction::Freeze:
11074 case Instruction::Store:
11075 case Instruction::ShuffleVector:
11076 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11077 for (auto [Idx, V] : enumerate(VL)) {
11078 auto *I = dyn_cast<Instruction>(V);
11079 if (!I) {
11080 for (auto [OpIdx, Ops] : enumerate(Operands))
11081 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11082 continue;
11083 }
11084 auto [Op, ConvertedOps] = convertTo(I, S);
11085 for (auto [OpIdx, Ops] : enumerate(Operands))
11086 Ops[Idx] = ConvertedOps[OpIdx];
11087 }
11088 return;
11089 case Instruction::GetElementPtr: {
11090 Operands.assign(2, {VL.size(), nullptr});
11091 // Need to cast all indices to the same type before vectorization to
11092 // avoid crash.
11093 // Required to be able to find correct matches between different gather
11094 // nodes and reuse the vectorized values rather than trying to gather them
11095 // again.
11096 const unsigned IndexIdx = 1;
11097 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
11098 Type *Ty =
11099 all_of(VL,
11100 [&](Value *V) {
11102 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
11103 })
11104 ? VL0Ty
11105 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
11106 ->getPointerOperandType()
11107 ->getScalarType());
11108 for (auto [Idx, V] : enumerate(VL)) {
11110 if (!GEP) {
11111 Operands[0][Idx] = V;
11112 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11113 continue;
11114 }
11115 Operands[0][Idx] = GEP->getPointerOperand();
11116 auto *Op = GEP->getOperand(IndexIdx);
11117 auto *CI = dyn_cast<ConstantInt>(Op);
11118 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
11119 CI, Ty, CI->getValue().isSignBitSet(), DL)
11120 : Op;
11121 }
11122 return;
11123 }
11124 case Instruction::Call: {
11125 auto *CI = cast<CallInst>(VL0);
11127 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
11129 continue;
11130 auto &Ops = Operands.emplace_back();
11131 for (Value *V : VL) {
11132 auto *I = dyn_cast<Instruction>(V);
11133 Ops.push_back(I ? I->getOperand(Idx)
11134 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
11135 }
11136 }
11137 return;
11138 }
11139 default:
11140 break;
11141 }
11142 llvm_unreachable("Unexpected vectorization of the instructions.");
11143 }
11144
11145public:
11146 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11147 const TargetTransformInfo &TTI,
11148 const TargetLibraryInfo &TLI)
11149 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11150
11151 InstructionsState
11152 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
11153 bool TryCopyableElementsVectorization,
11154 bool WithProfitabilityCheck = false,
11155 bool SkipSameCodeCheck = false) {
11156 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11157 ? InstructionsState::invalid()
11158 : getSameOpcode(VL, TLI);
11159 if (S)
11160 return S;
11161 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
11162 return S;
11163 findAndSetMainInstruction(VL, R);
11164 if (!MainOp)
11165 return InstructionsState::invalid();
11166 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11167 if (!WithProfitabilityCheck)
11168 return S;
11169 // Check if it is profitable to vectorize the instruction.
11170 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11171 auto BuildCandidates =
11172 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11173 Value *V2) {
11174 if (V1 != V2 && isa<PHINode>(V1))
11175 return;
11176 auto *I1 = dyn_cast<Instruction>(V1);
11177 auto *I2 = dyn_cast<Instruction>(V2);
11178 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11179 I1->getParent() != I2->getParent())
11180 return;
11181 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
11182 };
11183 if (VL.size() == 2) {
11184 // Check if the operands allow better vectorization.
11185 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11186 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11187 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11188 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11189 R.findBestRootPair(Candidates1) &&
11190 R.findBestRootPair(Candidates2);
11191 if (!Res && isCommutative(MainOp)) {
11192 Candidates1.clear();
11193 Candidates2.clear();
11194 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11195 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11196 Res = !Candidates1.empty() && !Candidates2.empty() &&
11197 R.findBestRootPair(Candidates1) &&
11198 R.findBestRootPair(Candidates2);
11199 }
11200 if (!Res)
11201 return InstructionsState::invalid();
11203 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
11204 InstructionCost VectorCost;
11205 FixedVectorType *VecTy =
11206 getWidenedType(S.getMainOp()->getType(), VL.size());
11207 switch (MainOpcode) {
11208 case Instruction::Add:
11209 case Instruction::Sub:
11210 case Instruction::LShr:
11211 case Instruction::Shl:
11212 case Instruction::SDiv:
11213 case Instruction::UDiv:
11214 case Instruction::And:
11215 case Instruction::Or:
11216 case Instruction::Xor:
11217 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
11218 break;
11219 default:
11220 llvm_unreachable("Unexpected instruction.");
11221 }
11222 if (VectorCost > ScalarCost)
11223 return InstructionsState::invalid();
11224 return S;
11225 }
11226 assert(Operands.size() == 2 && "Unexpected number of operands!");
11227 unsigned CopyableNum =
11228 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
11229 if (CopyableNum < VL.size() / 2)
11230 return S;
11231 // Too many phi copyables - exit.
11232 const unsigned Limit = VL.size() / 24;
11233 if ((CopyableNum >= VL.size() - Limit ||
11234 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11235 CopyableNum >= MaxPHINumOperands) &&
11236 all_of(VL, [&](Value *V) {
11237 return isa<PHINode>(V) || !S.isCopyableElement(V);
11238 }))
11239 return InstructionsState::invalid();
11240 // Check profitability if number of copyables > VL.size() / 2.
11241 // 1. Reorder operands for better matching.
11242 if (isCommutative(MainOp)) {
11243 for (auto &Ops : Operands) {
11244 // Make instructions the first operands.
11245 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
11246 std::swap(Ops.front(), Ops.back());
11247 continue;
11248 }
11249 // Make constants the second operands.
11250 if (isa<Constant>(Ops.front())) {
11251 std::swap(Ops.front(), Ops.back());
11252 continue;
11253 }
11254 }
11255 }
11256 // 2. Check, if operands can be vectorized.
11257 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
11258 return InstructionsState::invalid();
11259 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11260 if (allConstant(Ops) || isSplat(Ops))
11261 return true;
11262 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11263 // one is different.
11264 constexpr unsigned Limit = 4;
11265 if (Operands.front().size() >= Limit) {
11266 SmallDenseMap<const Value *, unsigned> Counters;
11267 for (Value *V : Ops) {
11268 if (isa<UndefValue>(V))
11269 continue;
11270 ++Counters[V];
11271 }
11272 if (Counters.size() == 2 &&
11273 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11274 return C.second == 1;
11275 }))
11276 return true;
11277 }
11278 // First operand not a constant or splat? Last attempt - check for
11279 // potential vectorization.
11280 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11281 InstructionsState OpS = Analysis.buildInstructionsState(
11282 Ops, R, /*TryCopyableElementsVectorization=*/true);
11283 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11284 return false;
11285 unsigned CopyableNum =
11286 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11287 return CopyableNum <= VL.size() / 2;
11288 };
11289 if (!CheckOperand(Operands.front()))
11290 return InstructionsState::invalid();
11291
11292 return S;
11293 }
11294
11295 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11296 ArrayRef<Value *> VL) {
11297 assert(S && "Invalid state!");
11299 if (S.areInstructionsWithCopyableElements()) {
11300 MainOp = S.getMainOp();
11301 MainOpcode = S.getOpcode();
11302 Operands.assign(MainOp->getNumOperands(),
11303 BoUpSLP::ValueList(VL.size(), nullptr));
11304 for (auto [Idx, V] : enumerate(VL)) {
11305 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11306 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11307 Operands[OperandIdx][Idx] = Operand;
11308 }
11309 } else {
11310 buildOriginalOperands(S, VL, Operands);
11311 }
11312 return Operands;
11313 }
11314};
11315} // namespace
11316
11317BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11318 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11319 bool TryCopyableElementsVectorization) const {
11320 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11321
11322 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11323 InstructionsState S = Analysis.buildInstructionsState(
11324 VL, *this, TryCopyableElementsVectorization,
11325 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11326
11327 bool AreScatterAllGEPSameBlock = false;
11328 if (!S) {
11329 SmallVector<unsigned> SortedIndices;
11330 BasicBlock *BB = nullptr;
11331 bool IsScatterVectorizeUserTE =
11332 UserTreeIdx.UserTE &&
11333 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11334 AreScatterAllGEPSameBlock =
11335 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11336 VL.size() > 2 &&
11337 all_of(VL,
11338 [&BB](Value *V) {
11339 auto *I = dyn_cast<GetElementPtrInst>(V);
11340 if (!I)
11341 return doesNotNeedToBeScheduled(V);
11342 if (!BB)
11343 BB = I->getParent();
11344 return BB == I->getParent() && I->getNumOperands() == 2;
11345 }) &&
11346 BB &&
11347 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL,
11348 *SE, SortedIndices));
11349 if (!AreScatterAllGEPSameBlock) {
11350 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11351 "C,S,B,O, small shuffle. \n";
11352 dbgs() << "[";
11353 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11354 dbgs() << "]\n");
11355 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11356 /*TryToFindDuplicates=*/true,
11357 /*TrySplitVectorize=*/true);
11358 }
11359 // Reset S to make it GetElementPtr kind of node.
11360 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11361 assert(It != VL.end() && "Expected at least one GEP.");
11362 S = getSameOpcode(*It, *TLI);
11363 }
11364 assert(S && "Must be valid.");
11365
11366 // Don't handle vectors.
11367 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11368 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11369 // Do not try to pack to avoid extra instructions here.
11370 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11371 /*TryToFindDuplicates=*/false);
11372 }
11373
11374 // Check that all of the users of the scalars that we want to vectorize are
11375 // schedulable.
11376 BasicBlock *BB = S.getMainOp()->getParent();
11377
11379 !DT->isReachableFromEntry(BB)) {
11380 // Don't go into unreachable blocks. They may contain instructions with
11381 // dependency cycles which confuse the final scheduling.
11382 // Do not vectorize EH and non-returning blocks, not profitable in most
11383 // cases.
11384 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11385 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11386 }
11387
11388 // Don't go into catchswitch blocks, which can happen with PHIs.
11389 // Such blocks can only have PHIs and the catchswitch. There is no
11390 // place to insert a shuffle if we need to, so just avoid that issue.
11392 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11393 // Do not try to pack to avoid extra instructions here.
11394 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11395 /*TryToFindDuplicates=*/false);
11396 }
11397
11398 // Don't handle scalable vectors
11399 if (S.getOpcode() == Instruction::ExtractElement &&
11401 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11402 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11403 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11404 }
11405
11406 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11407 // a load), in which case peek through to include it in the tree, without
11408 // ballooning over-budget.
11409 if (Depth >= RecursionMaxDepth &&
11410 (S.isAltShuffle() || VL.size() < 4 ||
11411 !(match(S.getMainOp(), m_Load(m_Value())) ||
11412 all_of(VL, [&S](const Value *I) {
11413 return match(I,
11415 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11416 })))) {
11417 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11418 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11419 }
11420
11421 // Check if this is a duplicate of another entry.
11422 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11423 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11424 if (E->isSame(VL)) {
11425 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11426 << ".\n");
11427 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11428 }
11429 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11430 if (all_of(VL, [&](Value *V) {
11431 return isa<PoisonValue>(V) || Values.contains(V) ||
11432 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11433 LI->getLoopFor(S.getMainOp()->getParent()) &&
11434 isVectorized(V));
11435 })) {
11436 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11437 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11438 }
11439 }
11440
11441 // If all of the operands are identical or constant we have a simple solution.
11442 // If we deal with insert/extract instructions, they all must have constant
11443 // indices, otherwise we should gather them, not try to vectorize.
11444 // If alternate op node with 2 elements with gathered operands - do not
11445 // vectorize.
11446 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11447 if (!S || !S.isAltShuffle() || VL.size() > 2)
11448 return false;
11449 if (VectorizableTree.size() < MinTreeSize)
11450 return false;
11451 if (Depth >= RecursionMaxDepth - 1)
11452 return true;
11453 // Check if all operands are extracts, part of vector node or can build a
11454 // regular vectorize node.
11455 SmallVector<unsigned, 8> InstsCount;
11456 for (Value *V : VL) {
11457 auto *I = cast<Instruction>(V);
11458 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11459 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11460 }));
11461 }
11462 bool IsCommutative =
11463 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11464 if ((IsCommutative &&
11465 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11466 (!IsCommutative &&
11467 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11468 return true;
11469 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11471 auto *I1 = cast<Instruction>(VL.front());
11472 auto *I2 = cast<Instruction>(VL.back());
11473 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11474 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11475 I2->getOperand(Op));
11476 if (static_cast<unsigned>(count_if(
11477 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11479 })) >= S.getMainOp()->getNumOperands() / 2)
11480 return false;
11481 if (S.getMainOp()->getNumOperands() > 2)
11482 return true;
11483 if (IsCommutative) {
11484 // Check permuted operands.
11485 Candidates.clear();
11486 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11487 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11488 I2->getOperand((Op + 1) % E));
11489 if (any_of(
11490 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11492 }))
11493 return false;
11494 }
11495 return true;
11496 };
11497 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11498 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11499 if (!AreAllSameInsts || isSplat(VL) ||
11501 S.getMainOp()) &&
11503 NotProfitableForVectorization(VL)) {
11504 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11505 dbgs() << "[";
11506 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11507 dbgs() << "]\n");
11508 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11509 }
11510
11511 // Don't vectorize ephemeral values.
11512 if (!EphValues.empty()) {
11513 for (Value *V : VL) {
11514 if (EphValues.count(V)) {
11515 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11516 << ") is ephemeral.\n");
11517 // Do not try to pack to avoid extra instructions here.
11518 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11519 /*TryToFindDuplicates=*/false);
11520 }
11521 }
11522 }
11523
11524 // We now know that this is a vector of instructions of the same type from
11525 // the same block.
11526
11527 // Check that none of the instructions in the bundle are already in the tree
11528 // and the node may be not profitable for the vectorization as the small
11529 // alternate node.
11530 if (S.isAltShuffle()) {
11531 auto GetNumVectorizedExtracted = [&]() {
11532 APInt Extracted = APInt::getZero(VL.size());
11533 APInt Vectorized = APInt::getAllOnes(VL.size());
11534 for (auto [Idx, V] : enumerate(VL)) {
11535 auto *I = dyn_cast<Instruction>(V);
11536 if (!I || doesNotNeedToBeScheduled(I) ||
11537 all_of(I->operands(), [&](const Use &U) {
11538 return isa<ExtractElementInst>(U.get());
11539 }))
11540 continue;
11541 if (isVectorized(I))
11542 Vectorized.clearBit(Idx);
11543 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11544 Extracted.setBit(Idx);
11545 }
11546 return std::make_pair(Vectorized, Extracted);
11547 };
11548 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11550 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11551 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11552 // Rough cost estimation, if the vector code (+ potential extracts) is
11553 // more profitable than the scalar + buildvector.
11554 Type *ScalarTy = VL.front()->getType();
11555 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11556 InstructionCost VectorizeCostEstimate =
11557 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11558 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11559 /*Insert=*/false, /*Extract=*/true, Kind);
11560 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11561 *TTI, ScalarTy, VecTy, Vectorized,
11562 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11563 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11564 }
11565 if (PreferScalarize) {
11566 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11567 "node is not profitable.\n");
11568 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11569 }
11570 }
11571
11572 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11573 if (UserIgnoreList && !UserIgnoreList->empty()) {
11574 for (Value *V : VL) {
11575 if (UserIgnoreList->contains(V)) {
11576 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11577 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11578 }
11579 }
11580 }
11581
11582 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11583}
11584
11585void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11586 const EdgeInfo &UserTreeIdx,
11587 unsigned InterleaveFactor) {
11588 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11589
11590 SmallVector<int> ReuseShuffleIndices;
11591 SmallVector<Value *> VL(VLRef);
11592
11593 // Tries to build split node.
11594 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11595 SmallVector<Value *> Op1, Op2;
11596 OrdersType ReorderIndices;
11597 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11598 return false;
11599
11600 auto Invalid = ScheduleBundle::invalid();
11601 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11602 UserTreeIdx, {}, ReorderIndices);
11603 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11604 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11605 InstructionsState S = getSameOpcode(Op, *TLI);
11606 if (S && (isa<LoadInst>(S.getMainOp()) ||
11607 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11608 // Build gather node for loads, they will be gathered later.
11609 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11610 Idx == 0 ? 0 : Op1.size());
11611 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11612 } else {
11613 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11614 Idx == 0 ? 0 : Op1.size());
11615 buildTreeRec(Op, Depth, {TE, Idx});
11616 }
11617 };
11618 AddNode(Op1, 0);
11619 AddNode(Op2, 1);
11620 return true;
11621 };
11622
11623 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11624 bool AreConsts = false;
11625 for (Value *V : VL) {
11626 if (isa<PoisonValue>(V))
11627 continue;
11628 if (isa<Constant>(V)) {
11629 AreConsts = true;
11630 continue;
11631 }
11632 if (!isa<PHINode>(V))
11633 return false;
11634 }
11635 return AreConsts;
11636 };
11637 if (AreOnlyConstsWithPHIs(VL)) {
11638 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11639 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11640 return;
11641 }
11642
11643 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11644 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11645 InstructionsState S = Legality.getInstructionsState();
11646 if (!Legality.isLegal()) {
11647 if (Legality.trySplitVectorize()) {
11648 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11649 // Last chance to try to vectorize alternate node.
11650 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11651 return;
11652 }
11653 if (!S)
11654 Legality = getScalarsVectorizationLegality(
11655 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11656 if (!Legality.isLegal()) {
11657 if (Legality.tryToFindDuplicates())
11658 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11659 UserTreeIdx);
11660
11661 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11662 return;
11663 }
11664 S = Legality.getInstructionsState();
11665 }
11666
11667 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11668 if (S.isAltShuffle() && TrySplitNode(S))
11669 return;
11670
11671 // Check that every instruction appears once in this bundle.
11672 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11673 /*TryPad=*/true)) {
11674 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11675 return;
11676 }
11677
11678 // Perform specific checks for each particular instruction kind.
11679 bool IsScatterVectorizeUserTE =
11680 UserTreeIdx.UserTE &&
11681 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11682 OrdersType CurrentOrder;
11683 SmallVector<Value *> PointerOps;
11684 StridedPtrInfo SPtrInfo;
11685 TreeEntry::EntryState State = getScalarsVectorizationState(
11686 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11687 if (State == TreeEntry::NeedToGather) {
11688 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11689 return;
11690 }
11691
11692 Instruction *VL0 = S.getMainOp();
11693 BasicBlock *BB = VL0->getParent();
11694 auto &BSRef = BlocksSchedules[BB];
11695 if (!BSRef)
11696 BSRef = std::make_unique<BlockScheduling>(BB);
11697
11698 BlockScheduling &BS = *BSRef;
11699
11700 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11701 std::optional<ScheduleBundle *> BundlePtr =
11702 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11703#ifdef EXPENSIVE_CHECKS
11704 // Make sure we didn't break any internal invariants
11705 BS.verify();
11706#endif
11707 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11708 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11709 // Last chance to try to vectorize alternate node.
11710 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11711 return;
11712 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11713 NonScheduledFirst.insert(VL.front());
11714 if (S.getOpcode() == Instruction::Load &&
11715 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11717 return;
11718 }
11719 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11720 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11721 ScheduleBundle Empty;
11722 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11723 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11724
11725 unsigned ShuffleOrOp =
11726 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11727 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11728 // Postpone PHI nodes creation
11729 SmallVector<unsigned> PHIOps;
11730 for (unsigned I : seq<unsigned>(Operands.size())) {
11731 ArrayRef<Value *> Op = Operands[I];
11732 if (Op.empty())
11733 continue;
11734 InstructionsState S = getSameOpcode(Op, *TLI);
11735 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11736 buildTreeRec(Op, Depth + 1, {TE, I});
11737 else
11738 PHIOps.push_back(I);
11739 }
11740 for (unsigned I : PHIOps)
11741 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11742 };
11743 switch (ShuffleOrOp) {
11744 case Instruction::PHI: {
11745 TreeEntry *TE =
11746 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11747 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11748 TE->dump());
11749
11750 TE->setOperands(Operands);
11751 CreateOperandNodes(TE, Operands);
11752 return;
11753 }
11754 case Instruction::ExtractValue:
11755 case Instruction::ExtractElement: {
11756 if (CurrentOrder.empty()) {
11757 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11758 } else {
11759 LLVM_DEBUG({
11760 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11761 "with order";
11762 for (unsigned Idx : CurrentOrder)
11763 dbgs() << " " << Idx;
11764 dbgs() << "\n";
11765 });
11766 fixupOrderingIndices(CurrentOrder);
11767 }
11768 // Insert new order with initial value 0, if it does not exist,
11769 // otherwise return the iterator to the existing one.
11770 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11771 ReuseShuffleIndices, CurrentOrder);
11772 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11773 "(ExtractValueInst/ExtractElementInst).\n";
11774 TE->dump());
11775 // This is a special case, as it does not gather, but at the same time
11776 // we are not extending buildTreeRec() towards the operands.
11777 TE->setOperands(Operands);
11778 return;
11779 }
11780 case Instruction::InsertElement: {
11781 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11782
11783 auto OrdCompare = [](const std::pair<int, int> &P1,
11784 const std::pair<int, int> &P2) {
11785 return P1.first > P2.first;
11786 };
11787 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
11788 decltype(OrdCompare)>
11789 Indices(OrdCompare);
11790 for (int I = 0, E = VL.size(); I < E; ++I) {
11791 unsigned Idx = *getElementIndex(VL[I]);
11792 Indices.emplace(Idx, I);
11793 }
11794 OrdersType CurrentOrder(VL.size(), VL.size());
11795 bool IsIdentity = true;
11796 for (int I = 0, E = VL.size(); I < E; ++I) {
11797 CurrentOrder[Indices.top().second] = I;
11798 IsIdentity &= Indices.top().second == I;
11799 Indices.pop();
11800 }
11801 if (IsIdentity)
11802 CurrentOrder.clear();
11803 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11804 {}, CurrentOrder);
11805 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
11806 TE->dump());
11807
11808 TE->setOperands(Operands);
11809 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
11810 return;
11811 }
11812 case Instruction::Load: {
11813 // Check that a vectorized load would load the same memory as a scalar
11814 // load. For example, we don't want to vectorize loads that are smaller
11815 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
11816 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
11817 // from such a struct, we read/write packed bits disagreeing with the
11818 // unvectorized version.
11819 TreeEntry *TE = nullptr;
11820 fixupOrderingIndices(CurrentOrder);
11821 switch (State) {
11822 case TreeEntry::Vectorize:
11823 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11824 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11825 if (CurrentOrder.empty())
11826 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
11827 TE->dump());
11828 else
11830 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
11831 TE->dump());
11832 break;
11833 case TreeEntry::CompressVectorize:
11834 // Vectorizing non-consecutive loads with (masked)load + compress.
11835 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11836 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11837 LLVM_DEBUG(
11838 dbgs()
11839 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11840 TE->dump());
11841 break;
11842 case TreeEntry::StridedVectorize:
11843 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11844 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11845 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11846 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
11847 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11848 TE->dump());
11849 break;
11850 case TreeEntry::ScatterVectorize:
11851 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11852 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11853 UserTreeIdx, ReuseShuffleIndices);
11854 LLVM_DEBUG(
11855 dbgs()
11856 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11857 TE->dump());
11858 break;
11859 case TreeEntry::CombinedVectorize:
11860 case TreeEntry::SplitVectorize:
11861 case TreeEntry::NeedToGather:
11862 llvm_unreachable("Unexpected loads state.");
11863 }
11864 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11865 assert(Operands.size() == 1 && "Expected a single operand only");
11866 SmallVector<int> Mask;
11867 inversePermutation(CurrentOrder, Mask);
11868 reorderScalars(Operands.front(), Mask);
11869 }
11870 TE->setOperands(Operands);
11871 if (State == TreeEntry::ScatterVectorize)
11872 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
11873 return;
11874 }
11875 case Instruction::ZExt:
11876 case Instruction::SExt:
11877 case Instruction::FPToUI:
11878 case Instruction::FPToSI:
11879 case Instruction::FPExt:
11880 case Instruction::PtrToInt:
11881 case Instruction::IntToPtr:
11882 case Instruction::SIToFP:
11883 case Instruction::UIToFP:
11884 case Instruction::Trunc:
11885 case Instruction::FPTrunc:
11886 case Instruction::BitCast: {
11887 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11888 std::make_pair(std::numeric_limits<unsigned>::min(),
11889 std::numeric_limits<unsigned>::max()));
11890 if (ShuffleOrOp == Instruction::ZExt ||
11891 ShuffleOrOp == Instruction::SExt) {
11892 CastMaxMinBWSizes = std::make_pair(
11893 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11894 PrevMaxBW),
11895 std::min<unsigned>(
11896 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11897 PrevMinBW));
11898 } else if (ShuffleOrOp == Instruction::Trunc) {
11899 CastMaxMinBWSizes = std::make_pair(
11900 std::max<unsigned>(
11901 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11902 PrevMaxBW),
11903 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11904 PrevMinBW));
11905 }
11906 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11907 ReuseShuffleIndices);
11908 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
11909 TE->dump());
11910
11911 TE->setOperands(Operands);
11912 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11913 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11914 if (ShuffleOrOp == Instruction::Trunc) {
11915 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11916 } else if (ShuffleOrOp == Instruction::SIToFP ||
11917 ShuffleOrOp == Instruction::UIToFP) {
11918 unsigned NumSignBits =
11919 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11920 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
11921 APInt Mask = DB->getDemandedBits(OpI);
11922 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
11923 }
11924 if (NumSignBits * 2 >=
11925 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11926 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11927 }
11928 return;
11929 }
11930 case Instruction::ICmp:
11931 case Instruction::FCmp: {
11932 // Check that all of the compares have the same predicate.
11933 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11934 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11935 ReuseShuffleIndices);
11936 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
11937 TE->dump());
11938
11939 VLOperands Ops(VL, Operands, S, *this);
11940 if (cast<CmpInst>(VL0)->isCommutative()) {
11941 // Commutative predicate - collect + sort operands of the instructions
11942 // so that each side is more likely to have the same opcode.
11944 "Commutative Predicate mismatch");
11945 Ops.reorder();
11946 Operands.front() = Ops.getVL(0);
11947 Operands.back() = Ops.getVL(1);
11948 } else {
11949 // Collect operands - commute if it uses the swapped predicate.
11950 for (auto [Idx, V] : enumerate(VL)) {
11951 if (isa<PoisonValue>(V))
11952 continue;
11953 auto *Cmp = cast<CmpInst>(V);
11954 if (Cmp->getPredicate() != P0)
11955 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11956 }
11957 }
11958 TE->setOperands(Operands);
11959 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11960 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11961 if (ShuffleOrOp == Instruction::ICmp) {
11962 unsigned NumSignBits0 =
11963 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11964 if (NumSignBits0 * 2 >=
11965 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11966 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11967 unsigned NumSignBits1 =
11968 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
11969 if (NumSignBits1 * 2 >=
11970 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
11971 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11972 }
11973 return;
11974 }
11975 case Instruction::Select:
11976 case Instruction::FNeg:
11977 case Instruction::Add:
11978 case Instruction::FAdd:
11979 case Instruction::Sub:
11980 case Instruction::FSub:
11981 case Instruction::Mul:
11982 case Instruction::FMul:
11983 case Instruction::UDiv:
11984 case Instruction::SDiv:
11985 case Instruction::FDiv:
11986 case Instruction::URem:
11987 case Instruction::SRem:
11988 case Instruction::FRem:
11989 case Instruction::Shl:
11990 case Instruction::LShr:
11991 case Instruction::AShr:
11992 case Instruction::And:
11993 case Instruction::Or:
11994 case Instruction::Xor:
11995 case Instruction::Freeze: {
11996 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11997 ReuseShuffleIndices);
11998 LLVM_DEBUG(
11999 dbgs() << "SLP: added a new TreeEntry "
12000 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12001 TE->dump());
12002
12003 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
12004 VLOperands Ops(VL, Operands, S, *this);
12005 Ops.reorder();
12006 Operands[0] = Ops.getVL(0);
12007 Operands[1] = Ops.getVL(1);
12008 }
12009 TE->setOperands(Operands);
12010 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12011 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12012 return;
12013 }
12014 case Instruction::GetElementPtr: {
12015 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12016 ReuseShuffleIndices);
12017 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
12018 TE->dump());
12019 TE->setOperands(Operands);
12020
12021 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
12022 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12023 return;
12024 }
12025 case Instruction::Store: {
12026 bool Consecutive = CurrentOrder.empty();
12027 if (!Consecutive)
12028 fixupOrderingIndices(CurrentOrder);
12029 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12030 ReuseShuffleIndices, CurrentOrder);
12031 if (Consecutive)
12032 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
12033 TE->dump());
12034 else
12035 LLVM_DEBUG(
12036 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
12037 TE->dump());
12038 TE->setOperands(Operands);
12039 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
12040 return;
12041 }
12042 case Instruction::Call: {
12043 // Check if the calls are all to the same vectorizable intrinsic or
12044 // library function.
12045 CallInst *CI = cast<CallInst>(VL0);
12047
12048 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12049 ReuseShuffleIndices);
12050 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
12051 TE->dump());
12052 if (isCommutative(VL0)) {
12053 VLOperands Ops(VL, Operands, S, *this);
12054 Ops.reorder();
12055 Operands[0] = Ops.getVL(0);
12056 Operands[1] = Ops.getVL(1);
12057 }
12058 TE->setOperands(Operands);
12059 for (unsigned I : seq<unsigned>(CI->arg_size())) {
12060 // For scalar operands no need to create an entry since no need to
12061 // vectorize it.
12063 continue;
12064 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12065 }
12066 return;
12067 }
12068 case Instruction::ShuffleVector: {
12069 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12070 ReuseShuffleIndices);
12071 if (S.isAltShuffle()) {
12072 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
12073 TE->dump());
12074 } else {
12075 assert(SLPReVec && "Only supported by REVEC.");
12076 LLVM_DEBUG(
12077 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12078 TE->dump());
12079 }
12080
12081 // Reorder operands if reordering would enable vectorization.
12082 auto *CI = dyn_cast<CmpInst>(VL0);
12083 if (CI && any_of(VL, [](Value *V) {
12084 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
12085 })) {
12086 auto *MainCI = cast<CmpInst>(S.getMainOp());
12087 auto *AltCI = cast<CmpInst>(S.getAltOp());
12088 CmpInst::Predicate MainP = MainCI->getPredicate();
12089 CmpInst::Predicate AltP = AltCI->getPredicate();
12090 assert(MainP != AltP &&
12091 "Expected different main/alternate predicates.");
12092 // Collect operands - commute if it uses the swapped predicate or
12093 // alternate operation.
12094 for (auto [Idx, V] : enumerate(VL)) {
12095 if (isa<PoisonValue>(V))
12096 continue;
12097 auto *Cmp = cast<CmpInst>(V);
12098
12099 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
12100 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12101 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12102 } else {
12103 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12104 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12105 }
12106 }
12107 TE->setOperands(Operands);
12108 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
12109 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
12110 return;
12111 }
12112
12113 if (isa<BinaryOperator>(VL0) || CI) {
12114 VLOperands Ops(VL, Operands, S, *this);
12115 Ops.reorder();
12116 Operands[0] = Ops.getVL(0);
12117 Operands[1] = Ops.getVL(1);
12118 }
12119 TE->setOperands(Operands);
12120 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12121 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12122 return;
12123 }
12124 default:
12125 break;
12126 }
12127 llvm_unreachable("Unexpected vectorization of the instructions.");
12128}
12129
12130unsigned BoUpSLP::canMapToVector(Type *T) const {
12131 unsigned N = 1;
12132 Type *EltTy = T;
12133
12135 if (EltTy->isEmptyTy())
12136 return 0;
12137 if (auto *ST = dyn_cast<StructType>(EltTy)) {
12138 // Check that struct is homogeneous.
12139 for (const auto *Ty : ST->elements())
12140 if (Ty != *ST->element_begin())
12141 return 0;
12142 N *= ST->getNumElements();
12143 EltTy = *ST->element_begin();
12144 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
12145 N *= AT->getNumElements();
12146 EltTy = AT->getElementType();
12147 } else {
12148 auto *VT = cast<FixedVectorType>(EltTy);
12149 N *= VT->getNumElements();
12150 EltTy = VT->getElementType();
12151 }
12152 }
12153
12154 if (!isValidElementType(EltTy))
12155 return 0;
12156 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
12157 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12158 VTSize != DL->getTypeStoreSizeInBits(T))
12159 return 0;
12160 return N;
12161}
12162
12163bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12164 SmallVectorImpl<unsigned> &CurrentOrder,
12165 bool ResizeAllowed) const {
12167 assert(It != VL.end() && "Expected at least one extract instruction.");
12168 auto *E0 = cast<Instruction>(*It);
12169 assert(
12171 "Invalid opcode");
12172 // Check if all of the extracts come from the same vector and from the
12173 // correct offset.
12174 Value *Vec = E0->getOperand(0);
12175
12176 CurrentOrder.clear();
12177
12178 // We have to extract from a vector/aggregate with the same number of elements.
12179 unsigned NElts;
12180 if (E0->getOpcode() == Instruction::ExtractValue) {
12181 NElts = canMapToVector(Vec->getType());
12182 if (!NElts)
12183 return false;
12184 // Check if load can be rewritten as load of vector.
12185 LoadInst *LI = dyn_cast<LoadInst>(Vec);
12186 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
12187 return false;
12188 } else {
12189 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
12190 }
12191
12192 unsigned E = VL.size();
12193 if (!ResizeAllowed && NElts != E)
12194 return false;
12195 SmallVector<int> Indices(E, PoisonMaskElem);
12196 unsigned MinIdx = NElts, MaxIdx = 0;
12197 for (auto [I, V] : enumerate(VL)) {
12198 auto *Inst = dyn_cast<Instruction>(V);
12199 if (!Inst)
12200 continue;
12201 if (Inst->getOperand(0) != Vec)
12202 return false;
12203 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
12204 if (isa<UndefValue>(EE->getIndexOperand()))
12205 continue;
12206 std::optional<unsigned> Idx = getExtractIndex(Inst);
12207 if (!Idx)
12208 return false;
12209 const unsigned ExtIdx = *Idx;
12210 if (ExtIdx >= NElts)
12211 continue;
12212 Indices[I] = ExtIdx;
12213 if (MinIdx > ExtIdx)
12214 MinIdx = ExtIdx;
12215 if (MaxIdx < ExtIdx)
12216 MaxIdx = ExtIdx;
12217 }
12218 if (MaxIdx - MinIdx + 1 > E)
12219 return false;
12220 if (MaxIdx + 1 <= E)
12221 MinIdx = 0;
12222
12223 // Check that all of the indices extract from the correct offset.
12224 bool ShouldKeepOrder = true;
12225 // Assign to all items the initial value E + 1 so we can check if the extract
12226 // instruction index was used already.
12227 // Also, later we can check that all the indices are used and we have a
12228 // consecutive access in the extract instructions, by checking that no
12229 // element of CurrentOrder still has value E + 1.
12230 CurrentOrder.assign(E, E);
12231 for (unsigned I = 0; I < E; ++I) {
12232 if (Indices[I] == PoisonMaskElem)
12233 continue;
12234 const unsigned ExtIdx = Indices[I] - MinIdx;
12235 if (CurrentOrder[ExtIdx] != E) {
12236 CurrentOrder.clear();
12237 return false;
12238 }
12239 ShouldKeepOrder &= ExtIdx == I;
12240 CurrentOrder[ExtIdx] = I;
12241 }
12242 if (ShouldKeepOrder)
12243 CurrentOrder.clear();
12244
12245 return ShouldKeepOrder;
12246}
12247
12248bool BoUpSLP::areAllUsersVectorized(
12249 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12250 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
12251 all_of(I->users(), [this](User *U) {
12252 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12253 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12254 });
12255}
12256
12257void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12258 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12259 SmallVectorImpl<Value *> *OpScalars,
12260 SmallVectorImpl<Value *> *AltScalars) const {
12261 unsigned Sz = Scalars.size();
12262 Mask.assign(Sz, PoisonMaskElem);
12263 SmallVector<int> OrderMask;
12264 if (!ReorderIndices.empty())
12265 inversePermutation(ReorderIndices, OrderMask);
12266 for (unsigned I = 0; I < Sz; ++I) {
12267 unsigned Idx = I;
12268 if (!ReorderIndices.empty())
12269 Idx = OrderMask[I];
12270 if (isa<PoisonValue>(Scalars[Idx]))
12271 continue;
12272 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12273 if (IsAltOp(OpInst)) {
12274 Mask[I] = Sz + Idx;
12275 if (AltScalars)
12276 AltScalars->push_back(OpInst);
12277 } else {
12278 Mask[I] = Idx;
12279 if (OpScalars)
12280 OpScalars->push_back(OpInst);
12281 }
12282 }
12283 if (!ReuseShuffleIndices.empty()) {
12284 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12285 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12286 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12287 });
12288 Mask.swap(NewMask);
12289 }
12290}
12291
12293 Instruction *AltOp,
12294 const TargetLibraryInfo &TLI) {
12295 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12296}
12297
12299 Instruction *AltOp,
12300 const TargetLibraryInfo &TLI) {
12301 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12302 auto *AltCI = cast<CmpInst>(AltOp);
12303 CmpInst::Predicate MainP = MainCI->getPredicate();
12304 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12305 assert(MainP != AltP && "Expected different main/alternate predicates.");
12306 auto *CI = cast<CmpInst>(I);
12307 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12308 return false;
12309 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12310 return true;
12311 CmpInst::Predicate P = CI->getPredicate();
12313
12314 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12315 "CmpInst expected to match either main or alternate predicate or "
12316 "their swap.");
12317 return MainP != P && MainP != SwappedP;
12318 }
12319 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12320}
12321
12322TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12323 assert(!Ops.empty());
12324 const auto *Op0 = Ops.front();
12325
12326 const bool IsConstant = all_of(Ops, [](Value *V) {
12327 // TODO: We should allow undef elements here
12328 return isConstant(V) && !isa<UndefValue>(V);
12329 });
12330 const bool IsUniform = all_of(Ops, [=](Value *V) {
12331 // TODO: We should allow undef elements here
12332 return V == Op0;
12333 });
12334 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12335 // TODO: We should allow undef elements here
12336 if (auto *CI = dyn_cast<ConstantInt>(V))
12337 return CI->getValue().isPowerOf2();
12338 return false;
12339 });
12340 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12341 // TODO: We should allow undef elements here
12342 if (auto *CI = dyn_cast<ConstantInt>(V))
12343 return CI->getValue().isNegatedPowerOf2();
12344 return false;
12345 });
12346
12348 if (IsConstant && IsUniform)
12350 else if (IsConstant)
12352 else if (IsUniform)
12354
12356 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12357 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12358
12359 return {VK, VP};
12360}
12361
12362namespace {
12363/// The base class for shuffle instruction emission and shuffle cost estimation.
12364class BaseShuffleAnalysis {
12365protected:
12366 Type *ScalarTy = nullptr;
12367
12368 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12369
12370 /// V is expected to be a vectorized value.
12371 /// When REVEC is disabled, there is no difference between VF and
12372 /// VNumElements.
12373 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12374 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12375 /// of 8.
12376 unsigned getVF(Value *V) const {
12377 assert(V && "V cannot be nullptr");
12378 assert(isa<FixedVectorType>(V->getType()) &&
12379 "V does not have FixedVectorType");
12380 assert(ScalarTy && "ScalarTy cannot be nullptr");
12381 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12382 unsigned VNumElements =
12383 cast<FixedVectorType>(V->getType())->getNumElements();
12384 assert(VNumElements > ScalarTyNumElements &&
12385 "the number of elements of V is not large enough");
12386 assert(VNumElements % ScalarTyNumElements == 0 &&
12387 "the number of elements of V is not a vectorized value");
12388 return VNumElements / ScalarTyNumElements;
12389 }
12390
12391 /// Checks if the mask is an identity mask.
12392 /// \param IsStrict if is true the function returns false if mask size does
12393 /// not match vector size.
12394 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12395 bool IsStrict) {
12396 int Limit = Mask.size();
12397 int VF = VecTy->getNumElements();
12398 int Index = -1;
12399 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12400 return true;
12401 if (!IsStrict) {
12402 // Consider extract subvector starting from index 0.
12403 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12404 Index == 0)
12405 return true;
12406 // All VF-size submasks are identity (e.g.
12407 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12408 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12409 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12410 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
12412 }))
12413 return true;
12414 }
12415 return false;
12416 }
12417
12418 /// Tries to combine 2 different masks into single one.
12419 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12420 /// change the size of the vector, \p LocalVF is the original size of the
12421 /// shuffled vector.
12422 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12423 ArrayRef<int> ExtMask) {
12424 unsigned VF = Mask.size();
12425 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12426 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12427 if (ExtMask[I] == PoisonMaskElem)
12428 continue;
12429 int MaskedIdx = Mask[ExtMask[I] % VF];
12430 NewMask[I] =
12431 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12432 }
12433 Mask.swap(NewMask);
12434 }
12435
12436 /// Looks through shuffles trying to reduce final number of shuffles in the
12437 /// code. The function looks through the previously emitted shuffle
12438 /// instructions and properly mark indices in mask as undef.
12439 /// For example, given the code
12440 /// \code
12441 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12442 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12443 /// \endcode
12444 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12445 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12446 /// <0, 1, 2, 3> for the shuffle.
12447 /// If 2 operands are of different size, the smallest one will be resized and
12448 /// the mask recalculated properly.
12449 /// For example, given the code
12450 /// \code
12451 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12452 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12453 /// \endcode
12454 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12455 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12456 /// <0, 1, 2, 3> for the shuffle.
12457 /// So, it tries to transform permutations to simple vector merge, if
12458 /// possible.
12459 /// \param V The input vector which must be shuffled using the given \p Mask.
12460 /// If the better candidate is found, \p V is set to this best candidate
12461 /// vector.
12462 /// \param Mask The input mask for the shuffle. If the best candidate is found
12463 /// during looking-through-shuffles attempt, it is updated accordingly.
12464 /// \param SinglePermute true if the shuffle operation is originally a
12465 /// single-value-permutation. In this case the look-through-shuffles procedure
12466 /// may look for resizing shuffles as the best candidates.
12467 /// \return true if the shuffle results in the non-resizing identity shuffle
12468 /// (and thus can be ignored), false - otherwise.
12469 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12470 bool SinglePermute) {
12471 Value *Op = V;
12472 ShuffleVectorInst *IdentityOp = nullptr;
12473 SmallVector<int> IdentityMask;
12474 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12475 // Exit if not a fixed vector type or changing size shuffle.
12476 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12477 if (!SVTy)
12478 break;
12479 // Remember the identity or broadcast mask, if it is not a resizing
12480 // shuffle. If no better candidates are found, this Op and Mask will be
12481 // used in the final shuffle.
12482 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12483 if (!IdentityOp || !SinglePermute ||
12484 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12486 IdentityMask.size()))) {
12487 IdentityOp = SV;
12488 // Store current mask in the IdentityMask so later we did not lost
12489 // this info if IdentityOp is selected as the best candidate for the
12490 // permutation.
12491 IdentityMask.assign(Mask);
12492 }
12493 }
12494 // Remember the broadcast mask. If no better candidates are found, this Op
12495 // and Mask will be used in the final shuffle.
12496 // Zero splat can be used as identity too, since it might be used with
12497 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12498 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12499 // expensive, the analysis founds out, that the source vector is just a
12500 // broadcast, this original mask can be transformed to identity mask <0,
12501 // 1, 2, 3>.
12502 // \code
12503 // %0 = shuffle %v, poison, zeroinitalizer
12504 // %res = shuffle %0, poison, <3, 1, 2, 0>
12505 // \endcode
12506 // may be transformed to
12507 // \code
12508 // %0 = shuffle %v, poison, zeroinitalizer
12509 // %res = shuffle %0, poison, <0, 1, 2, 3>
12510 // \endcode
12511 if (SV->isZeroEltSplat()) {
12512 IdentityOp = SV;
12513 IdentityMask.assign(Mask);
12514 }
12515 int LocalVF = Mask.size();
12516 if (auto *SVOpTy =
12517 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12518 LocalVF = SVOpTy->getNumElements();
12519 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12520 for (auto [Idx, I] : enumerate(Mask)) {
12521 if (I == PoisonMaskElem ||
12522 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12523 continue;
12524 ExtMask[Idx] = SV->getMaskValue(I);
12525 }
12526 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12527 SV->getOperand(0),
12528 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12529 .all();
12530 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12531 SV->getOperand(1),
12532 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12533 .all();
12534 if (!IsOp1Undef && !IsOp2Undef) {
12535 // Update mask and mark undef elems.
12536 for (int &I : Mask) {
12537 if (I == PoisonMaskElem)
12538 continue;
12539 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12541 I = PoisonMaskElem;
12542 }
12543 break;
12544 }
12545 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12546 combineMasks(LocalVF, ShuffleMask, Mask);
12547 Mask.swap(ShuffleMask);
12548 if (IsOp2Undef)
12549 Op = SV->getOperand(0);
12550 else
12551 Op = SV->getOperand(1);
12552 }
12553 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12554 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12556 if (IdentityOp) {
12557 V = IdentityOp;
12558 assert(Mask.size() == IdentityMask.size() &&
12559 "Expected masks of same sizes.");
12560 // Clear known poison elements.
12561 for (auto [I, Idx] : enumerate(Mask))
12562 if (Idx == PoisonMaskElem)
12563 IdentityMask[I] = PoisonMaskElem;
12564 Mask.swap(IdentityMask);
12565 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12566 return SinglePermute &&
12567 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12568 /*IsStrict=*/true) ||
12569 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12570 Shuffle->isZeroEltSplat() &&
12572 all_of(enumerate(Mask), [&](const auto &P) {
12573 return P.value() == PoisonMaskElem ||
12574 Shuffle->getShuffleMask()[P.index()] == 0;
12575 })));
12576 }
12577 V = Op;
12578 return false;
12579 }
12580 V = Op;
12581 return true;
12582 }
12583
12584 /// Smart shuffle instruction emission, walks through shuffles trees and
12585 /// tries to find the best matching vector for the actual shuffle
12586 /// instruction.
12587 template <typename T, typename ShuffleBuilderTy>
12588 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12589 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12590 assert(V1 && "Expected at least one vector value.");
12591 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12592 SmallVector<int> NewMask(Mask);
12593 if (ScalarTyNumElements != 1) {
12594 assert(SLPReVec && "FixedVectorType is not expected.");
12595 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12596 Mask = NewMask;
12597 }
12598 if (V2)
12599 Builder.resizeToMatch(V1, V2);
12600 int VF = Mask.size();
12601 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12602 VF = FTy->getNumElements();
12604 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12605 .all()) {
12606 // Peek through shuffles.
12607 Value *Op1 = V1;
12608 Value *Op2 = V2;
12609 int VF =
12610 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12611 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12612 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12613 for (int I = 0, E = Mask.size(); I < E; ++I) {
12614 if (Mask[I] < VF)
12615 CombinedMask1[I] = Mask[I];
12616 else
12617 CombinedMask2[I] = Mask[I] - VF;
12618 }
12619 Value *PrevOp1;
12620 Value *PrevOp2;
12621 do {
12622 PrevOp1 = Op1;
12623 PrevOp2 = Op2;
12624 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12625 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12626 // Check if we have 2 resizing shuffles - need to peek through operands
12627 // again.
12628 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12629 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12630 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12631 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12632 if (I == PoisonMaskElem)
12633 continue;
12634 ExtMask1[Idx] = SV1->getMaskValue(I);
12635 }
12636 SmallBitVector UseMask1 = buildUseMask(
12637 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12638 ->getNumElements(),
12639 ExtMask1, UseMask::SecondArg);
12640 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12641 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12642 if (I == PoisonMaskElem)
12643 continue;
12644 ExtMask2[Idx] = SV2->getMaskValue(I);
12645 }
12646 SmallBitVector UseMask2 = buildUseMask(
12647 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12648 ->getNumElements(),
12649 ExtMask2, UseMask::SecondArg);
12650 if (SV1->getOperand(0)->getType() ==
12651 SV2->getOperand(0)->getType() &&
12652 SV1->getOperand(0)->getType() != SV1->getType() &&
12653 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12654 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12655 Op1 = SV1->getOperand(0);
12656 Op2 = SV2->getOperand(0);
12657 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12658 int LocalVF = ShuffleMask1.size();
12659 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12660 LocalVF = FTy->getNumElements();
12661 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12662 CombinedMask1.swap(ShuffleMask1);
12663 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12664 LocalVF = ShuffleMask2.size();
12665 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12666 LocalVF = FTy->getNumElements();
12667 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12668 CombinedMask2.swap(ShuffleMask2);
12669 }
12670 }
12671 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12672 Builder.resizeToMatch(Op1, Op2);
12673 VF = std::max(cast<VectorType>(Op1->getType())
12674 ->getElementCount()
12675 .getKnownMinValue(),
12677 ->getElementCount()
12678 .getKnownMinValue());
12679 for (int I = 0, E = Mask.size(); I < E; ++I) {
12680 if (CombinedMask2[I] != PoisonMaskElem) {
12681 assert(CombinedMask1[I] == PoisonMaskElem &&
12682 "Expected undefined mask element");
12683 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12684 }
12685 }
12686 if (Op1 == Op2 &&
12687 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12688 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12690 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12691 ArrayRef(CombinedMask1))))
12692 return Builder.createIdentity(Op1);
12693 return Builder.createShuffleVector(
12694 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12695 CombinedMask1);
12696 }
12697 if (isa<PoisonValue>(V1))
12698 return Builder.createPoison(
12699 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12700 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12701 assert(V1 && "Expected non-null value after looking through shuffles.");
12702
12703 if (!IsIdentity)
12704 return Builder.createShuffleVector(V1, NewMask);
12705 return Builder.createIdentity(V1);
12706 }
12707
12708 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12709 /// shuffle emission.
12710 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12711 ArrayRef<int> Mask) {
12712 for (unsigned I : seq<unsigned>(CommonMask.size()))
12713 if (Mask[I] != PoisonMaskElem)
12714 CommonMask[I] = I;
12715 }
12716};
12717} // namespace
12718
12719/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12720static std::pair<InstructionCost, InstructionCost>
12722 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12723 Type *ScalarTy, VectorType *VecTy) {
12724 InstructionCost ScalarCost = 0;
12725 InstructionCost VecCost = 0;
12726 // Here we differentiate two cases: (1) when Ptrs represent a regular
12727 // vectorization tree node (as they are pointer arguments of scattered
12728 // loads) or (2) when Ptrs are the arguments of loads or stores being
12729 // vectorized as plane wide unit-stride load/store since all the
12730 // loads/stores are known to be from/to adjacent locations.
12731 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12732 // Case 2: estimate costs for pointer related costs when vectorizing to
12733 // a wide load/store.
12734 // Scalar cost is estimated as a set of pointers with known relationship
12735 // between them.
12736 // For vector code we will use BasePtr as argument for the wide load/store
12737 // but we also need to account all the instructions which are going to
12738 // stay in vectorized code due to uses outside of these scalar
12739 // loads/stores.
12740 ScalarCost = TTI.getPointersChainCost(
12741 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12742 CostKind);
12743
12744 SmallVector<const Value *> PtrsRetainedInVecCode;
12745 for (Value *V : Ptrs) {
12746 if (V == BasePtr) {
12747 PtrsRetainedInVecCode.push_back(V);
12748 continue;
12749 }
12750 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
12751 // For simplicity assume Ptr to stay in vectorized code if it's not a
12752 // GEP instruction. We don't care since it's cost considered free.
12753 // TODO: We should check for any uses outside of vectorizable tree
12754 // rather than just single use.
12755 if (!Ptr || !Ptr->hasOneUse())
12756 PtrsRetainedInVecCode.push_back(V);
12757 }
12758
12759 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12760 // If all pointers stay in vectorized code then we don't have
12761 // any savings on that.
12762 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
12763 }
12764 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12765 TTI::PointersChainInfo::getKnownStride(),
12766 VecTy, CostKind);
12767 } else {
12768 // Case 1: Ptrs are the arguments of loads that we are going to transform
12769 // into masked gather load intrinsic.
12770 // All the scalar GEPs will be removed as a result of vectorization.
12771 // For any external uses of some lanes extract element instructions will
12772 // be generated (which cost is estimated separately).
12773 TTI::PointersChainInfo PtrsInfo =
12774 all_of(Ptrs,
12775 [](const Value *V) {
12776 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
12777 return Ptr && !Ptr->hasAllConstantIndices();
12778 })
12779 ? TTI::PointersChainInfo::getUnknownStride()
12780 : TTI::PointersChainInfo::getKnownStride();
12781
12782 ScalarCost =
12783 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
12784 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12785 if (!BaseGEP) {
12786 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
12787 if (It != Ptrs.end())
12788 BaseGEP = cast<GEPOperator>(*It);
12789 }
12790 if (BaseGEP) {
12791 SmallVector<const Value *> Indices(BaseGEP->indices());
12792 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
12793 BaseGEP->getPointerOperand(), Indices, VecTy,
12794 CostKind);
12795 }
12796 }
12797
12798 return std::make_pair(ScalarCost, VecCost);
12799}
12800
12801void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12802 assert(TE.isGather() && TE.ReorderIndices.empty() &&
12803 "Expected gather node without reordering.");
12804 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
12805 SmallSet<size_t, 2> LoadKeyUsed;
12806
12807 // Do not reorder nodes if it small (just 2 elements), all-constant or all
12808 // instructions have same opcode already.
12809 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
12810 all_of(TE.Scalars, isConstant))
12811 return;
12812
12813 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
12814 return VectorizableTree[Idx]->isSame(TE.Scalars);
12815 }))
12816 return;
12817
12818 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
12819 Key = hash_combine(hash_value(LI->getParent()), Key);
12820 Value *Ptr =
12821 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
12822 if (LoadKeyUsed.contains(Key)) {
12823 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
12824 if (LIt != LoadsMap.end()) {
12825 for (LoadInst *RLI : LIt->second) {
12826 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
12827 LI->getType(), LI->getPointerOperand(), *DL, *SE,
12828 /*StrictCheck=*/true))
12829 return hash_value(RLI->getPointerOperand());
12830 }
12831 for (LoadInst *RLI : LIt->second) {
12833 LI->getPointerOperand(), *TLI)) {
12834 hash_code SubKey = hash_value(RLI->getPointerOperand());
12835 return SubKey;
12836 }
12837 }
12838 if (LIt->second.size() > 2) {
12839 hash_code SubKey =
12840 hash_value(LIt->second.back()->getPointerOperand());
12841 return SubKey;
12842 }
12843 }
12844 }
12845 LoadKeyUsed.insert(Key);
12846 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
12847 return hash_value(LI->getPointerOperand());
12848 };
12849 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12850 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12851 bool IsOrdered = true;
12852 unsigned NumInstructions = 0;
12853 // Try to "cluster" scalar instructions, to be able to build extra vectorized
12854 // nodes.
12855 for (auto [I, V] : enumerate(TE.Scalars)) {
12856 size_t Key = 1, Idx = 1;
12857 if (auto *Inst = dyn_cast<Instruction>(V);
12859 !isDeleted(Inst) && !isVectorized(V)) {
12860 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
12861 /*AllowAlternate=*/false);
12862 ++NumInstructions;
12863 }
12864 auto &Container = SortedValues[Key];
12865 if (IsOrdered && !KeyToIndex.contains(V) &&
12868 ((Container.contains(Idx) &&
12869 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
12870 (!Container.empty() && !Container.contains(Idx) &&
12871 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
12872 IsOrdered = false;
12873 auto &KTI = KeyToIndex[V];
12874 if (KTI.empty())
12875 Container[Idx].push_back(V);
12876 KTI.push_back(I);
12877 }
12879 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12880 if (!IsOrdered && NumInstructions > 1) {
12881 unsigned Cnt = 0;
12882 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
12883 for (const auto &D : SortedValues) {
12884 for (const auto &P : D.second) {
12885 unsigned Sz = 0;
12886 for (Value *V : P.second) {
12887 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
12888 for (auto [K, Idx] : enumerate(Indices)) {
12889 TE.ReorderIndices[Cnt + K] = Idx;
12890 TE.Scalars[Cnt + K] = V;
12891 }
12892 Sz += Indices.size();
12893 Cnt += Indices.size();
12894 }
12895 if (Sz > 1 && isa<Instruction>(P.second.front())) {
12896 const unsigned SubVF = getFloorFullVectorNumberOfElements(
12897 *TTI, TE.Scalars.front()->getType(), Sz);
12898 SubVectors.emplace_back(Cnt - Sz, SubVF);
12899 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12900 DemandedElts.clearBit(I);
12901 } else if (!P.second.empty() && isConstant(P.second.front())) {
12902 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12903 DemandedElts.clearBit(I);
12904 }
12905 }
12906 }
12907 }
12908 // Reuses always require shuffles, so consider it as profitable.
12909 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
12910 return;
12911 // Do simple cost estimation.
12914 auto *ScalarTy = TE.Scalars.front()->getType();
12915 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
12916 for (auto [Idx, Sz] : SubVectors) {
12918 Idx, getWidenedType(ScalarTy, Sz));
12919 }
12920 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12921 /*Insert=*/true,
12922 /*Extract=*/false, CostKind);
12923 int Sz = TE.Scalars.size();
12924 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
12925 TE.ReorderIndices.end());
12926 for (unsigned I : seq<unsigned>(Sz)) {
12927 Value *V = TE.getOrdered(I);
12928 if (isa<PoisonValue>(V)) {
12929 ReorderMask[I] = PoisonMaskElem;
12930 } else if (isConstant(V) || DemandedElts[I]) {
12931 ReorderMask[I] = I + TE.ReorderIndices.size();
12932 }
12933 }
12934 Cost += ::getShuffleCost(*TTI,
12935 any_of(ReorderMask, [&](int I) { return I >= Sz; })
12938 VecTy, ReorderMask);
12939 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12940 ReorderMask.assign(Sz, PoisonMaskElem);
12941 for (unsigned I : seq<unsigned>(Sz)) {
12942 Value *V = TE.getOrdered(I);
12943 if (isConstant(V)) {
12944 DemandedElts.clearBit(I);
12945 if (!isa<PoisonValue>(V))
12946 ReorderMask[I] = I;
12947 } else {
12948 ReorderMask[I] = I + Sz;
12949 }
12950 }
12951 InstructionCost BVCost =
12952 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12953 /*Insert=*/true, /*Extract=*/false, CostKind);
12954 if (!DemandedElts.isAllOnes())
12955 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
12956 if (Cost >= BVCost) {
12957 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
12958 reorderScalars(TE.Scalars, Mask);
12959 TE.ReorderIndices.clear();
12960 }
12961}
12962
12963/// Check if we can convert fadd/fsub sequence to FMAD.
12964/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
12966 const InstructionsState &S,
12967 DominatorTree &DT, const DataLayout &DL,
12969 const TargetLibraryInfo &TLI) {
12970 assert(all_of(VL,
12971 [](Value *V) {
12972 return V->getType()->getScalarType()->isFloatingPointTy();
12973 }) &&
12974 "Can only convert to FMA for floating point types");
12975 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12976
12977 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12978 FastMathFlags FMF;
12979 FMF.set();
12980 for (Value *V : VL) {
12981 auto *I = dyn_cast<Instruction>(V);
12982 if (!I)
12983 continue;
12984 if (S.isCopyableElement(I))
12985 continue;
12986 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12987 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12988 continue;
12989 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12990 FMF &= FPCI->getFastMathFlags();
12991 }
12992 return FMF.allowContract();
12993 };
12994 if (!CheckForContractable(VL))
12996 // fmul also should be contractable
12997 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12998 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12999
13000 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
13001 if (!OpS.valid())
13003
13004 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13006 if (!CheckForContractable(Operands.front()))
13008 // Compare the costs.
13009 InstructionCost FMulPlusFAddCost = 0;
13010 InstructionCost FMACost = 0;
13012 FastMathFlags FMF;
13013 FMF.set();
13014 for (Value *V : VL) {
13015 auto *I = dyn_cast<Instruction>(V);
13016 if (!I)
13017 continue;
13018 if (!S.isCopyableElement(I))
13019 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13020 FMF &= FPCI->getFastMathFlags();
13021 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13022 }
13023 unsigned NumOps = 0;
13024 for (auto [V, Op] : zip(VL, Operands.front())) {
13025 if (S.isCopyableElement(V))
13026 continue;
13027 auto *I = dyn_cast<Instruction>(Op);
13028 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
13029 if (auto *OpI = dyn_cast<Instruction>(V))
13030 FMACost += TTI.getInstructionCost(OpI, CostKind);
13031 if (I)
13032 FMACost += TTI.getInstructionCost(I, CostKind);
13033 continue;
13034 }
13035 ++NumOps;
13036 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13037 FMF &= FPCI->getFastMathFlags();
13038 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13039 }
13040 Type *Ty = VL.front()->getType();
13041 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
13042 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
13043 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
13044}
13045
13048 BaseGraphSize = VectorizableTree.size();
13049 // Turn graph transforming mode on and off, when done.
13050 class GraphTransformModeRAAI {
13051 bool &SavedIsGraphTransformMode;
13052
13053 public:
13054 GraphTransformModeRAAI(bool &IsGraphTransformMode)
13055 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13056 IsGraphTransformMode = true;
13057 }
13058 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
13059 } TransformContext(IsGraphTransformMode);
13060 // Operands are profitable if they are:
13061 // 1. At least one constant
13062 // or
13063 // 2. Splats
13064 // or
13065 // 3. Results in good vectorization opportunity, i.e. may generate vector
13066 // nodes and reduce cost of the graph.
13067 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
13068 const InstructionsState &S) {
13070 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
13071 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
13072 I2->getOperand(Op));
13073 return all_of(
13074 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
13075 return all_of(Cand,
13076 [](const std::pair<Value *, Value *> &P) {
13077 return isa<Constant>(P.first) ||
13078 isa<Constant>(P.second) || P.first == P.second;
13079 }) ||
13081 });
13082 };
13083
13084 // Try to reorder gather nodes for better vectorization opportunities.
13085 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13086 TreeEntry &E = *VectorizableTree[Idx];
13087 if (E.isGather())
13088 reorderGatherNode(E);
13089 }
13090
13091 // Better to use full gathered loads analysis, if there are only 2 loads
13092 // gathered nodes each having less than 16 elements.
13093 constexpr unsigned VFLimit = 16;
13094 bool ForceLoadGather =
13095 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
13096 return TE->isGather() && TE->hasState() &&
13097 TE->getOpcode() == Instruction::Load &&
13098 TE->getVectorFactor() < VFLimit;
13099 }) == 2;
13100
13101 // Checks if the scalars are used in other node.
13102 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
13103 function_ref<bool(Value *)> CheckContainer) {
13104 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
13105 if (isa<PoisonValue>(V))
13106 return true;
13107 auto *I = dyn_cast<Instruction>(V);
13108 if (!I)
13109 return false;
13110 return is_contained(TE->Scalars, I) || CheckContainer(I);
13111 });
13112 };
13113 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
13114 if (E.hasState()) {
13115 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
13116 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13117 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13118 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13119 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13120 return is_contained(TEs, TE);
13121 });
13122 });
13123 }))
13124 return true;
13125 ;
13126 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
13127 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13128 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13129 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13130 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13131 return is_contained(TEs, TE);
13132 });
13133 });
13134 }))
13135 return true;
13136 } else {
13137 // Check if the gather node full copy of split node.
13138 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
13139 if (It != E.Scalars.end()) {
13140 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
13141 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13142 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13143 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13144 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13145 return is_contained(TEs, TE);
13146 });
13147 });
13148 }))
13149 return true;
13150 }
13151 }
13152 return false;
13153 };
13154 // The tree may grow here, so iterate over nodes, built before.
13155 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13156 TreeEntry &E = *VectorizableTree[Idx];
13157 if (E.isGather()) {
13158 ArrayRef<Value *> VL = E.Scalars;
13159 const unsigned Sz = getVectorElementSize(VL.front());
13160 unsigned MinVF = getMinVF(2 * Sz);
13161 // Do not try partial vectorization for small nodes (<= 2), nodes with the
13162 // same opcode and same parent block or all constants.
13163 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13164 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
13165 // We use allSameOpcode instead of isAltShuffle because we don't
13166 // want to use interchangeable instruction here.
13167 !allSameOpcode(VL) || !allSameBlock(VL)) ||
13168 allConstant(VL) || isSplat(VL))
13169 continue;
13170 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13171 continue;
13172 // Check if the node is a copy of other vector nodes.
13173 if (CheckForSameVectorNodes(E))
13174 continue;
13175 // Try to find vectorizable sequences and transform them into a series of
13176 // insertvector instructions.
13177 unsigned StartIdx = 0;
13178 unsigned End = VL.size();
13179 for (unsigned VF = getFloorFullVectorNumberOfElements(
13180 *TTI, VL.front()->getType(), VL.size() - 1);
13181 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13182 *TTI, VL.front()->getType(), VF - 1)) {
13183 if (StartIdx + VF > End)
13184 continue;
13186 bool AllStrided = true;
13187 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13188 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
13189 // If any instruction is vectorized already - do not try again.
13190 // Reuse the existing node, if it fully matches the slice.
13191 if (isVectorized(Slice.front()) &&
13192 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
13193 continue;
13194 // Constant already handled effectively - skip.
13195 if (allConstant(Slice))
13196 continue;
13197 // Do not try to vectorize small splats (less than vector register and
13198 // only with the single non-undef element).
13199 bool IsSplat = isSplat(Slice);
13200 bool IsTwoRegisterSplat = true;
13201 if (IsSplat && VF == 2) {
13202 unsigned NumRegs2VF = ::getNumberOfParts(
13203 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
13204 IsTwoRegisterSplat = NumRegs2VF == 2;
13205 }
13206 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
13207 count(Slice, Slice.front()) ==
13208 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
13209 : 1)) {
13210 if (IsSplat)
13211 continue;
13212 InstructionsState S = getSameOpcode(Slice, *TLI);
13213 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
13214 (S.getOpcode() == Instruction::Load &&
13216 (S.getOpcode() != Instruction::Load &&
13217 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
13218 continue;
13219 if (VF == 2) {
13220 // Try to vectorize reduced values or if all users are vectorized.
13221 // For expensive instructions extra extracts might be profitable.
13222 if ((!UserIgnoreList || E.Idx != 0) &&
13223 TTI->getInstructionCost(S.getMainOp(), CostKind) <
13225 !all_of(Slice, [&](Value *V) {
13226 if (isa<PoisonValue>(V))
13227 return true;
13228 return areAllUsersVectorized(cast<Instruction>(V),
13229 UserIgnoreList);
13230 }))
13231 continue;
13232 if (S.getOpcode() == Instruction::Load) {
13233 OrdersType Order;
13234 SmallVector<Value *> PointerOps;
13235 StridedPtrInfo SPtrInfo;
13236 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
13237 PointerOps, SPtrInfo);
13238 AllStrided &= Res == LoadsState::StridedVectorize ||
13240 Res == LoadsState::Gather;
13241 // Do not vectorize gathers.
13242 if (Res == LoadsState::ScatterVectorize ||
13243 Res == LoadsState::Gather) {
13244 if (Res == LoadsState::Gather) {
13246 // If reductions and the scalars from the root node are
13247 // analyzed - mark as non-vectorizable reduction.
13248 if (UserIgnoreList && E.Idx == 0)
13249 analyzedReductionVals(Slice);
13250 }
13251 continue;
13252 }
13253 } else if (S.getOpcode() == Instruction::ExtractElement ||
13254 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
13256 !CheckOperandsProfitability(
13257 S.getMainOp(),
13260 S))) {
13261 // Do not vectorize extractelements (handled effectively
13262 // alread). Do not vectorize non-profitable instructions (with
13263 // low cost and non-vectorizable operands.)
13264 continue;
13265 }
13266 }
13267 }
13268 Slices.emplace_back(Cnt, Slice.size());
13269 }
13270 // Do not try to vectorize if all slides are strided or gathered with
13271 // vector factor 2 and there are more than 2 slices. Better to handle
13272 // them in gathered loads analysis, may result in better vectorization.
13273 if (VF == 2 && AllStrided && Slices.size() > 2)
13274 continue;
13275 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13276 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13277 if (StartIdx == Cnt)
13278 StartIdx = Cnt + Sz;
13279 if (End == Cnt + Sz)
13280 End = Cnt;
13281 };
13282 for (auto [Cnt, Sz] : Slices) {
13283 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13284 const TreeEntry *SameTE = nullptr;
13285 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13286 It != Slice.end()) {
13287 // If any instruction is vectorized already - do not try again.
13288 SameTE = getSameValuesTreeEntry(*It, Slice);
13289 }
13290 unsigned PrevSize = VectorizableTree.size();
13291 [[maybe_unused]] unsigned PrevEntriesSize =
13292 LoadEntriesToVectorize.size();
13293 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13294 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13295 VectorizableTree[PrevSize]->isGather() &&
13296 VectorizableTree[PrevSize]->hasState() &&
13297 VectorizableTree[PrevSize]->getOpcode() !=
13298 Instruction::ExtractElement &&
13299 !isSplat(Slice)) {
13300 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13301 analyzedReductionVals(Slice);
13302 VectorizableTree.pop_back();
13303 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13304 "LoadEntriesToVectorize expected to remain the same");
13305 continue;
13306 }
13307 AddCombinedNode(PrevSize, Cnt, Sz);
13308 }
13309 }
13310 // Restore ordering, if no extra vectorization happened.
13311 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13312 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13313 reorderScalars(E.Scalars, Mask);
13314 E.ReorderIndices.clear();
13315 }
13316 }
13317 if (!E.hasState())
13318 continue;
13319 switch (E.getOpcode()) {
13320 case Instruction::Load: {
13321 // No need to reorder masked gather loads, just reorder the scalar
13322 // operands.
13323 if (E.State != TreeEntry::Vectorize)
13324 break;
13325 Type *ScalarTy = E.getMainOp()->getType();
13326 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13327 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13328 // Check if profitable to represent consecutive load + reverse as strided
13329 // load with stride -1.
13330 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13331 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13332 SmallVector<int> Mask;
13333 inversePermutation(E.ReorderIndices, Mask);
13334 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13335 InstructionCost OriginalVecCost =
13336 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13337 BaseLI->getPointerAddressSpace(), CostKind,
13339 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13340 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13341 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13342 VecTy, BaseLI->getPointerOperand(),
13343 /*VariableMask=*/false, CommonAlignment,
13344 BaseLI),
13345 CostKind);
13346 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13347 // Strided load is more profitable than consecutive load + reverse -
13348 // transform the node to strided load.
13349 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13350 ->getPointerOperand()
13351 ->getType());
13352 StridedPtrInfo SPtrInfo;
13353 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13354 SPtrInfo.Ty = VecTy;
13355 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13356 E.State = TreeEntry::StridedVectorize;
13357 }
13358 }
13359 break;
13360 }
13361 case Instruction::Store: {
13362 Type *ScalarTy =
13363 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13364 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13365 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13366 // Check if profitable to represent consecutive load + reverse as strided
13367 // load with stride -1.
13368 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13369 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13370 SmallVector<int> Mask;
13371 inversePermutation(E.ReorderIndices, Mask);
13372 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13373 InstructionCost OriginalVecCost =
13374 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13375 BaseSI->getPointerAddressSpace(), CostKind,
13377 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13378 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13379 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13380 VecTy, BaseSI->getPointerOperand(),
13381 /*VariableMask=*/false, CommonAlignment,
13382 BaseSI),
13383 CostKind);
13384 if (StridedCost < OriginalVecCost)
13385 // Strided store is more profitable than reverse + consecutive store -
13386 // transform the node to strided store.
13387 E.State = TreeEntry::StridedVectorize;
13388 } else if (!E.ReorderIndices.empty()) {
13389 // Check for interleaved stores.
13390 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13391 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13392 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13393 if (Mask.size() < 4)
13394 return 0u;
13395 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13397 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13398 TTI.isLegalInterleavedAccessType(
13399 VecTy, Factor, BaseSI->getAlign(),
13400 BaseSI->getPointerAddressSpace()))
13401 return Factor;
13402 }
13403
13404 return 0u;
13405 };
13406 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13407 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13408 if (InterleaveFactor != 0)
13409 E.setInterleave(InterleaveFactor);
13410 }
13411 break;
13412 }
13413 case Instruction::Select: {
13414 if (E.State != TreeEntry::Vectorize)
13415 break;
13416 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13417 if (MinMaxID == Intrinsic::not_intrinsic)
13418 break;
13419 // This node is a minmax node.
13420 E.CombinedOp = TreeEntry::MinMax;
13421 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13422 if (SelectOnly && CondEntry->UserTreeIndex &&
13423 CondEntry->State == TreeEntry::Vectorize) {
13424 // The condition node is part of the combined minmax node.
13425 CondEntry->State = TreeEntry::CombinedVectorize;
13426 }
13427 break;
13428 }
13429 case Instruction::FSub:
13430 case Instruction::FAdd: {
13431 // Check if possible to convert (a*b)+c to fma.
13432 if (E.State != TreeEntry::Vectorize ||
13433 !E.getOperations().isAddSubLikeOp())
13434 break;
13435 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13436 .isValid())
13437 break;
13438 // This node is a fmuladd node.
13439 E.CombinedOp = TreeEntry::FMulAdd;
13440 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13441 if (FMulEntry->UserTreeIndex &&
13442 FMulEntry->State == TreeEntry::Vectorize) {
13443 // The FMul node is part of the combined fmuladd node.
13444 FMulEntry->State = TreeEntry::CombinedVectorize;
13445 }
13446 break;
13447 }
13448 default:
13449 break;
13450 }
13451 }
13452
13453 if (LoadEntriesToVectorize.empty()) {
13454 // Single load node - exit.
13455 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13456 VectorizableTree.front()->getOpcode() == Instruction::Load)
13457 return;
13458 // Small graph with small VF - exit.
13459 constexpr unsigned SmallTree = 3;
13460 constexpr unsigned SmallVF = 2;
13461 if ((VectorizableTree.size() <= SmallTree &&
13462 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13463 (VectorizableTree.size() <= 2 && UserIgnoreList))
13464 return;
13465
13466 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13467 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13468 getCanonicalGraphSize() <= SmallTree &&
13469 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13470 [](const std::unique_ptr<TreeEntry> &TE) {
13471 return TE->isGather() && TE->hasState() &&
13472 TE->getOpcode() == Instruction::Load &&
13473 !allSameBlock(TE->Scalars);
13474 }) == 1)
13475 return;
13476 }
13477
13478 // A list of loads to be gathered during the vectorization process. We can
13479 // try to vectorize them at the end, if profitable.
13480 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13482 GatheredLoads;
13483
13484 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13485 TreeEntry &E = *TE;
13486 if (E.isGather() &&
13487 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13488 (!E.hasState() && any_of(E.Scalars,
13489 [&](Value *V) {
13490 return isa<LoadInst>(V) &&
13491 !isVectorized(V) &&
13492 !isDeleted(cast<Instruction>(V));
13493 }))) &&
13494 !isSplat(E.Scalars)) {
13495 for (Value *V : E.Scalars) {
13496 auto *LI = dyn_cast<LoadInst>(V);
13497 if (!LI)
13498 continue;
13499 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13500 continue;
13502 *this, V, *DL, *SE, *TTI,
13503 GatheredLoads[std::make_tuple(
13504 LI->getParent(),
13505 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13506 LI->getType())]);
13507 }
13508 }
13509 }
13510 // Try to vectorize gathered loads if this is not just a gather of loads.
13511 if (!GatheredLoads.empty())
13512 tryToVectorizeGatheredLoads(GatheredLoads);
13513}
13514
13515/// Merges shuffle masks and emits final shuffle instruction, if required. It
13516/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13517/// when the actual shuffle instruction is generated only if this is actually
13518/// required. Otherwise, the shuffle instruction emission is delayed till the
13519/// end of the process, to reduce the number of emitted instructions and further
13520/// analysis/transformations.
13521class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13522 bool IsFinalized = false;
13523 SmallVector<int> CommonMask;
13525 const TargetTransformInfo &TTI;
13526 InstructionCost Cost = 0;
13527 SmallDenseSet<Value *> VectorizedVals;
13528 BoUpSLP &R;
13529 SmallPtrSetImpl<Value *> &CheckedExtracts;
13530 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13531 /// While set, still trying to estimate the cost for the same nodes and we
13532 /// can delay actual cost estimation (virtual shuffle instruction emission).
13533 /// May help better estimate the cost if same nodes must be permuted + allows
13534 /// to move most of the long shuffles cost estimation to TTI.
13535 bool SameNodesEstimated = true;
13536
13537 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13538 if (Ty->getScalarType()->isPointerTy()) {
13541 IntegerType::get(Ty->getContext(),
13542 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13543 Ty->getScalarType());
13544 if (auto *VTy = dyn_cast<VectorType>(Ty))
13545 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13546 return Res;
13547 }
13548 return Constant::getAllOnesValue(Ty);
13549 }
13550
13551 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13552 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13553 return TTI::TCC_Free;
13554 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13555 InstructionCost GatherCost = 0;
13556 SmallVector<Value *> Gathers(VL);
13557 if (!Root && isSplat(VL)) {
13558 // Found the broadcasting of the single scalar, calculate the cost as
13559 // the broadcast.
13560 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13561 assert(It != VL.end() && "Expected at least one non-undef value.");
13562 // Add broadcast for non-identity shuffle only.
13563 bool NeedShuffle =
13564 count(VL, *It) > 1 &&
13565 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13566 if (!NeedShuffle) {
13567 if (isa<FixedVectorType>(ScalarTy)) {
13568 assert(SLPReVec && "FixedVectorType is not expected.");
13569 return TTI.getShuffleCost(
13570 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13571 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13572 cast<FixedVectorType>(ScalarTy));
13573 }
13574 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13575 CostKind, std::distance(VL.begin(), It),
13576 PoisonValue::get(VecTy), *It);
13577 }
13578
13579 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13580 transform(VL, ShuffleMask.begin(), [](Value *V) {
13581 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13582 });
13583 InstructionCost InsertCost =
13584 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13585 PoisonValue::get(VecTy), *It);
13586 return InsertCost + ::getShuffleCost(TTI,
13588 VecTy, ShuffleMask, CostKind,
13589 /*Index=*/0, /*SubTp=*/nullptr,
13590 /*Args=*/*It);
13591 }
13592 return GatherCost +
13593 (all_of(Gathers, IsaPred<UndefValue>)
13595 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13596 ScalarTy));
13597 };
13598
13599 /// Compute the cost of creating a vector containing the extracted values from
13600 /// \p VL.
13602 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13603 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13604 unsigned NumParts) {
13605 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13606 unsigned NumElts =
13607 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13608 auto *EE = dyn_cast<ExtractElementInst>(V);
13609 if (!EE)
13610 return Sz;
13611 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13612 if (!VecTy)
13613 return Sz;
13614 return std::max(Sz, VecTy->getNumElements());
13615 });
13616 // FIXME: this must be moved to TTI for better estimation.
13617 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13618 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13620 SmallVectorImpl<unsigned> &SubVecSizes)
13621 -> std::optional<TTI::ShuffleKind> {
13622 if (NumElts <= EltsPerVector)
13623 return std::nullopt;
13624 int OffsetReg0 =
13625 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13626 [](int S, int I) {
13627 if (I == PoisonMaskElem)
13628 return S;
13629 return std::min(S, I);
13630 }),
13631 EltsPerVector);
13632 int OffsetReg1 = OffsetReg0;
13633 DenseSet<int> RegIndices;
13634 // Check that if trying to permute same single/2 input vectors.
13636 int FirstRegId = -1;
13637 Indices.assign(1, OffsetReg0);
13638 for (auto [Pos, I] : enumerate(Mask)) {
13639 if (I == PoisonMaskElem)
13640 continue;
13641 int Idx = I - OffsetReg0;
13642 int RegId =
13643 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13644 if (FirstRegId < 0)
13645 FirstRegId = RegId;
13646 RegIndices.insert(RegId);
13647 if (RegIndices.size() > 2)
13648 return std::nullopt;
13649 if (RegIndices.size() == 2) {
13650 ShuffleKind = TTI::SK_PermuteTwoSrc;
13651 if (Indices.size() == 1) {
13652 OffsetReg1 = alignDown(
13653 std::accumulate(
13654 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13655 [&](int S, int I) {
13656 if (I == PoisonMaskElem)
13657 return S;
13658 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13659 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13660 if (RegId == FirstRegId)
13661 return S;
13662 return std::min(S, I);
13663 }),
13664 EltsPerVector);
13665 unsigned Index = OffsetReg1 % NumElts;
13666 Indices.push_back(Index);
13667 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13668 }
13669 Idx = I - OffsetReg1;
13670 }
13671 I = (Idx % NumElts) % EltsPerVector +
13672 (RegId == FirstRegId ? 0 : EltsPerVector);
13673 }
13674 return ShuffleKind;
13675 };
13676 InstructionCost Cost = 0;
13677
13678 // Process extracts in blocks of EltsPerVector to check if the source vector
13679 // operand can be re-used directly. If not, add the cost of creating a
13680 // shuffle to extract the values into a vector register.
13681 for (unsigned Part : seq<unsigned>(NumParts)) {
13682 if (!ShuffleKinds[Part])
13683 continue;
13684 ArrayRef<int> MaskSlice = Mask.slice(
13685 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13686 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13687 copy(MaskSlice, SubMask.begin());
13689 SmallVector<unsigned, 2> SubVecSizes;
13690 std::optional<TTI::ShuffleKind> RegShuffleKind =
13691 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13692 if (!RegShuffleKind) {
13693 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13695 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13696 Cost +=
13697 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13698 getWidenedType(ScalarTy, NumElts), MaskSlice);
13699 continue;
13700 }
13701 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13702 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13703 Cost +=
13704 ::getShuffleCost(TTI, *RegShuffleKind,
13705 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13706 }
13707 const unsigned BaseVF = getFullVectorNumberOfElements(
13708 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13709 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13710 assert((Idx + SubVecSize) <= BaseVF &&
13711 "SK_ExtractSubvector index out of range");
13713 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13714 Idx, getWidenedType(ScalarTy, SubVecSize));
13715 }
13716 // Second attempt to check, if just a permute is better estimated than
13717 // subvector extract.
13718 SubMask.assign(NumElts, PoisonMaskElem);
13719 copy(MaskSlice, SubMask.begin());
13720 InstructionCost OriginalCost = ::getShuffleCost(
13721 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13722 if (OriginalCost < Cost)
13723 Cost = OriginalCost;
13724 }
13725 return Cost;
13726 }
13727 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13728 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13729 /// elements.
13730 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13731 ArrayRef<int> Mask, unsigned Part,
13732 unsigned SliceSize) {
13733 if (SameNodesEstimated) {
13734 // Delay the cost estimation if the same nodes are reshuffling.
13735 // If we already requested the cost of reshuffling of E1 and E2 before, no
13736 // need to estimate another cost with the sub-Mask, instead include this
13737 // sub-Mask into the CommonMask to estimate it later and avoid double cost
13738 // estimation.
13739 if ((InVectors.size() == 2 &&
13740 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
13741 cast<const TreeEntry *>(InVectors.back()) == E2) ||
13742 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
13743 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
13744 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
13745 [](int Idx) { return Idx == PoisonMaskElem; }) &&
13746 "Expected all poisoned elements.");
13747 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
13748 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13749 return;
13750 }
13751 // Found non-matching nodes - need to estimate the cost for the matched
13752 // and transform mask.
13753 Cost += createShuffle(InVectors.front(),
13754 InVectors.size() == 1 ? nullptr : InVectors.back(),
13755 CommonMask);
13756 transformMaskAfterShuffle(CommonMask, CommonMask);
13757 } else if (InVectors.size() == 2) {
13758 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13759 transformMaskAfterShuffle(CommonMask, CommonMask);
13760 }
13761 SameNodesEstimated = false;
13762 if (!E2 && InVectors.size() == 1) {
13763 unsigned VF = E1.getVectorFactor();
13764 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
13765 VF = std::max(VF, getVF(V1));
13766 } else {
13767 const auto *E = cast<const TreeEntry *>(InVectors.front());
13768 VF = std::max(VF, E->getVectorFactor());
13769 }
13770 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13771 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13772 CommonMask[Idx] = Mask[Idx] + VF;
13773 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13774 transformMaskAfterShuffle(CommonMask, CommonMask);
13775 } else {
13776 auto P = InVectors.front();
13777 Cost += createShuffle(&E1, E2, Mask);
13778 unsigned VF = Mask.size();
13779 if (Value *V1 = dyn_cast<Value *>(P)) {
13780 VF = std::max(VF,
13781 getNumElements(V1->getType()));
13782 } else {
13783 const auto *E = cast<const TreeEntry *>(P);
13784 VF = std::max(VF, E->getVectorFactor());
13785 }
13786 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13787 if (Mask[Idx] != PoisonMaskElem)
13788 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13789 Cost += createShuffle(P, InVectors.front(), CommonMask);
13790 transformMaskAfterShuffle(CommonMask, CommonMask);
13791 }
13792 }
13793
13794 class ShuffleCostBuilder {
13795 const TargetTransformInfo &TTI;
13796
13797 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
13798 int Index = -1;
13799 return Mask.empty() ||
13800 (VF == Mask.size() &&
13803 Index == 0);
13804 }
13805
13806 public:
13807 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
13808 ~ShuffleCostBuilder() = default;
13809 InstructionCost createShuffleVector(Value *V1, Value *,
13810 ArrayRef<int> Mask) const {
13811 // Empty mask or identity mask are free.
13812 unsigned VF =
13813 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13814 if (isEmptyOrIdentity(Mask, VF))
13815 return TTI::TCC_Free;
13816 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
13817 cast<VectorType>(V1->getType()), Mask);
13818 }
13819 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
13820 // Empty mask or identity mask are free.
13821 unsigned VF =
13822 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13823 if (isEmptyOrIdentity(Mask, VF))
13824 return TTI::TCC_Free;
13825 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
13826 cast<VectorType>(V1->getType()), Mask);
13827 }
13828 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
13829 InstructionCost createPoison(Type *Ty, unsigned VF) const {
13830 return TTI::TCC_Free;
13831 }
13832 void resizeToMatch(Value *&, Value *&) const {}
13833 };
13834
13835 /// Smart shuffle instruction emission, walks through shuffles trees and
13836 /// tries to find the best matching vector for the actual shuffle
13837 /// instruction.
13839 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
13841 ArrayRef<int> Mask) {
13842 ShuffleCostBuilder Builder(TTI);
13843 SmallVector<int> CommonMask(Mask);
13844 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
13845 unsigned CommonVF = Mask.size();
13846 InstructionCost ExtraCost = 0;
13847 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
13848 unsigned VF) -> InstructionCost {
13849 if (E.isGather() && allConstant(E.Scalars))
13850 return TTI::TCC_Free;
13851 Type *EScalarTy = E.Scalars.front()->getType();
13852 bool IsSigned = true;
13853 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13854 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
13855 IsSigned = It->second.second;
13856 }
13857 if (EScalarTy != ScalarTy) {
13858 unsigned CastOpcode = Instruction::Trunc;
13859 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13860 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13861 if (DstSz > SrcSz)
13862 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13863 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
13864 getWidenedType(EScalarTy, VF),
13865 TTI::CastContextHint::None, CostKind);
13866 }
13867 return TTI::TCC_Free;
13868 };
13869 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
13870 if (isa<Constant>(V))
13871 return TTI::TCC_Free;
13872 auto *VecTy = cast<VectorType>(V->getType());
13873 Type *EScalarTy = VecTy->getElementType();
13874 if (EScalarTy != ScalarTy) {
13875 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
13876 unsigned CastOpcode = Instruction::Trunc;
13877 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13878 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13879 if (DstSz > SrcSz)
13880 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13881 return TTI.getCastInstrCost(
13882 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
13883 VecTy, TTI::CastContextHint::None, CostKind);
13884 }
13885 return TTI::TCC_Free;
13886 };
13887 if (!V1 && !V2 && !P2.isNull()) {
13888 // Shuffle 2 entry nodes.
13889 const TreeEntry *E = cast<const TreeEntry *>(P1);
13890 unsigned VF = E->getVectorFactor();
13891 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13892 CommonVF = std::max(VF, E2->getVectorFactor());
13893 assert(all_of(Mask,
13894 [=](int Idx) {
13895 return Idx < 2 * static_cast<int>(CommonVF);
13896 }) &&
13897 "All elements in mask must be less than 2 * CommonVF.");
13898 if (E->Scalars.size() == E2->Scalars.size()) {
13899 SmallVector<int> EMask = E->getCommonMask();
13900 SmallVector<int> E2Mask = E2->getCommonMask();
13901 if (!EMask.empty() || !E2Mask.empty()) {
13902 for (int &Idx : CommonMask) {
13903 if (Idx == PoisonMaskElem)
13904 continue;
13905 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
13906 Idx = EMask[Idx];
13907 else if (Idx >= static_cast<int>(CommonVF))
13908 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13909 E->Scalars.size();
13910 }
13911 }
13912 CommonVF = E->Scalars.size();
13913 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13914 GetNodeMinBWAffectedCost(*E2, CommonVF);
13915 } else {
13916 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13917 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13918 }
13919 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13920 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13921 } else if (!V1 && P2.isNull()) {
13922 // Shuffle single entry node.
13923 const TreeEntry *E = cast<const TreeEntry *>(P1);
13924 unsigned VF = E->getVectorFactor();
13925 CommonVF = VF;
13926 assert(
13927 all_of(Mask,
13928 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13929 "All elements in mask must be less than CommonVF.");
13930 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13931 SmallVector<int> EMask = E->getCommonMask();
13932 assert(!EMask.empty() && "Expected non-empty common mask.");
13933 for (int &Idx : CommonMask) {
13934 if (Idx != PoisonMaskElem)
13935 Idx = EMask[Idx];
13936 }
13937 CommonVF = E->Scalars.size();
13938 } else if (unsigned Factor = E->getInterleaveFactor();
13939 Factor > 0 && E->Scalars.size() != Mask.size() &&
13941 Factor)) {
13942 // Deinterleaved nodes are free.
13943 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13944 }
13945 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13946 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13947 // Not identity/broadcast? Try to see if the original vector is better.
13948 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13949 CommonVF == CommonMask.size() &&
13950 any_of(enumerate(CommonMask),
13951 [](const auto &&P) {
13952 return P.value() != PoisonMaskElem &&
13953 static_cast<unsigned>(P.value()) != P.index();
13954 }) &&
13955 any_of(CommonMask,
13956 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
13957 SmallVector<int> ReorderMask;
13958 inversePermutation(E->ReorderIndices, ReorderMask);
13959 ::addMask(CommonMask, ReorderMask);
13960 }
13961 } else if (V1 && P2.isNull()) {
13962 // Shuffle single vector.
13963 ExtraCost += GetValueMinBWAffectedCost(V1);
13964 CommonVF = getVF(V1);
13965 assert(
13966 all_of(Mask,
13967 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13968 "All elements in mask must be less than CommonVF.");
13969 } else if (V1 && !V2) {
13970 // Shuffle vector and tree node.
13971 unsigned VF = getVF(V1);
13972 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13973 CommonVF = std::max(VF, E2->getVectorFactor());
13974 assert(all_of(Mask,
13975 [=](int Idx) {
13976 return Idx < 2 * static_cast<int>(CommonVF);
13977 }) &&
13978 "All elements in mask must be less than 2 * CommonVF.");
13979 if (E2->Scalars.size() == VF && VF != CommonVF) {
13980 SmallVector<int> E2Mask = E2->getCommonMask();
13981 assert(!E2Mask.empty() && "Expected non-empty common mask.");
13982 for (int &Idx : CommonMask) {
13983 if (Idx == PoisonMaskElem)
13984 continue;
13985 if (Idx >= static_cast<int>(CommonVF))
13986 Idx = E2Mask[Idx - CommonVF] + VF;
13987 }
13988 CommonVF = VF;
13989 }
13990 ExtraCost += GetValueMinBWAffectedCost(V1);
13991 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13992 ExtraCost += GetNodeMinBWAffectedCost(
13993 *E2, std::min(CommonVF, E2->getVectorFactor()));
13994 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13995 } else if (!V1 && V2) {
13996 // Shuffle vector and tree node.
13997 unsigned VF = getVF(V2);
13998 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13999 CommonVF = std::max(VF, E1->getVectorFactor());
14000 assert(all_of(Mask,
14001 [=](int Idx) {
14002 return Idx < 2 * static_cast<int>(CommonVF);
14003 }) &&
14004 "All elements in mask must be less than 2 * CommonVF.");
14005 if (E1->Scalars.size() == VF && VF != CommonVF) {
14006 SmallVector<int> E1Mask = E1->getCommonMask();
14007 assert(!E1Mask.empty() && "Expected non-empty common mask.");
14008 for (int &Idx : CommonMask) {
14009 if (Idx == PoisonMaskElem)
14010 continue;
14011 if (Idx >= static_cast<int>(CommonVF))
14012 Idx = E1Mask[Idx - CommonVF] + VF;
14013 else
14014 Idx = E1Mask[Idx];
14015 }
14016 CommonVF = VF;
14017 }
14018 ExtraCost += GetNodeMinBWAffectedCost(
14019 *E1, std::min(CommonVF, E1->getVectorFactor()));
14020 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14021 ExtraCost += GetValueMinBWAffectedCost(V2);
14022 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14023 } else {
14024 assert(V1 && V2 && "Expected both vectors.");
14025 unsigned VF = getVF(V1);
14026 CommonVF = std::max(VF, getVF(V2));
14027 assert(all_of(Mask,
14028 [=](int Idx) {
14029 return Idx < 2 * static_cast<int>(CommonVF);
14030 }) &&
14031 "All elements in mask must be less than 2 * CommonVF.");
14032 ExtraCost +=
14033 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14034 if (V1->getType() != V2->getType()) {
14035 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14036 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14037 } else {
14038 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
14039 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14040 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
14041 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14042 }
14043 }
14044 InVectors.front() =
14045 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14046 if (InVectors.size() == 2)
14047 InVectors.pop_back();
14048 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14049 V1, V2, CommonMask, Builder, ScalarTy);
14050 }
14051
14052public:
14054 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
14055 SmallPtrSetImpl<Value *> &CheckedExtracts)
14056 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14057 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14058 CheckedExtracts(CheckedExtracts) {}
14059 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14060 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14061 unsigned NumParts, bool &UseVecBaseAsInput) {
14062 UseVecBaseAsInput = false;
14063 if (Mask.empty())
14064 return nullptr;
14065 Value *VecBase = nullptr;
14066 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14067 if (!E->ReorderIndices.empty()) {
14068 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14069 E->ReorderIndices.end());
14070 reorderScalars(VL, ReorderMask);
14071 }
14072 // Check if it can be considered reused if same extractelements were
14073 // vectorized already.
14074 bool PrevNodeFound = any_of(
14075 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14076 [&](const std::unique_ptr<TreeEntry> &TE) {
14077 return ((TE->hasState() && !TE->isAltShuffle() &&
14078 TE->getOpcode() == Instruction::ExtractElement) ||
14079 TE->isGather()) &&
14080 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14081 return VL.size() > Data.index() &&
14082 (Mask[Data.index()] == PoisonMaskElem ||
14083 isa<UndefValue>(VL[Data.index()]) ||
14084 Data.value() == VL[Data.index()]);
14085 });
14086 });
14087 SmallPtrSet<Value *, 4> UniqueBases;
14088 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14089 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
14090 for (unsigned Part : seq<unsigned>(NumParts)) {
14091 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14092 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14093 for (auto [I, V] :
14094 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
14095 // Ignore non-extractelement scalars.
14096 if (isa<UndefValue>(V) ||
14097 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
14098 continue;
14099 // If all users of instruction are going to be vectorized and this
14100 // instruction itself is not going to be vectorized, consider this
14101 // instruction as dead and remove its cost from the final cost of the
14102 // vectorized tree.
14103 // Also, avoid adjusting the cost for extractelements with multiple uses
14104 // in different graph entries.
14105 auto *EE = cast<ExtractElementInst>(V);
14106 VecBase = EE->getVectorOperand();
14107 UniqueBases.insert(VecBase);
14108 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
14109 if (!CheckedExtracts.insert(V).second ||
14110 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
14111 any_of(EE->users(),
14112 [&](User *U) {
14113 return isa<GetElementPtrInst>(U) &&
14114 !R.areAllUsersVectorized(cast<Instruction>(U),
14115 &VectorizedVals);
14116 }) ||
14117 (!VEs.empty() && !is_contained(VEs, E)))
14118 continue;
14119 std::optional<unsigned> EEIdx = getExtractIndex(EE);
14120 if (!EEIdx)
14121 continue;
14122 unsigned Idx = *EEIdx;
14123 // Take credit for instruction that will become dead.
14124 if (EE->hasOneUse() || !PrevNodeFound) {
14125 Instruction *Ext = EE->user_back();
14126 if (isa<SExtInst, ZExtInst>(Ext) &&
14128 // Use getExtractWithExtendCost() to calculate the cost of
14129 // extractelement/ext pair.
14130 Cost -= TTI.getExtractWithExtendCost(
14131 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
14132 Idx, CostKind);
14133 // Add back the cost of s|zext which is subtracted separately.
14134 Cost += TTI.getCastInstrCost(
14135 Ext->getOpcode(), Ext->getType(), EE->getType(),
14137 continue;
14138 }
14139 }
14140 APInt &DemandedElts =
14141 VectorOpsToExtracts
14142 .try_emplace(VecBase,
14143 APInt::getZero(getNumElements(VecBase->getType())))
14144 .first->getSecond();
14145 DemandedElts.setBit(Idx);
14146 }
14147 }
14148 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14150 DemandedElts, /*Insert=*/false,
14151 /*Extract=*/true, CostKind);
14152 // Check that gather of extractelements can be represented as just a
14153 // shuffle of a single/two vectors the scalars are extracted from.
14154 // Found the bunch of extractelement instructions that must be gathered
14155 // into a vector and can be represented as a permutation elements in a
14156 // single input vector or of 2 input vectors.
14157 // Done for reused if same extractelements were vectorized already.
14158 if (!PrevNodeFound)
14159 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14160 InVectors.assign(1, E);
14161 CommonMask.assign(Mask.begin(), Mask.end());
14162 transformMaskAfterShuffle(CommonMask, CommonMask);
14163 SameNodesEstimated = false;
14164 if (NumParts != 1 && UniqueBases.size() != 1) {
14165 UseVecBaseAsInput = true;
14166 VecBase =
14167 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14168 }
14169 return VecBase;
14170 }
14171 /// Checks if the specified entry \p E needs to be delayed because of its
14172 /// dependency nodes.
14173 std::optional<InstructionCost>
14174 needToDelay(const TreeEntry *,
14176 // No need to delay the cost estimation during analysis.
14177 return std::nullopt;
14178 }
14179 /// Reset the builder to handle perfect diamond match.
14181 IsFinalized = false;
14182 CommonMask.clear();
14183 InVectors.clear();
14184 Cost = 0;
14185 VectorizedVals.clear();
14186 SameNodesEstimated = true;
14187 }
14188 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14189 if (&E1 == &E2) {
14190 assert(all_of(Mask,
14191 [&](int Idx) {
14192 return Idx < static_cast<int>(E1.getVectorFactor());
14193 }) &&
14194 "Expected single vector shuffle mask.");
14195 add(E1, Mask);
14196 return;
14197 }
14198 if (InVectors.empty()) {
14199 CommonMask.assign(Mask.begin(), Mask.end());
14200 InVectors.assign({&E1, &E2});
14201 return;
14202 }
14203 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14204 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14205 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14206 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14207 const auto *It =
14208 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
14209 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14210 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14211 }
14212 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14213 if (InVectors.empty()) {
14214 CommonMask.assign(Mask.begin(), Mask.end());
14215 InVectors.assign(1, &E1);
14216 return;
14217 }
14218 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14219 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14220 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14221 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14222 const auto *It =
14223 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
14224 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14225 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
14226 if (!SameNodesEstimated && InVectors.size() == 1)
14227 InVectors.emplace_back(&E1);
14228 }
14229 /// Adds 2 input vectors and the mask for their shuffling.
14230 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14231 // May come only for shuffling of 2 vectors with extractelements, already
14232 // handled in adjustExtracts.
14233 assert(InVectors.size() == 1 &&
14234 all_of(enumerate(CommonMask),
14235 [&](auto P) {
14236 if (P.value() == PoisonMaskElem)
14237 return Mask[P.index()] == PoisonMaskElem;
14238 auto *EI = cast<ExtractElementInst>(
14239 cast<const TreeEntry *>(InVectors.front())
14240 ->getOrdered(P.index()));
14241 return EI->getVectorOperand() == V1 ||
14242 EI->getVectorOperand() == V2;
14243 }) &&
14244 "Expected extractelement vectors.");
14245 }
14246 /// Adds another one input vector and the mask for the shuffling.
14247 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14248 if (InVectors.empty()) {
14249 assert(CommonMask.empty() && !ForExtracts &&
14250 "Expected empty input mask/vectors.");
14251 CommonMask.assign(Mask.begin(), Mask.end());
14252 InVectors.assign(1, V1);
14253 return;
14254 }
14255 if (ForExtracts) {
14256 // No need to add vectors here, already handled them in adjustExtracts.
14257 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14258 !CommonMask.empty() &&
14259 all_of(enumerate(CommonMask),
14260 [&](auto P) {
14261 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14262 ->getOrdered(P.index());
14263 if (P.value() == PoisonMaskElem)
14264 return P.value() == Mask[P.index()] ||
14265 isa<UndefValue>(Scalar);
14266 if (isa<Constant>(V1))
14267 return true;
14268 auto *EI = cast<ExtractElementInst>(Scalar);
14269 return EI->getVectorOperand() == V1;
14270 }) &&
14271 "Expected only tree entry for extractelement vectors.");
14272 return;
14273 }
14274 assert(!InVectors.empty() && !CommonMask.empty() &&
14275 "Expected only tree entries from extracts/reused buildvectors.");
14276 unsigned VF = getVF(V1);
14277 if (InVectors.size() == 2) {
14278 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14279 transformMaskAfterShuffle(CommonMask, CommonMask);
14280 VF = std::max<unsigned>(VF, CommonMask.size());
14281 } else if (const auto *InTE =
14282 InVectors.front().dyn_cast<const TreeEntry *>()) {
14283 VF = std::max(VF, InTE->getVectorFactor());
14284 } else {
14285 VF = std::max(
14286 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14287 ->getNumElements());
14288 }
14289 InVectors.push_back(V1);
14290 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14291 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14292 CommonMask[Idx] = Mask[Idx] + VF;
14293 }
14294 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14295 Value *Root = nullptr) {
14296 Cost += getBuildVectorCost(VL, Root);
14297 if (!Root) {
14298 // FIXME: Need to find a way to avoid use of getNullValue here.
14300 unsigned VF = VL.size();
14301 if (MaskVF != 0)
14302 VF = std::min(VF, MaskVF);
14303 Type *VLScalarTy = VL.front()->getType();
14304 for (Value *V : VL.take_front(VF)) {
14305 Type *ScalarTy = VLScalarTy->getScalarType();
14306 if (isa<PoisonValue>(V)) {
14307 Vals.push_back(PoisonValue::get(ScalarTy));
14308 continue;
14309 }
14310 if (isa<UndefValue>(V)) {
14311 Vals.push_back(UndefValue::get(ScalarTy));
14312 continue;
14313 }
14314 Vals.push_back(Constant::getNullValue(ScalarTy));
14315 }
14316 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14317 assert(SLPReVec && "FixedVectorType is not expected.");
14318 // When REVEC is enabled, we need to expand vector types into scalar
14319 // types.
14320 Vals = replicateMask(Vals, VecTy->getNumElements());
14321 }
14322 return ConstantVector::get(Vals);
14323 }
14326 cast<FixedVectorType>(Root->getType())->getNumElements()),
14327 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14328 }
14330 /// Finalize emission of the shuffles.
14332 ArrayRef<int> ExtMask,
14333 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14334 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14337 Action = {}) {
14338 IsFinalized = true;
14339 if (Action) {
14340 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14341 if (InVectors.size() == 2)
14342 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14343 else
14344 Cost += createShuffle(Vec, nullptr, CommonMask);
14345 transformMaskAfterShuffle(CommonMask, CommonMask);
14346 assert(VF > 0 &&
14347 "Expected vector length for the final value before action.");
14348 Value *V = cast<Value *>(Vec);
14349 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14350 Cost += createShuffle(V1, V2, Mask);
14351 return V1;
14352 });
14353 InVectors.front() = V;
14354 }
14355 if (!SubVectors.empty()) {
14356 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14357 if (InVectors.size() == 2)
14358 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14359 else
14360 Cost += createShuffle(Vec, nullptr, CommonMask);
14361 transformMaskAfterShuffle(CommonMask, CommonMask);
14362 // Add subvectors permutation cost.
14363 if (!SubVectorsMask.empty()) {
14364 assert(SubVectorsMask.size() <= CommonMask.size() &&
14365 "Expected same size of masks for subvectors and common mask.");
14366 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14367 copy(SubVectorsMask, SVMask.begin());
14368 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14369 if (I2 != PoisonMaskElem) {
14370 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14371 I1 = I2 + CommonMask.size();
14372 }
14373 }
14375 getWidenedType(ScalarTy, CommonMask.size()),
14376 SVMask, CostKind);
14377 }
14378 for (auto [E, Idx] : SubVectors) {
14379 Type *EScalarTy = E->Scalars.front()->getType();
14380 bool IsSigned = true;
14381 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14382 EScalarTy =
14383 IntegerType::get(EScalarTy->getContext(), It->second.first);
14384 IsSigned = It->second.second;
14385 }
14386 if (ScalarTy != EScalarTy) {
14387 unsigned CastOpcode = Instruction::Trunc;
14388 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14389 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14390 if (DstSz > SrcSz)
14391 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14392 Cost += TTI.getCastInstrCost(
14393 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14394 getWidenedType(EScalarTy, E->getVectorFactor()),
14396 }
14399 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14400 getWidenedType(ScalarTy, E->getVectorFactor()));
14401 if (!CommonMask.empty()) {
14402 std::iota(std::next(CommonMask.begin(), Idx),
14403 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14404 Idx);
14405 }
14406 }
14407 }
14408
14409 if (!ExtMask.empty()) {
14410 if (CommonMask.empty()) {
14411 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14412 } else {
14413 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14414 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14415 if (ExtMask[I] == PoisonMaskElem)
14416 continue;
14417 NewMask[I] = CommonMask[ExtMask[I]];
14418 }
14419 CommonMask.swap(NewMask);
14420 }
14421 }
14422 if (CommonMask.empty()) {
14423 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14424 return Cost;
14425 }
14426 return Cost +
14427 createShuffle(InVectors.front(),
14428 InVectors.size() == 2 ? InVectors.back() : nullptr,
14429 CommonMask);
14430 }
14431
14433 assert((IsFinalized || CommonMask.empty()) &&
14434 "Shuffle construction must be finalized.");
14435 }
14436};
14437
14438const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14439 unsigned Idx) const {
14440 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14441 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14442 return Op;
14443}
14444
14445TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14446 if (TE.State == TreeEntry::ScatterVectorize ||
14447 TE.State == TreeEntry::StridedVectorize)
14449 if (TE.State == TreeEntry::CompressVectorize)
14451 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14452 !TE.isAltShuffle()) {
14453 if (TE.ReorderIndices.empty())
14455 SmallVector<int> Mask;
14456 inversePermutation(TE.ReorderIndices, Mask);
14457 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14459 }
14461}
14462
14464BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14465 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14466 ArrayRef<Value *> VL = E->Scalars;
14467
14468 Type *ScalarTy = getValueType(VL[0]);
14469 if (!isValidElementType(ScalarTy))
14470 return InstructionCost::getInvalid();
14472
14473 // If we have computed a smaller type for the expression, update VecTy so
14474 // that the costs will be accurate.
14475 auto It = MinBWs.find(E);
14476 Type *OrigScalarTy = ScalarTy;
14477 if (It != MinBWs.end()) {
14478 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14479 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14480 if (VecTy)
14481 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14482 }
14483 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14484 unsigned EntryVF = E->getVectorFactor();
14485 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14486
14487 if (E->isGather()) {
14488 if (allConstant(VL))
14489 return 0;
14490 if (isa<InsertElementInst>(VL[0]))
14491 return InstructionCost::getInvalid();
14492 if (isa<CmpInst>(VL.front()))
14493 ScalarTy = VL.front()->getType();
14494 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14495 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14496 }
14497 if (E->State == TreeEntry::SplitVectorize) {
14498 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14499 "Expected exactly 2 combined entries.");
14500 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14501 InstructionCost VectorCost = 0;
14502 if (E->ReorderIndices.empty()) {
14503 VectorCost = ::getShuffleCost(
14504 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14505 E->CombinedEntriesWithIndices.back().second,
14507 ScalarTy,
14508 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14509 ->getVectorFactor()));
14510 } else {
14511 unsigned CommonVF =
14512 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14513 ->getVectorFactor(),
14514 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14515 ->getVectorFactor());
14516 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14517 getWidenedType(ScalarTy, CommonVF),
14518 E->getSplitMask(), CostKind);
14519 }
14520 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14521 return VectorCost;
14522 }
14523 InstructionCost CommonCost = 0;
14524 SmallVector<int> Mask;
14525 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14526 (E->State != TreeEntry::StridedVectorize ||
14527 !isReverseOrder(E->ReorderIndices))) {
14528 SmallVector<int> NewMask;
14529 if (E->getOpcode() == Instruction::Store) {
14530 // For stores the order is actually a mask.
14531 NewMask.resize(E->ReorderIndices.size());
14532 copy(E->ReorderIndices, NewMask.begin());
14533 } else {
14534 inversePermutation(E->ReorderIndices, NewMask);
14535 }
14536 ::addMask(Mask, NewMask);
14537 }
14538 if (!E->ReuseShuffleIndices.empty())
14539 ::addMask(Mask, E->ReuseShuffleIndices);
14540 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14541 CommonCost =
14542 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14543 assert((E->State == TreeEntry::Vectorize ||
14544 E->State == TreeEntry::ScatterVectorize ||
14545 E->State == TreeEntry::StridedVectorize ||
14546 E->State == TreeEntry::CompressVectorize) &&
14547 "Unhandled state");
14548 assert(E->getOpcode() &&
14549 ((allSameType(VL) && allSameBlock(VL)) ||
14550 (E->getOpcode() == Instruction::GetElementPtr &&
14551 E->getMainOp()->getType()->isPointerTy()) ||
14552 E->hasCopyableElements()) &&
14553 "Invalid VL");
14554 Instruction *VL0 = E->getMainOp();
14555 unsigned ShuffleOrOp =
14556 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14557 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14558 ShuffleOrOp = E->CombinedOp;
14559 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
14560 const unsigned Sz = UniqueValues.size();
14561 SmallBitVector UsedScalars(Sz, false);
14562 for (unsigned I = 0; I < Sz; ++I) {
14563 if (isa<Instruction>(UniqueValues[I]) &&
14564 !E->isCopyableElement(UniqueValues[I]) &&
14565 getTreeEntries(UniqueValues[I]).front() == E)
14566 continue;
14567 UsedScalars.set(I);
14568 }
14569 auto GetCastContextHint = [&](Value *V) {
14570 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14571 return getCastContextHint(*OpTEs.front());
14572 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14573 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14574 !SrcState.isAltShuffle())
14577 };
14578 auto GetCostDiff =
14579 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14580 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14581 // Calculate the cost of this instruction.
14582 InstructionCost ScalarCost = 0;
14583 if (isa<CastInst, CallInst>(VL0)) {
14584 // For some of the instructions no need to calculate cost for each
14585 // particular instruction, we can use the cost of the single
14586 // instruction x total number of scalar instructions.
14587 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14588 } else {
14589 for (unsigned I = 0; I < Sz; ++I) {
14590 if (UsedScalars.test(I))
14591 continue;
14592 ScalarCost += ScalarEltCost(I);
14593 }
14594 }
14595
14596 InstructionCost VecCost = VectorCost(CommonCost);
14597 // Check if the current node must be resized, if the parent node is not
14598 // resized.
14599 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14600 E->Idx != 0 &&
14601 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14602 const EdgeInfo &EI = E->UserTreeIndex;
14603 if (!EI.UserTE->hasState() ||
14604 EI.UserTE->getOpcode() != Instruction::Select ||
14605 EI.EdgeIdx != 0) {
14606 auto UserBWIt = MinBWs.find(EI.UserTE);
14607 Type *UserScalarTy =
14608 (EI.UserTE->isGather() ||
14609 EI.UserTE->State == TreeEntry::SplitVectorize)
14610 ? EI.UserTE->Scalars.front()->getType()
14611 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14612 if (UserBWIt != MinBWs.end())
14613 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14614 UserBWIt->second.first);
14615 if (ScalarTy != UserScalarTy) {
14616 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14617 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14618 unsigned VecOpcode;
14619 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14620 if (BWSz > SrcBWSz)
14621 VecOpcode = Instruction::Trunc;
14622 else
14623 VecOpcode =
14624 It->second.second ? Instruction::SExt : Instruction::ZExt;
14625 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14626 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14627 CostKind);
14628 }
14629 }
14630 }
14631 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14632 ScalarCost, "Calculated costs for Tree"));
14633 return VecCost - ScalarCost;
14634 };
14635 // Calculate cost difference from vectorizing set of GEPs.
14636 // Negative value means vectorizing is profitable.
14637 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14638 assert((E->State == TreeEntry::Vectorize ||
14639 E->State == TreeEntry::StridedVectorize ||
14640 E->State == TreeEntry::CompressVectorize) &&
14641 "Entry state expected to be Vectorize, StridedVectorize or "
14642 "MaskedLoadCompressVectorize here.");
14643 InstructionCost ScalarCost = 0;
14644 InstructionCost VecCost = 0;
14645 std::tie(ScalarCost, VecCost) = getGEPCosts(
14646 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14647 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14648 "Calculated GEPs cost for Tree"));
14649
14650 return VecCost - ScalarCost;
14651 };
14652
14653 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14654 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14655 if (MinMaxID == Intrinsic::not_intrinsic)
14656 return InstructionCost::getInvalid();
14657 Type *CanonicalType = Ty;
14658 if (CanonicalType->isPtrOrPtrVectorTy())
14659 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14660 CanonicalType->getContext(),
14661 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14662
14663 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14664 {CanonicalType, CanonicalType});
14666 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14667 // If the selects are the only uses of the compares, they will be
14668 // dead and we can adjust the cost by removing their cost.
14669 if (VI && SelectOnly) {
14670 assert((!Ty->isVectorTy() || SLPReVec) &&
14671 "Expected only for scalar type.");
14672 auto *CI = cast<CmpInst>(VI->getOperand(0));
14673 IntrinsicCost -= TTI->getCmpSelInstrCost(
14674 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14675 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14676 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14677 }
14678 return IntrinsicCost;
14679 };
14680 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14681 Instruction *VI) {
14682 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14683 return Cost;
14684 };
14685 switch (ShuffleOrOp) {
14686 case Instruction::PHI: {
14687 // Count reused scalars.
14688 InstructionCost ScalarCost = 0;
14689 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14690 for (Value *V : UniqueValues) {
14691 auto *PHI = dyn_cast<PHINode>(V);
14692 if (!PHI)
14693 continue;
14694
14695 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14696 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14697 Value *Op = PHI->getIncomingValue(I);
14698 Operands[I] = Op;
14699 }
14700 if (const TreeEntry *OpTE =
14701 getSameValuesTreeEntry(Operands.front(), Operands))
14702 if (CountedOps.insert(OpTE).second &&
14703 !OpTE->ReuseShuffleIndices.empty())
14704 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14705 OpTE->Scalars.size());
14706 }
14707
14708 return CommonCost - ScalarCost;
14709 }
14710 case Instruction::ExtractValue:
14711 case Instruction::ExtractElement: {
14712 APInt DemandedElts;
14713 VectorType *SrcVecTy = nullptr;
14714 auto GetScalarCost = [&](unsigned Idx) {
14715 if (isa<PoisonValue>(UniqueValues[Idx]))
14717
14718 auto *I = cast<Instruction>(UniqueValues[Idx]);
14719 if (!SrcVecTy) {
14720 if (ShuffleOrOp == Instruction::ExtractElement) {
14721 auto *EE = cast<ExtractElementInst>(I);
14722 SrcVecTy = EE->getVectorOperandType();
14723 } else {
14724 auto *EV = cast<ExtractValueInst>(I);
14725 Type *AggregateTy = EV->getAggregateOperand()->getType();
14726 unsigned NumElts;
14727 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14728 NumElts = ATy->getNumElements();
14729 else
14730 NumElts = AggregateTy->getStructNumElements();
14731 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
14732 }
14733 }
14734 if (I->hasOneUse()) {
14735 Instruction *Ext = I->user_back();
14736 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14738 // Use getExtractWithExtendCost() to calculate the cost of
14739 // extractelement/ext pair.
14740 InstructionCost Cost = TTI->getExtractWithExtendCost(
14741 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
14742 CostKind);
14743 // Subtract the cost of s|zext which is subtracted separately.
14744 Cost -= TTI->getCastInstrCost(
14745 Ext->getOpcode(), Ext->getType(), I->getType(),
14747 return Cost;
14748 }
14749 }
14750 if (DemandedElts.isZero())
14751 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
14752 DemandedElts.setBit(*getExtractIndex(I));
14754 };
14755 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14756 return CommonCost - (DemandedElts.isZero()
14758 : TTI.getScalarizationOverhead(
14759 SrcVecTy, DemandedElts, /*Insert=*/false,
14760 /*Extract=*/true, CostKind));
14761 };
14762 return GetCostDiff(GetScalarCost, GetVectorCost);
14763 }
14764 case Instruction::InsertElement: {
14765 assert(E->ReuseShuffleIndices.empty() &&
14766 "Unique insertelements only are expected.");
14767 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
14768 unsigned const NumElts = SrcVecTy->getNumElements();
14769 unsigned const NumScalars = VL.size();
14770
14771 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
14772
14773 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14774 unsigned OffsetBeg = *getElementIndex(VL.front());
14775 unsigned OffsetEnd = OffsetBeg;
14776 InsertMask[OffsetBeg] = 0;
14777 for (auto [I, V] : enumerate(VL.drop_front())) {
14778 unsigned Idx = *getElementIndex(V);
14779 if (OffsetBeg > Idx)
14780 OffsetBeg = Idx;
14781 else if (OffsetEnd < Idx)
14782 OffsetEnd = Idx;
14783 InsertMask[Idx] = I + 1;
14784 }
14785 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
14786 if (NumOfParts > 0 && NumOfParts < NumElts)
14787 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14788 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14789 VecScalarsSz;
14790 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14791 unsigned InsertVecSz = std::min<unsigned>(
14792 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
14793 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14794 bool IsWholeSubvector =
14795 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14796 // Check if we can safely insert a subvector. If it is not possible, just
14797 // generate a whole-sized vector and shuffle the source vector and the new
14798 // subvector.
14799 if (OffsetBeg + InsertVecSz > VecSz) {
14800 // Align OffsetBeg to generate correct mask.
14801 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
14802 InsertVecSz = VecSz;
14803 }
14804
14805 APInt DemandedElts = APInt::getZero(NumElts);
14806 // TODO: Add support for Instruction::InsertValue.
14807 SmallVector<int> Mask;
14808 if (!E->ReorderIndices.empty()) {
14809 inversePermutation(E->ReorderIndices, Mask);
14810 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
14811 } else {
14812 Mask.assign(VecSz, PoisonMaskElem);
14813 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
14814 }
14815 bool IsIdentity = true;
14816 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
14817 Mask.swap(PrevMask);
14818 for (unsigned I = 0; I < NumScalars; ++I) {
14819 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
14820 DemandedElts.setBit(InsertIdx);
14821 IsIdentity &= InsertIdx - OffsetBeg == I;
14822 Mask[InsertIdx - OffsetBeg] = I;
14823 }
14824 assert(Offset < NumElts && "Failed to find vector index offset");
14825
14827 Cost -=
14828 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
14829 /*Insert*/ true, /*Extract*/ false, CostKind);
14830
14831 // First cost - resize to actual vector size if not identity shuffle or
14832 // need to shift the vector.
14833 // Do not calculate the cost if the actual size is the register size and
14834 // we can merge this shuffle with the following SK_Select.
14835 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
14836 if (!IsIdentity)
14838 InsertVecTy, Mask);
14839 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14840 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14841 }));
14842 // Second cost - permutation with subvector, if some elements are from the
14843 // initial vector or inserting a subvector.
14844 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
14845 // subvector of ActualVecTy.
14846 SmallBitVector InMask =
14847 isUndefVector(FirstInsert->getOperand(0),
14848 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14849 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
14850 if (InsertVecSz != VecSz) {
14851 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
14852 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
14853 CostKind, OffsetBeg - Offset, InsertVecTy);
14854 } else {
14855 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
14856 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
14857 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
14858 I <= End; ++I)
14859 if (Mask[I] != PoisonMaskElem)
14860 Mask[I] = I + VecSz;
14861 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
14862 Mask[I] =
14863 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
14864 Cost +=
14865 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
14866 }
14867 }
14868 return Cost;
14869 }
14870 case Instruction::ZExt:
14871 case Instruction::SExt:
14872 case Instruction::FPToUI:
14873 case Instruction::FPToSI:
14874 case Instruction::FPExt:
14875 case Instruction::PtrToInt:
14876 case Instruction::IntToPtr:
14877 case Instruction::SIToFP:
14878 case Instruction::UIToFP:
14879 case Instruction::Trunc:
14880 case Instruction::FPTrunc:
14881 case Instruction::BitCast: {
14882 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14883 Type *SrcScalarTy = VL0->getOperand(0)->getType();
14884 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
14885 unsigned Opcode = ShuffleOrOp;
14886 unsigned VecOpcode = Opcode;
14887 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14888 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14889 // Check if the values are candidates to demote.
14890 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
14891 if (SrcIt != MinBWs.end()) {
14892 SrcBWSz = SrcIt->second.first;
14893 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
14894 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
14895 SrcVecTy =
14896 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
14897 }
14898 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14899 if (BWSz == SrcBWSz) {
14900 VecOpcode = Instruction::BitCast;
14901 } else if (BWSz < SrcBWSz) {
14902 VecOpcode = Instruction::Trunc;
14903 } else if (It != MinBWs.end()) {
14904 assert(BWSz > SrcBWSz && "Invalid cast!");
14905 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14906 } else if (SrcIt != MinBWs.end()) {
14907 assert(BWSz > SrcBWSz && "Invalid cast!");
14908 VecOpcode =
14909 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14910 }
14911 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14912 !SrcIt->second.second) {
14913 VecOpcode = Instruction::UIToFP;
14914 }
14915 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
14916 assert(Idx == 0 && "Expected 0 index only");
14917 return TTI->getCastInstrCost(Opcode, VL0->getType(),
14918 VL0->getOperand(0)->getType(),
14920 };
14921 auto GetVectorCost = [=](InstructionCost CommonCost) {
14922 // Do not count cost here if minimum bitwidth is in effect and it is just
14923 // a bitcast (here it is just a noop).
14924 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14925 return CommonCost;
14926 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
14927 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
14928
14929 bool IsArithmeticExtendedReduction =
14930 E->Idx == 0 && UserIgnoreList &&
14931 all_of(*UserIgnoreList, [](Value *V) {
14932 auto *I = cast<Instruction>(V);
14933 return is_contained({Instruction::Add, Instruction::FAdd,
14934 Instruction::Mul, Instruction::FMul,
14935 Instruction::And, Instruction::Or,
14936 Instruction::Xor},
14937 I->getOpcode());
14938 });
14939 if (IsArithmeticExtendedReduction &&
14940 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14941 return CommonCost;
14942 return CommonCost +
14943 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
14944 VecOpcode == Opcode ? VI : nullptr);
14945 };
14946 return GetCostDiff(GetScalarCost, GetVectorCost);
14947 }
14948 case Instruction::FCmp:
14949 case Instruction::ICmp:
14950 case Instruction::Select: {
14951 CmpPredicate VecPred, SwappedVecPred;
14952 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
14953 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
14954 match(VL0, MatchCmp))
14955 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
14956 else
14957 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
14960 auto GetScalarCost = [&](unsigned Idx) {
14961 if (isa<PoisonValue>(UniqueValues[Idx]))
14963
14964 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14965 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
14968 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
14969 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
14970 !match(VI, MatchCmp)) ||
14971 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
14972 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
14973 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
14976
14977 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
14978 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14979 CostKind, getOperandInfo(VI->getOperand(0)),
14980 getOperandInfo(VI->getOperand(1)), VI);
14981 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
14982 if (IntrinsicCost.isValid())
14983 ScalarCost = IntrinsicCost;
14984
14985 return ScalarCost;
14986 };
14987 auto GetVectorCost = [&](InstructionCost CommonCost) {
14988 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
14989
14990 InstructionCost VecCost =
14991 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
14992 CostKind, getOperandInfo(E->getOperand(0)),
14993 getOperandInfo(E->getOperand(1)), VL0);
14994 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
14995 auto *CondType =
14996 getWidenedType(SI->getCondition()->getType(), VL.size());
14997 unsigned CondNumElements = CondType->getNumElements();
14998 unsigned VecTyNumElements = getNumElements(VecTy);
14999 assert(VecTyNumElements >= CondNumElements &&
15000 VecTyNumElements % CondNumElements == 0 &&
15001 "Cannot vectorize Instruction::Select");
15002 if (CondNumElements != VecTyNumElements) {
15003 // When the return type is i1 but the source is fixed vector type, we
15004 // need to duplicate the condition value.
15005 VecCost += ::getShuffleCost(
15006 *TTI, TTI::SK_PermuteSingleSrc, CondType,
15007 createReplicatedMask(VecTyNumElements / CondNumElements,
15008 CondNumElements));
15009 }
15010 }
15011 return VecCost + CommonCost;
15012 };
15013 return GetCostDiff(GetScalarCost, GetVectorCost);
15014 }
15015 case TreeEntry::MinMax: {
15016 auto GetScalarCost = [&](unsigned Idx) {
15017 return GetMinMaxCost(OrigScalarTy);
15018 };
15019 auto GetVectorCost = [&](InstructionCost CommonCost) {
15020 InstructionCost VecCost = GetMinMaxCost(VecTy);
15021 return VecCost + CommonCost;
15022 };
15023 return GetCostDiff(GetScalarCost, GetVectorCost);
15024 }
15025 case TreeEntry::FMulAdd: {
15026 auto GetScalarCost = [&](unsigned Idx) {
15027 if (isa<PoisonValue>(UniqueValues[Idx]))
15029 return GetFMulAddCost(E->getOperations(),
15030 cast<Instruction>(UniqueValues[Idx]));
15031 };
15032 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15033 FastMathFlags FMF;
15034 FMF.set();
15035 for (Value *V : E->Scalars) {
15036 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
15037 FMF &= FPCI->getFastMathFlags();
15038 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
15039 FMF &= FPCIOp->getFastMathFlags();
15040 }
15041 }
15042 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15043 {VecTy, VecTy, VecTy}, FMF);
15044 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
15045 return VecCost + CommonCost;
15046 };
15047 return GetCostDiff(GetScalarCost, GetVectorCost);
15048 }
15049 case Instruction::FNeg:
15050 case Instruction::Add:
15051 case Instruction::FAdd:
15052 case Instruction::Sub:
15053 case Instruction::FSub:
15054 case Instruction::Mul:
15055 case Instruction::FMul:
15056 case Instruction::UDiv:
15057 case Instruction::SDiv:
15058 case Instruction::FDiv:
15059 case Instruction::URem:
15060 case Instruction::SRem:
15061 case Instruction::FRem:
15062 case Instruction::Shl:
15063 case Instruction::LShr:
15064 case Instruction::AShr:
15065 case Instruction::And:
15066 case Instruction::Or:
15067 case Instruction::Xor: {
15068 auto GetScalarCost = [&](unsigned Idx) {
15069 if (isa<PoisonValue>(UniqueValues[Idx]))
15071
15072 // We cannot retrieve the operand from UniqueValues[Idx] because an
15073 // interchangeable instruction may be used. The order and the actual
15074 // operand might differ from what is retrieved from UniqueValues[Idx].
15075 Value *Op1 = E->getOperand(0)[Idx];
15076 Value *Op2;
15077 SmallVector<const Value *, 2> Operands(1, Op1);
15078 if (isa<UnaryOperator>(UniqueValues[Idx])) {
15079 Op2 = Op1;
15080 } else {
15081 Op2 = E->getOperand(1)[Idx];
15082 Operands.push_back(Op2);
15083 }
15086 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
15087 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
15088 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
15089 I && (ShuffleOrOp == Instruction::FAdd ||
15090 ShuffleOrOp == Instruction::FSub)) {
15091 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
15092 if (IntrinsicCost.isValid())
15093 ScalarCost = IntrinsicCost;
15094 }
15095 return ScalarCost;
15096 };
15097 auto GetVectorCost = [=](InstructionCost CommonCost) {
15098 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15099 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15100 ArrayRef<Value *> Ops = E->getOperand(I);
15101 if (all_of(Ops, [&](Value *Op) {
15102 auto *CI = dyn_cast<ConstantInt>(Op);
15103 return CI && CI->getValue().countr_one() >= It->second.first;
15104 }))
15105 return CommonCost;
15106 }
15107 }
15108 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
15109 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
15110 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
15111 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
15112 Op2Info, {}, nullptr, TLI) +
15113 CommonCost;
15114 };
15115 return GetCostDiff(GetScalarCost, GetVectorCost);
15116 }
15117 case Instruction::GetElementPtr: {
15118 return CommonCost + GetGEPCostDiff(VL, VL0);
15119 }
15120 case Instruction::Load: {
15121 auto GetScalarCost = [&](unsigned Idx) {
15122 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
15123 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15124 VI->getAlign(), VI->getPointerAddressSpace(),
15126 };
15127 auto *LI0 = cast<LoadInst>(VL0);
15128 auto GetVectorCost = [&](InstructionCost CommonCost) {
15129 InstructionCost VecLdCost;
15130 switch (E->State) {
15131 case TreeEntry::Vectorize:
15132 if (unsigned Factor = E->getInterleaveFactor()) {
15133 VecLdCost = TTI->getInterleavedMemoryOpCost(
15134 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15135 LI0->getPointerAddressSpace(), CostKind);
15136
15137 } else {
15138 VecLdCost = TTI->getMemoryOpCost(
15139 Instruction::Load, VecTy, LI0->getAlign(),
15140 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15141 }
15142 break;
15143 case TreeEntry::StridedVectorize: {
15144 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
15145 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15146 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
15147 Align CommonAlignment =
15148 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15149 VecLdCost = TTI->getMemIntrinsicInstrCost(
15150 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15151 StridedLoadTy, LI0->getPointerOperand(),
15152 /*VariableMask=*/false, CommonAlignment),
15153 CostKind);
15154 if (StridedLoadTy != VecTy)
15155 VecLdCost +=
15156 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15157 getCastContextHint(*E), CostKind);
15158
15159 break;
15160 }
15161 case TreeEntry::CompressVectorize: {
15162 bool IsMasked;
15163 unsigned InterleaveFactor;
15164 SmallVector<int> CompressMask;
15165 VectorType *LoadVecTy;
15166 SmallVector<Value *> Scalars(VL);
15167 if (!E->ReorderIndices.empty()) {
15168 SmallVector<int> Mask(E->ReorderIndices.begin(),
15169 E->ReorderIndices.end());
15170 reorderScalars(Scalars, Mask);
15171 }
15172 SmallVector<Value *> PointerOps(Scalars.size());
15173 for (auto [I, V] : enumerate(Scalars))
15174 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15175 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15176 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15177 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
15178 CompressMask, LoadVecTy);
15179 assert(IsVectorized && "Failed to vectorize load");
15180 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
15181 InterleaveFactor, IsMasked);
15182 Align CommonAlignment = LI0->getAlign();
15183 if (InterleaveFactor) {
15184 VecLdCost = TTI->getInterleavedMemoryOpCost(
15185 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15186 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
15187 } else if (IsMasked) {
15188 VecLdCost = TTI->getMemIntrinsicInstrCost(
15189 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15190 CommonAlignment,
15191 LI0->getPointerAddressSpace()),
15192 CostKind);
15193 // TODO: include this cost into CommonCost.
15194 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15195 LoadVecTy, CompressMask, CostKind);
15196 } else {
15197 VecLdCost = TTI->getMemoryOpCost(
15198 Instruction::Load, LoadVecTy, CommonAlignment,
15199 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15200 // TODO: include this cost into CommonCost.
15201 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15202 LoadVecTy, CompressMask, CostKind);
15203 }
15204 break;
15205 }
15206 case TreeEntry::ScatterVectorize: {
15207 Align CommonAlignment =
15208 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15209 VecLdCost = TTI->getMemIntrinsicInstrCost(
15210 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15211 LI0->getPointerOperand(),
15212 /*VariableMask=*/false, CommonAlignment),
15213 CostKind);
15214 break;
15215 }
15216 case TreeEntry::CombinedVectorize:
15217 case TreeEntry::SplitVectorize:
15218 case TreeEntry::NeedToGather:
15219 llvm_unreachable("Unexpected vectorization state.");
15220 }
15221 return VecLdCost + CommonCost;
15222 };
15223
15224 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
15225 // If this node generates masked gather load then it is not a terminal node.
15226 // Hence address operand cost is estimated separately.
15227 if (E->State == TreeEntry::ScatterVectorize)
15228 return Cost;
15229
15230 // Estimate cost of GEPs since this tree node is a terminator.
15231 SmallVector<Value *> PointerOps(VL.size());
15232 for (auto [I, V] : enumerate(VL))
15233 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15234 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15235 }
15236 case Instruction::Store: {
15237 bool IsReorder = !E->ReorderIndices.empty();
15238 auto GetScalarCost = [=](unsigned Idx) {
15239 auto *VI = cast<StoreInst>(VL[Idx]);
15240 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
15241 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15242 VI->getAlign(), VI->getPointerAddressSpace(),
15243 CostKind, OpInfo, VI);
15244 };
15245 auto *BaseSI =
15246 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
15247 auto GetVectorCost = [=](InstructionCost CommonCost) {
15248 // We know that we can merge the stores. Calculate the cost.
15249 InstructionCost VecStCost;
15250 if (E->State == TreeEntry::StridedVectorize) {
15251 Align CommonAlignment =
15252 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
15253 VecStCost = TTI->getMemIntrinsicInstrCost(
15254 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15255 VecTy, BaseSI->getPointerOperand(),
15256 /*VariableMask=*/false, CommonAlignment),
15257 CostKind);
15258 } else {
15259 assert(E->State == TreeEntry::Vectorize &&
15260 "Expected either strided or consecutive stores.");
15261 if (unsigned Factor = E->getInterleaveFactor()) {
15262 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
15263 "No reused shuffles expected");
15264 CommonCost = 0;
15265 VecStCost = TTI->getInterleavedMemoryOpCost(
15266 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15267 BaseSI->getPointerAddressSpace(), CostKind);
15268 } else {
15269 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
15270 VecStCost = TTI->getMemoryOpCost(
15271 Instruction::Store, VecTy, BaseSI->getAlign(),
15272 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
15273 }
15274 }
15275 return VecStCost + CommonCost;
15276 };
15277 SmallVector<Value *> PointerOps(VL.size());
15278 for (auto [I, V] : enumerate(VL)) {
15279 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
15280 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
15281 }
15282
15283 return GetCostDiff(GetScalarCost, GetVectorCost) +
15284 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15285 }
15286 case Instruction::Call: {
15287 auto GetScalarCost = [&](unsigned Idx) {
15288 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15291 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15292 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15293 }
15294 return TTI->getCallInstrCost(CI->getCalledFunction(),
15296 CI->getFunctionType()->params(), CostKind);
15297 };
15298 auto GetVectorCost = [=](InstructionCost CommonCost) {
15299 auto *CI = cast<CallInst>(VL0);
15302 CI, ID, VecTy->getNumElements(),
15303 It != MinBWs.end() ? It->second.first : 0, TTI);
15304 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15305 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15306 };
15307 return GetCostDiff(GetScalarCost, GetVectorCost);
15308 }
15309 case Instruction::ShuffleVector: {
15310 if (!SLPReVec || E->isAltShuffle())
15311 assert(E->isAltShuffle() &&
15312 ((Instruction::isBinaryOp(E->getOpcode()) &&
15313 Instruction::isBinaryOp(E->getAltOpcode())) ||
15314 (Instruction::isCast(E->getOpcode()) &&
15315 Instruction::isCast(E->getAltOpcode())) ||
15316 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15317 "Invalid Shuffle Vector Operand");
15318 // Try to find the previous shuffle node with the same operands and same
15319 // main/alternate ops.
15320 auto TryFindNodeWithEqualOperands = [=]() {
15321 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15322 if (TE.get() == E)
15323 break;
15324 if (TE->hasState() && TE->isAltShuffle() &&
15325 ((TE->getOpcode() == E->getOpcode() &&
15326 TE->getAltOpcode() == E->getAltOpcode()) ||
15327 (TE->getOpcode() == E->getAltOpcode() &&
15328 TE->getAltOpcode() == E->getOpcode())) &&
15329 TE->hasEqualOperands(*E))
15330 return true;
15331 }
15332 return false;
15333 };
15334 auto GetScalarCost = [&](unsigned Idx) {
15335 if (isa<PoisonValue>(UniqueValues[Idx]))
15337
15338 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15339 assert(E->getMatchingMainOpOrAltOp(VI) &&
15340 "Unexpected main/alternate opcode");
15341 (void)E;
15342 return TTI->getInstructionCost(VI, CostKind);
15343 };
15344 // Need to clear CommonCost since the final shuffle cost is included into
15345 // vector cost.
15346 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15347 // VecCost is equal to sum of the cost of creating 2 vectors
15348 // and the cost of creating shuffle.
15349 InstructionCost VecCost = 0;
15350 if (TryFindNodeWithEqualOperands()) {
15351 LLVM_DEBUG({
15352 dbgs() << "SLP: diamond match for alternate node found.\n";
15353 E->dump();
15354 });
15355 // No need to add new vector costs here since we're going to reuse
15356 // same main/alternate vector ops, just do different shuffling.
15357 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15358 VecCost =
15359 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15360 VecCost +=
15361 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15362 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15363 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15364 VecCost = TTIRef.getCmpSelInstrCost(
15365 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15366 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15367 VL0);
15368 VecCost += TTIRef.getCmpSelInstrCost(
15369 E->getOpcode(), VecTy, MaskTy,
15370 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15371 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15372 E->getAltOp());
15373 } else {
15374 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15375 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15376 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15377 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15378 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15379 unsigned SrcBWSz =
15380 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15381 if (SrcIt != MinBWs.end()) {
15382 SrcBWSz = SrcIt->second.first;
15383 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15384 SrcTy = getWidenedType(SrcSclTy, VL.size());
15385 }
15386 if (BWSz <= SrcBWSz) {
15387 if (BWSz < SrcBWSz)
15388 VecCost =
15389 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15391 LLVM_DEBUG({
15392 dbgs()
15393 << "SLP: alternate extension, which should be truncated.\n";
15394 E->dump();
15395 });
15396 return VecCost;
15397 }
15398 }
15399 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15401 VecCost +=
15402 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15404 }
15405 SmallVector<int> Mask;
15406 E->buildAltOpShuffleMask(
15407 [&](Instruction *I) {
15408 assert(E->getMatchingMainOpOrAltOp(I) &&
15409 "Unexpected main/alternate opcode");
15410 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15411 *TLI);
15412 },
15413 Mask);
15415 FinalVecTy, Mask, CostKind);
15416 // Patterns like [fadd,fsub] can be combined into a single instruction
15417 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15418 // need to take into account their order when looking for the most used
15419 // order.
15420 unsigned Opcode0 = E->getOpcode();
15421 unsigned Opcode1 = E->getAltOpcode();
15422 SmallBitVector OpcodeMask(
15423 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15424 // If this pattern is supported by the target then we consider the
15425 // order.
15426 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15427 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15428 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15429 return AltVecCost < VecCost ? AltVecCost : VecCost;
15430 }
15431 // TODO: Check the reverse order too.
15432 return VecCost;
15433 };
15434 if (SLPReVec && !E->isAltShuffle())
15435 return GetCostDiff(
15436 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15437 // If a group uses mask in order, the shufflevector can be
15438 // eliminated by instcombine. Then the cost is 0.
15440 "Not supported shufflevector usage.");
15441 auto *SV = cast<ShuffleVectorInst>(VL.front());
15442 unsigned SVNumElements =
15443 cast<FixedVectorType>(SV->getOperand(0)->getType())
15444 ->getNumElements();
15445 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15446 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15447 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15448 int NextIndex = 0;
15449 if (!all_of(Group, [&](Value *V) {
15451 "Not supported shufflevector usage.");
15452 auto *SV = cast<ShuffleVectorInst>(V);
15453 int Index;
15454 [[maybe_unused]] bool IsExtractSubvectorMask =
15455 SV->isExtractSubvectorMask(Index);
15456 assert(IsExtractSubvectorMask &&
15457 "Not supported shufflevector usage.");
15458 if (NextIndex != Index)
15459 return false;
15460 NextIndex += SV->getShuffleMask().size();
15461 return true;
15462 }))
15463 return ::getShuffleCost(
15465 calculateShufflevectorMask(E->Scalars));
15466 }
15467 return TTI::TCC_Free;
15468 });
15469 return GetCostDiff(GetScalarCost, GetVectorCost);
15470 }
15471 case Instruction::Freeze:
15472 return CommonCost;
15473 default:
15474 llvm_unreachable("Unknown instruction");
15475 }
15476}
15477
15478bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15479 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15480 << VectorizableTree.size() << " is fully vectorizable .\n");
15481
15482 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15483 SmallVector<int> Mask;
15484 return TE->isGather() &&
15485 !any_of(TE->Scalars,
15486 [this](Value *V) { return EphValues.contains(V); }) &&
15487 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15488 TE->Scalars.size() < Limit ||
15489 (((TE->hasState() &&
15490 TE->getOpcode() == Instruction::ExtractElement) ||
15492 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15493 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15494 !TE->isAltShuffle()) ||
15495 any_of(TE->Scalars, IsaPred<LoadInst>));
15496 };
15497
15498 // We only handle trees of heights 1 and 2.
15499 if (VectorizableTree.size() == 1 &&
15500 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15501 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15502 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15503 (ForReduction &&
15504 AreVectorizableGathers(VectorizableTree[0].get(),
15505 VectorizableTree[0]->Scalars.size()) &&
15506 VectorizableTree[0]->getVectorFactor() > 2)))
15507 return true;
15508
15509 if (VectorizableTree.size() != 2)
15510 return false;
15511
15512 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15513 // with the second gather nodes if they have less scalar operands rather than
15514 // the initial tree element (may be profitable to shuffle the second gather)
15515 // or they are extractelements, which form shuffle.
15516 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15517 AreVectorizableGathers(VectorizableTree[1].get(),
15518 VectorizableTree[0]->Scalars.size()))
15519 return true;
15520
15521 // Gathering cost would be too much for tiny trees.
15522 if (VectorizableTree[0]->isGather() ||
15523 (VectorizableTree[1]->isGather() &&
15524 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15525 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15526 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15527 return false;
15528
15529 return true;
15530}
15531
15532static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15534 bool MustMatchOrInst) {
15535 // Look past the root to find a source value. Arbitrarily follow the
15536 // path through operand 0 of any 'or'. Also, peek through optional
15537 // shift-left-by-multiple-of-8-bits.
15538 Value *ZextLoad = Root;
15539 const APInt *ShAmtC;
15540 bool FoundOr = false;
15541 while (!isa<ConstantExpr>(ZextLoad) &&
15542 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15543 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15544 ShAmtC->urem(8) == 0))) {
15545 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15546 ZextLoad = BinOp->getOperand(0);
15547 if (BinOp->getOpcode() == Instruction::Or)
15548 FoundOr = true;
15549 }
15550 // Check if the input is an extended load of the required or/shift expression.
15551 Value *Load;
15552 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15553 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15554 return false;
15555
15556 // Require that the total load bit width is a legal integer type.
15557 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15558 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15559 Type *SrcTy = Load->getType();
15560 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15561 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15562 return false;
15563
15564 // Everything matched - assume that we can fold the whole sequence using
15565 // load combining.
15566 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15567 << *(cast<Instruction>(Root)) << "\n");
15568
15569 return true;
15570}
15571
15573 if (RdxKind != RecurKind::Or)
15574 return false;
15575
15576 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15577 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15578 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15579 /* MatchOr */ false);
15580}
15581
15583 // Peek through a final sequence of stores and check if all operations are
15584 // likely to be load-combined.
15585 unsigned NumElts = Stores.size();
15586 for (Value *Scalar : Stores) {
15587 Value *X;
15588 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15589 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15590 return false;
15591 }
15592 return true;
15593}
15594
15595bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15596 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15597 return true;
15598
15599 // Graph is empty - do nothing.
15600 if (VectorizableTree.empty()) {
15601 assert(ExternalUses.empty() && "We shouldn't have any external users");
15602
15603 return true;
15604 }
15605
15606 // No need to vectorize inserts of gathered values.
15607 if (VectorizableTree.size() == 2 &&
15608 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15609 VectorizableTree[1]->isGather() &&
15610 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15611 !(isSplat(VectorizableTree[1]->Scalars) ||
15612 allConstant(VectorizableTree[1]->Scalars))))
15613 return true;
15614
15615 // If the graph includes only PHI nodes and gathers, it is defnitely not
15616 // profitable for the vectorization, we can skip it, if the cost threshold is
15617 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15618 // gathers/buildvectors.
15619 constexpr int Limit = 4;
15620 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15621 !VectorizableTree.empty() &&
15622 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15623 return (TE->isGather() &&
15624 (!TE->hasState() ||
15625 TE->getOpcode() != Instruction::ExtractElement) &&
15626 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15627 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15628 }))
15629 return true;
15630
15631 // Do not vectorize small tree of phis only, if all vector phis are also
15632 // gathered.
15633 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15634 VectorizableTree.size() <= Limit &&
15635 all_of(VectorizableTree,
15636 [&](const std::unique_ptr<TreeEntry> &TE) {
15637 return (TE->isGather() &&
15638 (!TE->hasState() ||
15639 TE->getOpcode() != Instruction::ExtractElement) &&
15640 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15641 Limit) ||
15642 (TE->hasState() &&
15643 (TE->getOpcode() == Instruction::InsertElement ||
15644 (TE->getOpcode() == Instruction::PHI &&
15645 all_of(TE->Scalars, [&](Value *V) {
15646 return isa<PoisonValue>(V) || MustGather.contains(V);
15647 }))));
15648 }) &&
15649 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15650 return TE->State == TreeEntry::Vectorize &&
15651 TE->getOpcode() == Instruction::PHI;
15652 }))
15653 return true;
15654
15655 // If the tree contains only phis, buildvectors, split nodes and
15656 // small nodes with reuses, we can skip it.
15657 SmallVector<const TreeEntry *> StoreLoadNodes;
15658 unsigned NumGathers = 0;
15659 constexpr int LimitTreeSize = 36;
15660 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15661 all_of(VectorizableTree,
15662 [&](const std::unique_ptr<TreeEntry> &TE) {
15663 if (!TE->isGather() && TE->hasState() &&
15664 (TE->getOpcode() == Instruction::Load ||
15665 TE->getOpcode() == Instruction::Store)) {
15666 StoreLoadNodes.push_back(TE.get());
15667 return true;
15668 }
15669 if (TE->isGather())
15670 ++NumGathers;
15671 return TE->State == TreeEntry::SplitVectorize ||
15672 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15673 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15674 VectorizableTree.size() > LimitTreeSize) ||
15675 (TE->isGather() &&
15676 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15677 (TE->hasState() &&
15678 (TE->getOpcode() == Instruction::PHI ||
15679 (TE->hasCopyableElements() &&
15680 static_cast<unsigned>(count_if(
15681 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15682 TE->Scalars.size() / 2) ||
15683 ((!TE->ReuseShuffleIndices.empty() ||
15684 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15685 TE->Scalars.size() == 2)));
15686 }) &&
15687 (StoreLoadNodes.empty() ||
15688 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15689 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15690 return TE->getOpcode() == Instruction::Store ||
15691 all_of(TE->Scalars, [&](Value *V) {
15692 return !isa<LoadInst>(V) ||
15693 areAllUsersVectorized(cast<Instruction>(V));
15694 });
15695 })))))
15696 return true;
15697
15698 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15699 // tree node) and other buildvectors, we can skip it.
15700 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15701 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15702 VectorizableTree.size() >= Limit &&
15703 count_if(ArrayRef(VectorizableTree).drop_front(),
15704 [&](const std::unique_ptr<TreeEntry> &TE) {
15705 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15706 TE->UserTreeIndex.UserTE->Idx == 0;
15707 }) == 2)
15708 return true;
15709
15710 // If the tree contains only vectorization of the phi node from the
15711 // buildvector - skip it.
15712 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15713 VectorizableTree.size() > 2 &&
15714 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15715 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15716 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15717 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15718 all_of(
15719 ArrayRef(VectorizableTree).drop_front(2),
15720 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
15721 return true;
15722
15723 // We can vectorize the tree if its size is greater than or equal to the
15724 // minimum size specified by the MinTreeSize command line option.
15725 if (VectorizableTree.size() >= MinTreeSize)
15726 return false;
15727
15728 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
15729 // can vectorize it if we can prove it fully vectorizable.
15730 if (isFullyVectorizableTinyTree(ForReduction))
15731 return false;
15732
15733 // Check if any of the gather node forms an insertelement buildvector
15734 // somewhere.
15735 bool IsAllowedSingleBVNode =
15736 VectorizableTree.size() > 1 ||
15737 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15738 !VectorizableTree.front()->isAltShuffle() &&
15739 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15740 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15741 allSameBlock(VectorizableTree.front()->Scalars));
15742 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15743 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
15744 return isa<ExtractElementInst, Constant>(V) ||
15745 (IsAllowedSingleBVNode &&
15746 !V->hasNUsesOrMore(UsesLimit) &&
15747 any_of(V->users(), IsaPred<InsertElementInst>));
15748 });
15749 }))
15750 return false;
15751
15752 if (VectorizableTree.back()->isGather() &&
15753 VectorizableTree.back()->hasState() &&
15754 VectorizableTree.back()->isAltShuffle() &&
15755 VectorizableTree.back()->getVectorFactor() > 2 &&
15756 allSameBlock(VectorizableTree.back()->Scalars) &&
15757 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15758 TTI->getScalarizationOverhead(
15759 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15760 VectorizableTree.back()->getVectorFactor()),
15761 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
15762 /*Insert=*/true, /*Extract=*/false,
15764 return false;
15765
15766 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
15767 // vectorizable.
15768 return true;
15769}
15770
15773 constexpr unsigned SmallTree = 3;
15774 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15775 getCanonicalGraphSize() <= SmallTree &&
15776 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15777 [](const std::unique_ptr<TreeEntry> &TE) {
15778 return TE->isGather() && TE->hasState() &&
15779 TE->getOpcode() == Instruction::Load &&
15780 !allSameBlock(TE->Scalars);
15781 }) == 1)
15782 return true;
15783 return false;
15784 }
15785 bool Res = false;
15786 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
15787 TreeEntry &E = *VectorizableTree[Idx];
15788 if (E.State == TreeEntry::SplitVectorize)
15789 return false;
15790 if (!E.isGather())
15791 continue;
15792 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15793 (!E.hasState() &&
15795 (isa<ExtractElementInst>(E.Scalars.front()) &&
15796 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
15797 return false;
15798 if (isSplat(E.Scalars) || allConstant(E.Scalars))
15799 continue;
15800 Res = true;
15801 }
15802 return Res;
15803}
15804
15806 // Walk from the bottom of the tree to the top, tracking which values are
15807 // live. When we see a call instruction that is not part of our tree,
15808 // query TTI to see if there is a cost to keeping values live over it
15809 // (for example, if spills and fills are required).
15810
15811 const TreeEntry *Root = VectorizableTree.front().get();
15812 if (Root->isGather())
15813 return 0;
15814
15815 InstructionCost Cost = 0;
15817 EntriesToOperands;
15818 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
15819 SmallPtrSet<const Instruction *, 8> LastInstructions;
15820 for (const auto &TEPtr : VectorizableTree) {
15821 if (!TEPtr->isGather()) {
15822 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15823 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
15824 LastInstructions.insert(LastInst);
15825 }
15826 if (TEPtr->UserTreeIndex)
15827 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15828 }
15829
15830 auto NoCallIntrinsic = [this](const Instruction *I) {
15831 const auto *II = dyn_cast<IntrinsicInst>(I);
15832 if (!II)
15833 return false;
15834 if (II->isAssumeLikeIntrinsic())
15835 return true;
15836 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
15837 InstructionCost IntrCost =
15838 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
15839 InstructionCost CallCost = TTI->getCallInstrCost(
15840 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
15841 return IntrCost < CallCost;
15842 };
15843
15844 // Maps last instruction in the entry to the last instruction for the one of
15845 // operand entries and the flag. If the flag is true, there are no calls in
15846 // between these instructions.
15848 CheckedInstructions;
15849 unsigned Budget = 0;
15850 const unsigned BudgetLimit =
15851 ScheduleRegionSizeBudget / VectorizableTree.size();
15852 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
15853 const Instruction *Last) {
15854 assert(First->getParent() == Last->getParent() &&
15855 "Expected instructions in same block.");
15856 if (auto It = CheckedInstructions.find(Last);
15857 It != CheckedInstructions.end()) {
15858 const Instruction *Checked = It->second.getPointer();
15859 if (Checked == First || Checked->comesBefore(First))
15860 return It->second.getInt() != 0;
15861 Last = Checked;
15862 } else if (Last == First || Last->comesBefore(First)) {
15863 return true;
15864 }
15866 ++First->getIterator().getReverse(),
15867 PrevInstIt =
15868 Last->getIterator().getReverse();
15869 SmallVector<const Instruction *> LastInstsInRange;
15870 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15871 // Debug information does not impact spill cost.
15872 // Vectorized calls, represented as vector intrinsics, do not impact spill
15873 // cost.
15874 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15875 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
15876 for (const Instruction *LastInst : LastInstsInRange)
15877 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
15878 return false;
15879 }
15880 if (LastInstructions.contains(&*PrevInstIt))
15881 LastInstsInRange.push_back(&*PrevInstIt);
15882
15883 ++PrevInstIt;
15884 ++Budget;
15885 }
15886 for (const Instruction *LastInst : LastInstsInRange)
15887 CheckedInstructions.try_emplace(
15888 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
15889 Budget <= BudgetLimit ? 1 : 0);
15890 return Budget <= BudgetLimit;
15891 };
15892 auto AddCosts = [&](const TreeEntry *Op) {
15893 Type *ScalarTy = Op->Scalars.front()->getType();
15894 auto It = MinBWs.find(Op);
15895 if (It != MinBWs.end())
15896 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
15897 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
15898 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15899 if (ScalarTy->isVectorTy()) {
15900 // Handle revec dead vector instructions.
15901 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15902 }
15903 };
15904 // Memoize the relationship between blocks, i.e. if there is (at least one)
15905 // non-vectorized call between the blocks. This allows to skip the analysis of
15906 // the same block paths multiple times.
15908 ParentOpParentToPreds;
15909 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
15910 BasicBlock *OpParent) {
15911 auto Key = std::make_pair(Root, OpParent);
15912 if (auto It = ParentOpParentToPreds.find(Key);
15913 It != ParentOpParentToPreds.end())
15914 return It->second;
15916 if (Pred)
15917 Worklist.push_back(Pred);
15918 else
15919 Worklist.append(pred_begin(Root), pred_end(Root));
15922 ParentsPairsToAdd;
15923 bool Res = false;
15924 auto Cleanup = make_scope_exit([&]() {
15925 for (const auto &KeyPair : ParentsPairsToAdd) {
15926 assert(!ParentOpParentToPreds.contains(KeyPair) &&
15927 "Should not have been added before.");
15928 ParentOpParentToPreds.try_emplace(KeyPair, Res);
15929 }
15930 });
15931 while (!Worklist.empty()) {
15932 BasicBlock *BB = Worklist.pop_back_val();
15933 if (BB == OpParent || !Visited.insert(BB).second)
15934 continue;
15935 auto Pair = std::make_pair(BB, OpParent);
15936 if (auto It = ParentOpParentToPreds.find(Pair);
15937 It != ParentOpParentToPreds.end()) {
15938 Res = It->second;
15939 return Res;
15940 }
15941 ParentsPairsToAdd.insert(Pair);
15942 unsigned BlockSize = BB->size();
15943 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
15944 return Res;
15945 Budget += BlockSize;
15946 if (Budget > BudgetLimit)
15947 return Res;
15948 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
15949 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
15950 BB->getTerminator()))
15951 return Res;
15952 Worklist.append(pred_begin(BB), pred_end(BB));
15953 }
15954 Res = true;
15955 return Res;
15956 };
15957 SmallVector<const TreeEntry *> LiveEntries(1, Root);
15958 while (!LiveEntries.empty()) {
15959 const TreeEntry *Entry = LiveEntries.pop_back_val();
15960 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
15961 if (Operands.empty())
15962 continue;
15963 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
15964 BasicBlock *Parent = LastInst->getParent();
15965 for (const TreeEntry *Op : Operands) {
15966 if (!Op->isGather())
15967 LiveEntries.push_back(Op);
15968 if (Entry->State == TreeEntry::SplitVectorize ||
15969 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
15970 (Op->isGather() && allConstant(Op->Scalars)))
15971 continue;
15972 Budget = 0;
15973 BasicBlock *Pred = nullptr;
15974 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15975 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15976 BasicBlock *OpParent;
15977 Instruction *OpLastInst;
15978 if (Op->isGather()) {
15979 assert(Entry->getOpcode() == Instruction::PHI &&
15980 "Expected phi node only.");
15981 OpParent = cast<PHINode>(Entry->getMainOp())
15982 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15983 OpLastInst = OpParent->getTerminator();
15984 for (Value *V : Op->Scalars) {
15985 auto *Inst = dyn_cast<Instruction>(V);
15986 if (!Inst)
15987 continue;
15988 if (isVectorized(V)) {
15989 OpParent = Inst->getParent();
15990 OpLastInst = Inst;
15991 break;
15992 }
15993 }
15994 } else {
15995 OpLastInst = EntriesToLastInstruction.at(Op);
15996 OpParent = OpLastInst->getParent();
15997 }
15998 // Check the call instructions within the same basic blocks.
15999 if (OpParent == Parent) {
16000 if (Entry->getOpcode() == Instruction::PHI) {
16001 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16002 AddCosts(Op);
16003 continue;
16004 }
16005 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16006 AddCosts(Op);
16007 continue;
16008 }
16009 // Check for call instruction in between blocks.
16010 // 1. Check entry's block to the head.
16011 if (Entry->getOpcode() != Instruction::PHI &&
16012 !CheckForNonVecCallsInSameBlock(
16013 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
16014 LastInst)) {
16015 AddCosts(Op);
16016 continue;
16017 }
16018 // 2. Check op's block from the end.
16019 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16020 OpParent->getTerminator())) {
16021 AddCosts(Op);
16022 continue;
16023 }
16024 // 3. Check the predecessors of entry's block till op's block.
16025 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16026 AddCosts(Op);
16027 continue;
16028 }
16029 }
16030 }
16031
16032 return Cost;
16033}
16034
16035/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
16036/// buildvector sequence.
16038 const InsertElementInst *IE2) {
16039 if (IE1 == IE2)
16040 return false;
16041 const auto *I1 = IE1;
16042 const auto *I2 = IE2;
16043 const InsertElementInst *PrevI1;
16044 const InsertElementInst *PrevI2;
16045 unsigned Idx1 = *getElementIndex(IE1);
16046 unsigned Idx2 = *getElementIndex(IE2);
16047 do {
16048 if (I2 == IE1)
16049 return true;
16050 if (I1 == IE2)
16051 return false;
16052 PrevI1 = I1;
16053 PrevI2 = I2;
16054 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16055 getElementIndex(I1).value_or(Idx2) != Idx2)
16056 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
16057 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
16058 getElementIndex(I2).value_or(Idx1) != Idx1)
16059 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
16060 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16061 llvm_unreachable("Two different buildvectors not expected.");
16062}
16063
16064namespace {
16065/// Returns incoming Value *, if the requested type is Value * too, or a default
16066/// value, otherwise.
16067struct ValueSelect {
16068 template <typename U>
16069 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
16070 return V;
16071 }
16072 template <typename U>
16073 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
16074 return U();
16075 }
16076};
16077} // namespace
16078
16079/// Does the analysis of the provided shuffle masks and performs the requested
16080/// actions on the vectors with the given shuffle masks. It tries to do it in
16081/// several steps.
16082/// 1. If the Base vector is not undef vector, resizing the very first mask to
16083/// have common VF and perform action for 2 input vectors (including non-undef
16084/// Base). Other shuffle masks are combined with the resulting after the 1 stage
16085/// and processed as a shuffle of 2 elements.
16086/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
16087/// action only for 1 vector with the given mask, if it is not the identity
16088/// mask.
16089/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
16090/// vectors, combing the masks properly between the steps.
16091template <typename T>
16093 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
16094 function_ref<unsigned(T *)> GetVF,
16095 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
16097 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
16098 SmallVector<int> Mask(ShuffleMask.begin()->second);
16099 auto VMIt = std::next(ShuffleMask.begin());
16100 T *Prev = nullptr;
16101 SmallBitVector UseMask =
16102 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16103 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
16104 if (!IsBaseUndef.all()) {
16105 // Base is not undef, need to combine it with the next subvectors.
16106 std::pair<T *, bool> Res =
16107 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
16108 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
16109 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16110 if (Mask[Idx] == PoisonMaskElem)
16111 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
16112 else
16113 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16114 }
16115 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
16116 assert((!V || GetVF(V) == Mask.size()) &&
16117 "Expected base vector of VF number of elements.");
16118 Prev = Action(Mask, {nullptr, Res.first});
16119 } else if (ShuffleMask.size() == 1) {
16120 // Base is undef and only 1 vector is shuffled - perform the action only for
16121 // single vector, if the mask is not the identity mask.
16122 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16123 /*ForSingleMask=*/true);
16124 if (Res.second)
16125 // Identity mask is found.
16126 Prev = Res.first;
16127 else
16128 Prev = Action(Mask, {ShuffleMask.begin()->first});
16129 } else {
16130 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
16131 // shuffles step by step, combining shuffle between the steps.
16132 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16133 unsigned Vec2VF = GetVF(VMIt->first);
16134 if (Vec1VF == Vec2VF) {
16135 // No need to resize the input vectors since they are of the same size, we
16136 // can shuffle them directly.
16137 ArrayRef<int> SecMask = VMIt->second;
16138 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16139 if (SecMask[I] != PoisonMaskElem) {
16140 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16141 Mask[I] = SecMask[I] + Vec1VF;
16142 }
16143 }
16144 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16145 } else {
16146 // Vectors of different sizes - resize and reshuffle.
16147 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16148 /*ForSingleMask=*/false);
16149 std::pair<T *, bool> Res2 =
16150 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16151 ArrayRef<int> SecMask = VMIt->second;
16152 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16153 if (Mask[I] != PoisonMaskElem) {
16154 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16155 if (Res1.second)
16156 Mask[I] = I;
16157 } else if (SecMask[I] != PoisonMaskElem) {
16158 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16159 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
16160 }
16161 }
16162 Prev = Action(Mask, {Res1.first, Res2.first});
16163 }
16164 VMIt = std::next(VMIt);
16165 }
16166 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
16167 // Perform requested actions for the remaining masks/vectors.
16168 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16169 // Shuffle other input vectors, if any.
16170 std::pair<T *, bool> Res =
16171 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16172 ArrayRef<int> SecMask = VMIt->second;
16173 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16174 if (SecMask[I] != PoisonMaskElem) {
16175 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
16176 "Multiple uses of scalars.");
16177 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
16178 } else if (Mask[I] != PoisonMaskElem) {
16179 Mask[I] = I;
16180 }
16181 }
16182 Prev = Action(Mask, {Prev, Res.first});
16183 }
16184 return Prev;
16185}
16186
16187namespace {
16188/// Data type for handling buildvector sequences with the reused scalars from
16189/// other tree entries.
16190template <typename T> struct ShuffledInsertData {
16191 /// List of insertelements to be replaced by shuffles.
16192 SmallVector<InsertElementInst *> InsertElements;
16193 /// The parent vectors and shuffle mask for the given list of inserts.
16194 MapVector<T, SmallVector<int>> ValueMasks;
16195};
16196} // namespace
16197
16199 InstructionCost ReductionCost) {
16200 InstructionCost Cost = ReductionCost;
16201 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16202 << VectorizableTree.size() << ".\n");
16203
16204 SmallPtrSet<Value *, 4> CheckedExtracts;
16205 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
16206 TreeEntry &TE = *VectorizableTree[I];
16207 // No need to count the cost for combined entries, they are combined and
16208 // just skip their cost.
16209 if (TE.State == TreeEntry::CombinedVectorize) {
16210 LLVM_DEBUG(
16211 dbgs() << "SLP: Skipping cost for combined node that starts with "
16212 << *TE.Scalars[0] << ".\n";
16213 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16214 continue;
16215 }
16216 if (TE.hasState() &&
16217 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16218 if (const TreeEntry *E =
16219 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16220 E && E->getVectorFactor() == TE.getVectorFactor()) {
16221 // Some gather nodes might be absolutely the same as some vectorizable
16222 // nodes after reordering, need to handle it.
16223 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16224 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16225 << "SLP: Current total cost = " << Cost << "\n");
16226 continue;
16227 }
16228 }
16229
16230 // Exclude cost of gather loads nodes which are not used. These nodes were
16231 // built as part of the final attempt to vectorize gathered loads.
16232 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16233 "Expected gather nodes with users only.");
16234
16235 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
16236 Cost += C;
16237 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16238 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16239 << "SLP: Current total cost = " << Cost << "\n");
16240 }
16241
16242 if (Cost >= -SLPCostThreshold &&
16243 none_of(ExternalUses, [](const ExternalUser &EU) {
16244 return isa_and_nonnull<InsertElementInst>(EU.User);
16245 }))
16246 return Cost;
16247
16248 SmallPtrSet<Value *, 16> ExtractCostCalculated;
16249 InstructionCost ExtractCost = 0;
16251 SmallVector<APInt> DemandedElts;
16252 SmallDenseSet<Value *, 4> UsedInserts;
16254 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16256 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
16257 // Keep track {Scalar, Index, User} tuple.
16258 // On AArch64, this helps in fusing a mov instruction, associated with
16259 // extractelement, with fmul in the backend so that extractelement is free.
16261 for (ExternalUser &EU : ExternalUses) {
16262 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
16263 }
16264 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
16265 for (ExternalUser &EU : ExternalUses) {
16266 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
16267 << EU.E.Idx << " in lane " << EU.Lane << "\n");
16268 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
16269 else dbgs() << " User: nullptr\n");
16270 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
16271
16272 // Uses by ephemeral values are free (because the ephemeral value will be
16273 // removed prior to code generation, and so the extraction will be
16274 // removed as well).
16275 if (EphValues.count(EU.User))
16276 continue;
16277
16278 // Check if the scalar for the given user or all users is accounted already.
16279 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
16280 (EU.User &&
16281 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
16282 continue;
16283
16284 // Used in unreachable blocks or in EH pads (rarely executed) or is
16285 // terminated with unreachable instruction.
16286 if (BasicBlock *UserParent =
16287 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
16288 UserParent &&
16289 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16290 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
16291 continue;
16292
16293 // We only add extract cost once for the same scalar.
16294 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16295 !ExtractCostCalculated.insert(EU.Scalar).second)
16296 continue;
16297
16298 // No extract cost for vector "scalar" if REVEC is disabled
16299 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16300 continue;
16301
16302 // If found user is an insertelement, do not calculate extract cost but try
16303 // to detect it as a final shuffled/identity match.
16304 // TODO: what if a user is insertvalue when REVEC is enabled?
16305 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16306 VU && VU->getOperand(1) == EU.Scalar) {
16307 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16308 if (!UsedInserts.insert(VU).second)
16309 continue;
16310 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16311 if (InsertIdx) {
16312 const TreeEntry *ScalarTE = &EU.E;
16313 auto *It = find_if(
16314 ShuffledInserts,
16315 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16316 // Checks if 2 insertelements are from the same buildvector.
16317 InsertElementInst *VecInsert = Data.InsertElements.front();
16319 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16320 Value *Op0 = II->getOperand(0);
16321 if (isVectorized(II) && !isVectorized(Op0))
16322 return nullptr;
16323 return Op0;
16324 });
16325 });
16326 int VecId = -1;
16327 if (It == ShuffledInserts.end()) {
16328 auto &Data = ShuffledInserts.emplace_back();
16329 Data.InsertElements.emplace_back(VU);
16330 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16331 VecId = ShuffledInserts.size() - 1;
16332 auto It = MinBWs.find(ScalarTE);
16333 if (It != MinBWs.end() &&
16334 VectorCasts
16335 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16336 .second) {
16337 unsigned BWSz = It->second.first;
16338 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16339 unsigned VecOpcode;
16340 if (DstBWSz < BWSz)
16341 VecOpcode = Instruction::Trunc;
16342 else
16343 VecOpcode =
16344 It->second.second ? Instruction::SExt : Instruction::ZExt;
16346 InstructionCost C = TTI->getCastInstrCost(
16347 VecOpcode, FTy,
16348 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16349 FTy->getNumElements()),
16351 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16352 << " for extending externally used vector with "
16353 "non-equal minimum bitwidth.\n");
16354 Cost += C;
16355 }
16356 } else {
16357 if (isFirstInsertElement(VU, It->InsertElements.front()))
16358 It->InsertElements.front() = VU;
16359 VecId = std::distance(ShuffledInserts.begin(), It);
16360 }
16361 int InIdx = *InsertIdx;
16362 SmallVectorImpl<int> &Mask =
16363 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16364 if (Mask.empty())
16365 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16366 Mask[InIdx] = EU.Lane;
16367 DemandedElts[VecId].setBit(InIdx);
16368 continue;
16369 }
16370 }
16371 }
16372
16374 // If we plan to rewrite the tree in a smaller type, we will need to sign
16375 // extend the extracted value back to the original type. Here, we account
16376 // for the extract and the added cost of the sign extend if needed.
16377 InstructionCost ExtraCost = TTI::TCC_Free;
16378 auto *ScalarTy = EU.Scalar->getType();
16379 const unsigned BundleWidth = EU.E.getVectorFactor();
16380 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16381 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16382 const TreeEntry *Entry = &EU.E;
16383 auto It = MinBWs.find(Entry);
16384 if (It != MinBWs.end()) {
16385 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16386 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16387 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16388 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16389 ? Instruction::ZExt
16390 : Instruction::SExt;
16391 VecTy = getWidenedType(MinTy, BundleWidth);
16392 ExtraCost =
16393 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16394 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16395 << ExtraCost << "\n");
16396 } else {
16397 ExtraCost =
16398 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16399 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16400 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16401 << *VecTy << ": " << ExtraCost << "\n");
16402 }
16403 // Leave the scalar instructions as is if they are cheaper than extracts.
16404 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16405 Entry->getOpcode() == Instruction::Load) {
16406 // Checks if the user of the external scalar is phi in loop body.
16407 auto IsPhiInLoop = [&](const ExternalUser &U) {
16408 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16409 auto *I = cast<Instruction>(U.Scalar);
16410 const Loop *L = LI->getLoopFor(Phi->getParent());
16411 return L && (Phi->getParent() == I->getParent() ||
16412 L == LI->getLoopFor(I->getParent()));
16413 }
16414 return false;
16415 };
16416 if (!ValueToExtUses) {
16417 ValueToExtUses.emplace();
16418 for (const auto &P : enumerate(ExternalUses)) {
16419 // Ignore phis in loops.
16420 if (IsPhiInLoop(P.value()))
16421 continue;
16422
16423 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16424 }
16425 }
16426 // Can use original instruction, if no operands vectorized or they are
16427 // marked as externally used already.
16428 auto *Inst = cast<Instruction>(EU.Scalar);
16429 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16430 auto OperandIsScalar = [&](Value *V) {
16431 if (!isVectorized(V)) {
16432 // Some extractelements might be not vectorized, but
16433 // transformed into shuffle and removed from the function,
16434 // consider it here.
16435 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16436 return !EE->hasOneUse() || !MustGather.contains(EE);
16437 return true;
16438 }
16439 return ValueToExtUses->contains(V);
16440 };
16441 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16442 bool CanBeUsedAsScalarCast = false;
16443 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16444 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16445 Op && all_of(Op->operands(), OperandIsScalar)) {
16446 InstructionCost OpCost =
16447 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16448 ? TTI->getInstructionCost(Op, CostKind)
16449 : 0;
16450 if (ScalarCost + OpCost <= ExtraCost) {
16451 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16452 ScalarCost += OpCost;
16453 }
16454 }
16455 }
16456 if (CanBeUsedAsScalar) {
16457 bool KeepScalar = ScalarCost <= ExtraCost;
16458 // Try to keep original scalar if the user is the phi node from the same
16459 // block as the root phis, currently vectorized. It allows to keep
16460 // better ordering info of PHIs, being vectorized currently.
16461 bool IsProfitablePHIUser =
16462 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16463 VectorizableTree.front()->Scalars.size() > 2)) &&
16464 VectorizableTree.front()->hasState() &&
16465 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16466 !Inst->hasNUsesOrMore(UsesLimit) &&
16467 none_of(Inst->users(),
16468 [&](User *U) {
16469 auto *PHIUser = dyn_cast<PHINode>(U);
16470 return (!PHIUser ||
16471 PHIUser->getParent() !=
16472 cast<Instruction>(
16473 VectorizableTree.front()->getMainOp())
16474 ->getParent()) &&
16475 !isVectorized(U);
16476 }) &&
16477 count_if(Entry->Scalars, [&](Value *V) {
16478 return ValueToExtUses->contains(V);
16479 }) <= 2;
16480 if (IsProfitablePHIUser) {
16481 KeepScalar = true;
16482 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16483 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16484 (!GatheredLoadsEntriesFirst.has_value() ||
16485 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16486 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16487 return ValueToExtUses->contains(V);
16488 });
16489 auto It = ExtractsCount.find(Entry);
16490 if (It != ExtractsCount.end()) {
16491 assert(ScalarUsesCount >= It->getSecond().size() &&
16492 "Expected total number of external uses not less than "
16493 "number of scalar uses.");
16494 ScalarUsesCount -= It->getSecond().size();
16495 }
16496 // Keep original scalar if number of externally used instructions in
16497 // the same entry is not power of 2. It may help to do some extra
16498 // vectorization for now.
16499 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16500 }
16501 if (KeepScalar) {
16502 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16503 for (Value *V : Inst->operands()) {
16504 auto It = ValueToExtUses->find(V);
16505 if (It != ValueToExtUses->end()) {
16506 // Replace all uses to avoid compiler crash.
16507 ExternalUses[It->second].User = nullptr;
16508 }
16509 }
16510 ExtraCost = ScalarCost;
16511 if (!IsPhiInLoop(EU))
16512 ExtractsCount[Entry].insert(Inst);
16513 if (CanBeUsedAsScalarCast) {
16514 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16515 // Update the users of the operands of the cast operand to avoid
16516 // compiler crash.
16517 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16518 for (Value *V : IOp->operands()) {
16519 auto It = ValueToExtUses->find(V);
16520 if (It != ValueToExtUses->end()) {
16521 // Replace all uses to avoid compiler crash.
16522 ExternalUses[It->second].User = nullptr;
16523 }
16524 }
16525 }
16526 }
16527 }
16528 }
16529 }
16530
16531 ExtractCost += ExtraCost;
16532 }
16533 // Insert externals for extract of operands of casts to be emitted as scalars
16534 // instead of extractelement.
16535 for (Value *V : ScalarOpsFromCasts) {
16536 ExternalUsesAsOriginalScalar.insert(V);
16537 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
16538 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
16539 TEs.front()->findLaneForValue(V));
16540 }
16541 }
16542 // Add reduced value cost, if resized.
16543 if (!VectorizedVals.empty()) {
16544 const TreeEntry &Root = *VectorizableTree.front();
16545 auto BWIt = MinBWs.find(&Root);
16546 if (BWIt != MinBWs.end()) {
16547 Type *DstTy = Root.Scalars.front()->getType();
16548 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
16549 unsigned SrcSz =
16550 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16551 if (OriginalSz != SrcSz) {
16552 unsigned Opcode = Instruction::Trunc;
16553 if (OriginalSz > SrcSz)
16554 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16555 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
16556 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16557 assert(SLPReVec && "Only supported by REVEC.");
16558 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
16559 }
16560 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16563 }
16564 }
16565 }
16566
16567 Cost += ExtractCost;
16568 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
16569 bool ForSingleMask) {
16570 InstructionCost C = 0;
16571 unsigned VF = Mask.size();
16572 unsigned VecVF = TE->getVectorFactor();
16573 bool HasLargeIndex =
16574 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
16575 if ((VF != VecVF && HasLargeIndex) ||
16577
16578 if (HasLargeIndex) {
16579 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
16580 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16581 OrigMask.begin());
16583 getWidenedType(TE->getMainOp()->getType(), VecVF),
16584 OrigMask);
16585 LLVM_DEBUG(
16586 dbgs() << "SLP: Adding cost " << C
16587 << " for final shuffle of insertelement external users.\n";
16588 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16589 Cost += C;
16590 return std::make_pair(TE, true);
16591 }
16592
16593 if (!ForSingleMask) {
16594 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16595 for (unsigned I = 0; I < VF; ++I) {
16596 if (Mask[I] != PoisonMaskElem)
16597 ResizeMask[Mask[I]] = Mask[I];
16598 }
16599 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
16602 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
16603 LLVM_DEBUG(
16604 dbgs() << "SLP: Adding cost " << C
16605 << " for final shuffle of insertelement external users.\n";
16606 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16607
16608 Cost += C;
16609 }
16610 }
16611 return std::make_pair(TE, false);
16612 };
16613 // Calculate the cost of the reshuffled vectors, if any.
16614 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16615 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
16616 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16617 unsigned VF = 0;
16618 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
16620 assert((TEs.size() == 1 || TEs.size() == 2) &&
16621 "Expected exactly 1 or 2 tree entries.");
16622 if (TEs.size() == 1) {
16623 if (VF == 0)
16624 VF = TEs.front()->getVectorFactor();
16625 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16626 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
16627 !all_of(enumerate(Mask), [=](const auto &Data) {
16628 return Data.value() == PoisonMaskElem ||
16629 (Data.index() < VF &&
16630 static_cast<int>(Data.index()) == Data.value());
16631 })) {
16634 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16635 << " for final shuffle of insertelement "
16636 "external users.\n";
16637 TEs.front()->dump();
16638 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16639 Cost += C;
16640 }
16641 } else {
16642 if (VF == 0) {
16643 if (TEs.front() &&
16644 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16645 VF = TEs.front()->getVectorFactor();
16646 else
16647 VF = Mask.size();
16648 }
16649 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16651 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
16652 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16653 << " for final shuffle of vector node and external "
16654 "insertelement users.\n";
16655 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16656 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16657 Cost += C;
16658 }
16659 VF = Mask.size();
16660 return TEs.back();
16661 };
16663 MutableArrayRef(Vector.data(), Vector.size()), Base,
16664 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16665 EstimateShufflesCost);
16666 InstructionCost InsertCost = TTI->getScalarizationOverhead(
16668 ShuffledInserts[I].InsertElements.front()->getType()),
16669 DemandedElts[I],
16670 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
16671 Cost -= InsertCost;
16672 }
16673
16674 // Add the cost for reduced value resize (if required).
16675 if (ReductionBitWidth != 0) {
16676 assert(UserIgnoreList && "Expected reduction tree.");
16677 const TreeEntry &E = *VectorizableTree.front();
16678 auto It = MinBWs.find(&E);
16679 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16680 unsigned SrcSize = It->second.first;
16681 unsigned DstSize = ReductionBitWidth;
16682 unsigned Opcode = Instruction::Trunc;
16683 if (SrcSize < DstSize) {
16684 bool IsArithmeticExtendedReduction =
16685 all_of(*UserIgnoreList, [](Value *V) {
16686 auto *I = cast<Instruction>(V);
16687 return is_contained({Instruction::Add, Instruction::FAdd,
16688 Instruction::Mul, Instruction::FMul,
16689 Instruction::And, Instruction::Or,
16690 Instruction::Xor},
16691 I->getOpcode());
16692 });
16693 if (IsArithmeticExtendedReduction)
16694 Opcode =
16695 Instruction::BitCast; // Handle it by getExtendedReductionCost
16696 else
16697 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16698 }
16699 if (Opcode != Instruction::BitCast) {
16700 auto *SrcVecTy =
16701 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16702 auto *DstVecTy =
16703 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16704 TTI::CastContextHint CCH = getCastContextHint(E);
16705 InstructionCost CastCost;
16706 switch (E.getOpcode()) {
16707 case Instruction::SExt:
16708 case Instruction::ZExt:
16709 case Instruction::Trunc: {
16710 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16711 CCH = getCastContextHint(*OpTE);
16712 break;
16713 }
16714 default:
16715 break;
16716 }
16717 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16719 Cost += CastCost;
16720 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
16721 << " for final resize for reduction from " << SrcVecTy
16722 << " to " << DstVecTy << "\n";
16723 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16724 }
16725 }
16726 }
16727
16728 std::optional<InstructionCost> SpillCost;
16729 if (Cost < -SLPCostThreshold) {
16730 SpillCost = getSpillCost();
16731 Cost += *SpillCost;
16732 }
16733#ifndef NDEBUG
16734 SmallString<256> Str;
16735 {
16736 raw_svector_ostream OS(Str);
16737 OS << "SLP: Spill Cost = ";
16738 if (SpillCost)
16739 OS << *SpillCost;
16740 else
16741 OS << "<skipped>";
16742 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
16743 << "SLP: Total Cost = " << Cost << ".\n";
16744 }
16745 LLVM_DEBUG(dbgs() << Str);
16746 if (ViewSLPTree)
16747 ViewGraph(this, "SLP" + F->getName(), false, Str);
16748#endif
16749
16750 return Cost;
16751}
16752
16753/// Tries to find extractelement instructions with constant indices from fixed
16754/// vector type and gather such instructions into a bunch, which highly likely
16755/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16756/// successful, the matched scalars are replaced by poison values in \p VL for
16757/// future analysis.
16758std::optional<TTI::ShuffleKind>
16759BoUpSLP::tryToGatherSingleRegisterExtractElements(
16761 // Scan list of gathered scalars for extractelements that can be represented
16762 // as shuffles.
16764 SmallVector<int> UndefVectorExtracts;
16765 for (int I = 0, E = VL.size(); I < E; ++I) {
16766 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16767 if (!EI) {
16768 if (isa<UndefValue>(VL[I]))
16769 UndefVectorExtracts.push_back(I);
16770 continue;
16771 }
16772 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16773 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16774 continue;
16775 std::optional<unsigned> Idx = getExtractIndex(EI);
16776 // Undefined index.
16777 if (!Idx) {
16778 UndefVectorExtracts.push_back(I);
16779 continue;
16780 }
16781 if (Idx >= VecTy->getNumElements()) {
16782 UndefVectorExtracts.push_back(I);
16783 continue;
16784 }
16785 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
16786 ExtractMask.reset(*Idx);
16787 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
16788 UndefVectorExtracts.push_back(I);
16789 continue;
16790 }
16791 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
16792 }
16793 // Sort the vector operands by the maximum number of uses in extractelements.
16795 VectorOpToIdx.takeVector();
16796 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
16797 return P1.second.size() > P2.second.size();
16798 });
16799 // Find the best pair of the vectors or a single vector.
16800 const int UndefSz = UndefVectorExtracts.size();
16801 unsigned SingleMax = 0;
16802 unsigned PairMax = 0;
16803 if (!Vectors.empty()) {
16804 SingleMax = Vectors.front().second.size() + UndefSz;
16805 if (Vectors.size() > 1) {
16806 auto *ItNext = std::next(Vectors.begin());
16807 PairMax = SingleMax + ItNext->second.size();
16808 }
16809 }
16810 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16811 return std::nullopt;
16812 // Check if better to perform a shuffle of 2 vectors or just of a single
16813 // vector.
16814 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
16815 SmallVector<Value *> GatheredExtracts(
16816 VL.size(), PoisonValue::get(VL.front()->getType()));
16817 if (SingleMax >= PairMax && SingleMax) {
16818 for (int Idx : Vectors.front().second)
16819 std::swap(GatheredExtracts[Idx], VL[Idx]);
16820 } else if (!Vectors.empty()) {
16821 for (unsigned Idx : {0, 1})
16822 for (int Idx : Vectors[Idx].second)
16823 std::swap(GatheredExtracts[Idx], VL[Idx]);
16824 }
16825 // Add extracts from undefs too.
16826 for (int Idx : UndefVectorExtracts)
16827 std::swap(GatheredExtracts[Idx], VL[Idx]);
16828 // Check that gather of extractelements can be represented as just a
16829 // shuffle of a single/two vectors the scalars are extracted from.
16830 std::optional<TTI::ShuffleKind> Res =
16831 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
16832 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16833 // TODO: try to check other subsets if possible.
16834 // Restore the original VL if attempt was not successful.
16835 copy(SavedVL, VL.begin());
16836 return std::nullopt;
16837 }
16838 // Restore unused scalars from mask, if some of the extractelements were not
16839 // selected for shuffle.
16840 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
16841 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
16842 isa<UndefValue>(GatheredExtracts[I])) {
16843 std::swap(VL[I], GatheredExtracts[I]);
16844 continue;
16845 }
16846 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16847 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16848 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16849 is_contained(UndefVectorExtracts, I))
16850 continue;
16851 }
16852 return Res;
16853}
16854
16855/// Tries to find extractelement instructions with constant indices from fixed
16856/// vector type and gather such instructions into a bunch, which highly likely
16857/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16858/// successful, the matched scalars are replaced by poison values in \p VL for
16859/// future analysis.
16861BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16862 SmallVectorImpl<int> &Mask,
16863 unsigned NumParts) const {
16864 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
16865 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
16866 Mask.assign(VL.size(), PoisonMaskElem);
16867 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16868 for (unsigned Part : seq<unsigned>(NumParts)) {
16869 // Scan list of gathered scalars for extractelements that can be represented
16870 // as shuffles.
16871 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
16872 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
16873 SmallVector<int> SubMask;
16874 std::optional<TTI::ShuffleKind> Res =
16875 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16876 ShufflesRes[Part] = Res;
16877 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
16878 }
16879 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
16880 return Res.has_value();
16881 }))
16882 ShufflesRes.clear();
16883 return ShufflesRes;
16884}
16885
16886std::optional<TargetTransformInfo::ShuffleKind>
16887BoUpSLP::isGatherShuffledSingleRegisterEntry(
16888 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
16889 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
16890 Entries.clear();
16891 // TODO: currently checking only for Scalars in the tree entry, need to count
16892 // reused elements too for better cost estimation.
16893 auto GetUserEntry = [&](const TreeEntry *TE) {
16894 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16895 TE = TE->UserTreeIndex.UserTE;
16896 if (TE == VectorizableTree.front().get())
16897 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
16898 return TE->UserTreeIndex;
16899 };
16900 auto HasGatherUser = [&](const TreeEntry *TE) {
16901 while (TE->Idx != 0 && TE->UserTreeIndex) {
16902 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16903 return true;
16904 TE = TE->UserTreeIndex.UserTE;
16905 }
16906 return false;
16907 };
16908 const EdgeInfo TEUseEI = GetUserEntry(TE);
16909 if (!TEUseEI)
16910 return std::nullopt;
16911 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16912 const BasicBlock *TEInsertBlock = nullptr;
16913 // Main node of PHI entries keeps the correct order of operands/incoming
16914 // blocks.
16915 if (auto *PHI = dyn_cast_or_null<PHINode>(
16916 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
16917 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16918 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16919 TEInsertPt = TEInsertBlock->getTerminator();
16920 } else {
16921 TEInsertBlock = TEInsertPt->getParent();
16922 }
16923 if (!DT->isReachableFromEntry(TEInsertBlock))
16924 return std::nullopt;
16925 auto *NodeUI = DT->getNode(TEInsertBlock);
16926 assert(NodeUI && "Should only process reachable instructions");
16927 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
16928 auto CheckOrdering = [&](const Instruction *InsertPt) {
16929 // Argument InsertPt is an instruction where vector code for some other
16930 // tree entry (one that shares one or more scalars with TE) is going to be
16931 // generated. This lambda returns true if insertion point of vector code
16932 // for the TE dominates that point (otherwise dependency is the other way
16933 // around). The other node is not limited to be of a gather kind. Gather
16934 // nodes are not scheduled and their vector code is inserted before their
16935 // first user. If user is PHI, that is supposed to be at the end of a
16936 // predecessor block. Otherwise it is the last instruction among scalars of
16937 // the user node. So, instead of checking dependency between instructions
16938 // themselves, we check dependency between their insertion points for vector
16939 // code (since each scalar instruction ends up as a lane of a vector
16940 // instruction).
16941 const BasicBlock *InsertBlock = InsertPt->getParent();
16942 auto *NodeEUI = DT->getNode(InsertBlock);
16943 if (!NodeEUI)
16944 return false;
16945 assert((NodeUI == NodeEUI) ==
16946 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16947 "Different nodes should have different DFS numbers");
16948 // Check the order of the gather nodes users.
16949 if (TEInsertPt->getParent() != InsertBlock &&
16950 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16951 return false;
16952 if (TEInsertPt->getParent() == InsertBlock &&
16953 TEInsertPt->comesBefore(InsertPt))
16954 return false;
16955 return true;
16956 };
16957 // Find all tree entries used by the gathered values. If no common entries
16958 // found - not a shuffle.
16959 // Here we build a set of tree nodes for each gathered value and trying to
16960 // find the intersection between these sets. If we have at least one common
16961 // tree node for each gathered value - we have just a permutation of the
16962 // single vector. If we have 2 different sets, we're in situation where we
16963 // have a permutation of 2 input vectors.
16965 SmallDenseMap<Value *, int> UsedValuesEntry;
16966 SmallPtrSet<const Value *, 16> VisitedValue;
16967 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
16968 // The node is reused - exit.
16969 if ((TEPtr->getVectorFactor() != VL.size() &&
16970 TEPtr->Scalars.size() != VL.size()) ||
16971 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
16972 return false;
16973 UsedTEs.clear();
16974 UsedTEs.emplace_back().insert(TEPtr);
16975 for (Value *V : VL) {
16976 if (isConstant(V))
16977 continue;
16978 UsedValuesEntry.try_emplace(V, 0);
16979 }
16980 return true;
16981 };
16982 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
16983 unsigned EdgeIdx) {
16984 const TreeEntry *Ptr1 = User1;
16985 const TreeEntry *Ptr2 = User2;
16986 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16987 while (Ptr2) {
16988 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
16989 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16990 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16991 }
16992 while (Ptr1) {
16993 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16994 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16995 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
16996 return Idx < It->second;
16997 }
16998 return false;
16999 };
17000 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
17001 Instruction *InsertPt) {
17002 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
17003 !TEUseEI.UserTE->isCopyableElement(
17004 const_cast<Instruction *>(TEInsertPt)) &&
17005 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17006 InsertPt->getNextNode() == TEInsertPt &&
17007 (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) ||
17008 !isUsedOutsideBlock(InsertPt));
17009 };
17010 for (Value *V : VL) {
17011 if (isConstant(V) || !VisitedValue.insert(V).second)
17012 continue;
17013 // Build a list of tree entries where V is used.
17014 SmallPtrSet<const TreeEntry *, 4> VToTEs;
17015 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
17016 if (TEPtr == TE || TEPtr->Idx == 0)
17017 continue;
17018 assert(any_of(TEPtr->Scalars,
17019 [&](Value *V) { return GatheredScalars.contains(V); }) &&
17020 "Must contain at least single gathered value.");
17021 assert(TEPtr->UserTreeIndex &&
17022 "Expected only single user of a gather node.");
17023 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17024
17025 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17026 UseEI.UserTE->hasState())
17027 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
17028 : nullptr;
17029 Instruction *InsertPt =
17030 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
17031 : &getLastInstructionInBundle(UseEI.UserTE);
17032 if (TEInsertPt == InsertPt) {
17033 // Check nodes, which might be emitted first.
17034 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17035 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17036 TEUseEI.UserTE->isAltShuffle()) &&
17037 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
17038 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17039 (UseEI.UserTE->hasState() &&
17040 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17041 !UseEI.UserTE->isAltShuffle()) ||
17042 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
17043 continue;
17044 }
17045
17046 // If the schedulable insertion point is used in multiple entries - just
17047 // exit, no known ordering at this point, available only after real
17048 // scheduling.
17049 if (!doesNotNeedToBeScheduled(InsertPt) &&
17050 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17051 continue;
17052 // If the users are the PHI nodes with the same incoming blocks - skip.
17053 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17054 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17055 UseEI.UserTE->State == TreeEntry::Vectorize &&
17056 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17057 TEUseEI.UserTE != UseEI.UserTE)
17058 continue;
17059 // If 2 gathers are operands of the same entry (regardless of whether
17060 // user is PHI or else), compare operands indices, use the earlier one
17061 // as the base.
17062 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17063 continue;
17064 // If the user instruction is used for some reason in different
17065 // vectorized nodes - make it depend on index.
17066 if (TEUseEI.UserTE != UseEI.UserTE &&
17067 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17068 HasGatherUser(TEUseEI.UserTE)))
17069 continue;
17070 // If the user node is the operand of the other user node - skip.
17071 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17072 continue;
17073 }
17074
17075 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17076 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17077 UseEI.UserTE->doesNotNeedToSchedule() &&
17078 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
17079 continue;
17080 // Check if the user node of the TE comes after user node of TEPtr,
17081 // otherwise TEPtr depends on TE.
17082 if ((TEInsertBlock != InsertPt->getParent() ||
17083 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17084 (!CheckOrdering(InsertPt) ||
17085 (UseEI.UserTE->hasCopyableElements() &&
17086 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17087 is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
17088 continue;
17089 // The node is reused - exit.
17090 if (CheckAndUseSameNode(TEPtr))
17091 break;
17092 // The parent node is copyable with last inst used outside? And the last
17093 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
17094 // preserve def-use chain.
17095 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17096 continue;
17097 VToTEs.insert(TEPtr);
17098 }
17099 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
17100 const auto *It = find_if(
17101 VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
17102 if (It != VTEs.end()) {
17103 const TreeEntry *VTE = *It;
17104 if (none_of(TE->CombinedEntriesWithIndices,
17105 [&](const auto &P) { return P.first == VTE->Idx; })) {
17106 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17107 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17108 continue;
17109 }
17110 // The node is reused - exit.
17111 if (CheckAndUseSameNode(VTE))
17112 break;
17113 VToTEs.insert(VTE);
17114 }
17115 }
17116 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
17117 const TreeEntry *VTE = VTEs.front();
17118 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17119 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17120 VTEs = VTEs.drop_front();
17121 // Iterate through all vectorized nodes.
17122 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
17123 return MTE->State == TreeEntry::Vectorize;
17124 });
17125 if (MIt == VTEs.end())
17126 continue;
17127 VTE = *MIt;
17128 }
17129 if (none_of(TE->CombinedEntriesWithIndices,
17130 [&](const auto &P) { return P.first == VTE->Idx; })) {
17131 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17132 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst) ||
17133 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
17134 continue;
17135 }
17136 // The node is reused - exit.
17137 if (CheckAndUseSameNode(VTE))
17138 break;
17139 VToTEs.insert(VTE);
17140 }
17141 if (VToTEs.empty())
17142 continue;
17143 if (UsedTEs.empty()) {
17144 // The first iteration, just insert the list of nodes to vector.
17145 UsedTEs.push_back(VToTEs);
17146 UsedValuesEntry.try_emplace(V, 0);
17147 } else {
17148 // Need to check if there are any previously used tree nodes which use V.
17149 // If there are no such nodes, consider that we have another one input
17150 // vector.
17151 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
17152 unsigned Idx = 0;
17153 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
17154 // Do we have a non-empty intersection of previously listed tree entries
17155 // and tree entries using current V?
17156 set_intersect(VToTEs, Set);
17157 if (!VToTEs.empty()) {
17158 // Yes, write the new subset and continue analysis for the next
17159 // scalar.
17160 Set.swap(VToTEs);
17161 break;
17162 }
17163 VToTEs = SavedVToTEs;
17164 ++Idx;
17165 }
17166 // No non-empty intersection found - need to add a second set of possible
17167 // source vectors.
17168 if (Idx == UsedTEs.size()) {
17169 // If the number of input vectors is greater than 2 - not a permutation,
17170 // fallback to the regular gather.
17171 // TODO: support multiple reshuffled nodes.
17172 if (UsedTEs.size() == 2)
17173 continue;
17174 UsedTEs.push_back(SavedVToTEs);
17175 Idx = UsedTEs.size() - 1;
17176 }
17177 UsedValuesEntry.try_emplace(V, Idx);
17178 }
17179 }
17180
17181 if (UsedTEs.empty()) {
17182 Entries.clear();
17183 return std::nullopt;
17184 }
17185
17186 unsigned VF = 0;
17187 if (UsedTEs.size() == 1) {
17188 // Keep the order to avoid non-determinism.
17189 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
17190 UsedTEs.front().end());
17191 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17192 return TE1->Idx < TE2->Idx;
17193 });
17194 // Try to find the perfect match in another gather node at first.
17195 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
17196 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
17197 });
17198 if (It != FirstEntries.end() &&
17199 ((*It)->getVectorFactor() == VL.size() ||
17200 ((*It)->getVectorFactor() == TE->Scalars.size() &&
17201 TE->ReuseShuffleIndices.size() == VL.size() &&
17202 (*It)->isSame(TE->Scalars)))) {
17203 Entries.push_back(*It);
17204 if ((*It)->getVectorFactor() == VL.size()) {
17205 std::iota(std::next(Mask.begin(), Part * VL.size()),
17206 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
17207 } else {
17208 SmallVector<int> CommonMask = TE->getCommonMask();
17209 copy(CommonMask, Mask.begin());
17210 }
17211 // Clear undef scalars.
17212 for (unsigned I : seq<unsigned>(VL.size()))
17213 if (isa<PoisonValue>(VL[I]))
17214 Mask[Part * VL.size() + I] = PoisonMaskElem;
17216 }
17217 // No perfect match, just shuffle, so choose the first tree node from the
17218 // tree.
17219 Entries.push_back(FirstEntries.front());
17220 // Update mapping between values and corresponding tree entries.
17221 for (auto &P : UsedValuesEntry)
17222 P.second = 0;
17223 VF = FirstEntries.front()->getVectorFactor();
17224 } else {
17225 // Try to find nodes with the same vector factor.
17226 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
17227 // Keep the order of tree nodes to avoid non-determinism.
17228 DenseMap<int, const TreeEntry *> VFToTE;
17229 for (const TreeEntry *TE : UsedTEs.front()) {
17230 unsigned VF = TE->getVectorFactor();
17231 auto It = VFToTE.find(VF);
17232 if (It != VFToTE.end()) {
17233 if (It->second->Idx > TE->Idx)
17234 It->getSecond() = TE;
17235 continue;
17236 }
17237 VFToTE.try_emplace(VF, TE);
17238 }
17239 // Same, keep the order to avoid non-determinism.
17240 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
17241 UsedTEs.back().end());
17242 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17243 return TE1->Idx < TE2->Idx;
17244 });
17245 for (const TreeEntry *TE : SecondEntries) {
17246 auto It = VFToTE.find(TE->getVectorFactor());
17247 if (It != VFToTE.end()) {
17248 VF = It->first;
17249 Entries.push_back(It->second);
17250 Entries.push_back(TE);
17251 break;
17252 }
17253 }
17254 // No 2 source vectors with the same vector factor - just choose 2 with max
17255 // index.
17256 if (Entries.empty()) {
17258 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
17259 return TE1->Idx < TE2->Idx;
17260 }));
17261 Entries.push_back(SecondEntries.front());
17262 VF = std::max(Entries.front()->getVectorFactor(),
17263 Entries.back()->getVectorFactor());
17264 } else {
17265 VF = Entries.front()->getVectorFactor();
17266 }
17267 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
17268 for (const TreeEntry *E : Entries)
17269 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
17270 E->Scalars.end());
17271 // Update mapping between values and corresponding tree entries.
17272 for (auto &P : UsedValuesEntry) {
17273 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
17274 if (ValuesToEntries[Idx].contains(P.first)) {
17275 P.second = Idx;
17276 break;
17277 }
17278 }
17279 }
17280
17281 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
17282 // Checks if the 2 PHIs are compatible in terms of high possibility to be
17283 // vectorized.
17284 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
17285 auto *PHI = cast<PHINode>(V);
17286 auto *PHI1 = cast<PHINode>(V1);
17287 // Check that all incoming values are compatible/from same parent (if they
17288 // are instructions).
17289 // The incoming values are compatible if they all are constants, or
17290 // instruction with the same/alternate opcodes from the same basic block.
17291 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
17292 Value *In = PHI->getIncomingValue(I);
17293 Value *In1 = PHI1->getIncomingValue(I);
17294 if (isConstant(In) && isConstant(In1))
17295 continue;
17296 if (!getSameOpcode({In, In1}, *TLI))
17297 return false;
17298 if (cast<Instruction>(In)->getParent() !=
17300 return false;
17301 }
17302 return true;
17303 };
17304 // Check if the value can be ignored during analysis for shuffled gathers.
17305 // We suppose it is better to ignore instruction, which do not form splats,
17306 // are not vectorized/not extractelements (these instructions will be handled
17307 // by extractelements processing) or may form vector node in future.
17308 auto MightBeIgnored = [=](Value *V) {
17309 auto *I = dyn_cast<Instruction>(V);
17310 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17312 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17313 };
17314 // Check that the neighbor instruction may form a full vector node with the
17315 // current instruction V. It is possible, if they have same/alternate opcode
17316 // and same parent basic block.
17317 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17318 Value *V1 = VL[Idx];
17319 bool UsedInSameVTE = false;
17320 auto It = UsedValuesEntry.find(V1);
17321 if (It != UsedValuesEntry.end())
17322 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17323 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17324 getSameOpcode({V, V1}, *TLI) &&
17325 cast<Instruction>(V)->getParent() ==
17326 cast<Instruction>(V1)->getParent() &&
17327 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17328 };
17329 // Build a shuffle mask for better cost estimation and vector emission.
17330 SmallBitVector UsedIdxs(Entries.size());
17332 for (int I = 0, E = VL.size(); I < E; ++I) {
17333 Value *V = VL[I];
17334 auto It = UsedValuesEntry.find(V);
17335 if (It == UsedValuesEntry.end())
17336 continue;
17337 // Do not try to shuffle scalars, if they are constants, or instructions
17338 // that can be vectorized as a result of the following vector build
17339 // vectorization.
17340 if (isConstant(V) || (MightBeIgnored(V) &&
17341 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17342 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17343 continue;
17344 unsigned Idx = It->second;
17345 EntryLanes.emplace_back(Idx, I);
17346 UsedIdxs.set(Idx);
17347 }
17348 // Iterate through all shuffled scalars and select entries, which can be used
17349 // for final shuffle.
17351 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17352 if (!UsedIdxs.test(I))
17353 continue;
17354 // Fix the entry number for the given scalar. If it is the first entry, set
17355 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17356 // These indices are used when calculating final shuffle mask as the vector
17357 // offset.
17358 for (std::pair<unsigned, int> &Pair : EntryLanes)
17359 if (Pair.first == I)
17360 Pair.first = TempEntries.size();
17361 TempEntries.push_back(Entries[I]);
17362 }
17363 Entries.swap(TempEntries);
17364 if (EntryLanes.size() == Entries.size() &&
17365 !VL.equals(ArrayRef(TE->Scalars)
17366 .slice(Part * VL.size(),
17367 std::min<int>(VL.size(), TE->Scalars.size())))) {
17368 // We may have here 1 or 2 entries only. If the number of scalars is equal
17369 // to the number of entries, no need to do the analysis, it is not very
17370 // profitable. Since VL is not the same as TE->Scalars, it means we already
17371 // have some shuffles before. Cut off not profitable case.
17372 Entries.clear();
17373 return std::nullopt;
17374 }
17375 // Build the final mask, check for the identity shuffle, if possible.
17376 bool IsIdentity = Entries.size() == 1;
17377 // Pair.first is the offset to the vector, while Pair.second is the index of
17378 // scalar in the list.
17379 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17380 unsigned Idx = Part * VL.size() + Pair.second;
17381 Mask[Idx] =
17382 Pair.first * VF +
17383 (ForOrder ? std::distance(
17384 Entries[Pair.first]->Scalars.begin(),
17385 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17386 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17387 IsIdentity &= Mask[Idx] == Pair.second;
17388 }
17389 if (ForOrder || IsIdentity || Entries.empty()) {
17390 switch (Entries.size()) {
17391 case 1:
17392 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17394 break;
17395 case 2:
17396 if (EntryLanes.size() > 2 || VL.size() <= 2)
17398 break;
17399 default:
17400 break;
17401 }
17402 } else if (!isa<VectorType>(VL.front()->getType()) &&
17403 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17404 // Do the cost estimation if shuffle beneficial than buildvector.
17405 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17406 std::next(Mask.begin(), (Part + 1) * VL.size()));
17407 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17408 for (int Idx : SubMask) {
17409 if (Idx == PoisonMaskElem)
17410 continue;
17411 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17412 MinElement = Idx;
17413 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17414 MaxElement = Idx;
17415 }
17416 assert(MaxElement >= 0 && MinElement >= 0 &&
17417 MaxElement % VF >= MinElement % VF &&
17418 "Expected at least single element.");
17419 unsigned NewVF = std::max<unsigned>(
17420 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17421 (MaxElement % VF) -
17422 (MinElement % VF) + 1));
17423 if (NewVF < VF) {
17424 for (int &Idx : SubMask) {
17425 if (Idx == PoisonMaskElem)
17426 continue;
17427 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17428 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17429 }
17430 } else {
17431 NewVF = VF;
17432 }
17433
17435 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17436 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17437 auto GetShuffleCost = [&,
17438 &TTI = *TTI](ArrayRef<int> Mask,
17440 VectorType *VecTy) -> InstructionCost {
17441 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17443 Mask, Entries.front()->getInterleaveFactor()))
17444 return TTI::TCC_Free;
17445 return ::getShuffleCost(TTI,
17446 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17448 VecTy, Mask, CostKind);
17449 };
17450 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17451 InstructionCost FirstShuffleCost = 0;
17452 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17453 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17454 FirstShuffleCost = ShuffleCost;
17455 } else {
17456 // Transform mask to include only first entry.
17457 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17458 bool IsIdentity = true;
17459 for (auto [I, Idx] : enumerate(FirstMask)) {
17460 if (Idx >= static_cast<int>(NewVF)) {
17461 Idx = PoisonMaskElem;
17462 } else {
17463 DemandedElts.clearBit(I);
17464 if (Idx != PoisonMaskElem)
17465 IsIdentity &= static_cast<int>(I) == Idx;
17466 }
17467 }
17468 if (!IsIdentity)
17469 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17470 FirstShuffleCost += getScalarizationOverhead(
17471 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17472 /*Extract=*/false, CostKind);
17473 }
17474 InstructionCost SecondShuffleCost = 0;
17475 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17476 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17477 SecondShuffleCost = ShuffleCost;
17478 } else {
17479 // Transform mask to include only first entry.
17480 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17481 bool IsIdentity = true;
17482 for (auto [I, Idx] : enumerate(SecondMask)) {
17483 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
17484 Idx = PoisonMaskElem;
17485 } else {
17486 DemandedElts.clearBit(I);
17487 if (Idx != PoisonMaskElem) {
17488 Idx -= NewVF;
17489 IsIdentity &= static_cast<int>(I) == Idx;
17490 }
17491 }
17492 }
17493 if (!IsIdentity)
17494 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17495 SecondShuffleCost += getScalarizationOverhead(
17496 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17497 /*Extract=*/false, CostKind);
17498 }
17499 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17500 for (auto [I, Idx] : enumerate(SubMask))
17501 if (Idx == PoisonMaskElem)
17502 DemandedElts.clearBit(I);
17503 InstructionCost BuildVectorCost = getScalarizationOverhead(
17504 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17505 /*Extract=*/false, CostKind);
17506 const TreeEntry *BestEntry = nullptr;
17507 if (FirstShuffleCost < ShuffleCost) {
17508 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17509 std::next(Mask.begin(), (Part + 1) * VL.size()),
17510 [&](int &Idx) {
17511 if (Idx >= static_cast<int>(VF))
17512 Idx = PoisonMaskElem;
17513 });
17514 BestEntry = Entries.front();
17515 ShuffleCost = FirstShuffleCost;
17516 }
17517 if (SecondShuffleCost < ShuffleCost) {
17518 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17519 std::next(Mask.begin(), (Part + 1) * VL.size()),
17520 [&](int &Idx) {
17521 if (Idx < static_cast<int>(VF))
17522 Idx = PoisonMaskElem;
17523 else
17524 Idx -= VF;
17525 });
17526 BestEntry = Entries[1];
17527 ShuffleCost = SecondShuffleCost;
17528 }
17529 if (BuildVectorCost >= ShuffleCost) {
17530 if (BestEntry) {
17531 Entries.clear();
17532 Entries.push_back(BestEntry);
17533 }
17534 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
17536 }
17537 }
17538 Entries.clear();
17539 // Clear the corresponding mask elements.
17540 std::fill(std::next(Mask.begin(), Part * VL.size()),
17541 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
17542 return std::nullopt;
17543}
17544
17546BoUpSLP::isGatherShuffledEntry(
17547 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
17548 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
17549 bool ForOrder) {
17550 assert(NumParts > 0 && NumParts < VL.size() &&
17551 "Expected positive number of registers.");
17552 Entries.clear();
17553 // No need to check for the topmost gather node.
17554 if (TE == VectorizableTree.front().get() &&
17555 (!GatheredLoadsEntriesFirst.has_value() ||
17556 none_of(ArrayRef(VectorizableTree).drop_front(),
17557 [](const std::unique_ptr<TreeEntry> &TE) {
17558 return !TE->isGather();
17559 })))
17560 return {};
17561 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
17562 // implemented yet.
17563 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17564 return {};
17565 Mask.assign(VL.size(), PoisonMaskElem);
17566 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17567 "Expected only single user of the gather node.");
17568 assert(VL.size() % NumParts == 0 &&
17569 "Number of scalars must be divisible by NumParts.");
17570 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
17571 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17572 (TE->Idx == 0 ||
17573 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
17574 isSplat(TE->Scalars) ||
17575 (TE->hasState() &&
17576 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
17577 return {};
17578 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17580 for (unsigned Part : seq<unsigned>(NumParts)) {
17581 ArrayRef<Value *> SubVL =
17582 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17583 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17584 std::optional<TTI::ShuffleKind> SubRes =
17585 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17586 ForOrder);
17587 if (!SubRes)
17588 SubEntries.clear();
17589 Res.push_back(SubRes);
17590 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
17591 SubEntries.front()->getVectorFactor() == VL.size() &&
17592 (SubEntries.front()->isSame(TE->Scalars) ||
17593 SubEntries.front()->isSame(VL))) {
17594 SmallVector<const TreeEntry *> LocalSubEntries;
17595 LocalSubEntries.swap(SubEntries);
17596 Entries.clear();
17597 Res.clear();
17598 std::iota(Mask.begin(), Mask.end(), 0);
17599 // Clear undef scalars.
17600 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
17601 if (isa<PoisonValue>(VL[I]))
17603 Entries.emplace_back(1, LocalSubEntries.front());
17605 return Res;
17606 }
17607 }
17608 if (all_of(Res,
17609 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
17610 Entries.clear();
17611 return {};
17612 }
17613 return Res;
17614}
17615
17616InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
17617 Type *ScalarTy) const {
17618 const unsigned VF = VL.size();
17619 auto *VecTy = getWidenedType(ScalarTy, VF);
17620 // Find the cost of inserting/extracting values from the vector.
17621 // Check if the same elements are inserted several times and count them as
17622 // shuffle candidates.
17623 APInt DemandedElements = APInt::getZero(VF);
17626 auto EstimateInsertCost = [&](unsigned I, Value *V) {
17627 DemandedElements.setBit(I);
17628 if (V->getType() != ScalarTy)
17629 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
17631 };
17632 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
17633 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17634 for (auto [I, V] : enumerate(VL)) {
17635 // No need to shuffle duplicates for constants.
17636 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
17637 continue;
17638
17639 if (isConstant(V)) {
17640 ConstantShuffleMask[I] = I + VF;
17641 continue;
17642 }
17643 EstimateInsertCost(I, V);
17644 }
17645 // FIXME: add a cost for constant vector materialization.
17646 bool IsAnyNonUndefConst =
17647 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
17648 // 1. Shuffle input source vector and constant vector.
17649 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17651 ConstantShuffleMask);
17652 }
17653
17654 // 2. Insert unique non-constants.
17655 if (!DemandedElements.isZero())
17656 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
17657 /*Insert=*/true,
17658 /*Extract=*/false, CostKind,
17659 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17660 return Cost;
17661}
17662
17663Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
17664 auto It = EntryToLastInstruction.find(E);
17665 if (It != EntryToLastInstruction.end())
17666 return *cast<Instruction>(It->second);
17667 Instruction *Res = nullptr;
17668 // Get the basic block this bundle is in. All instructions in the bundle
17669 // should be in this block (except for extractelement-like instructions with
17670 // constant indices or gathered loads or copyables).
17671 Instruction *Front;
17672 unsigned Opcode;
17673 if (E->hasState()) {
17674 Front = E->getMainOp();
17675 Opcode = E->getOpcode();
17676 } else {
17677 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
17678 Opcode = Front->getOpcode();
17679 }
17680 auto *BB = Front->getParent();
17681 assert(
17682 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17683 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17684 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17685 all_of(E->Scalars,
17686 [=](Value *V) -> bool {
17687 if (Opcode == Instruction::GetElementPtr &&
17688 !isa<GetElementPtrInst>(V))
17689 return true;
17690 auto *I = dyn_cast<Instruction>(V);
17691 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17692 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17693 })) &&
17694 "Expected gathered loads or GEPs or instructions from same basic "
17695 "block.");
17696
17697 auto FindLastInst = [&]() {
17698 Instruction *LastInst = Front;
17699 for (Value *V : E->Scalars) {
17700 auto *I = dyn_cast<Instruction>(V);
17701 if (!I)
17702 continue;
17703 if (E->isCopyableElement(I))
17704 continue;
17705 if (LastInst->getParent() == I->getParent()) {
17706 if (LastInst->comesBefore(I))
17707 LastInst = I;
17708 continue;
17709 }
17710 assert(((Opcode == Instruction::GetElementPtr &&
17712 E->State == TreeEntry::SplitVectorize ||
17713 (isVectorLikeInstWithConstOps(LastInst) &&
17715 (GatheredLoadsEntriesFirst.has_value() &&
17716 Opcode == Instruction::Load && E->isGather() &&
17717 E->Idx < *GatheredLoadsEntriesFirst)) &&
17718 "Expected vector-like or non-GEP in GEP node insts only.");
17719 if (!DT->isReachableFromEntry(LastInst->getParent())) {
17720 LastInst = I;
17721 continue;
17722 }
17723 if (!DT->isReachableFromEntry(I->getParent()))
17724 continue;
17725 auto *NodeA = DT->getNode(LastInst->getParent());
17726 auto *NodeB = DT->getNode(I->getParent());
17727 assert(NodeA && "Should only process reachable instructions");
17728 assert(NodeB && "Should only process reachable instructions");
17729 assert((NodeA == NodeB) ==
17730 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17731 "Different nodes should have different DFS numbers");
17732 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17733 LastInst = I;
17734 }
17735 BB = LastInst->getParent();
17736 return LastInst;
17737 };
17738
17739 auto FindFirstInst = [&]() {
17740 Instruction *FirstInst = Front;
17741 for (Value *V : E->Scalars) {
17742 auto *I = dyn_cast<Instruction>(V);
17743 if (!I)
17744 continue;
17745 if (E->isCopyableElement(I))
17746 continue;
17747 if (FirstInst->getParent() == I->getParent()) {
17748 if (I->comesBefore(FirstInst))
17749 FirstInst = I;
17750 continue;
17751 }
17752 assert(((Opcode == Instruction::GetElementPtr &&
17754 (isVectorLikeInstWithConstOps(FirstInst) &&
17756 "Expected vector-like or non-GEP in GEP node insts only.");
17757 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
17758 FirstInst = I;
17759 continue;
17760 }
17761 if (!DT->isReachableFromEntry(I->getParent()))
17762 continue;
17763 auto *NodeA = DT->getNode(FirstInst->getParent());
17764 auto *NodeB = DT->getNode(I->getParent());
17765 assert(NodeA && "Should only process reachable instructions");
17766 assert(NodeB && "Should only process reachable instructions");
17767 assert((NodeA == NodeB) ==
17768 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17769 "Different nodes should have different DFS numbers");
17770 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17771 FirstInst = I;
17772 }
17773 return FirstInst;
17774 };
17775
17776 if (E->State == TreeEntry::SplitVectorize) {
17777 Res = FindLastInst();
17778 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
17779 for (auto *E : Entries) {
17780 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17781 if (!I)
17782 I = &getLastInstructionInBundle(E);
17783 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
17784 Res = I;
17785 }
17786 }
17787 EntryToLastInstruction.try_emplace(E, Res);
17788 return *Res;
17789 }
17790
17791 // Set insertpoint for gathered loads to the very first load.
17792 if (GatheredLoadsEntriesFirst.has_value() &&
17793 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17794 Opcode == Instruction::Load) {
17795 Res = FindFirstInst();
17796 EntryToLastInstruction.try_emplace(E, Res);
17797 return *Res;
17798 }
17799
17800 // Set the insert point to the beginning of the basic block if the entry
17801 // should not be scheduled.
17802 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
17803 if (E->isGather())
17804 return nullptr;
17805 // Found previously that the instruction do not need to be scheduled.
17806 const auto *It = BlocksSchedules.find(BB);
17807 if (It == BlocksSchedules.end())
17808 return nullptr;
17809 for (Value *V : E->Scalars) {
17810 auto *I = dyn_cast<Instruction>(V);
17811 if (!I || isa<PHINode>(I) ||
17812 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
17813 continue;
17814 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
17815 if (Bundles.empty())
17816 continue;
17817 const auto *It = find_if(
17818 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
17819 if (It != Bundles.end())
17820 return *It;
17821 }
17822 return nullptr;
17823 };
17824 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17825 if (!E->isGather() && !Bundle) {
17826 if ((Opcode == Instruction::GetElementPtr &&
17827 any_of(E->Scalars,
17828 [](Value *V) {
17829 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17830 })) ||
17831 (all_of(E->Scalars,
17832 [&](Value *V) {
17833 return isa<PoisonValue>(V) ||
17834 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17835 E->isCopyableElement(V) ||
17836 (!isVectorLikeInstWithConstOps(V) &&
17837 isUsedOutsideBlock(V));
17838 }) &&
17839 (!E->doesNotNeedToSchedule() ||
17840 any_of(E->Scalars,
17841 [&](Value *V) {
17842 if (!isa<Instruction>(V) ||
17843 (E->hasCopyableElements() && E->isCopyableElement(V)))
17844 return false;
17845 return !areAllOperandsNonInsts(V);
17846 }) ||
17847 none_of(E->Scalars, [&](Value *V) {
17848 if (!isa<Instruction>(V) ||
17849 (E->hasCopyableElements() && E->isCopyableElement(V)))
17850 return false;
17851 return MustGather.contains(V);
17852 }))))
17853 Res = FindLastInst();
17854 else
17855 Res = FindFirstInst();
17856 EntryToLastInstruction.try_emplace(E, Res);
17857 return *Res;
17858 }
17859
17860 // Find the last instruction. The common case should be that BB has been
17861 // scheduled, and the last instruction is VL.back(). So we start with
17862 // VL.back() and iterate over schedule data until we reach the end of the
17863 // bundle. The end of the bundle is marked by null ScheduleData.
17864 if (Bundle) {
17865 assert(!E->isGather() && "Gathered instructions should not be scheduled");
17866 Res = Bundle->getBundle().back()->getInst();
17867 EntryToLastInstruction.try_emplace(E, Res);
17868 return *Res;
17869 }
17870
17871 // LastInst can still be null at this point if there's either not an entry
17872 // for BB in BlocksSchedules or there's no ScheduleData available for
17873 // VL.back(). This can be the case if buildTreeRec aborts for various
17874 // reasons (e.g., the maximum recursion depth is reached, the maximum region
17875 // size is reached, etc.). ScheduleData is initialized in the scheduling
17876 // "dry-run".
17877 //
17878 // If this happens, we can still find the last instruction by brute force. We
17879 // iterate forwards from Front (inclusive) until we either see all
17880 // instructions in the bundle or reach the end of the block. If Front is the
17881 // last instruction in program order, LastInst will be set to Front, and we
17882 // will visit all the remaining instructions in the block.
17883 //
17884 // One of the reasons we exit early from buildTreeRec is to place an upper
17885 // bound on compile-time. Thus, taking an additional compile-time hit here is
17886 // not ideal. However, this should be exceedingly rare since it requires that
17887 // we both exit early from buildTreeRec and that the bundle be out-of-order
17888 // (causing us to iterate all the way to the end of the block).
17889 if (!Res)
17890 Res = FindLastInst();
17891 assert(Res && "Failed to find last instruction in bundle");
17892 EntryToLastInstruction.try_emplace(E, Res);
17893 return *Res;
17894}
17895
17896void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
17897 auto *Front = E->getMainOp();
17898 Instruction *LastInst = &getLastInstructionInBundle(E);
17899 assert(LastInst && "Failed to find last instruction in bundle");
17900 BasicBlock::iterator LastInstIt = LastInst->getIterator();
17901 // If the instruction is PHI, set the insert point after all the PHIs.
17902 bool IsPHI = isa<PHINode>(LastInst);
17903 if (IsPHI) {
17904 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
17905 if (LastInstIt != LastInst->getParent()->end() &&
17906 LastInstIt->getParent()->isLandingPad())
17907 LastInstIt = std::next(LastInstIt);
17908 }
17909 if (IsPHI ||
17910 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17911 (E->doesNotNeedToSchedule() ||
17912 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
17913 isUsedOutsideBlock(LastInst)))) ||
17914 (GatheredLoadsEntriesFirst.has_value() &&
17915 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17916 E->getOpcode() == Instruction::Load)) {
17917 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
17918 } else {
17919 // Set the insertion point after the last instruction in the bundle. Set the
17920 // debug location to Front.
17921 Builder.SetInsertPoint(
17922 LastInst->getParent(),
17923 LastInst->getNextNode()->getIterator());
17924 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
17925 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
17926 } else {
17927 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
17928 PoisonValue::get(Builder.getPtrTy()),
17929 MaybeAlign());
17930 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
17931 eraseInstruction(Res);
17932 LastInstructionToPos.try_emplace(LastInst, Res);
17933 }
17934 }
17935 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
17936}
17937
17938Value *BoUpSLP::gather(
17939 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
17940 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
17941 // List of instructions/lanes from current block and/or the blocks which are
17942 // part of the current loop. These instructions will be inserted at the end to
17943 // make it possible to optimize loops and hoist invariant instructions out of
17944 // the loops body with better chances for success.
17946 SmallSet<int, 4> PostponedIndices;
17947 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
17948 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
17949 SmallPtrSet<BasicBlock *, 4> Visited;
17950 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
17951 InsertBB = InsertBB->getSinglePredecessor();
17952 return InsertBB && InsertBB == InstBB;
17953 };
17954 for (int I = 0, E = VL.size(); I < E; ++I) {
17955 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
17956 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17957 isVectorized(Inst) ||
17958 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
17959 PostponedIndices.insert(I).second)
17960 PostponedInsts.emplace_back(Inst, I);
17961 }
17962
17963 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
17964 Type *Ty) {
17965 Value *Scalar = V;
17966 if (Scalar->getType() != Ty) {
17967 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17968 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
17969 Value *V = Scalar;
17970 if (auto *CI = dyn_cast<CastInst>(Scalar);
17972 Value *Op = CI->getOperand(0);
17973 if (auto *IOp = dyn_cast<Instruction>(Op);
17974 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
17975 V = Op;
17976 }
17977 Scalar = Builder.CreateIntCast(
17978 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
17979 }
17980
17981 Instruction *InsElt;
17982 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17983 assert(SLPReVec && "FixedVectorType is not expected.");
17984 Vec =
17985 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
17986 auto *II = dyn_cast<Instruction>(Vec);
17987 if (!II)
17988 return Vec;
17989 InsElt = II;
17990 } else {
17991 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17992 InsElt = dyn_cast<InsertElementInst>(Vec);
17993 if (!InsElt)
17994 return Vec;
17995 }
17996 GatherShuffleExtractSeq.insert(InsElt);
17997 CSEBlocks.insert(InsElt->getParent());
17998 // Add to our 'need-to-extract' list.
17999 if (isa<Instruction>(V)) {
18000 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
18001 // Find which lane we need to extract.
18002 User *UserOp = nullptr;
18003 if (Scalar != V) {
18004 if (auto *SI = dyn_cast<Instruction>(Scalar))
18005 UserOp = SI;
18006 } else {
18007 if (V->getType()->isVectorTy()) {
18008 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
18009 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
18010 // Find shufflevector, caused by resize.
18011 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
18012 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
18013 if (SV->getOperand(0) == V)
18014 return SV;
18015 if (SV->getOperand(1) == V)
18016 return SV;
18017 }
18018 return nullptr;
18019 };
18020 InsElt = nullptr;
18021 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18022 InsElt = User;
18023 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18024 InsElt = User;
18025 assert(InsElt &&
18026 "Failed to find shufflevector, caused by resize.");
18027 }
18028 }
18029 UserOp = InsElt;
18030 }
18031 if (UserOp) {
18032 unsigned FoundLane = Entries.front()->findLaneForValue(V);
18033 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
18034 }
18035 }
18036 }
18037 return Vec;
18038 };
18039 auto *VecTy = getWidenedType(ScalarTy, VL.size());
18040 Value *Vec = PoisonValue::get(VecTy);
18041 SmallVector<int> NonConsts;
18042 SmallVector<int> Mask(VL.size());
18043 std::iota(Mask.begin(), Mask.end(), 0);
18044 Value *OriginalRoot = Root;
18045 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
18046 SV && isa<PoisonValue>(SV->getOperand(1)) &&
18047 SV->getOperand(0)->getType() == VecTy) {
18048 Root = SV->getOperand(0);
18049 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18050 }
18051 // Insert constant values at first.
18052 for (int I = 0, E = VL.size(); I < E; ++I) {
18053 if (PostponedIndices.contains(I))
18054 continue;
18055 if (!isConstant(VL[I])) {
18056 NonConsts.push_back(I);
18057 continue;
18058 }
18059 if (isa<PoisonValue>(VL[I]))
18060 continue;
18061 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18062 Mask[I] = I + E;
18063 }
18064 if (Root) {
18065 if (isa<PoisonValue>(Vec)) {
18066 Vec = OriginalRoot;
18067 } else {
18068 Vec = CreateShuffle(Root, Vec, Mask);
18069 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
18070 OI && OI->use_empty() &&
18071 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18072 return TE->VectorizedValue == OI;
18073 }))
18074 eraseInstruction(OI);
18075 }
18076 }
18077 // Insert non-constant values.
18078 for (int I : NonConsts)
18079 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18080 // Append instructions, which are/may be part of the loop, in the end to make
18081 // it possible to hoist non-loop-based instructions.
18082 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18083 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18084
18085 return Vec;
18086}
18087
18088/// Merges shuffle masks and emits final shuffle instruction, if required. It
18089/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
18090/// when the actual shuffle instruction is generated only if this is actually
18091/// required. Otherwise, the shuffle instruction emission is delayed till the
18092/// end of the process, to reduce the number of emitted instructions and further
18093/// analysis/transformations.
18094/// The class also will look through the previously emitted shuffle instructions
18095/// and properly mark indices in mask as undef.
18096/// For example, given the code
18097/// \code
18098/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
18099/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
18100/// \endcode
18101/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
18102/// look through %s1 and %s2 and emit
18103/// \code
18104/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18105/// \endcode
18106/// instead.
18107/// If 2 operands are of different size, the smallest one will be resized and
18108/// the mask recalculated properly.
18109/// For example, given the code
18110/// \code
18111/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
18112/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
18113/// \endcode
18114/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
18115/// look through %s1 and %s2 and emit
18116/// \code
18117/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18118/// \endcode
18119/// instead.
18120class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
18121 bool IsFinalized = false;
18122 /// Combined mask for all applied operands and masks. It is built during
18123 /// analysis and actual emission of shuffle vector instructions.
18124 SmallVector<int> CommonMask;
18125 /// List of operands for the shuffle vector instruction. It hold at max 2
18126 /// operands, if the 3rd is going to be added, the first 2 are combined into
18127 /// shuffle with \p CommonMask mask, the first operand sets to be the
18128 /// resulting shuffle and the second operand sets to be the newly added
18129 /// operand. The \p CommonMask is transformed in the proper way after that.
18130 SmallVector<Value *, 2> InVectors;
18131 IRBuilderBase &Builder;
18132 BoUpSLP &R;
18133
18134 class ShuffleIRBuilder {
18135 IRBuilderBase &Builder;
18136 /// Holds all of the instructions that we gathered.
18137 SetVector<Instruction *> &GatherShuffleExtractSeq;
18138 /// A list of blocks that we are going to CSE.
18139 DenseSet<BasicBlock *> &CSEBlocks;
18140 /// Data layout.
18141 const DataLayout &DL;
18142
18143 public:
18144 ShuffleIRBuilder(IRBuilderBase &Builder,
18145 SetVector<Instruction *> &GatherShuffleExtractSeq,
18146 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
18147 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
18148 CSEBlocks(CSEBlocks), DL(DL) {}
18149 ~ShuffleIRBuilder() = default;
18150 /// Creates shufflevector for the 2 operands with the given mask.
18151 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
18152 if (V1->getType() != V2->getType()) {
18154 V1->getType()->isIntOrIntVectorTy() &&
18155 "Expected integer vector types only.");
18156 if (V1->getType() != V2->getType()) {
18157 if (cast<VectorType>(V2->getType())
18158 ->getElementType()
18159 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
18160 ->getElementType()
18161 ->getIntegerBitWidth())
18162 V2 = Builder.CreateIntCast(
18163 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
18164 else
18165 V1 = Builder.CreateIntCast(
18166 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
18167 }
18168 }
18169 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
18170 if (auto *I = dyn_cast<Instruction>(Vec)) {
18171 GatherShuffleExtractSeq.insert(I);
18172 CSEBlocks.insert(I->getParent());
18173 }
18174 return Vec;
18175 }
18176 /// Creates permutation of the single vector operand with the given mask, if
18177 /// it is not identity mask.
18178 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
18179 if (Mask.empty())
18180 return V1;
18181 unsigned VF = Mask.size();
18182 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
18183 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
18184 return V1;
18185 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18186 if (auto *I = dyn_cast<Instruction>(Vec)) {
18187 GatherShuffleExtractSeq.insert(I);
18188 CSEBlocks.insert(I->getParent());
18189 }
18190 return Vec;
18191 }
18192 Value *createIdentity(Value *V) { return V; }
18193 Value *createPoison(Type *Ty, unsigned VF) {
18194 return PoisonValue::get(getWidenedType(Ty, VF));
18195 }
18196 /// Resizes 2 input vector to match the sizes, if the they are not equal
18197 /// yet. The smallest vector is resized to the size of the larger vector.
18198 void resizeToMatch(Value *&V1, Value *&V2) {
18199 if (V1->getType() == V2->getType())
18200 return;
18201 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
18202 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
18203 int VF = std::max(V1VF, V2VF);
18204 int MinVF = std::min(V1VF, V2VF);
18205 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
18206 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
18207 0);
18208 Value *&Op = MinVF == V1VF ? V1 : V2;
18209 Op = Builder.CreateShuffleVector(Op, IdentityMask);
18210 if (auto *I = dyn_cast<Instruction>(Op)) {
18211 GatherShuffleExtractSeq.insert(I);
18212 CSEBlocks.insert(I->getParent());
18213 }
18214 if (MinVF == V1VF)
18215 V1 = Op;
18216 else
18217 V2 = Op;
18218 }
18219 };
18220
18221 /// Smart shuffle instruction emission, walks through shuffles trees and
18222 /// tries to find the best matching vector for the actual shuffle
18223 /// instruction.
18224 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
18225 assert(V1 && "Expected at least one vector value.");
18226 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18227 R.CSEBlocks, *R.DL);
18228 return BaseShuffleAnalysis::createShuffle<Value *>(
18229 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18230 }
18231
18232 /// Cast value \p V to the vector type with the same number of elements, but
18233 /// the base type \p ScalarTy.
18234 Value *castToScalarTyElem(Value *V,
18235 std::optional<bool> IsSigned = std::nullopt) {
18236 auto *VecTy = cast<VectorType>(V->getType());
18237 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
18238 if (VecTy->getElementType() == ScalarTy->getScalarType())
18239 return V;
18240 return Builder.CreateIntCast(
18241 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18242 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
18243 }
18244
18245 Value *getVectorizedValue(const TreeEntry &E) {
18246 Value *Vec = E.VectorizedValue;
18247 if (!Vec->getType()->isIntOrIntVectorTy())
18248 return Vec;
18249 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
18250 return !isa<PoisonValue>(V) &&
18251 !isKnownNonNegative(
18252 V, SimplifyQuery(*R.DL));
18253 }));
18254 }
18255
18256public:
18258 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18259
18260 /// Adjusts extractelements after reusing them.
18261 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
18262 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18263 unsigned NumParts, bool &UseVecBaseAsInput) {
18264 UseVecBaseAsInput = false;
18265 SmallPtrSet<Value *, 4> UniqueBases;
18266 Value *VecBase = nullptr;
18267 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
18268 if (!E->ReorderIndices.empty()) {
18269 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18270 E->ReorderIndices.end());
18271 reorderScalars(VL, ReorderMask);
18272 }
18273 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18274 int Idx = Mask[I];
18275 if (Idx == PoisonMaskElem)
18276 continue;
18277 auto *EI = cast<ExtractElementInst>(VL[I]);
18278 VecBase = EI->getVectorOperand();
18279 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
18280 VecBase = TEs.front()->VectorizedValue;
18281 assert(VecBase && "Expected vectorized value.");
18282 UniqueBases.insert(VecBase);
18283 // If the only one use is vectorized - can delete the extractelement
18284 // itself.
18285 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18286 (NumParts != 1 && count(VL, EI) > 1) ||
18287 any_of(EI->users(), [&](User *U) {
18288 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18289 return UTEs.empty() || UTEs.size() > 1 ||
18290 (isa<GetElementPtrInst>(U) &&
18291 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18292 (!UTEs.empty() &&
18293 count_if(R.VectorizableTree,
18294 [&](const std::unique_ptr<TreeEntry> &TE) {
18295 return TE->UserTreeIndex.UserTE ==
18296 UTEs.front() &&
18297 is_contained(VL, EI);
18298 }) != 1);
18299 }))
18300 continue;
18301 R.eraseInstruction(EI);
18302 }
18303 if (NumParts == 1 || UniqueBases.size() == 1) {
18304 assert(VecBase && "Expected vectorized value.");
18305 return castToScalarTyElem(VecBase);
18306 }
18307 UseVecBaseAsInput = true;
18308 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
18309 for (auto [I, Idx] : enumerate(Mask))
18310 if (Idx != PoisonMaskElem)
18311 Idx = I;
18312 };
18313 // Perform multi-register vector shuffle, joining them into a single virtual
18314 // long vector.
18315 // Need to shuffle each part independently and then insert all this parts
18316 // into a long virtual vector register, forming the original vector.
18317 Value *Vec = nullptr;
18318 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18319 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18320 for (unsigned Part : seq<unsigned>(NumParts)) {
18321 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
18322 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
18323 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
18324 constexpr int MaxBases = 2;
18325 SmallVector<Value *, MaxBases> Bases(MaxBases);
18326 auto VLMask = zip(SubVL, SubMask);
18327 const unsigned VF = std::accumulate(
18328 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
18329 if (std::get<1>(D) == PoisonMaskElem)
18330 return S;
18331 Value *VecOp =
18332 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18333 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18334 !TEs.empty())
18335 VecOp = TEs.front()->VectorizedValue;
18336 assert(VecOp && "Expected vectorized value.");
18337 const unsigned Size =
18338 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18339 return std::max(S, Size);
18340 });
18341 for (const auto [V, I] : VLMask) {
18342 if (I == PoisonMaskElem)
18343 continue;
18344 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18345 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18346 VecOp = TEs.front()->VectorizedValue;
18347 assert(VecOp && "Expected vectorized value.");
18348 VecOp = castToScalarTyElem(VecOp);
18349 Bases[I / VF] = VecOp;
18350 }
18351 if (!Bases.front())
18352 continue;
18353 Value *SubVec;
18354 if (Bases.back()) {
18355 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18356 TransformToIdentity(SubMask);
18357 } else {
18358 SubVec = Bases.front();
18359 }
18360 if (!Vec) {
18361 Vec = SubVec;
18362 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18363 [&](unsigned P) {
18364 ArrayRef<int> SubMask =
18365 Mask.slice(P * SliceSize,
18366 getNumElems(Mask.size(),
18367 SliceSize, P));
18368 return all_of(SubMask, [](int Idx) {
18369 return Idx == PoisonMaskElem;
18370 });
18371 })) &&
18372 "Expected first part or all previous parts masked.");
18373 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18374 } else {
18375 unsigned NewVF =
18376 cast<FixedVectorType>(Vec->getType())->getNumElements();
18377 if (Vec->getType() != SubVec->getType()) {
18378 unsigned SubVecVF =
18379 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18380 NewVF = std::max(NewVF, SubVecVF);
18381 }
18382 // Adjust SubMask.
18383 for (int &Idx : SubMask)
18384 if (Idx != PoisonMaskElem)
18385 Idx += NewVF;
18386 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18387 Vec = createShuffle(Vec, SubVec, VecMask);
18388 TransformToIdentity(VecMask);
18389 }
18390 }
18391 copy(VecMask, Mask.begin());
18392 return Vec;
18393 }
18394 /// Checks if the specified entry \p E needs to be delayed because of its
18395 /// dependency nodes.
18396 std::optional<Value *>
18397 needToDelay(const TreeEntry *E,
18399 // No need to delay emission if all deps are ready.
18400 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18401 return all_of(
18402 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18403 }))
18404 return std::nullopt;
18405 // Postpone gather emission, will be emitted after the end of the
18406 // process to keep correct order.
18407 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18408 return Builder.CreateAlignedLoad(
18409 ResVecTy,
18410 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18411 MaybeAlign());
18412 }
18413 /// Reset the builder to handle perfect diamond match.
18415 IsFinalized = false;
18416 CommonMask.clear();
18417 InVectors.clear();
18418 }
18419 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18420 /// shuffling.
18421 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18422 Value *V1 = getVectorizedValue(E1);
18423 Value *V2 = getVectorizedValue(E2);
18424 add(V1, V2, Mask);
18425 }
18426 /// Adds single input vector (in form of tree entry) and the mask for its
18427 /// shuffling.
18428 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18429 Value *V1 = getVectorizedValue(E1);
18430 add(V1, Mask);
18431 }
18432 /// Adds 2 input vectors and the mask for their shuffling.
18433 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18434 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18437 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18438 V1 = castToScalarTyElem(V1);
18439 V2 = castToScalarTyElem(V2);
18440 if (InVectors.empty()) {
18441 InVectors.push_back(V1);
18442 InVectors.push_back(V2);
18443 CommonMask.assign(Mask.begin(), Mask.end());
18444 return;
18445 }
18446 Value *Vec = InVectors.front();
18447 if (InVectors.size() == 2) {
18448 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18449 transformMaskAfterShuffle(CommonMask, CommonMask);
18450 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18451 Mask.size()) {
18452 Vec = createShuffle(Vec, nullptr, CommonMask);
18453 transformMaskAfterShuffle(CommonMask, CommonMask);
18454 }
18455 V1 = createShuffle(V1, V2, Mask);
18456 unsigned VF = std::max(getVF(V1), getVF(Vec));
18457 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18458 if (Mask[Idx] != PoisonMaskElem)
18459 CommonMask[Idx] = Idx + VF;
18460 InVectors.front() = Vec;
18461 if (InVectors.size() == 2)
18462 InVectors.back() = V1;
18463 else
18464 InVectors.push_back(V1);
18465 }
18466 /// Adds another one input vector and the mask for the shuffling.
18467 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
18469 "castToScalarTyElem expects V1 to be FixedVectorType");
18470 V1 = castToScalarTyElem(V1);
18471 if (InVectors.empty()) {
18472 InVectors.push_back(V1);
18473 CommonMask.assign(Mask.begin(), Mask.end());
18474 return;
18475 }
18476 const auto *It = find(InVectors, V1);
18477 if (It == InVectors.end()) {
18478 if (InVectors.size() == 2 ||
18479 InVectors.front()->getType() != V1->getType()) {
18480 Value *V = InVectors.front();
18481 if (InVectors.size() == 2) {
18482 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18483 transformMaskAfterShuffle(CommonMask, CommonMask);
18484 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18485 CommonMask.size()) {
18486 V = createShuffle(InVectors.front(), nullptr, CommonMask);
18487 transformMaskAfterShuffle(CommonMask, CommonMask);
18488 }
18489 unsigned VF = std::max(CommonMask.size(), Mask.size());
18490 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18491 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
18492 CommonMask[Idx] = V->getType() != V1->getType()
18493 ? Idx + VF
18494 : Mask[Idx] + getVF(V1);
18495 if (V->getType() != V1->getType())
18496 V1 = createShuffle(V1, nullptr, Mask);
18497 InVectors.front() = V;
18498 if (InVectors.size() == 2)
18499 InVectors.back() = V1;
18500 else
18501 InVectors.push_back(V1);
18502 return;
18503 }
18504 // Check if second vector is required if the used elements are already
18505 // used from the first one.
18506 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18507 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
18508 InVectors.push_back(V1);
18509 break;
18510 }
18511 }
18512 unsigned VF = 0;
18513 for (Value *V : InVectors)
18514 VF = std::max(VF, getVF(V));
18515 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18516 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
18517 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18518 }
18519 /// Adds another one input vector and the mask for the shuffling.
18521 SmallVector<int> NewMask;
18522 inversePermutation(Order, NewMask);
18523 add(V1, NewMask);
18524 }
18525 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
18526 Value *Root = nullptr) {
18527 return R.gather(VL, Root, ScalarTy,
18528 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
18529 return createShuffle(V1, V2, Mask);
18530 });
18531 }
18532 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
18533 /// Finalize emission of the shuffles.
18534 /// \param Action the action (if any) to be performed before final applying of
18535 /// the \p ExtMask mask.
18537 ArrayRef<int> ExtMask,
18538 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18539 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
18542 Action = {}) {
18543 IsFinalized = true;
18544 if (Action) {
18545 Value *Vec = InVectors.front();
18546 if (InVectors.size() == 2) {
18547 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18548 InVectors.pop_back();
18549 } else {
18550 Vec = createShuffle(Vec, nullptr, CommonMask);
18551 }
18552 transformMaskAfterShuffle(CommonMask, CommonMask);
18553 assert(VF > 0 &&
18554 "Expected vector length for the final value before action.");
18555 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
18556 if (VecVF < VF) {
18557 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18558 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18559 Vec = createShuffle(Vec, nullptr, ResizeMask);
18560 }
18561 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
18562 return createShuffle(V1, V2, Mask);
18563 });
18564 InVectors.front() = Vec;
18565 }
18566 if (!SubVectors.empty()) {
18567 Value *Vec = InVectors.front();
18568 if (InVectors.size() == 2) {
18569 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18570 InVectors.pop_back();
18571 } else {
18572 Vec = createShuffle(Vec, nullptr, CommonMask);
18573 }
18574 transformMaskAfterShuffle(CommonMask, CommonMask);
18575 auto CreateSubVectors = [&](Value *Vec,
18576 SmallVectorImpl<int> &CommonMask) {
18577 for (auto [E, Idx] : SubVectors) {
18578 Value *V = getVectorizedValue(*E);
18579 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
18580 // Use scalar version of the SCalarType to correctly handle shuffles
18581 // for revectorization. The revectorization mode operates by the
18582 // vectors, but here we need to operate on the scalars, because the
18583 // masks were already transformed for the vector elements and we don't
18584 // need doing this transformation again.
18585 Type *OrigScalarTy = ScalarTy;
18586 ScalarTy = ScalarTy->getScalarType();
18587 Vec = createInsertVector(
18588 Builder, Vec, V, InsertionIndex,
18589 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
18590 _3));
18591 ScalarTy = OrigScalarTy;
18592 if (!CommonMask.empty()) {
18593 std::iota(std::next(CommonMask.begin(), Idx),
18594 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
18595 Idx);
18596 }
18597 }
18598 return Vec;
18599 };
18600 if (SubVectorsMask.empty()) {
18601 Vec = CreateSubVectors(Vec, CommonMask);
18602 } else {
18603 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
18604 copy(SubVectorsMask, SVMask.begin());
18605 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
18606 if (I2 != PoisonMaskElem) {
18607 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
18608 I1 = I2 + CommonMask.size();
18609 }
18610 }
18611 Value *InsertVec =
18612 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
18613 Vec = createShuffle(InsertVec, Vec, SVMask);
18614 transformMaskAfterShuffle(CommonMask, SVMask);
18615 }
18616 InVectors.front() = Vec;
18617 }
18618
18619 if (!ExtMask.empty()) {
18620 if (CommonMask.empty()) {
18621 CommonMask.assign(ExtMask.begin(), ExtMask.end());
18622 } else {
18623 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
18624 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
18625 if (ExtMask[I] == PoisonMaskElem)
18626 continue;
18627 NewMask[I] = CommonMask[ExtMask[I]];
18628 }
18629 CommonMask.swap(NewMask);
18630 }
18631 }
18632 if (CommonMask.empty()) {
18633 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
18634 return InVectors.front();
18635 }
18636 if (InVectors.size() == 2)
18637 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18638 return createShuffle(InVectors.front(), nullptr, CommonMask);
18639 }
18640
18642 assert((IsFinalized || CommonMask.empty()) &&
18643 "Shuffle construction must be finalized.");
18644 }
18645};
18646
18647Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
18648 return vectorizeTree(getOperandEntry(E, NodeIdx));
18649}
18650
18651template <typename BVTy, typename ResTy, typename... Args>
18652ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
18653 Args &...Params) {
18654 assert(E->isGather() && "Expected gather node.");
18655 unsigned VF = E->getVectorFactor();
18656
18657 bool NeedFreeze = false;
18658 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
18659 // Clear values, to be replaced by insertvector instructions.
18660 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18661 for_each(MutableArrayRef(GatheredScalars)
18662 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18663 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
18665 E->CombinedEntriesWithIndices.size());
18666 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18667 [&](const auto &P) {
18668 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18669 });
18670 // Build a mask out of the reorder indices and reorder scalars per this
18671 // mask.
18672 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18673 E->ReorderIndices.end());
18674 if (!ReorderMask.empty())
18675 reorderScalars(GatheredScalars, ReorderMask);
18676 SmallVector<int> SubVectorsMask;
18677 inversePermutation(E->ReorderIndices, SubVectorsMask);
18678 // Transform non-clustered elements in the mask to poison (-1).
18679 // "Clustered" operations will be reordered using this mask later.
18680 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
18681 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
18682 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
18683 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
18684 } else {
18685 SubVectorsMask.clear();
18686 }
18687 SmallVector<Value *> StoredGS(GatheredScalars);
18688 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
18689 unsigned I, unsigned SliceSize,
18690 bool IsNotPoisonous) {
18691 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
18692 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18693 }))
18694 return false;
18695 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18696 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18697 if (UserTE->getNumOperands() != 2)
18698 return false;
18699 if (!IsNotPoisonous) {
18700 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18701 [=](const std::unique_ptr<TreeEntry> &TE) {
18702 return TE->UserTreeIndex.UserTE == UserTE &&
18703 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18704 });
18705 if (It == VectorizableTree.end())
18706 return false;
18707 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
18708 if (!(*It)->ReorderIndices.empty()) {
18709 inversePermutation((*It)->ReorderIndices, ReorderMask);
18710 reorderScalars(GS, ReorderMask);
18711 }
18712 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
18713 Value *V0 = std::get<0>(P);
18714 Value *V1 = std::get<1>(P);
18715 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18716 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18717 is_contained(E->Scalars, V1));
18718 }))
18719 return false;
18720 }
18721 int Idx;
18722 if ((Mask.size() < InputVF &&
18723 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
18724 Idx == 0) ||
18725 (Mask.size() == InputVF &&
18726 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
18727 std::iota(
18728 std::next(Mask.begin(), I * SliceSize),
18729 std::next(Mask.begin(),
18730 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18731 0);
18732 } else {
18733 unsigned IVal =
18734 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
18735 std::fill(
18736 std::next(Mask.begin(), I * SliceSize),
18737 std::next(Mask.begin(),
18738 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18739 IVal);
18740 }
18741 return true;
18742 };
18743 BVTy ShuffleBuilder(ScalarTy, Params...);
18744 ResTy Res = ResTy();
18745 SmallVector<int> Mask;
18746 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
18748 Value *ExtractVecBase = nullptr;
18749 bool UseVecBaseAsInput = false;
18752 Type *OrigScalarTy = GatheredScalars.front()->getType();
18753 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
18754 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
18755 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
18756 // Check for gathered extracts.
18757 bool Resized = false;
18758 ExtractShuffles =
18759 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18760 if (!ExtractShuffles.empty()) {
18761 SmallVector<const TreeEntry *> ExtractEntries;
18762 for (auto [Idx, I] : enumerate(ExtractMask)) {
18763 if (I == PoisonMaskElem)
18764 continue;
18765 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
18766 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
18767 !TEs.empty())
18768 ExtractEntries.append(TEs.begin(), TEs.end());
18769 }
18770 if (std::optional<ResTy> Delayed =
18771 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18772 // Delay emission of gathers which are not ready yet.
18773 PostponedGathers.insert(E);
18774 // Postpone gather emission, will be emitted after the end of the
18775 // process to keep correct order.
18776 return *Delayed;
18777 }
18778 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
18779 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18780 ExtractVecBase = VecBase;
18781 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18782 if (VF == VecBaseTy->getNumElements() &&
18783 GatheredScalars.size() != VF) {
18784 Resized = true;
18785 GatheredScalars.append(VF - GatheredScalars.size(),
18786 PoisonValue::get(OrigScalarTy));
18787 NumParts =
18788 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
18789 }
18790 }
18791 }
18792 // Gather extracts after we check for full matched gathers only.
18793 if (!ExtractShuffles.empty() || !E->hasState() ||
18794 E->getOpcode() != Instruction::Load ||
18795 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18796 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18797 any_of(E->Scalars,
18798 [this](Value *V) {
18799 return isa<LoadInst>(V) && isVectorized(V);
18800 })) ||
18801 (E->hasState() && E->isAltShuffle()) ||
18802 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
18803 isSplat(E->Scalars) ||
18804 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18805 GatherShuffles =
18806 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18807 }
18808 if (!GatherShuffles.empty()) {
18809 if (std::optional<ResTy> Delayed =
18810 ShuffleBuilder.needToDelay(E, Entries)) {
18811 // Delay emission of gathers which are not ready yet.
18812 PostponedGathers.insert(E);
18813 // Postpone gather emission, will be emitted after the end of the
18814 // process to keep correct order.
18815 return *Delayed;
18816 }
18817 if (GatherShuffles.size() == 1 &&
18818 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
18819 Entries.front().front()->isSame(E->Scalars)) {
18820 // Perfect match in the graph, will reuse the previously vectorized
18821 // node. Cost is 0.
18822 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
18823 << shortBundleName(E->Scalars, E->Idx) << ".\n");
18824 // Restore the mask for previous partially matched values.
18825 Mask.resize(E->Scalars.size());
18826 const TreeEntry *FrontTE = Entries.front().front();
18827 if (FrontTE->ReorderIndices.empty() &&
18828 ((FrontTE->ReuseShuffleIndices.empty() &&
18829 E->Scalars.size() == FrontTE->Scalars.size()) ||
18830 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18831 std::iota(Mask.begin(), Mask.end(), 0);
18832 } else {
18833 for (auto [I, V] : enumerate(E->Scalars)) {
18834 if (isa<PoisonValue>(V)) {
18835 Mask[I] = PoisonMaskElem;
18836 continue;
18837 }
18838 Mask[I] = FrontTE->findLaneForValue(V);
18839 }
18840 }
18841 // Reset the builder(s) to correctly handle perfect diamond matched
18842 // nodes.
18843 ShuffleBuilder.resetForSameNode();
18844 ShuffleBuilder.add(*FrontTE, Mask);
18845 // Full matched entry found, no need to insert subvectors.
18846 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18847 return Res;
18848 }
18849 if (!Resized) {
18850 if (GatheredScalars.size() != VF &&
18851 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
18852 return any_of(TEs, [&](const TreeEntry *TE) {
18853 return TE->getVectorFactor() == VF;
18854 });
18855 }))
18856 GatheredScalars.append(VF - GatheredScalars.size(),
18857 PoisonValue::get(OrigScalarTy));
18858 }
18859 // Remove shuffled elements from list of gathers.
18860 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18861 if (Mask[I] != PoisonMaskElem)
18862 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18863 }
18864 }
18865 }
18866 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18867 SmallVectorImpl<int> &ReuseMask,
18868 bool IsRootPoison) {
18869 // For splats with can emit broadcasts instead of gathers, so try to find
18870 // such sequences.
18871 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
18872 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
18873 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
18874 SmallVector<int> UndefPos;
18875 DenseMap<Value *, unsigned> UniquePositions;
18876 // Gather unique non-const values and all constant values.
18877 // For repeated values, just shuffle them.
18878 int NumNonConsts = 0;
18879 int SinglePos = 0;
18880 for (auto [I, V] : enumerate(Scalars)) {
18881 if (isa<UndefValue>(V)) {
18882 if (!isa<PoisonValue>(V)) {
18883 ReuseMask[I] = I;
18884 UndefPos.push_back(I);
18885 }
18886 continue;
18887 }
18888 if (isConstant(V)) {
18889 ReuseMask[I] = I;
18890 continue;
18891 }
18892 ++NumNonConsts;
18893 SinglePos = I;
18894 Value *OrigV = V;
18895 Scalars[I] = PoisonValue::get(OrigScalarTy);
18896 if (IsSplat) {
18897 Scalars.front() = OrigV;
18898 ReuseMask[I] = 0;
18899 } else {
18900 const auto Res = UniquePositions.try_emplace(OrigV, I);
18901 Scalars[Res.first->second] = OrigV;
18902 ReuseMask[I] = Res.first->second;
18903 }
18904 }
18905 if (NumNonConsts == 1) {
18906 // Restore single insert element.
18907 if (IsSplat) {
18908 ReuseMask.assign(VF, PoisonMaskElem);
18909 std::swap(Scalars.front(), Scalars[SinglePos]);
18910 if (!UndefPos.empty() && UndefPos.front() == 0)
18911 Scalars.front() = UndefValue::get(OrigScalarTy);
18912 }
18913 ReuseMask[SinglePos] = SinglePos;
18914 } else if (!UndefPos.empty() && IsSplat) {
18915 // For undef values, try to replace them with the simple broadcast.
18916 // We can do it if the broadcasted value is guaranteed to be
18917 // non-poisonous, or by freezing the incoming scalar value first.
18918 auto *It = find_if(Scalars, [this, E](Value *V) {
18919 return !isa<UndefValue>(V) &&
18921 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
18922 // Check if the value already used in the same operation in
18923 // one of the nodes already.
18924 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18925 is_contained(E->UserTreeIndex.UserTE->Scalars,
18926 U.getUser());
18927 })));
18928 });
18929 if (It != Scalars.end()) {
18930 // Replace undefs by the non-poisoned scalars and emit broadcast.
18931 int Pos = std::distance(Scalars.begin(), It);
18932 for (int I : UndefPos) {
18933 // Set the undef position to the non-poisoned scalar.
18934 ReuseMask[I] = Pos;
18935 // Replace the undef by the poison, in the mask it is replaced by
18936 // non-poisoned scalar already.
18937 if (I != Pos)
18938 Scalars[I] = PoisonValue::get(OrigScalarTy);
18939 }
18940 } else {
18941 // Replace undefs by the poisons, emit broadcast and then emit
18942 // freeze.
18943 for (int I : UndefPos) {
18944 ReuseMask[I] = PoisonMaskElem;
18945 if (isa<UndefValue>(Scalars[I]))
18946 Scalars[I] = PoisonValue::get(OrigScalarTy);
18947 }
18948 NeedFreeze = true;
18949 }
18950 }
18951 };
18952 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
18953 bool IsNonPoisoned = true;
18954 bool IsUsedInExpr = true;
18955 Value *Vec1 = nullptr;
18956 if (!ExtractShuffles.empty()) {
18957 // Gather of extractelements can be represented as just a shuffle of
18958 // a single/two vectors the scalars are extracted from.
18959 // Find input vectors.
18960 Value *Vec2 = nullptr;
18961 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18962 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
18963 ExtractMask[I] = PoisonMaskElem;
18964 }
18965 if (UseVecBaseAsInput) {
18966 Vec1 = ExtractVecBase;
18967 } else {
18968 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18969 if (ExtractMask[I] == PoisonMaskElem)
18970 continue;
18971 if (isa<UndefValue>(StoredGS[I]))
18972 continue;
18973 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
18974 Value *VecOp = EI->getVectorOperand();
18975 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
18976 !TEs.empty() && TEs.front()->VectorizedValue)
18977 VecOp = TEs.front()->VectorizedValue;
18978 if (!Vec1) {
18979 Vec1 = VecOp;
18980 } else if (Vec1 != VecOp) {
18981 assert((!Vec2 || Vec2 == VecOp) &&
18982 "Expected only 1 or 2 vectors shuffle.");
18983 Vec2 = VecOp;
18984 }
18985 }
18986 }
18987 if (Vec2) {
18988 IsUsedInExpr = false;
18989 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
18990 isGuaranteedNotToBePoison(Vec2, AC);
18991 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18992 } else if (Vec1) {
18993 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
18994 IsUsedInExpr &= FindReusedSplat(
18995 ExtractMask,
18996 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
18997 ExtractMask.size(), IsNotPoisonedVec);
18998 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
18999 IsNonPoisoned &= IsNotPoisonedVec;
19000 } else {
19001 IsUsedInExpr = false;
19002 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
19003 /*ForExtracts=*/true);
19004 }
19005 }
19006 if (!GatherShuffles.empty()) {
19007 unsigned SliceSize =
19008 getPartNumElems(E->Scalars.size(),
19009 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
19010 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
19011 for (const auto [I, TEs] : enumerate(Entries)) {
19012 if (TEs.empty()) {
19013 assert(!GatherShuffles[I] &&
19014 "No shuffles with empty entries list expected.");
19015 continue;
19016 }
19017 assert((TEs.size() == 1 || TEs.size() == 2) &&
19018 "Expected shuffle of 1 or 2 entries.");
19019 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
19020 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
19021 VecMask.assign(VecMask.size(), PoisonMaskElem);
19022 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
19023 if (TEs.size() == 1) {
19024 bool IsNotPoisonedVec =
19025 TEs.front()->VectorizedValue
19026 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
19027 : true;
19028 IsUsedInExpr &=
19029 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
19030 SliceSize, IsNotPoisonedVec);
19031 ShuffleBuilder.add(*TEs.front(), VecMask);
19032 IsNonPoisoned &= IsNotPoisonedVec;
19033 } else {
19034 IsUsedInExpr = false;
19035 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19036 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19037 IsNonPoisoned &=
19038 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
19039 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
19040 }
19041 }
19042 }
19043 // Try to figure out best way to combine values: build a shuffle and insert
19044 // elements or just build several shuffles.
19045 // Insert non-constant scalars.
19046 SmallVector<Value *> NonConstants(GatheredScalars);
19047 int EMSz = ExtractMask.size();
19048 int MSz = Mask.size();
19049 // Try to build constant vector and shuffle with it only if currently we
19050 // have a single permutation and more than 1 scalar constants.
19051 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
19052 bool IsIdentityShuffle =
19053 ((UseVecBaseAsInput ||
19054 all_of(ExtractShuffles,
19055 [](const std::optional<TTI::ShuffleKind> &SK) {
19056 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19058 })) &&
19059 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
19060 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
19061 (!GatherShuffles.empty() &&
19062 all_of(GatherShuffles,
19063 [](const std::optional<TTI::ShuffleKind> &SK) {
19064 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19066 }) &&
19067 none_of(Mask, [&](int I) { return I >= MSz; }) &&
19069 bool EnoughConstsForShuffle =
19070 IsSingleShuffle &&
19071 (none_of(GatheredScalars,
19072 [](Value *V) {
19073 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19074 }) ||
19075 any_of(GatheredScalars,
19076 [](Value *V) {
19077 return isa<Constant>(V) && !isa<UndefValue>(V);
19078 })) &&
19079 (!IsIdentityShuffle ||
19080 (GatheredScalars.size() == 2 &&
19081 any_of(GatheredScalars,
19082 [](Value *V) { return !isa<UndefValue>(V); })) ||
19083 count_if(GatheredScalars, [](Value *V) {
19084 return isa<Constant>(V) && !isa<PoisonValue>(V);
19085 }) > 1);
19086 // NonConstants array contains just non-constant values, GatheredScalars
19087 // contains only constant to build final vector and then shuffle.
19088 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
19089 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
19090 NonConstants[I] = PoisonValue::get(OrigScalarTy);
19091 else
19092 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
19093 }
19094 // Generate constants for final shuffle and build a mask for them.
19095 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
19096 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
19097 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
19098 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
19099 ShuffleBuilder.add(BV, BVMask);
19100 }
19101 if (all_of(NonConstants, [=](Value *V) {
19102 return isa<PoisonValue>(V) ||
19103 (IsSingleShuffle && ((IsIdentityShuffle &&
19104 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
19105 }))
19106 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19107 SubVectorsMask);
19108 else
19109 Res = ShuffleBuilder.finalize(
19110 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
19111 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
19112 bool IsSplat = isSplat(NonConstants);
19113 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
19114 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
19115 auto CheckIfSplatIsProfitable = [&]() {
19116 // Estimate the cost of splatting + shuffle and compare with
19117 // insert + shuffle.
19118 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19119 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19120 if (isa<ExtractElementInst>(V) || isVectorized(V))
19121 return false;
19122 InstructionCost SplatCost = TTI->getVectorInstrCost(
19123 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
19124 PoisonValue::get(VecTy), V);
19125 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19126 for (auto [Idx, I] : enumerate(BVMask))
19127 if (I != PoisonMaskElem)
19128 NewMask[Idx] = Mask.size();
19129 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
19130 NewMask, CostKind);
19131 InstructionCost BVCost = TTI->getVectorInstrCost(
19132 Instruction::InsertElement, VecTy, CostKind,
19133 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
19134 Vec, V);
19135 // Shuffle required?
19136 if (count(BVMask, PoisonMaskElem) <
19137 static_cast<int>(BVMask.size() - 1)) {
19138 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19139 for (auto [Idx, I] : enumerate(BVMask))
19140 if (I != PoisonMaskElem)
19141 NewMask[Idx] = I;
19142 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
19143 VecTy, NewMask, CostKind);
19144 }
19145 return SplatCost <= BVCost;
19146 };
19147 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
19148 for (auto [Idx, I] : enumerate(BVMask))
19149 if (I != PoisonMaskElem)
19150 Mask[Idx] = I;
19151 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
19152 } else {
19153 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19154 SmallVector<Value *> Values(NonConstants.size(),
19155 PoisonValue::get(ScalarTy));
19156 Values[0] = V;
19157 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
19158 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
19159 transform(BVMask, SplatMask.begin(), [](int I) {
19160 return I == PoisonMaskElem ? PoisonMaskElem : 0;
19161 });
19162 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
19163 BV = CreateShuffle(BV, nullptr, SplatMask);
19164 for (auto [Idx, I] : enumerate(BVMask))
19165 if (I != PoisonMaskElem)
19166 Mask[Idx] = BVMask.size() + Idx;
19167 Vec = CreateShuffle(Vec, BV, Mask);
19168 for (auto [Idx, I] : enumerate(Mask))
19169 if (I != PoisonMaskElem)
19170 Mask[Idx] = Idx;
19171 }
19172 });
19173 } else if (!allConstant(GatheredScalars)) {
19174 // Gather unique scalars and all constants.
19175 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
19176 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
19177 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
19178 ShuffleBuilder.add(BV, ReuseMask);
19179 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19180 SubVectorsMask);
19181 } else {
19182 // Gather all constants.
19183 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
19184 for (auto [I, V] : enumerate(GatheredScalars)) {
19185 if (!isa<PoisonValue>(V))
19186 Mask[I] = I;
19187 }
19188 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19189 ShuffleBuilder.add(BV, Mask);
19190 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19191 SubVectorsMask);
19192 }
19193
19194 if (NeedFreeze)
19195 Res = ShuffleBuilder.createFreeze(Res);
19196 return Res;
19197}
19198
19199Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
19200 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
19201 (void)vectorizeTree(VectorizableTree[EIdx].get());
19202 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
19203 Builder, *this);
19204}
19205
19206/// \returns \p I after propagating metadata from \p VL only for instructions in
19207/// \p VL.
19210 for (Value *V : VL)
19211 if (isa<Instruction>(V))
19212 Insts.push_back(V);
19213 return llvm::propagateMetadata(Inst, Insts);
19214}
19215
19217 if (DebugLoc DL = PN.getDebugLoc())
19218 return DL;
19219 return DebugLoc::getUnknown();
19220}
19221
19222Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
19223 IRBuilderBase::InsertPointGuard Guard(Builder);
19224
19225 Value *V = E->Scalars.front();
19226 Type *ScalarTy = V->getType();
19227 if (!isa<CmpInst>(V))
19228 ScalarTy = getValueType(V);
19229 auto It = MinBWs.find(E);
19230 if (It != MinBWs.end()) {
19231 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
19232 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
19233 if (VecTy)
19234 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
19235 }
19236 if (E->VectorizedValue)
19237 return E->VectorizedValue;
19238 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
19239 if (E->isGather()) {
19240 // Set insert point for non-reduction initial nodes.
19241 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
19242 setInsertPointAfterBundle(E);
19243 Value *Vec = createBuildVector(E, ScalarTy);
19244 E->VectorizedValue = Vec;
19245 return Vec;
19246 }
19247 if (E->State == TreeEntry::SplitVectorize) {
19248 assert(E->CombinedEntriesWithIndices.size() == 2 &&
19249 "Expected exactly 2 combined entries.");
19250 setInsertPointAfterBundle(E);
19251 TreeEntry &OpTE1 =
19252 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
19253 assert(OpTE1.isSame(
19254 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19255 "Expected same first part of scalars.");
19256 Value *Op1 = vectorizeTree(&OpTE1);
19257 TreeEntry &OpTE2 =
19258 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
19259 assert(
19260 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19261 "Expected same second part of scalars.");
19262 Value *Op2 = vectorizeTree(&OpTE2);
19263 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
19264 bool IsSigned = false;
19265 auto It = MinBWs.find(OpE);
19266 if (It != MinBWs.end())
19267 IsSigned = It->second.second;
19268 else
19269 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19270 if (isa<PoisonValue>(V))
19271 return false;
19272 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19273 });
19274 return IsSigned;
19275 };
19276 if (cast<VectorType>(Op1->getType())->getElementType() !=
19277 ScalarTy->getScalarType()) {
19278 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19279 Op1 = Builder.CreateIntCast(
19280 Op1,
19282 ScalarTy,
19283 cast<FixedVectorType>(Op1->getType())->getNumElements()),
19284 GetOperandSignedness(&OpTE1));
19285 }
19286 if (cast<VectorType>(Op2->getType())->getElementType() !=
19287 ScalarTy->getScalarType()) {
19288 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19289 Op2 = Builder.CreateIntCast(
19290 Op2,
19292 ScalarTy,
19293 cast<FixedVectorType>(Op2->getType())->getNumElements()),
19294 GetOperandSignedness(&OpTE2));
19295 }
19296 if (E->ReorderIndices.empty()) {
19297 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
19298 std::iota(
19299 Mask.begin(),
19300 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
19301 0);
19302 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
19303 if (ScalarTyNumElements != 1) {
19304 assert(SLPReVec && "Only supported by REVEC.");
19305 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
19306 }
19307 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19308 Vec = createInsertVector(Builder, Vec, Op2,
19309 E->CombinedEntriesWithIndices.back().second *
19310 ScalarTyNumElements);
19311 E->VectorizedValue = Vec;
19312 return Vec;
19313 }
19314 unsigned CommonVF =
19315 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19316 if (getNumElements(Op1->getType()) != CommonVF) {
19317 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19318 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
19319 0);
19320 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19321 }
19322 if (getNumElements(Op2->getType()) != CommonVF) {
19323 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19324 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
19325 0);
19326 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19327 }
19328 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
19329 E->VectorizedValue = Vec;
19330 return Vec;
19331 }
19332
19333 bool IsReverseOrder =
19334 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
19335 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
19336 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
19337 if (E->getOpcode() == Instruction::Store &&
19338 E->State == TreeEntry::Vectorize) {
19339 ArrayRef<int> Mask =
19340 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19341 E->ReorderIndices.size());
19342 ShuffleBuilder.add(V, Mask);
19343 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19344 E->State == TreeEntry::CompressVectorize) {
19345 ShuffleBuilder.addOrdered(V, {});
19346 } else {
19347 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19348 }
19350 E->CombinedEntriesWithIndices.size());
19351 transform(
19352 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19353 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19354 });
19355 assert(
19356 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19357 "Expected either combined subnodes or reordering");
19358 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19359 };
19360
19361 assert(!E->isGather() && "Unhandled state");
19362 unsigned ShuffleOrOp =
19363 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19364 Instruction *VL0 = E->getMainOp();
19365 auto GetOperandSignedness = [&](unsigned Idx) {
19366 const TreeEntry *OpE = getOperandEntry(E, Idx);
19367 bool IsSigned = false;
19368 auto It = MinBWs.find(OpE);
19369 if (It != MinBWs.end())
19370 IsSigned = It->second.second;
19371 else
19372 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19373 if (isa<PoisonValue>(V))
19374 return false;
19375 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19376 });
19377 return IsSigned;
19378 };
19379 switch (ShuffleOrOp) {
19380 case Instruction::PHI: {
19381 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19382 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19383 "PHI reordering is free.");
19384 auto *PH = cast<PHINode>(VL0);
19385 Builder.SetInsertPoint(PH->getParent(),
19386 PH->getParent()->getFirstNonPHIIt());
19387 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19388 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19389 Value *V = NewPhi;
19390
19391 // Adjust insertion point once all PHI's have been generated.
19392 Builder.SetInsertPoint(PH->getParent(),
19393 PH->getParent()->getFirstInsertionPt());
19394 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19395
19396 V = FinalShuffle(V, E);
19397
19398 E->VectorizedValue = V;
19399 // If phi node is fully emitted - exit.
19400 if (NewPhi->getNumIncomingValues() != 0)
19401 return NewPhi;
19402
19403 // PHINodes may have multiple entries from the same block. We want to
19404 // visit every block once.
19405 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19406
19407 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19408 BasicBlock *IBB = PH->getIncomingBlock(I);
19409
19410 // Stop emission if all incoming values are generated.
19411 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19412 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19413 return NewPhi;
19414 }
19415
19416 if (!VisitedBBs.insert(IBB).second) {
19417 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19418 NewPhi->addIncoming(VecOp, IBB);
19419 TreeEntry *OpTE = getOperandEntry(E, I);
19420 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19421 OpTE->VectorizedValue = VecOp;
19422 continue;
19423 }
19424
19425 Builder.SetInsertPoint(IBB->getTerminator());
19426 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19427 Value *Vec = vectorizeOperand(E, I);
19428 if (VecTy != Vec->getType()) {
19429 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19430 MinBWs.contains(getOperandEntry(E, I))) &&
19431 "Expected item in MinBWs.");
19432 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19433 }
19434 NewPhi->addIncoming(Vec, IBB);
19435 }
19436
19437 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19438 "Invalid number of incoming values");
19439 assert(E->VectorizedValue && "Expected vectorized value.");
19440 return E->VectorizedValue;
19441 }
19442
19443 case Instruction::ExtractElement: {
19444 Value *V = E->getSingleOperand(0);
19445 setInsertPointAfterBundle(E);
19446 V = FinalShuffle(V, E);
19447 E->VectorizedValue = V;
19448 return V;
19449 }
19450 case Instruction::ExtractValue: {
19451 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19452 Builder.SetInsertPoint(LI);
19453 Value *Ptr = LI->getPointerOperand();
19454 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19455 Value *NewV = ::propagateMetadata(V, E->Scalars);
19456 NewV = FinalShuffle(NewV, E);
19457 E->VectorizedValue = NewV;
19458 return NewV;
19459 }
19460 case Instruction::InsertElement: {
19461 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
19462 if (const TreeEntry *OpE = getOperandEntry(E, 1);
19463 OpE && !OpE->isGather() && OpE->hasState() &&
19464 !OpE->hasCopyableElements())
19465 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
19466 else
19467 setInsertPointAfterBundle(E);
19468 Value *V = vectorizeOperand(E, 1);
19469 ArrayRef<Value *> Op = E->getOperand(1);
19470 Type *ScalarTy = Op.front()->getType();
19471 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
19472 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19473 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
19474 assert(Res.first > 0 && "Expected item in MinBWs.");
19475 V = Builder.CreateIntCast(
19476 V,
19478 ScalarTy,
19479 cast<FixedVectorType>(V->getType())->getNumElements()),
19480 Res.second);
19481 }
19482
19483 // Create InsertVector shuffle if necessary
19484 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
19485 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19486 }));
19487 const unsigned NumElts =
19488 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19489 const unsigned NumScalars = E->Scalars.size();
19490
19491 unsigned Offset = *getElementIndex(VL0);
19492 assert(Offset < NumElts && "Failed to find vector index offset");
19493
19494 // Create shuffle to resize vector
19495 SmallVector<int> Mask;
19496 if (!E->ReorderIndices.empty()) {
19497 inversePermutation(E->ReorderIndices, Mask);
19498 Mask.append(NumElts - NumScalars, PoisonMaskElem);
19499 } else {
19500 Mask.assign(NumElts, PoisonMaskElem);
19501 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
19502 }
19503 // Create InsertVector shuffle if necessary
19504 bool IsIdentity = true;
19505 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
19506 Mask.swap(PrevMask);
19507 for (unsigned I = 0; I < NumScalars; ++I) {
19508 Value *Scalar = E->Scalars[PrevMask[I]];
19509 unsigned InsertIdx = *getElementIndex(Scalar);
19510 IsIdentity &= InsertIdx - Offset == I;
19511 Mask[InsertIdx - Offset] = I;
19512 }
19513 if (!IsIdentity || NumElts != NumScalars) {
19514 Value *V2 = nullptr;
19515 bool IsVNonPoisonous =
19517 SmallVector<int> InsertMask(Mask);
19518 if (NumElts != NumScalars && Offset == 0) {
19519 // Follow all insert element instructions from the current buildvector
19520 // sequence.
19521 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
19522 do {
19523 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
19524 if (!InsertIdx)
19525 break;
19526 if (InsertMask[*InsertIdx] == PoisonMaskElem)
19527 InsertMask[*InsertIdx] = *InsertIdx;
19528 if (!Ins->hasOneUse())
19529 break;
19532 } while (Ins);
19533 SmallBitVector UseMask =
19534 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19535 SmallBitVector IsFirstPoison =
19536 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19537 SmallBitVector IsFirstUndef =
19538 isUndefVector(FirstInsert->getOperand(0), UseMask);
19539 if (!IsFirstPoison.all()) {
19540 unsigned Idx = 0;
19541 for (unsigned I = 0; I < NumElts; I++) {
19542 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
19543 IsFirstUndef.test(I)) {
19544 if (IsVNonPoisonous) {
19545 InsertMask[I] = I < NumScalars ? I : 0;
19546 continue;
19547 }
19548 if (!V2)
19549 V2 = UndefValue::get(V->getType());
19550 if (Idx >= NumScalars)
19551 Idx = NumScalars - 1;
19552 InsertMask[I] = NumScalars + Idx;
19553 ++Idx;
19554 } else if (InsertMask[I] != PoisonMaskElem &&
19555 Mask[I] == PoisonMaskElem) {
19556 InsertMask[I] = PoisonMaskElem;
19557 }
19558 }
19559 } else {
19560 InsertMask = Mask;
19561 }
19562 }
19563 if (!V2)
19564 V2 = PoisonValue::get(V->getType());
19565 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19566 if (auto *I = dyn_cast<Instruction>(V)) {
19567 GatherShuffleExtractSeq.insert(I);
19568 CSEBlocks.insert(I->getParent());
19569 }
19570 }
19571
19572 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
19573 for (unsigned I = 0; I < NumElts; I++) {
19574 if (Mask[I] != PoisonMaskElem)
19575 InsertMask[Offset + I] = I;
19576 }
19577 SmallBitVector UseMask =
19578 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19579 SmallBitVector IsFirstUndef =
19580 isUndefVector(FirstInsert->getOperand(0), UseMask);
19581 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
19582 NumElts != NumScalars) {
19583 if (IsFirstUndef.all()) {
19584 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
19585 SmallBitVector IsFirstPoison =
19586 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19587 if (!IsFirstPoison.all()) {
19588 for (unsigned I = 0; I < NumElts; I++) {
19589 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
19590 InsertMask[I] = I + NumElts;
19591 }
19592 }
19593 V = Builder.CreateShuffleVector(
19594 V,
19595 IsFirstPoison.all() ? PoisonValue::get(V->getType())
19596 : FirstInsert->getOperand(0),
19597 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
19598 if (auto *I = dyn_cast<Instruction>(V)) {
19599 GatherShuffleExtractSeq.insert(I);
19600 CSEBlocks.insert(I->getParent());
19601 }
19602 }
19603 } else {
19604 SmallBitVector IsFirstPoison =
19605 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19606 for (unsigned I = 0; I < NumElts; I++) {
19607 if (InsertMask[I] == PoisonMaskElem)
19608 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
19609 else
19610 InsertMask[I] += NumElts;
19611 }
19612 V = Builder.CreateShuffleVector(
19613 FirstInsert->getOperand(0), V, InsertMask,
19614 cast<Instruction>(E->Scalars.back())->getName());
19615 if (auto *I = dyn_cast<Instruction>(V)) {
19616 GatherShuffleExtractSeq.insert(I);
19617 CSEBlocks.insert(I->getParent());
19618 }
19619 }
19620 }
19621
19622 ++NumVectorInstructions;
19623 E->VectorizedValue = V;
19624 return V;
19625 }
19626 case Instruction::ZExt:
19627 case Instruction::SExt:
19628 case Instruction::FPToUI:
19629 case Instruction::FPToSI:
19630 case Instruction::FPExt:
19631 case Instruction::PtrToInt:
19632 case Instruction::IntToPtr:
19633 case Instruction::SIToFP:
19634 case Instruction::UIToFP:
19635 case Instruction::Trunc:
19636 case Instruction::FPTrunc:
19637 case Instruction::BitCast: {
19638 setInsertPointAfterBundle(E);
19639
19640 Value *InVec = vectorizeOperand(E, 0);
19641
19642 auto *CI = cast<CastInst>(VL0);
19643 Instruction::CastOps VecOpcode = CI->getOpcode();
19644 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
19645 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
19646 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
19647 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19648 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
19649 // Check if the values are candidates to demote.
19650 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19651 if (SrcIt != MinBWs.end())
19652 SrcBWSz = SrcIt->second.first;
19653 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
19654 if (BWSz == SrcBWSz) {
19655 VecOpcode = Instruction::BitCast;
19656 } else if (BWSz < SrcBWSz) {
19657 VecOpcode = Instruction::Trunc;
19658 } else if (It != MinBWs.end()) {
19659 assert(BWSz > SrcBWSz && "Invalid cast!");
19660 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19661 } else if (SrcIt != MinBWs.end()) {
19662 assert(BWSz > SrcBWSz && "Invalid cast!");
19663 VecOpcode =
19664 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19665 }
19666 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19667 !SrcIt->second.second) {
19668 VecOpcode = Instruction::UIToFP;
19669 }
19670 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19671 ? InVec
19672 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19673 V = FinalShuffle(V, E);
19674
19675 E->VectorizedValue = V;
19676 ++NumVectorInstructions;
19677 return V;
19678 }
19679 case Instruction::FCmp:
19680 case Instruction::ICmp: {
19681 setInsertPointAfterBundle(E);
19682
19683 Value *L = vectorizeOperand(E, 0);
19684 Value *R = vectorizeOperand(E, 1);
19685 if (L->getType() != R->getType()) {
19686 assert((getOperandEntry(E, 0)->isGather() ||
19687 getOperandEntry(E, 1)->isGather() ||
19688 MinBWs.contains(getOperandEntry(E, 0)) ||
19689 MinBWs.contains(getOperandEntry(E, 1))) &&
19690 "Expected item in MinBWs.");
19691 if (cast<VectorType>(L->getType())
19692 ->getElementType()
19693 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
19694 ->getElementType()
19695 ->getIntegerBitWidth()) {
19696 Type *CastTy = R->getType();
19697 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19698 } else {
19699 Type *CastTy = L->getType();
19700 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19701 }
19702 }
19703
19704 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
19705 Value *V = Builder.CreateCmp(P0, L, R);
19706 propagateIRFlags(V, E->Scalars, VL0);
19707 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
19708 ICmp->setSameSign(/*B=*/false);
19709 // Do not cast for cmps.
19710 VecTy = cast<FixedVectorType>(V->getType());
19711 V = FinalShuffle(V, E);
19712
19713 E->VectorizedValue = V;
19714 ++NumVectorInstructions;
19715 return V;
19716 }
19717 case Instruction::Select: {
19718 setInsertPointAfterBundle(E);
19719
19720 Value *Cond = vectorizeOperand(E, 0);
19721 Value *True = vectorizeOperand(E, 1);
19722 Value *False = vectorizeOperand(E, 2);
19723 if (True->getType() != VecTy || False->getType() != VecTy) {
19724 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
19725 getOperandEntry(E, 2)->isGather() ||
19726 MinBWs.contains(getOperandEntry(E, 1)) ||
19727 MinBWs.contains(getOperandEntry(E, 2))) &&
19728 "Expected item in MinBWs.");
19729 if (True->getType() != VecTy)
19730 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19731 if (False->getType() != VecTy)
19732 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19733 }
19734
19735 unsigned CondNumElements = getNumElements(Cond->getType());
19736 unsigned TrueNumElements = getNumElements(True->getType());
19737 assert(TrueNumElements >= CondNumElements &&
19738 TrueNumElements % CondNumElements == 0 &&
19739 "Cannot vectorize Instruction::Select");
19740 assert(TrueNumElements == getNumElements(False->getType()) &&
19741 "Cannot vectorize Instruction::Select");
19742 if (CondNumElements != TrueNumElements) {
19743 // When the return type is i1 but the source is fixed vector type, we
19744 // need to duplicate the condition value.
19745 Cond = Builder.CreateShuffleVector(
19746 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
19747 CondNumElements));
19748 }
19749 assert(getNumElements(Cond->getType()) == TrueNumElements &&
19750 "Cannot vectorize Instruction::Select");
19751 Value *V =
19752 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
19753 V = FinalShuffle(V, E);
19754
19755 E->VectorizedValue = V;
19756 ++NumVectorInstructions;
19757 return V;
19758 }
19759 case Instruction::FNeg: {
19760 setInsertPointAfterBundle(E);
19761
19762 Value *Op = vectorizeOperand(E, 0);
19763
19764 Value *V = Builder.CreateUnOp(
19765 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
19766 propagateIRFlags(V, E->Scalars, VL0);
19767 if (auto *I = dyn_cast<Instruction>(V))
19768 V = ::propagateMetadata(I, E->Scalars);
19769
19770 V = FinalShuffle(V, E);
19771
19772 E->VectorizedValue = V;
19773 ++NumVectorInstructions;
19774
19775 return V;
19776 }
19777 case Instruction::Freeze: {
19778 setInsertPointAfterBundle(E);
19779
19780 Value *Op = vectorizeOperand(E, 0);
19781
19782 if (Op->getType() != VecTy) {
19783 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19784 MinBWs.contains(getOperandEntry(E, 0))) &&
19785 "Expected item in MinBWs.");
19786 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
19787 }
19788 Value *V = Builder.CreateFreeze(Op);
19789 V = FinalShuffle(V, E);
19790
19791 E->VectorizedValue = V;
19792 ++NumVectorInstructions;
19793
19794 return V;
19795 }
19796 case Instruction::Add:
19797 case Instruction::FAdd:
19798 case Instruction::Sub:
19799 case Instruction::FSub:
19800 case Instruction::Mul:
19801 case Instruction::FMul:
19802 case Instruction::UDiv:
19803 case Instruction::SDiv:
19804 case Instruction::FDiv:
19805 case Instruction::URem:
19806 case Instruction::SRem:
19807 case Instruction::FRem:
19808 case Instruction::Shl:
19809 case Instruction::LShr:
19810 case Instruction::AShr:
19811 case Instruction::And:
19812 case Instruction::Or:
19813 case Instruction::Xor: {
19814 setInsertPointAfterBundle(E);
19815
19816 Value *LHS = vectorizeOperand(E, 0);
19817 Value *RHS = vectorizeOperand(E, 1);
19818 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19819 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19820 ArrayRef<Value *> Ops = E->getOperand(I);
19821 if (all_of(Ops, [&](Value *Op) {
19822 auto *CI = dyn_cast<ConstantInt>(Op);
19823 return CI && CI->getValue().countr_one() >= It->second.first;
19824 })) {
19825 V = FinalShuffle(I == 0 ? RHS : LHS, E);
19826 E->VectorizedValue = V;
19827 ++NumVectorInstructions;
19828 return V;
19829 }
19830 }
19831 }
19832 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
19833 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19834 getOperandEntry(E, 1)->isGather() ||
19835 MinBWs.contains(getOperandEntry(E, 0)) ||
19836 MinBWs.contains(getOperandEntry(E, 1))) &&
19837 "Expected item in MinBWs.");
19838 if (LHS->getType() != VecTy)
19839 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
19840 if (RHS->getType() != VecTy)
19841 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
19842 }
19843
19844 Value *V = Builder.CreateBinOp(
19845 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
19846 RHS);
19847 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
19848 if (auto *I = dyn_cast<Instruction>(V)) {
19849 V = ::propagateMetadata(I, E->Scalars);
19850 // Drop nuw flags for abs(sub(commutative), true).
19851 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19852 any_of(E->Scalars, [E](Value *V) {
19853 return isa<PoisonValue>(V) ||
19854 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
19855 isCommutative(cast<Instruction>(V));
19856 }))
19857 I->setHasNoUnsignedWrap(/*b=*/false);
19858 }
19859
19860 V = FinalShuffle(V, E);
19861
19862 E->VectorizedValue = V;
19863 ++NumVectorInstructions;
19864
19865 return V;
19866 }
19867 case Instruction::Load: {
19868 // Loads are inserted at the head of the tree because we don't want to
19869 // sink them all the way down past store instructions.
19870 setInsertPointAfterBundle(E);
19871
19872 LoadInst *LI = cast<LoadInst>(VL0);
19873 Instruction *NewLI;
19874 FixedVectorType *StridedLoadTy = nullptr;
19875 Value *PO = LI->getPointerOperand();
19876 if (E->State == TreeEntry::Vectorize) {
19877 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19878 } else if (E->State == TreeEntry::CompressVectorize) {
19879 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19880 CompressEntryToData.at(E);
19881 Align CommonAlignment = LI->getAlign();
19882 if (IsMasked) {
19883 unsigned VF = getNumElements(LoadVecTy);
19884 SmallVector<Constant *> MaskValues(
19885 VF / getNumElements(LI->getType()),
19886 ConstantInt::getFalse(VecTy->getContext()));
19887 for (int I : CompressMask)
19888 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
19889 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19890 assert(SLPReVec && "Only supported by REVEC.");
19891 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
19892 }
19893 Constant *MaskValue = ConstantVector::get(MaskValues);
19894 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19895 MaskValue);
19896 } else {
19897 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19898 }
19899 NewLI = ::propagateMetadata(NewLI, E->Scalars);
19900 // TODO: include this cost into CommonCost.
19901 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19902 assert(SLPReVec && "FixedVectorType is not expected.");
19903 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
19904 CompressMask);
19905 }
19906 NewLI =
19907 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
19908 } else if (E->State == TreeEntry::StridedVectorize) {
19909 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19910 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19911 PO = IsReverseOrder ? PtrN : Ptr0;
19912 Type *StrideTy = DL->getIndexType(PO->getType());
19913 Value *StrideVal;
19914 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
19915 StridedLoadTy = SPtrInfo.Ty;
19916 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
19917 unsigned StridedLoadEC =
19918 StridedLoadTy->getElementCount().getKnownMinValue();
19919
19920 Value *Stride = SPtrInfo.StrideVal;
19921 if (!Stride) {
19922 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19923 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
19924 SCEVExpander Expander(*SE, "strided-load-vec");
19925 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
19926 &*Builder.GetInsertPoint());
19927 }
19928 Value *NewStride =
19929 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
19930 StrideVal = Builder.CreateMul(
19931 NewStride, ConstantInt::get(
19932 StrideTy, (IsReverseOrder ? -1 : 1) *
19933 static_cast<int>(
19934 DL->getTypeAllocSize(ScalarTy))));
19935 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19936 auto *Inst = Builder.CreateIntrinsic(
19937 Intrinsic::experimental_vp_strided_load,
19938 {StridedLoadTy, PO->getType(), StrideTy},
19939 {PO, StrideVal,
19940 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
19941 Builder.getInt32(StridedLoadEC)});
19942 Inst->addParamAttr(
19943 /*ArgNo=*/0,
19944 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19945 NewLI = Inst;
19946 } else {
19947 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
19948 Value *VecPtr = vectorizeOperand(E, 0);
19949 if (isa<FixedVectorType>(ScalarTy)) {
19950 assert(SLPReVec && "FixedVectorType is not expected.");
19951 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
19952 // to expand VecPtr if ScalarTy is a vector type.
19953 unsigned ScalarTyNumElements =
19954 cast<FixedVectorType>(ScalarTy)->getNumElements();
19955 unsigned VecTyNumElements =
19956 cast<FixedVectorType>(VecTy)->getNumElements();
19957 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19958 "Cannot expand getelementptr.");
19959 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19960 SmallVector<Constant *> Indices(VecTyNumElements);
19961 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
19962 return Builder.getInt64(I % ScalarTyNumElements);
19963 });
19964 VecPtr = Builder.CreateGEP(
19965 VecTy->getElementType(),
19966 Builder.CreateShuffleVector(
19967 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
19968 ConstantVector::get(Indices));
19969 }
19970 // Use the minimum alignment of the gathered loads.
19971 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19972 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19973 }
19974 Value *V = E->State == TreeEntry::CompressVectorize
19975 ? NewLI
19976 : ::propagateMetadata(NewLI, E->Scalars);
19977
19978 if (StridedLoadTy != VecTy)
19979 V = Builder.CreateBitOrPointerCast(V, VecTy);
19980 V = FinalShuffle(V, E);
19981 E->VectorizedValue = V;
19982 ++NumVectorInstructions;
19983 return V;
19984 }
19985 case Instruction::Store: {
19986 auto *SI = cast<StoreInst>(VL0);
19987
19988 setInsertPointAfterBundle(E);
19989
19990 Value *VecValue = vectorizeOperand(E, 0);
19991 if (VecValue->getType() != VecTy)
19992 VecValue =
19993 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19994 VecValue = FinalShuffle(VecValue, E);
19995
19996 Value *Ptr = SI->getPointerOperand();
19997 Instruction *ST;
19998 if (E->State == TreeEntry::Vectorize) {
19999 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
20000 } else {
20001 assert(E->State == TreeEntry::StridedVectorize &&
20002 "Expected either strided or consecutive stores.");
20003 if (!E->ReorderIndices.empty()) {
20004 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
20005 Ptr = SI->getPointerOperand();
20006 }
20007 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
20008 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
20009 auto *Inst = Builder.CreateIntrinsic(
20010 Intrinsic::experimental_vp_strided_store,
20011 {VecTy, Ptr->getType(), StrideTy},
20012 {VecValue, Ptr,
20013 ConstantInt::get(
20014 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
20015 Builder.getAllOnesMask(VecTy->getElementCount()),
20016 Builder.getInt32(E->Scalars.size())});
20017 Inst->addParamAttr(
20018 /*ArgNo=*/1,
20019 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20020 ST = Inst;
20021 }
20022
20023 Value *V = ::propagateMetadata(ST, E->Scalars);
20024
20025 E->VectorizedValue = V;
20026 ++NumVectorInstructions;
20027 return V;
20028 }
20029 case Instruction::GetElementPtr: {
20030 auto *GEP0 = cast<GetElementPtrInst>(VL0);
20031 setInsertPointAfterBundle(E);
20032
20033 Value *Op0 = vectorizeOperand(E, 0);
20034
20035 SmallVector<Value *> OpVecs;
20036 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
20037 Value *OpVec = vectorizeOperand(E, J);
20038 OpVecs.push_back(OpVec);
20039 }
20040
20041 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20042 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
20044 for (Value *V : E->Scalars) {
20046 GEPs.push_back(V);
20047 }
20048 V = ::propagateMetadata(I, GEPs);
20049 }
20050
20051 V = FinalShuffle(V, E);
20052
20053 E->VectorizedValue = V;
20054 ++NumVectorInstructions;
20055
20056 return V;
20057 }
20058 case Instruction::Call: {
20059 CallInst *CI = cast<CallInst>(VL0);
20060 setInsertPointAfterBundle(E);
20061
20063
20065 CI, ID, VecTy->getNumElements(),
20066 It != MinBWs.end() ? It->second.first : 0, TTI);
20067 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
20068 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
20069 VecCallCosts.first <= VecCallCosts.second;
20070
20071 Value *ScalarArg = nullptr;
20072 SmallVector<Value *> OpVecs;
20073 SmallVector<Type *, 2> TysForDecl;
20074 // Add return type if intrinsic is overloaded on it.
20075 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
20076 TysForDecl.push_back(VecTy);
20077 auto *CEI = cast<CallInst>(VL0);
20078 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
20079 // Some intrinsics have scalar arguments. This argument should not be
20080 // vectorized.
20081 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
20082 ScalarArg = CEI->getArgOperand(I);
20083 // if decided to reduce bitwidth of abs intrinsic, it second argument
20084 // must be set false (do not return poison, if value issigned min).
20085 if (ID == Intrinsic::abs && It != MinBWs.end() &&
20086 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
20087 ScalarArg = Builder.getFalse();
20088 OpVecs.push_back(ScalarArg);
20090 TysForDecl.push_back(ScalarArg->getType());
20091 continue;
20092 }
20093
20094 Value *OpVec = vectorizeOperand(E, I);
20095 ScalarArg = CEI->getArgOperand(I);
20096 if (cast<VectorType>(OpVec->getType())->getElementType() !=
20097 ScalarArg->getType()->getScalarType() &&
20098 It == MinBWs.end()) {
20099 auto *CastTy =
20100 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
20101 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
20102 } else if (It != MinBWs.end()) {
20103 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
20104 }
20105 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
20106 OpVecs.push_back(OpVec);
20107 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
20108 TysForDecl.push_back(OpVec->getType());
20109 }
20110
20111 Function *CF;
20112 if (!UseIntrinsic) {
20113 VFShape Shape =
20115 ElementCount::getFixed(VecTy->getNumElements()),
20116 false /*HasGlobalPred*/);
20117 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
20118 } else {
20119 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
20120 }
20121
20123 CI->getOperandBundlesAsDefs(OpBundles);
20124 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
20125
20126 propagateIRFlags(V, E->Scalars, VL0);
20127 V = FinalShuffle(V, E);
20128
20129 E->VectorizedValue = V;
20130 ++NumVectorInstructions;
20131 return V;
20132 }
20133 case Instruction::ShuffleVector: {
20134 Value *V;
20135 if (SLPReVec && !E->isAltShuffle()) {
20136 setInsertPointAfterBundle(E);
20137 Value *Src = vectorizeOperand(E, 0);
20138 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
20139 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
20140 SmallVector<int> NewMask(ThisMask.size());
20141 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
20142 return SVSrc->getShuffleMask()[Mask];
20143 });
20144 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
20145 SVSrc->getOperand(1), NewMask);
20146 } else {
20147 V = Builder.CreateShuffleVector(Src, ThisMask);
20148 }
20149 propagateIRFlags(V, E->Scalars, VL0);
20150 if (auto *I = dyn_cast<Instruction>(V))
20151 V = ::propagateMetadata(I, E->Scalars);
20152 V = FinalShuffle(V, E);
20153 } else {
20154 assert(E->isAltShuffle() &&
20155 ((Instruction::isBinaryOp(E->getOpcode()) &&
20156 Instruction::isBinaryOp(E->getAltOpcode())) ||
20157 (Instruction::isCast(E->getOpcode()) &&
20158 Instruction::isCast(E->getAltOpcode())) ||
20159 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
20160 "Invalid Shuffle Vector Operand");
20161
20162 Value *LHS = nullptr, *RHS = nullptr;
20163 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
20164 setInsertPointAfterBundle(E);
20165 LHS = vectorizeOperand(E, 0);
20166 RHS = vectorizeOperand(E, 1);
20167 } else {
20168 setInsertPointAfterBundle(E);
20169 LHS = vectorizeOperand(E, 0);
20170 }
20171 if (LHS && RHS &&
20172 ((Instruction::isBinaryOp(E->getOpcode()) &&
20173 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
20174 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
20175 assert((It != MinBWs.end() ||
20176 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
20177 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
20178 MinBWs.contains(getOperandEntry(E, 0)) ||
20179 MinBWs.contains(getOperandEntry(E, 1))) &&
20180 "Expected item in MinBWs.");
20181 Type *CastTy = VecTy;
20182 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
20184 ->getElementType()
20185 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
20186 ->getElementType()
20187 ->getIntegerBitWidth())
20188 CastTy = RHS->getType();
20189 else
20190 CastTy = LHS->getType();
20191 }
20192 if (LHS->getType() != CastTy)
20193 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
20194 if (RHS->getType() != CastTy)
20195 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
20196 }
20197
20198 Value *V0, *V1;
20199 if (Instruction::isBinaryOp(E->getOpcode())) {
20200 V0 = Builder.CreateBinOp(
20201 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
20202 V1 = Builder.CreateBinOp(
20203 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
20204 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
20205 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
20206 auto *AltCI = cast<CmpInst>(E->getAltOp());
20207 CmpInst::Predicate AltPred = AltCI->getPredicate();
20208 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
20209 } else {
20210 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
20211 unsigned SrcBWSz = DL->getTypeSizeInBits(
20212 cast<VectorType>(LHS->getType())->getElementType());
20213 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20214 if (BWSz <= SrcBWSz) {
20215 if (BWSz < SrcBWSz)
20216 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
20217 assert(LHS->getType() == VecTy &&
20218 "Expected same type as operand.");
20219 if (auto *I = dyn_cast<Instruction>(LHS))
20220 LHS = ::propagateMetadata(I, E->Scalars);
20221 LHS = FinalShuffle(LHS, E);
20222 E->VectorizedValue = LHS;
20223 ++NumVectorInstructions;
20224 return LHS;
20225 }
20226 }
20227 V0 = Builder.CreateCast(
20228 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
20229 V1 = Builder.CreateCast(
20230 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
20231 }
20232 // Add V0 and V1 to later analysis to try to find and remove matching
20233 // instruction, if any.
20234 for (Value *V : {V0, V1}) {
20235 if (auto *I = dyn_cast<Instruction>(V)) {
20236 GatherShuffleExtractSeq.insert(I);
20237 CSEBlocks.insert(I->getParent());
20238 }
20239 }
20240
20241 // Create shuffle to take alternate operations from the vector.
20242 // Also, gather up main and alt scalar ops to propagate IR flags to
20243 // each vector operation.
20244 ValueList OpScalars, AltScalars;
20245 SmallVector<int> Mask;
20246 E->buildAltOpShuffleMask(
20247 [E, this](Instruction *I) {
20248 assert(E->getMatchingMainOpOrAltOp(I) &&
20249 "Unexpected main/alternate opcode");
20250 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
20251 *TLI);
20252 },
20253 Mask, &OpScalars, &AltScalars);
20254
20255 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
20256 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
20257 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
20258 // Drop nuw flags for abs(sub(commutative), true).
20259 if (auto *I = dyn_cast<Instruction>(Vec);
20260 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
20261 any_of(E->Scalars, [E](Value *V) {
20262 if (isa<PoisonValue>(V))
20263 return false;
20264 if (E->hasCopyableElements() && E->isCopyableElement(V))
20265 return false;
20266 auto *IV = cast<Instruction>(V);
20267 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20268 }))
20269 I->setHasNoUnsignedWrap(/*b=*/false);
20270 };
20271 DropNuwFlag(V0, E->getOpcode());
20272 DropNuwFlag(V1, E->getAltOpcode());
20273
20274 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20275 assert(SLPReVec && "FixedVectorType is not expected.");
20276 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
20277 }
20278 V = Builder.CreateShuffleVector(V0, V1, Mask);
20279 if (auto *I = dyn_cast<Instruction>(V)) {
20280 V = ::propagateMetadata(I, E->Scalars);
20281 GatherShuffleExtractSeq.insert(I);
20282 CSEBlocks.insert(I->getParent());
20283 }
20284 }
20285
20286 E->VectorizedValue = V;
20287 ++NumVectorInstructions;
20288
20289 return V;
20290 }
20291 default:
20292 llvm_unreachable("unknown inst");
20293 }
20294 return nullptr;
20295}
20296
20298 ExtraValueToDebugLocsMap ExternallyUsedValues;
20299 return vectorizeTree(ExternallyUsedValues);
20300}
20301
20303 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
20304 Instruction *ReductionRoot,
20305 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20306 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
20307 // need to rebuild it.
20308 EntryToLastInstruction.clear();
20309 // All blocks must be scheduled before any instructions are inserted.
20310 for (auto &BSIter : BlocksSchedules)
20311 scheduleBlock(*this, BSIter.second.get());
20312 // Cache last instructions for the nodes to avoid side effects, which may
20313 // appear during vectorization, like extra uses, etc.
20314 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20315 if (TE->isGather())
20316 continue;
20317 (void)getLastInstructionInBundle(TE.get());
20318 }
20319
20320 if (ReductionRoot)
20321 Builder.SetInsertPoint(ReductionRoot->getParent(),
20322 ReductionRoot->getIterator());
20323 else
20324 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20325
20326 // Vectorize gather operands of the nodes with the external uses only.
20328 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20329 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20330 TE->UserTreeIndex.UserTE->hasState() &&
20331 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20332 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20333 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20334 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20335 all_of(TE->UserTreeIndex.UserTE->Scalars,
20336 [](Value *V) { return isUsedOutsideBlock(V); })) {
20337 Instruction &LastInst =
20338 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20339 GatherEntries.emplace_back(TE.get(), &LastInst);
20340 }
20341 }
20342 for (auto &Entry : GatherEntries) {
20343 IRBuilderBase::InsertPointGuard Guard(Builder);
20344 Builder.SetInsertPoint(Entry.second);
20345 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20346 (void)vectorizeTree(Entry.first);
20347 }
20348 // Emit gathered loads first to emit better code for the users of those
20349 // gathered loads.
20350 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20351 if (GatheredLoadsEntriesFirst.has_value() &&
20352 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20353 (!TE->isGather() || TE->UserTreeIndex)) {
20354 assert((TE->UserTreeIndex ||
20355 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20356 "Expected gathered load node.");
20357 (void)vectorizeTree(TE.get());
20358 }
20359 }
20360 (void)vectorizeTree(VectorizableTree[0].get());
20361 // Run through the list of postponed gathers and emit them, replacing the temp
20362 // emitted allocas with actual vector instructions.
20363 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20365 for (const TreeEntry *E : PostponedNodes) {
20366 auto *TE = const_cast<TreeEntry *>(E);
20367 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20368 TE->VectorizedValue = nullptr;
20369 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20370 // If user is a PHI node, its vector code have to be inserted right before
20371 // block terminator. Since the node was delayed, there were some unresolved
20372 // dependencies at the moment when stab instruction was emitted. In a case
20373 // when any of these dependencies turn out an operand of another PHI, coming
20374 // from this same block, position of a stab instruction will become invalid.
20375 // The is because source vector that supposed to feed this gather node was
20376 // inserted at the end of the block [after stab instruction]. So we need
20377 // to adjust insertion point again to the end of block.
20378 if (isa<PHINode>(UserI) ||
20379 (TE->UserTreeIndex.UserTE->hasState() &&
20380 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20381 // Insert before all users.
20382 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20383 for (User *U : PrevVec->users()) {
20384 if (U == UserI)
20385 continue;
20386 auto *UI = dyn_cast<Instruction>(U);
20387 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20388 continue;
20389 if (UI->comesBefore(InsertPt))
20390 InsertPt = UI;
20391 }
20392 Builder.SetInsertPoint(InsertPt);
20393 } else {
20394 Builder.SetInsertPoint(PrevVec);
20395 }
20396 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20397 Value *Vec = vectorizeTree(TE);
20398 if (auto *VecI = dyn_cast<Instruction>(Vec);
20399 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20400 Builder.GetInsertPoint()->comesBefore(VecI))
20401 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20402 Builder.GetInsertPoint());
20403 if (Vec->getType() != PrevVec->getType()) {
20404 assert(Vec->getType()->isIntOrIntVectorTy() &&
20405 PrevVec->getType()->isIntOrIntVectorTy() &&
20406 "Expected integer vector types only.");
20407 std::optional<bool> IsSigned;
20408 for (Value *V : TE->Scalars) {
20409 if (isVectorized(V)) {
20410 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20411 auto It = MinBWs.find(MNTE);
20412 if (It != MinBWs.end()) {
20413 IsSigned = IsSigned.value_or(false) || It->second.second;
20414 if (*IsSigned)
20415 break;
20416 }
20417 }
20418 if (IsSigned.value_or(false))
20419 break;
20420 // Scan through gather nodes.
20421 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20422 auto It = MinBWs.find(BVE);
20423 if (It != MinBWs.end()) {
20424 IsSigned = IsSigned.value_or(false) || It->second.second;
20425 if (*IsSigned)
20426 break;
20427 }
20428 }
20429 if (IsSigned.value_or(false))
20430 break;
20431 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20432 IsSigned =
20433 IsSigned.value_or(false) ||
20434 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20435 continue;
20436 }
20437 if (IsSigned.value_or(false))
20438 break;
20439 }
20440 }
20441 if (IsSigned.value_or(false)) {
20442 // Final attempt - check user node.
20443 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20444 if (It != MinBWs.end())
20445 IsSigned = It->second.second;
20446 }
20447 assert(IsSigned &&
20448 "Expected user node or perfect diamond match in MinBWs.");
20449 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20450 }
20451 PrevVec->replaceAllUsesWith(Vec);
20452 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20453 // Replace the stub vector node, if it was used before for one of the
20454 // buildvector nodes already.
20455 auto It = PostponedValues.find(PrevVec);
20456 if (It != PostponedValues.end()) {
20457 for (TreeEntry *VTE : It->getSecond())
20458 VTE->VectorizedValue = Vec;
20459 }
20460 eraseInstruction(PrevVec);
20461 }
20462
20463 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
20464 << " values .\n");
20465
20467 // Maps vector instruction to original insertelement instruction
20468 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
20469 // Maps extract Scalar to the corresponding extractelement instruction in the
20470 // basic block. Only one extractelement per block should be emitted.
20472 ScalarToEEs;
20473 SmallDenseSet<Value *, 4> UsedInserts;
20475 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
20477 // Extract all of the elements with the external uses.
20478 for (const auto &ExternalUse : ExternalUses) {
20479 Value *Scalar = ExternalUse.Scalar;
20480 llvm::User *User = ExternalUse.User;
20481
20482 // Skip users that we already RAUW. This happens when one instruction
20483 // has multiple uses of the same value.
20484 if (User && !is_contained(Scalar->users(), User))
20485 continue;
20486 const TreeEntry *E = &ExternalUse.E;
20487 assert(E && "Invalid scalar");
20488 assert(!E->isGather() && "Extracting from a gather list");
20489 // Non-instruction pointers are not deleted, just skip them.
20490 if (E->getOpcode() == Instruction::GetElementPtr &&
20491 !isa<GetElementPtrInst>(Scalar))
20492 continue;
20493
20494 Value *Vec = E->VectorizedValue;
20495 assert(Vec && "Can't find vectorizable value");
20496
20497 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20498 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
20499 if (Scalar->getType() != Vec->getType()) {
20500 Value *Ex = nullptr;
20501 Value *ExV = nullptr;
20502 auto *Inst = dyn_cast<Instruction>(Scalar);
20503 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20504 auto It = ScalarToEEs.find(Scalar);
20505 if (It != ScalarToEEs.end()) {
20506 // No need to emit many extracts, just move the only one in the
20507 // current block.
20508 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20509 : Builder.GetInsertBlock());
20510 if (EEIt != It->second.end()) {
20511 Value *PrevV = EEIt->second.first;
20512 if (auto *I = dyn_cast<Instruction>(PrevV);
20513 I && !ReplaceInst &&
20514 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20515 Builder.GetInsertPoint()->comesBefore(I)) {
20516 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20517 Builder.GetInsertPoint());
20518 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20519 CI->moveAfter(I);
20520 }
20521 Ex = PrevV;
20522 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20523 }
20524 }
20525 if (!Ex) {
20526 // "Reuse" the existing extract to improve final codegen.
20527 if (ReplaceInst) {
20528 // Leave the instruction as is, if it cheaper extracts and all
20529 // operands are scalar.
20530 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20531 IgnoredExtracts.insert(EE);
20532 Ex = EE;
20533 } else {
20534 auto *CloneInst = Inst->clone();
20535 CloneInst->insertBefore(Inst->getIterator());
20536 if (Inst->hasName())
20537 CloneInst->takeName(Inst);
20538 Ex = CloneInst;
20539 }
20540 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20541 ES && isa<Instruction>(Vec)) {
20542 Value *V = ES->getVectorOperand();
20543 auto *IVec = cast<Instruction>(Vec);
20544 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
20545 V = ETEs.front()->VectorizedValue;
20546 if (auto *IV = dyn_cast<Instruction>(V);
20547 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
20548 IV->comesBefore(IVec))
20549 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20550 else
20551 Ex = Builder.CreateExtractElement(Vec, Lane);
20552 } else if (auto *VecTy =
20553 dyn_cast<FixedVectorType>(Scalar->getType())) {
20554 assert(SLPReVec && "FixedVectorType is not expected.");
20555 unsigned VecTyNumElements = VecTy->getNumElements();
20556 // When REVEC is enabled, we need to extract a vector.
20557 // Note: The element size of Scalar may be different from the
20558 // element size of Vec.
20559 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
20560 ExternalUse.Lane * VecTyNumElements);
20561 } else {
20562 Ex = Builder.CreateExtractElement(Vec, Lane);
20563 }
20564 // If necessary, sign-extend or zero-extend ScalarRoot
20565 // to the larger type.
20566 ExV = Ex;
20567 if (Scalar->getType() != Ex->getType())
20568 ExV = Builder.CreateIntCast(
20569 Ex, Scalar->getType(),
20570 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
20571 auto *I = dyn_cast<Instruction>(Ex);
20572 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
20573 : &F->getEntryBlock(),
20574 std::make_pair(Ex, ExV));
20575 }
20576 // The then branch of the previous if may produce constants, since 0
20577 // operand might be a constant.
20578 if (auto *ExI = dyn_cast<Instruction>(Ex);
20579 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
20580 GatherShuffleExtractSeq.insert(ExI);
20581 CSEBlocks.insert(ExI->getParent());
20582 }
20583 return ExV;
20584 }
20585 assert(isa<FixedVectorType>(Scalar->getType()) &&
20586 isa<InsertElementInst>(Scalar) &&
20587 "In-tree scalar of vector type is not insertelement?");
20588 auto *IE = cast<InsertElementInst>(Scalar);
20589 VectorToInsertElement.try_emplace(Vec, IE);
20590 return Vec;
20591 };
20592 // If User == nullptr, the Scalar remains as scalar in vectorized
20593 // instructions or is used as extra arg. Generate ExtractElement instruction
20594 // and update the record for this scalar in ExternallyUsedValues.
20595 if (!User) {
20596 if (!ScalarsWithNullptrUser.insert(Scalar).second)
20597 continue;
20598 assert(
20599 (ExternallyUsedValues.count(Scalar) ||
20600 ExternalUsesWithNonUsers.count(Scalar) ||
20601 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20602 any_of(
20603 Scalar->users(),
20604 [&, TTI = TTI](llvm::User *U) {
20605 if (ExternalUsesAsOriginalScalar.contains(U))
20606 return true;
20607 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20608 return !UseEntries.empty() &&
20609 (E->State == TreeEntry::Vectorize ||
20610 E->State == TreeEntry::StridedVectorize ||
20611 E->State == TreeEntry::CompressVectorize) &&
20612 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20613 return (UseEntry->State == TreeEntry::Vectorize ||
20614 UseEntry->State ==
20615 TreeEntry::StridedVectorize ||
20616 UseEntry->State ==
20617 TreeEntry::CompressVectorize) &&
20618 doesInTreeUserNeedToExtract(
20619 Scalar, getRootEntryInstruction(*UseEntry),
20620 TLI, TTI);
20621 });
20622 })) &&
20623 "Scalar with nullptr User must be registered in "
20624 "ExternallyUsedValues map or remain as scalar in vectorized "
20625 "instructions");
20626 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20627 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
20628 if (PHI->getParent()->isLandingPad())
20629 Builder.SetInsertPoint(
20630 PHI->getParent(),
20631 std::next(
20632 PHI->getParent()->getLandingPadInst()->getIterator()));
20633 else
20634 Builder.SetInsertPoint(PHI->getParent(),
20635 PHI->getParent()->getFirstNonPHIIt());
20636 } else {
20637 Builder.SetInsertPoint(VecI->getParent(),
20638 std::next(VecI->getIterator()));
20639 }
20640 } else {
20641 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20642 }
20643 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20644 // Required to update internally referenced instructions.
20645 if (Scalar != NewInst) {
20646 assert((!isa<ExtractElementInst>(Scalar) ||
20647 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
20648 "Extractelements should not be replaced.");
20649 Scalar->replaceAllUsesWith(NewInst);
20650 }
20651 continue;
20652 }
20653
20654 if (auto *VU = dyn_cast<InsertElementInst>(User);
20655 VU && VU->getOperand(1) == Scalar) {
20656 // Skip if the scalar is another vector op or Vec is not an instruction.
20657 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20658 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
20659 if (!UsedInserts.insert(VU).second)
20660 continue;
20661 // Need to use original vector, if the root is truncated.
20662 auto BWIt = MinBWs.find(E);
20663 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
20664 auto *ScalarTy = FTy->getElementType();
20665 auto Key = std::make_pair(Vec, ScalarTy);
20666 auto VecIt = VectorCasts.find(Key);
20667 if (VecIt == VectorCasts.end()) {
20668 IRBuilderBase::InsertPointGuard Guard(Builder);
20669 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
20670 if (IVec->getParent()->isLandingPad())
20671 Builder.SetInsertPoint(IVec->getParent(),
20672 std::next(IVec->getParent()
20673 ->getLandingPadInst()
20674 ->getIterator()));
20675 else
20676 Builder.SetInsertPoint(
20677 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20678 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
20679 Builder.SetInsertPoint(IVec->getNextNode());
20680 }
20681 Vec = Builder.CreateIntCast(
20682 Vec,
20684 ScalarTy,
20685 cast<FixedVectorType>(Vec->getType())->getNumElements()),
20686 BWIt->second.second);
20687 VectorCasts.try_emplace(Key, Vec);
20688 } else {
20689 Vec = VecIt->second;
20690 }
20691 }
20692
20693 std::optional<unsigned> InsertIdx = getElementIndex(VU);
20694 if (InsertIdx) {
20695 auto *It = find_if(
20696 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
20697 // Checks if 2 insertelements are from the same buildvector.
20698 InsertElementInst *VecInsert = Data.InsertElements.front();
20700 VU, VecInsert,
20701 [](InsertElementInst *II) { return II->getOperand(0); });
20702 });
20703 unsigned Idx = *InsertIdx;
20704 if (It == ShuffledInserts.end()) {
20705 (void)ShuffledInserts.emplace_back();
20706 It = std::next(ShuffledInserts.begin(),
20707 ShuffledInserts.size() - 1);
20708 }
20709 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
20710 if (Mask.empty())
20711 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
20712 Mask[Idx] = ExternalUse.Lane;
20713 It->InsertElements.push_back(cast<InsertElementInst>(User));
20714 continue;
20715 }
20716 }
20717 }
20718 }
20719
20720 // Generate extracts for out-of-tree users.
20721 // Find the insertion point for the extractelement lane.
20722 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20723 if (PHINode *PH = dyn_cast<PHINode>(User)) {
20724 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20725 if (PH->getIncomingValue(I) == Scalar) {
20726 Instruction *IncomingTerminator =
20727 PH->getIncomingBlock(I)->getTerminator();
20728 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20729 Builder.SetInsertPoint(VecI->getParent(),
20730 std::next(VecI->getIterator()));
20731 } else {
20732 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
20733 }
20734 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20735 PH->setOperand(I, NewInst);
20736 }
20737 }
20738 } else {
20739 Builder.SetInsertPoint(cast<Instruction>(User));
20740 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20741 User->replaceUsesOfWith(Scalar, NewInst);
20742 }
20743 } else {
20744 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20745 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20746 User->replaceUsesOfWith(Scalar, NewInst);
20747 }
20748
20749 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
20750 }
20751
20752 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20753 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
20754 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
20755 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
20756 for (int I = 0, E = Mask.size(); I < E; ++I) {
20757 if (Mask[I] < VF)
20758 CombinedMask1[I] = Mask[I];
20759 else
20760 CombinedMask2[I] = Mask[I] - VF;
20761 }
20762 ShuffleInstructionBuilder ShuffleBuilder(
20763 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
20764 ShuffleBuilder.add(V1, CombinedMask1);
20765 if (V2)
20766 ShuffleBuilder.add(V2, CombinedMask2);
20767 return ShuffleBuilder.finalize({}, {}, {});
20768 };
20769
20770 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
20771 bool ForSingleMask) {
20772 unsigned VF = Mask.size();
20773 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20774 if (VF != VecVF) {
20775 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
20776 Vec = CreateShuffle(Vec, nullptr, Mask);
20777 return std::make_pair(Vec, true);
20778 }
20779 if (!ForSingleMask) {
20780 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20781 for (unsigned I = 0; I < VF; ++I) {
20782 if (Mask[I] != PoisonMaskElem)
20783 ResizeMask[Mask[I]] = Mask[I];
20784 }
20785 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
20786 }
20787 }
20788
20789 return std::make_pair(Vec, false);
20790 };
20791 // Perform shuffling of the vectorize tree entries for better handling of
20792 // external extracts.
20793 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20794 // Find the first and the last instruction in the list of insertelements.
20795 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
20796 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
20797 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
20798 Builder.SetInsertPoint(LastInsert);
20799 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20801 MutableArrayRef(Vector.data(), Vector.size()),
20802 FirstInsert->getOperand(0),
20803 [](Value *Vec) {
20804 return cast<VectorType>(Vec->getType())
20805 ->getElementCount()
20806 .getKnownMinValue();
20807 },
20808 ResizeToVF,
20809 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20810 ArrayRef<Value *> Vals) {
20811 assert((Vals.size() == 1 || Vals.size() == 2) &&
20812 "Expected exactly 1 or 2 input values.");
20813 if (Vals.size() == 1) {
20814 // Do not create shuffle if the mask is a simple identity
20815 // non-resizing mask.
20816 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20817 ->getNumElements() ||
20818 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20819 return CreateShuffle(Vals.front(), nullptr, Mask);
20820 return Vals.front();
20821 }
20822 return CreateShuffle(Vals.front() ? Vals.front()
20823 : FirstInsert->getOperand(0),
20824 Vals.back(), Mask);
20825 });
20826 auto It = ShuffledInserts[I].InsertElements.rbegin();
20827 // Rebuild buildvector chain.
20828 InsertElementInst *II = nullptr;
20829 if (It != ShuffledInserts[I].InsertElements.rend())
20830 II = *It;
20832 while (It != ShuffledInserts[I].InsertElements.rend()) {
20833 assert(II && "Must be an insertelement instruction.");
20834 if (*It == II)
20835 ++It;
20836 else
20837 Inserts.push_back(cast<Instruction>(II));
20838 II = dyn_cast<InsertElementInst>(II->getOperand(0));
20839 }
20840 for (Instruction *II : reverse(Inserts)) {
20841 II->replaceUsesOfWith(II->getOperand(0), NewInst);
20842 if (auto *NewI = dyn_cast<Instruction>(NewInst))
20843 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
20844 II->moveAfter(NewI);
20845 NewInst = II;
20846 }
20847 LastInsert->replaceAllUsesWith(NewInst);
20848 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
20849 IE->replaceUsesOfWith(IE->getOperand(0),
20850 PoisonValue::get(IE->getOperand(0)->getType()));
20851 IE->replaceUsesOfWith(IE->getOperand(1),
20852 PoisonValue::get(IE->getOperand(1)->getType()));
20853 eraseInstruction(IE);
20854 }
20855 CSEBlocks.insert(LastInsert->getParent());
20856 }
20857
20858 SmallVector<Instruction *> RemovedInsts;
20859 // For each vectorized value:
20860 for (auto &TEPtr : VectorizableTree) {
20861 TreeEntry *Entry = TEPtr.get();
20862
20863 // No need to handle users of gathered values.
20864 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
20865 continue;
20866
20867 assert(Entry->VectorizedValue && "Can't find vectorizable value");
20868
20869 // For each lane:
20870 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
20871 Value *Scalar = Entry->Scalars[Lane];
20872
20873 if (Entry->getOpcode() == Instruction::GetElementPtr &&
20874 !isa<GetElementPtrInst>(Scalar))
20875 continue;
20876 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20877 EE && IgnoredExtracts.contains(EE))
20878 continue;
20879 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
20880 continue;
20881#ifndef NDEBUG
20882 Type *Ty = Scalar->getType();
20883 if (!Ty->isVoidTy()) {
20884 for (User *U : Scalar->users()) {
20885 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
20886
20887 // It is legal to delete users in the ignorelist.
20888 assert((isVectorized(U) ||
20889 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20892 "Deleting out-of-tree value");
20893 }
20894 }
20895#endif
20896 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
20897 auto *I = cast<Instruction>(Scalar);
20898 RemovedInsts.push_back(I);
20899 }
20900 }
20901
20902 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
20903 // new vector instruction.
20904 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20905 V->mergeDIAssignID(RemovedInsts);
20906
20907 // Clear up reduction references, if any.
20908 if (UserIgnoreList) {
20909 for (Instruction *I : RemovedInsts) {
20910 const TreeEntry *IE = getTreeEntries(I).front();
20911 if (IE->Idx != 0 &&
20912 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
20913 (ValueToGatherNodes.lookup(I).contains(
20914 VectorizableTree.front().get()) ||
20915 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20916 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20917 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20918 IE->UserTreeIndex &&
20919 is_contained(VectorizableTree.front()->Scalars, I)) &&
20920 !(GatheredLoadsEntriesFirst.has_value() &&
20921 IE->Idx >= *GatheredLoadsEntriesFirst &&
20922 VectorizableTree.front()->isGather() &&
20923 is_contained(VectorizableTree.front()->Scalars, I)) &&
20924 !(!VectorizableTree.front()->isGather() &&
20925 VectorizableTree.front()->isCopyableElement(I)))
20926 continue;
20927 SmallVector<SelectInst *> LogicalOpSelects;
20928 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
20929 // Do not replace condition of the logical op in form select <cond>.
20930 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20931 (match(U.getUser(), m_LogicalAnd()) ||
20932 match(U.getUser(), m_LogicalOr())) &&
20933 U.getOperandNo() == 0;
20934 if (IsPoisoningLogicalOp) {
20935 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20936 return false;
20937 }
20938 return UserIgnoreList->contains(U.getUser());
20939 });
20940 // Replace conditions of the poisoning logical ops with the non-poison
20941 // constant value.
20942 for (SelectInst *SI : LogicalOpSelects)
20943 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
20944 }
20945 }
20946 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
20947 // cache correctness.
20948 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
20949 // - instructions are not deleted until later.
20950 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
20951
20952 Builder.ClearInsertionPoint();
20953 InstrElementSize.clear();
20954
20955 const TreeEntry &RootTE = *VectorizableTree.front();
20956 Value *Vec = RootTE.VectorizedValue;
20957 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20958 It != MinBWs.end() &&
20959 ReductionBitWidth != It->second.first) {
20960 IRBuilder<>::InsertPointGuard Guard(Builder);
20961 Builder.SetInsertPoint(ReductionRoot->getParent(),
20962 ReductionRoot->getIterator());
20963 Vec = Builder.CreateIntCast(
20964 Vec,
20965 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20966 cast<VectorType>(Vec->getType())->getElementCount()),
20967 It->second.second);
20968 }
20969 return Vec;
20970}
20971
20973 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
20974 << " gather sequences instructions.\n");
20975 // LICM InsertElementInst sequences.
20976 for (Instruction *I : GatherShuffleExtractSeq) {
20977 if (isDeleted(I))
20978 continue;
20979
20980 // Check if this block is inside a loop.
20981 Loop *L = LI->getLoopFor(I->getParent());
20982 if (!L)
20983 continue;
20984
20985 // Check if it has a preheader.
20986 BasicBlock *PreHeader = L->getLoopPreheader();
20987 if (!PreHeader)
20988 continue;
20989
20990 // If the vector or the element that we insert into it are
20991 // instructions that are defined in this basic block then we can't
20992 // hoist this instruction.
20993 if (any_of(I->operands(), [L](Value *V) {
20994 auto *OpI = dyn_cast<Instruction>(V);
20995 return OpI && L->contains(OpI);
20996 }))
20997 continue;
20998
20999 // We can hoist this instruction. Move it to the pre-header.
21000 I->moveBefore(PreHeader->getTerminator()->getIterator());
21001 CSEBlocks.insert(PreHeader);
21002 }
21003
21004 // Make a list of all reachable blocks in our CSE queue.
21006 CSEWorkList.reserve(CSEBlocks.size());
21007 for (BasicBlock *BB : CSEBlocks)
21008 if (DomTreeNode *N = DT->getNode(BB)) {
21009 assert(DT->isReachableFromEntry(N));
21010 CSEWorkList.push_back(N);
21011 }
21012
21013 // Sort blocks by domination. This ensures we visit a block after all blocks
21014 // dominating it are visited.
21015 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
21016 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
21017 "Different nodes should have different DFS numbers");
21018 return A->getDFSNumIn() < B->getDFSNumIn();
21019 });
21020
21021 // Less defined shuffles can be replaced by the more defined copies.
21022 // Between two shuffles one is less defined if it has the same vector operands
21023 // and its mask indeces are the same as in the first one or undefs. E.g.
21024 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
21025 // poison, <0, 0, 0, 0>.
21026 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
21027 Instruction *I2,
21028 SmallVectorImpl<int> &NewMask) {
21029 if (I1->getType() != I2->getType())
21030 return false;
21031 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
21032 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
21033 if (!SI1 || !SI2)
21034 return I1->isIdenticalTo(I2);
21035 if (SI1->isIdenticalTo(SI2))
21036 return true;
21037 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
21038 if (SI1->getOperand(I) != SI2->getOperand(I))
21039 return false;
21040 // Check if the second instruction is more defined than the first one.
21041 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
21042 ArrayRef<int> SM1 = SI1->getShuffleMask();
21043 // Count trailing undefs in the mask to check the final number of used
21044 // registers.
21045 unsigned LastUndefsCnt = 0;
21046 for (int I = 0, E = NewMask.size(); I < E; ++I) {
21047 if (SM1[I] == PoisonMaskElem)
21048 ++LastUndefsCnt;
21049 else
21050 LastUndefsCnt = 0;
21051 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
21052 NewMask[I] != SM1[I])
21053 return false;
21054 if (NewMask[I] == PoisonMaskElem)
21055 NewMask[I] = SM1[I];
21056 }
21057 // Check if the last undefs actually change the final number of used vector
21058 // registers.
21059 return SM1.size() - LastUndefsCnt > 1 &&
21060 ::getNumberOfParts(*TTI, SI1->getType()) ==
21062 *TTI, getWidenedType(SI1->getType()->getElementType(),
21063 SM1.size() - LastUndefsCnt));
21064 };
21065 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
21066 // instructions. TODO: We can further optimize this scan if we split the
21067 // instructions into different buckets based on the insert lane.
21069 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
21070 assert(*I &&
21071 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
21072 "Worklist not sorted properly!");
21073 BasicBlock *BB = (*I)->getBlock();
21074 // For all instructions in blocks containing gather sequences:
21075 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
21076 if (isDeleted(&In))
21077 continue;
21079 !GatherShuffleExtractSeq.contains(&In))
21080 continue;
21081
21082 // Check if we can replace this instruction with any of the
21083 // visited instructions.
21084 bool Replaced = false;
21085 for (Instruction *&V : Visited) {
21086 SmallVector<int> NewMask;
21087 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
21088 DT->dominates(V->getParent(), In.getParent())) {
21089 In.replaceAllUsesWith(V);
21090 eraseInstruction(&In);
21091 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
21092 if (!NewMask.empty())
21093 SI->setShuffleMask(NewMask);
21094 Replaced = true;
21095 break;
21096 }
21098 GatherShuffleExtractSeq.contains(V) &&
21099 IsIdenticalOrLessDefined(V, &In, NewMask) &&
21100 DT->dominates(In.getParent(), V->getParent())) {
21101 In.moveAfter(V);
21102 V->replaceAllUsesWith(&In);
21104 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
21105 if (!NewMask.empty())
21106 SI->setShuffleMask(NewMask);
21107 V = &In;
21108 Replaced = true;
21109 break;
21110 }
21111 }
21112 if (!Replaced) {
21113 assert(!is_contained(Visited, &In));
21114 Visited.push_back(&In);
21115 }
21116 }
21117 }
21118 CSEBlocks.clear();
21119 GatherShuffleExtractSeq.clear();
21120}
21121
21122BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
21123 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
21124 auto &BundlePtr =
21125 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
21126 for (Value *V : VL) {
21127 if (S.isNonSchedulable(V))
21128 continue;
21129 auto *I = cast<Instruction>(V);
21130 if (S.isCopyableElement(V)) {
21131 // Add a copyable element model.
21132 ScheduleCopyableData &SD =
21133 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
21134 // Group the instructions to a bundle.
21135 BundlePtr->add(&SD);
21136 continue;
21137 }
21138 ScheduleData *BundleMember = getScheduleData(V);
21139 assert(BundleMember && "no ScheduleData for bundle member "
21140 "(maybe not in same basic block)");
21141 // Group the instructions to a bundle.
21142 BundlePtr->add(BundleMember);
21143 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
21144 BundlePtr.get());
21145 }
21146 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
21147 return *BundlePtr;
21148}
21149
21150// Groups the instructions to a bundle (which is then a single scheduling entity)
21151// and schedules instructions until the bundle gets ready.
21152std::optional<BoUpSLP::ScheduleBundle *>
21153BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
21154 const InstructionsState &S,
21155 const EdgeInfo &EI) {
21156 // No need to schedule PHIs, insertelement, extractelement and extractvalue
21157 // instructions.
21158 if (isa<PHINode>(S.getMainOp()) ||
21159 isVectorLikeInstWithConstOps(S.getMainOp()))
21160 return nullptr;
21161 // If the parent node is non-schedulable and the current node is copyable, and
21162 // any of parent instructions are used outside several basic blocks or in
21163 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
21164 // analysis, leading to a crash.
21165 // Non-scheduled nodes may not have related ScheduleData model, which may lead
21166 // to a skipped dep analysis.
21167 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21168 EI.UserTE->doesNotNeedToSchedule() &&
21169 EI.UserTE->getOpcode() != Instruction::PHI &&
21170 any_of(EI.UserTE->Scalars, [](Value *V) {
21171 auto *I = dyn_cast<Instruction>(V);
21172 if (!I || I->hasOneUser())
21173 return false;
21174 for (User *U : I->users()) {
21175 auto *UI = cast<Instruction>(U);
21176 if (isa<BinaryOperator>(UI))
21177 return true;
21178 }
21179 return false;
21180 }))
21181 return std::nullopt;
21182 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21183 EI.UserTE->hasCopyableElements() &&
21184 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21185 all_of(VL, [&](Value *V) {
21186 if (S.isCopyableElement(V))
21187 return true;
21188 return isUsedOutsideBlock(V);
21189 }))
21190 return std::nullopt;
21191 // If any instruction is used outside block only and its operand is placed
21192 // immediately before it, do not schedule, it may cause wrong def-use chain.
21193 if (S.areInstructionsWithCopyableElements() && any_of(VL, [&](Value *V) {
21194 if (isa<PoisonValue>(V) || S.isCopyableElement(V))
21195 return false;
21196 if (isUsedOutsideBlock(V)) {
21197 for (Value *Op : cast<Instruction>(V)->operands()) {
21198 auto *I = dyn_cast<Instruction>(Op);
21199 if (!I)
21200 continue;
21201 return SLP->isVectorized(I) && I->getNextNode() == V;
21202 }
21203 }
21204 return false;
21205 }))
21206 return std::nullopt;
21207 if (S.areInstructionsWithCopyableElements() && EI) {
21208 bool IsNonSchedulableWithParentPhiNode =
21209 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
21210 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
21211 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21212 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
21213 if (IsNonSchedulableWithParentPhiNode) {
21214 SmallSet<std::pair<Value *, Value *>, 4> Values;
21215 for (const auto [Idx, V] :
21216 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
21217 Value *Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
21218 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
21219 auto *I = dyn_cast<Instruction>(Op);
21220 if (!I || !isCommutative(I))
21221 continue;
21222 if (!Values.insert(std::make_pair(V, Op)).second)
21223 return std::nullopt;
21224 }
21225 }
21226 }
21227 bool HasCopyables = S.areInstructionsWithCopyableElements();
21228 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
21229 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
21230 // If all operands were replaced by copyables, the operands of this node
21231 // might be not, so need to recalculate dependencies for schedule data,
21232 // replaced by copyable schedule data.
21233 SmallVector<ScheduleData *> ControlDependentMembers;
21234 for (Value *V : VL) {
21235 auto *I = dyn_cast<Instruction>(V);
21236 if (!I || (HasCopyables && S.isCopyableElement(V)))
21237 continue;
21238 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21239 for (const Use &U : I->operands()) {
21240 unsigned &NumOps =
21241 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
21242 .first->getSecond();
21243 ++NumOps;
21244 if (auto *Op = dyn_cast<Instruction>(U.get());
21245 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
21246 if (ScheduleData *OpSD = getScheduleData(Op);
21247 OpSD && OpSD->hasValidDependencies()) {
21248 OpSD->clearDirectDependencies();
21249 if (RegionHasStackSave ||
21251 ControlDependentMembers.push_back(OpSD);
21252 }
21253 }
21254 }
21255 }
21256 if (!ControlDependentMembers.empty()) {
21257 ScheduleBundle Invalid = ScheduleBundle::invalid();
21258 calculateDependencies(Invalid, /*InsertInReadyList=*/true, SLP,
21259 ControlDependentMembers);
21260 }
21261 return nullptr;
21262 }
21263
21264 // Initialize the instruction bundle.
21265 Instruction *OldScheduleEnd = ScheduleEnd;
21266 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
21267
21268 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
21269 // Clear deps or recalculate the region, if the memory instruction is a
21270 // copyable. It may have memory deps, which must be recalculated.
21271 SmallVector<ScheduleData *> ControlDependentMembers;
21272 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21273 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21274 for (ScheduleEntity *SE : Bundle.getBundle()) {
21275 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
21276 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21277 BundleMember && BundleMember->hasValidDependencies()) {
21278 BundleMember->clearDirectDependencies();
21279 if (RegionHasStackSave ||
21281 BundleMember->getInst()))
21282 ControlDependentMembers.push_back(BundleMember);
21283 }
21284 continue;
21285 }
21286 auto *SD = cast<ScheduleData>(SE);
21287 if (SD->hasValidDependencies() &&
21288 (!S.areInstructionsWithCopyableElements() ||
21289 !S.isCopyableElement(SD->getInst())) &&
21290 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21291 EI.UserTE->hasState() &&
21292 (!EI.UserTE->hasCopyableElements() ||
21293 !EI.UserTE->isCopyableElement(SD->getInst())))
21294 SD->clearDirectDependencies();
21295 for (const Use &U : SD->getInst()->operands()) {
21296 unsigned &NumOps =
21297 UserOpToNumOps
21298 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
21299 .first->getSecond();
21300 ++NumOps;
21301 if (auto *Op = dyn_cast<Instruction>(U.get());
21302 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
21303 *SLP, NumOps)) {
21304 if (ScheduleData *OpSD = getScheduleData(Op);
21305 OpSD && OpSD->hasValidDependencies()) {
21306 OpSD->clearDirectDependencies();
21307 if (RegionHasStackSave ||
21309 ControlDependentMembers.push_back(OpSD);
21310 }
21311 }
21312 }
21313 }
21314 };
21315 // The scheduling region got new instructions at the lower end (or it is a
21316 // new region for the first bundle). This makes it necessary to
21317 // recalculate all dependencies.
21318 // It is seldom that this needs to be done a second time after adding the
21319 // initial bundle to the region.
21320 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21321 for_each(ScheduleDataMap, [&](auto &P) {
21322 if (BB != P.first->getParent())
21323 return;
21324 ScheduleData *SD = P.second;
21325 if (isInSchedulingRegion(*SD))
21326 SD->clearDependencies();
21327 });
21328 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21329 for_each(P.second, [&](ScheduleCopyableData *SD) {
21330 if (isInSchedulingRegion(*SD))
21331 SD->clearDependencies();
21332 });
21333 });
21334 ReSchedule = true;
21335 }
21336 // Check if the bundle data has deps for copyable elements already. In
21337 // this case need to reset deps and recalculate it.
21338 if (Bundle && !Bundle.getBundle().empty()) {
21339 if (S.areInstructionsWithCopyableElements() ||
21340 !ScheduleCopyableDataMap.empty())
21341 CheckIfNeedToClearDeps(Bundle);
21342 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
21343 << BB->getName() << "\n");
21344 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
21345 ControlDependentMembers);
21346 } else if (!ControlDependentMembers.empty()) {
21347 ScheduleBundle Invalid = ScheduleBundle::invalid();
21348 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
21349 ControlDependentMembers);
21350 }
21351
21352 if (ReSchedule) {
21353 resetSchedule();
21354 initialFillReadyList(ReadyInsts);
21355 }
21356
21357 // Now try to schedule the new bundle or (if no bundle) just calculate
21358 // dependencies. As soon as the bundle is "ready" it means that there are no
21359 // cyclic dependencies and we can schedule it. Note that's important that we
21360 // don't "schedule" the bundle yet.
21361 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21362 !ReadyInsts.empty()) {
21363 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21364 assert(Picked->isReady() && "must be ready to schedule");
21365 schedule(*SLP, S, EI, Picked, ReadyInsts);
21366 if (Picked == &Bundle)
21367 break;
21368 }
21369 };
21370
21371 // Make sure that the scheduling region contains all
21372 // instructions of the bundle.
21373 for (Value *V : VL) {
21374 if (S.isNonSchedulable(V))
21375 continue;
21376 if (!extendSchedulingRegion(V, S)) {
21377 // If the scheduling region got new instructions at the lower end (or it
21378 // is a new region for the first bundle). This makes it necessary to
21379 // recalculate all dependencies.
21380 // Otherwise the compiler may crash trying to incorrectly calculate
21381 // dependencies and emit instruction in the wrong order at the actual
21382 // scheduling.
21383 ScheduleBundle Invalid = ScheduleBundle::invalid();
21384 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
21385 return std::nullopt;
21386 }
21387 }
21388
21389 bool ReSchedule = false;
21390 for (Value *V : VL) {
21391 if (S.isNonSchedulable(V))
21392 continue;
21394 getScheduleCopyableData(cast<Instruction>(V));
21395 if (!CopyableData.empty()) {
21396 for (ScheduleCopyableData *SD : CopyableData)
21397 ReadyInsts.remove(SD);
21398 }
21399 ScheduleData *BundleMember = getScheduleData(V);
21400 assert((BundleMember || S.isCopyableElement(V)) &&
21401 "no ScheduleData for bundle member (maybe not in same basic block)");
21402 if (!BundleMember)
21403 continue;
21404
21405 // Make sure we don't leave the pieces of the bundle in the ready list when
21406 // whole bundle might not be ready.
21407 ReadyInsts.remove(BundleMember);
21408 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
21409 !Bundles.empty()) {
21410 for (ScheduleBundle *B : Bundles)
21411 ReadyInsts.remove(B);
21412 }
21413
21414 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21415 continue;
21416 // A bundle member was scheduled as single instruction before and now
21417 // needs to be scheduled as part of the bundle. We just get rid of the
21418 // existing schedule.
21419 // A bundle member has deps calculated before it was copyable element - need
21420 // to reschedule.
21421 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
21422 << " was already scheduled\n");
21423 ReSchedule = true;
21424 }
21425
21426 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21427 TryScheduleBundleImpl(ReSchedule, Bundle);
21428 if (!Bundle.isReady()) {
21429 for (ScheduleEntity *BD : Bundle.getBundle()) {
21430 // Copyable data scheduling is just removed.
21432 continue;
21433 if (BD->isReady()) {
21434 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
21435 if (Bundles.empty()) {
21436 ReadyInsts.insert(BD);
21437 continue;
21438 }
21439 for (ScheduleBundle *B : Bundles)
21440 if (B->isReady())
21441 ReadyInsts.insert(B);
21442 }
21443 }
21444 ScheduledBundlesList.pop_back();
21445 SmallVector<ScheduleData *> ControlDependentMembers;
21446 for (Value *V : VL) {
21447 if (S.isNonSchedulable(V))
21448 continue;
21449 auto *I = cast<Instruction>(V);
21450 if (S.isCopyableElement(I)) {
21451 // Remove the copyable data from the scheduling region and restore
21452 // previous mappings.
21453 auto KV = std::make_pair(EI, I);
21454 assert(ScheduleCopyableDataMap.contains(KV) &&
21455 "no ScheduleCopyableData for copyable element");
21456 ScheduleCopyableData *SD =
21457 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
21458 ScheduleCopyableDataMapByUsers[I].remove(SD);
21459 if (EI.UserTE) {
21460 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21461 const auto *It = find(Op, I);
21462 assert(It != Op.end() && "Lane not set");
21463 SmallPtrSet<Instruction *, 4> Visited;
21464 do {
21465 int Lane = std::distance(Op.begin(), It);
21466 assert(Lane >= 0 && "Lane not set");
21467 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21468 !EI.UserTE->ReorderIndices.empty())
21469 Lane = EI.UserTE->ReorderIndices[Lane];
21470 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21471 "Couldn't find extract lane");
21472 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21473 if (!Visited.insert(In).second) {
21474 It = find(make_range(std::next(It), Op.end()), I);
21475 break;
21476 }
21477 ScheduleCopyableDataMapByInstUser
21478 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
21479 .pop_back();
21480 It = find(make_range(std::next(It), Op.end()), I);
21481 } while (It != Op.end());
21482 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
21483 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
21484 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
21485 }
21486 if (ScheduleCopyableDataMapByUsers[I].empty())
21487 ScheduleCopyableDataMapByUsers.erase(I);
21488 ScheduleCopyableDataMap.erase(KV);
21489 // Need to recalculate dependencies for the actual schedule data.
21490 if (ScheduleData *OpSD = getScheduleData(I);
21491 OpSD && OpSD->hasValidDependencies()) {
21492 OpSD->clearDirectDependencies();
21493 if (RegionHasStackSave ||
21495 ControlDependentMembers.push_back(OpSD);
21496 }
21497 continue;
21498 }
21499 ScheduledBundles.find(I)->getSecond().pop_back();
21500 }
21501 if (!ControlDependentMembers.empty()) {
21502 ScheduleBundle Invalid = ScheduleBundle::invalid();
21503 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
21504 ControlDependentMembers);
21505 }
21506 return std::nullopt;
21507 }
21508 return &Bundle;
21509}
21510
21511BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21512 // Allocate a new ScheduleData for the instruction.
21513 if (ChunkPos >= ChunkSize) {
21514 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21515 ChunkPos = 0;
21516 }
21517 return &(ScheduleDataChunks.back()[ChunkPos++]);
21518}
21519
21520bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21521 Value *V, const InstructionsState &S) {
21523 assert(I && "bundle member must be an instruction");
21524 if (getScheduleData(I))
21525 return true;
21526 if (!ScheduleStart) {
21527 // It's the first instruction in the new region.
21528 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
21529 ScheduleStart = I;
21530 ScheduleEnd = I->getNextNode();
21531 assert(ScheduleEnd && "tried to vectorize a terminator?");
21532 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
21533 return true;
21534 }
21535 // Search up and down at the same time, because we don't know if the new
21536 // instruction is above or below the existing scheduling region.
21537 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
21538 // against the budget. Otherwise debug info could affect codegen.
21540 ++ScheduleStart->getIterator().getReverse();
21541 BasicBlock::reverse_iterator UpperEnd = BB->rend();
21542 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
21543 BasicBlock::iterator LowerEnd = BB->end();
21544 auto IsAssumeLikeIntr = [](const Instruction &I) {
21545 if (auto *II = dyn_cast<IntrinsicInst>(&I))
21546 return II->isAssumeLikeIntrinsic();
21547 return false;
21548 };
21549 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21550 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21551 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
21552 &*DownIter != I) {
21553 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21554 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
21555 return false;
21556 }
21557
21558 ++UpIter;
21559 ++DownIter;
21560
21561 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21562 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21563 }
21564 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
21565 assert(I->getParent() == ScheduleStart->getParent() &&
21566 "Instruction is in wrong basic block.");
21567 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
21568 ScheduleStart = I;
21569 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
21570 << "\n");
21571 return true;
21572 }
21573 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
21574 "Expected to reach top of the basic block or instruction down the "
21575 "lower end.");
21576 assert(I->getParent() == ScheduleEnd->getParent() &&
21577 "Instruction is in wrong basic block.");
21578 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
21579 nullptr);
21580 ScheduleEnd = I->getNextNode();
21581 assert(ScheduleEnd && "tried to vectorize a terminator?");
21582 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
21583 return true;
21584}
21585
21586void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21587 Instruction *ToI,
21588 ScheduleData *PrevLoadStore,
21589 ScheduleData *NextLoadStore) {
21590 ScheduleData *CurrentLoadStore = PrevLoadStore;
21591 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
21592 // No need to allocate data for non-schedulable instructions.
21593 if (isa<PHINode>(I))
21594 continue;
21595 ScheduleData *SD = ScheduleDataMap.lookup(I);
21596 if (!SD) {
21597 SD = allocateScheduleDataChunks();
21598 ScheduleDataMap[I] = SD;
21599 }
21600 assert(!isInSchedulingRegion(*SD) &&
21601 "new ScheduleData already in scheduling region");
21602 SD->init(SchedulingRegionID, I);
21603
21604 auto CanIgnoreLoad = [](const Instruction *I) {
21605 const auto *LI = dyn_cast<LoadInst>(I);
21606 // If there is a simple load marked as invariant, we can ignore it.
21607 // But, in the (unlikely) case of non-simple invariant load,
21608 // we should not ignore it.
21609 return LI && LI->isSimple() &&
21610 LI->getMetadata(LLVMContext::MD_invariant_load);
21611 };
21612
21613 if (I->mayReadOrWriteMemory() &&
21614 // Simple InvariantLoad does not depend on other memory accesses.
21615 !CanIgnoreLoad(I) &&
21616 (!isa<IntrinsicInst>(I) ||
21617 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
21619 Intrinsic::pseudoprobe))) {
21620 // Update the linked list of memory accessing instructions.
21621 if (CurrentLoadStore) {
21622 CurrentLoadStore->setNextLoadStore(SD);
21623 } else {
21624 FirstLoadStoreInRegion = SD;
21625 }
21626 CurrentLoadStore = SD;
21627 }
21628
21631 RegionHasStackSave = true;
21632 }
21633 if (NextLoadStore) {
21634 if (CurrentLoadStore)
21635 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21636 } else {
21637 LastLoadStoreInRegion = CurrentLoadStore;
21638 }
21639}
21640
21641void BoUpSLP::BlockScheduling::calculateDependencies(
21642 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
21643 ArrayRef<ScheduleData *> ControlDeps) {
21644 SmallVector<ScheduleEntity *> WorkList;
21645 auto ProcessNode = [&](ScheduleEntity *SE) {
21646 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21647 if (CD->hasValidDependencies())
21648 return;
21649 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
21650 CD->initDependencies();
21651 CD->resetUnscheduledDeps();
21652 const EdgeInfo &EI = CD->getEdgeInfo();
21653 if (EI.UserTE) {
21654 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21655 const auto *It = find(Op, CD->getInst());
21656 assert(It != Op.end() && "Lane not set");
21657 SmallPtrSet<Instruction *, 4> Visited;
21658 do {
21659 int Lane = std::distance(Op.begin(), It);
21660 assert(Lane >= 0 && "Lane not set");
21661 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21662 !EI.UserTE->ReorderIndices.empty())
21663 Lane = EI.UserTE->ReorderIndices[Lane];
21664 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21665 "Couldn't find extract lane");
21666 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21667 if (EI.UserTE->isCopyableElement(In)) {
21668 // We may have not have related copyable scheduling data, if the
21669 // instruction is non-schedulable.
21670 if (ScheduleCopyableData *UseSD =
21671 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21672 CD->incDependencies();
21673 if (!UseSD->isScheduled())
21674 CD->incrementUnscheduledDeps(1);
21675 if (!UseSD->hasValidDependencies() ||
21676 (InsertInReadyList && UseSD->isReady()))
21677 WorkList.push_back(UseSD);
21678 }
21679 } else if (Visited.insert(In).second) {
21680 if (ScheduleData *UseSD = getScheduleData(In)) {
21681 CD->incDependencies();
21682 if (!UseSD->isScheduled())
21683 CD->incrementUnscheduledDeps(1);
21684 if (!UseSD->hasValidDependencies() ||
21685 (InsertInReadyList && UseSD->isReady()))
21686 WorkList.push_back(UseSD);
21687 }
21688 }
21689 It = find(make_range(std::next(It), Op.end()), CD->getInst());
21690 } while (It != Op.end());
21691 if (CD->isReady() && CD->getDependencies() == 0 &&
21692 (EI.UserTE->hasState() &&
21693 (EI.UserTE->getMainOp()->getParent() !=
21694 CD->getInst()->getParent() ||
21695 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21696 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
21697 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21698 auto *IU = dyn_cast<Instruction>(U);
21699 if (!IU)
21700 return true;
21701 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21702 })))))) {
21703 // If no uses in the block - mark as having pseudo-use, which cannot
21704 // be scheduled.
21705 // Prevents incorrect def-use tracking between external user and
21706 // actual instruction.
21707 CD->incDependencies();
21708 CD->incrementUnscheduledDeps(1);
21709 }
21710 }
21711 return;
21712 }
21713 auto *BundleMember = cast<ScheduleData>(SE);
21714 if (BundleMember->hasValidDependencies())
21715 return;
21716 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
21717 BundleMember->initDependencies();
21718 BundleMember->resetUnscheduledDeps();
21719 // Handle def-use chain dependencies.
21720 SmallDenseMap<Value *, unsigned> UserToNumOps;
21721 for (User *U : BundleMember->getInst()->users()) {
21722 if (isa<PHINode>(U))
21723 continue;
21724 if (ScheduleData *UseSD = getScheduleData(U)) {
21725 // The operand is a copyable element - skip.
21726 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
21727 ++NumOps;
21728 if (areAllOperandsReplacedByCopyableData(
21729 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21730 continue;
21731 BundleMember->incDependencies();
21732 if (!UseSD->isScheduled())
21733 BundleMember->incrementUnscheduledDeps(1);
21734 if (!UseSD->hasValidDependencies() ||
21735 (InsertInReadyList && UseSD->isReady()))
21736 WorkList.push_back(UseSD);
21737 }
21738 }
21739 for (ScheduleCopyableData *UseSD :
21740 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21741 BundleMember->incDependencies();
21742 if (!UseSD->isScheduled())
21743 BundleMember->incrementUnscheduledDeps(1);
21744 if (!UseSD->hasValidDependencies() ||
21745 (InsertInReadyList && UseSD->isReady()))
21746 WorkList.push_back(UseSD);
21747 }
21748
21749 SmallPtrSet<const Instruction *, 4> Visited;
21750 auto MakeControlDependent = [&](Instruction *I) {
21751 // Do not mark control dependent twice.
21752 if (!Visited.insert(I).second)
21753 return;
21754 auto *DepDest = getScheduleData(I);
21755 assert(DepDest && "must be in schedule window");
21756 DepDest->addControlDependency(BundleMember);
21757 BundleMember->incDependencies();
21758 if (!DepDest->isScheduled())
21759 BundleMember->incrementUnscheduledDeps(1);
21760 if (!DepDest->hasValidDependencies() ||
21761 (InsertInReadyList && DepDest->isReady()))
21762 WorkList.push_back(DepDest);
21763 };
21764
21765 // Any instruction which isn't safe to speculate at the beginning of the
21766 // block is control depend on any early exit or non-willreturn call
21767 // which proceeds it.
21768 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
21769 for (Instruction *I = BundleMember->getInst()->getNextNode();
21770 I != ScheduleEnd; I = I->getNextNode()) {
21771 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
21772 continue;
21773
21774 // Add the dependency
21775 MakeControlDependent(I);
21776
21778 // Everything past here must be control dependent on I.
21779 break;
21780 }
21781 }
21782
21783 if (RegionHasStackSave) {
21784 // If we have an inalloc alloca instruction, it needs to be scheduled
21785 // after any preceeding stacksave. We also need to prevent any alloca
21786 // from reordering above a preceeding stackrestore.
21787 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21788 match(BundleMember->getInst(),
21790 for (Instruction *I = BundleMember->getInst()->getNextNode();
21791 I != ScheduleEnd; I = I->getNextNode()) {
21794 // Any allocas past here must be control dependent on I, and I
21795 // must be memory dependend on BundleMember->Inst.
21796 break;
21797
21798 if (!isa<AllocaInst>(I))
21799 continue;
21800
21801 // Add the dependency
21802 MakeControlDependent(I);
21803 }
21804 }
21805
21806 // In addition to the cases handle just above, we need to prevent
21807 // allocas and loads/stores from moving below a stacksave or a
21808 // stackrestore. Avoiding moving allocas below stackrestore is currently
21809 // thought to be conservatism. Moving loads/stores below a stackrestore
21810 // can lead to incorrect code.
21811 if (isa<AllocaInst>(BundleMember->getInst()) ||
21812 BundleMember->getInst()->mayReadOrWriteMemory()) {
21813 for (Instruction *I = BundleMember->getInst()->getNextNode();
21814 I != ScheduleEnd; I = I->getNextNode()) {
21817 continue;
21818
21819 // Add the dependency
21820 MakeControlDependent(I);
21821 break;
21822 }
21823 }
21824 }
21825
21826 // Handle the memory dependencies (if any).
21827 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21828 if (!NextLoadStore)
21829 return;
21830 Instruction *SrcInst = BundleMember->getInst();
21831 assert(SrcInst->mayReadOrWriteMemory() &&
21832 "NextLoadStore list for non memory effecting bundle?");
21833 MemoryLocation SrcLoc = getLocation(SrcInst);
21834 bool SrcMayWrite = SrcInst->mayWriteToMemory();
21835 unsigned NumAliased = 0;
21836 unsigned DistToSrc = 1;
21837 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
21838
21839 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21840 DepDest = DepDest->getNextLoadStore()) {
21841 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
21842
21843 // We have two limits to reduce the complexity:
21844 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
21845 // SLP->isAliased (which is the expensive part in this loop).
21846 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
21847 // the whole loop (even if the loop is fast, it's quadratic).
21848 // It's important for the loop break condition (see below) to
21849 // check this limit even between two read-only instructions.
21850 if (DistToSrc >= MaxMemDepDistance ||
21851 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21852 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
21853 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21854
21855 // We increment the counter only if the locations are aliased
21856 // (instead of counting all alias checks). This gives a better
21857 // balance between reduced runtime and accurate dependencies.
21858 NumAliased++;
21859
21860 DepDest->addMemoryDependency(BundleMember);
21861 BundleMember->incDependencies();
21862 if (!DepDest->isScheduled())
21863 BundleMember->incrementUnscheduledDeps(1);
21864 if (!DepDest->hasValidDependencies() ||
21865 (InsertInReadyList && DepDest->isReady()))
21866 WorkList.push_back(DepDest);
21867 }
21868
21869 // Example, explaining the loop break condition: Let's assume our
21870 // starting instruction is i0 and MaxMemDepDistance = 3.
21871 //
21872 // +--------v--v--v
21873 // i0,i1,i2,i3,i4,i5,i6,i7,i8
21874 // +--------^--^--^
21875 //
21876 // MaxMemDepDistance let us stop alias-checking at i3 and we add
21877 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
21878 // Previously we already added dependencies from i3 to i6,i7,i8
21879 // (because of MaxMemDepDistance). As we added a dependency from
21880 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
21881 // and we can abort this loop at i6.
21882 if (DistToSrc >= 2 * MaxMemDepDistance)
21883 break;
21884 DistToSrc++;
21885 }
21886 };
21887
21888 assert((Bundle || !ControlDeps.empty()) &&
21889 "expected at least one instruction to schedule");
21890 if (Bundle)
21891 WorkList.push_back(Bundle.getBundle().front());
21892 WorkList.append(ControlDeps.begin(), ControlDeps.end());
21893 SmallPtrSet<ScheduleBundle *, 16> Visited;
21894 while (!WorkList.empty()) {
21895 ScheduleEntity *SD = WorkList.pop_back_val();
21896 SmallVector<ScheduleBundle *, 1> CopyableBundle;
21898 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21899 CopyableBundle.push_back(&CD->getBundle());
21900 Bundles = CopyableBundle;
21901 } else {
21902 Bundles = getScheduleBundles(SD->getInst());
21903 }
21904 if (Bundles.empty()) {
21905 if (!SD->hasValidDependencies())
21906 ProcessNode(SD);
21907 if (InsertInReadyList && SD->isReady()) {
21908 ReadyInsts.insert(SD);
21909 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
21910 }
21911 continue;
21912 }
21913 for (ScheduleBundle *Bundle : Bundles) {
21914 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
21915 continue;
21916 assert(isInSchedulingRegion(*Bundle) &&
21917 "ScheduleData not in scheduling region");
21918 for_each(Bundle->getBundle(), ProcessNode);
21919 }
21920 if (InsertInReadyList && SD->isReady()) {
21921 for (ScheduleBundle *Bundle : Bundles) {
21922 assert(isInSchedulingRegion(*Bundle) &&
21923 "ScheduleData not in scheduling region");
21924 if (!Bundle->isReady())
21925 continue;
21926 ReadyInsts.insert(Bundle);
21927 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
21928 << "\n");
21929 }
21930 }
21931 }
21932}
21933
21934void BoUpSLP::BlockScheduling::resetSchedule() {
21935 assert(ScheduleStart &&
21936 "tried to reset schedule on block which has not been scheduled");
21937 for_each(ScheduleDataMap, [&](auto &P) {
21938 if (BB != P.first->getParent())
21939 return;
21940 ScheduleData *SD = P.second;
21941 if (isInSchedulingRegion(*SD)) {
21942 SD->setScheduled(/*Scheduled=*/false);
21943 SD->resetUnscheduledDeps();
21944 }
21945 });
21946 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21947 for_each(P.second, [&](ScheduleCopyableData *SD) {
21948 if (isInSchedulingRegion(*SD)) {
21949 SD->setScheduled(/*Scheduled=*/false);
21950 SD->resetUnscheduledDeps();
21951 }
21952 });
21953 });
21954 for_each(ScheduledBundles, [&](auto &P) {
21955 for_each(P.second, [&](ScheduleBundle *Bundle) {
21956 if (isInSchedulingRegion(*Bundle))
21957 Bundle->setScheduled(/*Scheduled=*/false);
21958 });
21959 });
21960 // Reset schedule data for copyable elements.
21961 for (auto &P : ScheduleCopyableDataMap) {
21962 if (isInSchedulingRegion(*P.second)) {
21963 P.second->setScheduled(/*Scheduled=*/false);
21964 P.second->resetUnscheduledDeps();
21965 }
21966 }
21967 ReadyInsts.clear();
21968}
21969
21970void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
21971 if (!BS->ScheduleStart)
21972 return;
21973
21974 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
21975
21976 // A key point - if we got here, pre-scheduling was able to find a valid
21977 // scheduling of the sub-graph of the scheduling window which consists
21978 // of all vector bundles and their transitive users. As such, we do not
21979 // need to reschedule anything *outside of* that subgraph.
21980
21981 BS->resetSchedule();
21982
21983 // For the real scheduling we use a more sophisticated ready-list: it is
21984 // sorted by the original instruction location. This lets the final schedule
21985 // be as close as possible to the original instruction order.
21986 // WARNING: If changing this order causes a correctness issue, that means
21987 // there is some missing dependence edge in the schedule data graph.
21988 struct ScheduleDataCompare {
21989 bool operator()(const ScheduleEntity *SD1,
21990 const ScheduleEntity *SD2) const {
21991 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21992 }
21993 };
21994 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21995
21996 // Ensure that all dependency data is updated (for nodes in the sub-graph)
21997 // and fill the ready-list with initial instructions.
21998 int Idx = 0;
21999 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22000 I = I->getNextNode()) {
22001 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
22002 if (!Bundles.empty()) {
22003 for (ScheduleBundle *Bundle : Bundles) {
22004 Bundle->setSchedulingPriority(Idx++);
22005 if (!Bundle->hasValidDependencies())
22006 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
22007 }
22008 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
22009 for (ScheduleCopyableData *SD : reverse(SDs)) {
22010 ScheduleBundle &Bundle = SD->getBundle();
22011 Bundle.setSchedulingPriority(Idx++);
22012 if (!Bundle.hasValidDependencies())
22013 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22014 }
22015 continue;
22016 }
22018 BS->getScheduleCopyableDataUsers(I);
22019 if (ScheduleData *SD = BS->getScheduleData(I)) {
22020 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
22021 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
22022 SDTEs.front()->doesNotNeedToSchedule() ||
22024 "scheduler and vectorizer bundle mismatch");
22025 SD->setSchedulingPriority(Idx++);
22026 if (!SD->hasValidDependencies() &&
22027 (!CopyableData.empty() ||
22028 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
22029 assert(TE->isGather() && "expected gather node");
22030 return TE->hasState() && TE->hasCopyableElements() &&
22031 TE->isCopyableElement(I);
22032 }))) {
22033 // Need to calculate deps for these nodes to correctly handle copyable
22034 // dependencies, even if they were cancelled.
22035 // If copyables bundle was cancelled, the deps are cleared and need to
22036 // recalculate them.
22037 ScheduleBundle Bundle;
22038 Bundle.add(SD);
22039 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22040 }
22041 }
22042 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
22043 ScheduleBundle &Bundle = SD->getBundle();
22044 Bundle.setSchedulingPriority(Idx++);
22045 if (!Bundle.hasValidDependencies())
22046 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22047 }
22048 }
22049 BS->initialFillReadyList(ReadyInsts);
22050
22051 Instruction *LastScheduledInst = BS->ScheduleEnd;
22052
22053 // Do the "real" scheduling.
22054 SmallPtrSet<Instruction *, 16> Scheduled;
22055 while (!ReadyInsts.empty()) {
22056 auto *Picked = *ReadyInsts.begin();
22057 ReadyInsts.erase(ReadyInsts.begin());
22058
22059 // Move the scheduled instruction(s) to their dedicated places, if not
22060 // there yet.
22061 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
22062 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
22063 Instruction *PickedInst = BundleMember->getInst();
22064 // If copyable must be schedule as part of something else, skip it.
22065 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
22066 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
22067 (!IsCopyable && !Scheduled.insert(PickedInst).second))
22068 continue;
22069 if (PickedInst->getNextNode() != LastScheduledInst)
22070 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
22071 LastScheduledInst = PickedInst;
22072 }
22073 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
22074 LastScheduledInst);
22075 } else {
22076 auto *SD = cast<ScheduleData>(Picked);
22077 Instruction *PickedInst = SD->getInst();
22078 if (PickedInst->getNextNode() != LastScheduledInst)
22079 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
22080 LastScheduledInst = PickedInst;
22081 }
22082 auto Invalid = InstructionsState::invalid();
22083 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
22084 }
22085
22086 // Check that we didn't break any of our invariants.
22087#ifdef EXPENSIVE_CHECKS
22088 BS->verify();
22089#endif
22090
22091#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
22092 // Check that all schedulable entities got scheduled
22093 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22094 I = I->getNextNode()) {
22095 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
22096 assert(all_of(Bundles,
22097 [](const ScheduleBundle *Bundle) {
22098 return Bundle->isScheduled();
22099 }) &&
22100 "must be scheduled at this point");
22101 }
22102#endif
22103
22104 // Avoid duplicate scheduling of the block.
22105 BS->ScheduleStart = nullptr;
22106}
22107
22109 // If V is a store, just return the width of the stored value (or value
22110 // truncated just before storing) without traversing the expression tree.
22111 // This is the common case.
22112 if (auto *Store = dyn_cast<StoreInst>(V))
22113 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
22114
22115 if (auto *IEI = dyn_cast<InsertElementInst>(V))
22116 return getVectorElementSize(IEI->getOperand(1));
22117
22118 auto E = InstrElementSize.find(V);
22119 if (E != InstrElementSize.end())
22120 return E->second;
22121
22122 // If V is not a store, we can traverse the expression tree to find loads
22123 // that feed it. The type of the loaded value may indicate a more suitable
22124 // width than V's type. We want to base the vector element size on the width
22125 // of memory operations where possible.
22128 if (auto *I = dyn_cast<Instruction>(V)) {
22129 Worklist.emplace_back(I, I->getParent(), 0);
22130 Visited.insert(I);
22131 }
22132
22133 // Traverse the expression tree in bottom-up order looking for loads. If we
22134 // encounter an instruction we don't yet handle, we give up.
22135 auto Width = 0u;
22136 Value *FirstNonBool = nullptr;
22137 while (!Worklist.empty()) {
22138 auto [I, Parent, Level] = Worklist.pop_back_val();
22139
22140 // We should only be looking at scalar instructions here. If the current
22141 // instruction has a vector type, skip.
22142 auto *Ty = I->getType();
22143 if (isa<VectorType>(Ty))
22144 continue;
22145 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
22146 FirstNonBool = I;
22147 if (Level > RecursionMaxDepth)
22148 continue;
22149
22150 // If the current instruction is a load, update MaxWidth to reflect the
22151 // width of the loaded value.
22153 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
22154
22155 // Otherwise, we need to visit the operands of the instruction. We only
22156 // handle the interesting cases from buildTree here. If an operand is an
22157 // instruction we haven't yet visited and from the same basic block as the
22158 // user or the use is a PHI node, we add it to the worklist.
22161 for (Use &U : I->operands()) {
22162 if (auto *J = dyn_cast<Instruction>(U.get()))
22163 if (Visited.insert(J).second &&
22164 (isa<PHINode>(I) || J->getParent() == Parent)) {
22165 Worklist.emplace_back(J, J->getParent(), Level + 1);
22166 continue;
22167 }
22168 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
22169 FirstNonBool = U.get();
22170 }
22171 } else {
22172 break;
22173 }
22174 }
22175
22176 // If we didn't encounter a memory access in the expression tree, or if we
22177 // gave up for some reason, just return the width of V. Otherwise, return the
22178 // maximum width we found.
22179 if (!Width) {
22180 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
22181 V = FirstNonBool;
22182 Width = DL->getTypeSizeInBits(V->getType());
22183 }
22184
22185 for (Instruction *I : Visited)
22186 InstrElementSize[I] = Width;
22187
22188 return Width;
22189}
22190
22191bool BoUpSLP::collectValuesToDemote(
22192 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
22194 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
22195 bool &IsProfitableToDemote, bool IsTruncRoot) const {
22196 // We can always demote constants.
22197 if (all_of(E.Scalars, IsaPred<Constant>))
22198 return true;
22199
22200 unsigned OrigBitWidth =
22201 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
22202 if (OrigBitWidth == BitWidth) {
22203 MaxDepthLevel = 1;
22204 return true;
22205 }
22206
22207 // Check if the node was analyzed already and must keep its original bitwidth.
22208 if (NodesToKeepBWs.contains(E.Idx))
22209 return false;
22210
22211 // If the value is not a vectorized instruction in the expression and not used
22212 // by the insertelement instruction and not used in multiple vector nodes, it
22213 // cannot be demoted.
22214 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
22215 if (isa<PoisonValue>(R))
22216 return false;
22217 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22218 });
22219 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
22220 if (isa<PoisonValue>(V))
22221 return true;
22222 if (getTreeEntries(V).size() > 1)
22223 return false;
22224 // For lat shuffle of sext/zext with many uses need to check the extra bit
22225 // for unsigned values, otherwise may have incorrect casting for reused
22226 // scalars.
22227 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
22228 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
22229 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22230 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22231 return true;
22232 }
22233 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22234 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22235 if (IsSignedNode)
22236 ++BitWidth1;
22237 if (auto *I = dyn_cast<Instruction>(V)) {
22238 APInt Mask = DB->getDemandedBits(I);
22239 unsigned BitWidth2 =
22240 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
22241 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22242 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
22243 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22244 break;
22245 BitWidth2 *= 2;
22246 }
22247 BitWidth1 = std::min(BitWidth1, BitWidth2);
22248 }
22249 BitWidth = std::max(BitWidth, BitWidth1);
22250 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
22251 };
22252 auto FinalAnalysis = [&, TTI = TTI]() {
22253 if (!IsProfitableToDemote)
22254 return false;
22255 bool Res = all_of(
22256 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
22257 // Demote gathers.
22258 if (Res && E.isGather()) {
22259 if (E.hasState()) {
22260 if (const TreeEntry *SameTE =
22261 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
22262 SameTE)
22263 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
22264 ToDemote, Visited, NodesToKeepBWs,
22265 MaxDepthLevel, IsProfitableToDemote,
22266 IsTruncRoot)) {
22267 ToDemote.push_back(E.Idx);
22268 return true;
22269 }
22270 }
22271 // Check possible extractelement instructions bases and final vector
22272 // length.
22273 SmallPtrSet<Value *, 4> UniqueBases;
22274 for (Value *V : E.Scalars) {
22275 auto *EE = dyn_cast<ExtractElementInst>(V);
22276 if (!EE)
22277 continue;
22278 UniqueBases.insert(EE->getVectorOperand());
22279 }
22280 const unsigned VF = E.Scalars.size();
22281 Type *OrigScalarTy = E.Scalars.front()->getType();
22282 if (UniqueBases.size() <= 2 ||
22283 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
22285 *TTI,
22287 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
22288 VF))) {
22289 ToDemote.push_back(E.Idx);
22290 return true;
22291 }
22292 }
22293 return Res;
22294 };
22295 if (E.isGather() || !Visited.insert(&E).second ||
22296 any_of(E.Scalars, [&](Value *V) {
22297 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22298 return isa<InsertElementInst>(U) && !isVectorized(U);
22299 });
22300 }))
22301 return FinalAnalysis();
22302
22303 if (any_of(E.Scalars, [&](Value *V) {
22304 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22305 return isVectorized(U) ||
22306 (E.Idx == 0 && UserIgnoreList &&
22307 UserIgnoreList->contains(U)) ||
22308 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22309 !U->getType()->isScalableTy() &&
22310 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22311 }) && !IsPotentiallyTruncated(V, BitWidth);
22312 }))
22313 return false;
22314
22315 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
22316 bool &NeedToExit) {
22317 NeedToExit = false;
22318 unsigned InitLevel = MaxDepthLevel;
22319 for (const TreeEntry *Op : Operands) {
22320 unsigned Level = InitLevel;
22321 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
22322 ToDemote, Visited, NodesToKeepBWs, Level,
22323 IsProfitableToDemote, IsTruncRoot)) {
22324 if (!IsProfitableToDemote)
22325 return false;
22326 NeedToExit = true;
22327 if (!FinalAnalysis())
22328 return false;
22329 continue;
22330 }
22331 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22332 }
22333 return true;
22334 };
22335 auto AttemptCheckBitwidth =
22336 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
22337 // Try all bitwidth < OrigBitWidth.
22338 NeedToExit = false;
22339 unsigned BestFailBitwidth = 0;
22340 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
22341 if (Checker(BitWidth, OrigBitWidth))
22342 return true;
22343 if (BestFailBitwidth == 0 && FinalAnalysis())
22344 BestFailBitwidth = BitWidth;
22345 }
22346 if (BitWidth >= OrigBitWidth) {
22347 if (BestFailBitwidth == 0) {
22348 BitWidth = OrigBitWidth;
22349 return false;
22350 }
22351 MaxDepthLevel = 1;
22352 BitWidth = BestFailBitwidth;
22353 NeedToExit = true;
22354 return true;
22355 }
22356 return false;
22357 };
22358 auto TryProcessInstruction =
22359 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
22360 function_ref<bool(unsigned, unsigned)> Checker = {}) {
22361 if (Operands.empty()) {
22362 if (!IsTruncRoot)
22363 MaxDepthLevel = 1;
22364 for (Value *V : E.Scalars)
22365 (void)IsPotentiallyTruncated(V, BitWidth);
22366 } else {
22367 // Several vectorized uses? Check if we can truncate it, otherwise -
22368 // exit.
22369 if (any_of(E.Scalars, [&](Value *V) {
22370 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22371 }))
22372 return false;
22373 bool NeedToExit = false;
22374 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22375 return false;
22376 if (NeedToExit)
22377 return true;
22378 if (!ProcessOperands(Operands, NeedToExit))
22379 return false;
22380 if (NeedToExit)
22381 return true;
22382 }
22383
22384 ++MaxDepthLevel;
22385 // Record the entry that we can demote.
22386 ToDemote.push_back(E.Idx);
22387 return IsProfitableToDemote;
22388 };
22389
22390 if (E.State == TreeEntry::SplitVectorize)
22391 return TryProcessInstruction(
22392 BitWidth,
22393 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
22394 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
22395
22396 if (E.isAltShuffle()) {
22397 // Combining these opcodes may lead to incorrect analysis, skip for now.
22398 auto IsDangerousOpcode = [](unsigned Opcode) {
22399 switch (Opcode) {
22400 case Instruction::Shl:
22401 case Instruction::AShr:
22402 case Instruction::LShr:
22403 case Instruction::UDiv:
22404 case Instruction::SDiv:
22405 case Instruction::URem:
22406 case Instruction::SRem:
22407 return true;
22408 default:
22409 break;
22410 }
22411 return false;
22412 };
22413 if (IsDangerousOpcode(E.getAltOpcode()))
22414 return FinalAnalysis();
22415 }
22416
22417 switch (E.getOpcode()) {
22418
22419 // We can always demote truncations and extensions. Since truncations can
22420 // seed additional demotion, we save the truncated value.
22421 case Instruction::Trunc:
22422 if (IsProfitableToDemoteRoot)
22423 IsProfitableToDemote = true;
22424 return TryProcessInstruction(BitWidth);
22425 case Instruction::ZExt:
22426 case Instruction::SExt:
22427 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
22428 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22429 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22430 return false;
22431 IsProfitableToDemote = true;
22432 return TryProcessInstruction(BitWidth);
22433
22434 // We can demote certain binary operations if we can demote both of their
22435 // operands.
22436 case Instruction::Add:
22437 case Instruction::Sub:
22438 case Instruction::Mul:
22439 case Instruction::And:
22440 case Instruction::Or:
22441 case Instruction::Xor: {
22442 return TryProcessInstruction(
22443 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
22444 }
22445 case Instruction::Freeze:
22446 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
22447 case Instruction::Shl: {
22448 // If we are truncating the result of this SHL, and if it's a shift of an
22449 // inrange amount, we can always perform a SHL in a smaller type.
22450 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
22451 return all_of(E.Scalars, [&](Value *V) {
22452 if (isa<PoisonValue>(V))
22453 return true;
22454 if (E.isCopyableElement(V))
22455 return true;
22456 auto *I = cast<Instruction>(V);
22457 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22458 return AmtKnownBits.getMaxValue().ult(BitWidth);
22459 });
22460 };
22461 return TryProcessInstruction(
22462 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
22463 }
22464 case Instruction::LShr: {
22465 // If this is a truncate of a logical shr, we can truncate it to a smaller
22466 // lshr iff we know that the bits we would otherwise be shifting in are
22467 // already zeros.
22468 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22469 return all_of(E.Scalars, [&](Value *V) {
22470 if (isa<PoisonValue>(V))
22471 return true;
22472 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22473 if (E.isCopyableElement(V))
22474 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22475 auto *I = cast<Instruction>(V);
22476 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22477 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22478 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22479 SimplifyQuery(*DL));
22480 });
22481 };
22482 return TryProcessInstruction(
22483 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22484 LShrChecker);
22485 }
22486 case Instruction::AShr: {
22487 // If this is a truncate of an arithmetic shr, we can truncate it to a
22488 // smaller ashr iff we know that all the bits from the sign bit of the
22489 // original type and the sign bit of the truncate type are similar.
22490 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22491 return all_of(E.Scalars, [&](Value *V) {
22492 if (isa<PoisonValue>(V))
22493 return true;
22494 auto *I = cast<Instruction>(V);
22495 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22496 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22497 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22498 ShiftedBits <
22499 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22500 });
22501 };
22502 return TryProcessInstruction(
22503 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22504 AShrChecker);
22505 }
22506 case Instruction::UDiv:
22507 case Instruction::URem: {
22508 // UDiv and URem can be truncated if all the truncated bits are zero.
22509 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22510 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22511 return all_of(E.Scalars, [&](Value *V) {
22512 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22513 if (E.hasCopyableElements() && E.isCopyableElement(V))
22514 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22515 auto *I = cast<Instruction>(V);
22516 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22517 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22518 });
22519 };
22520 return TryProcessInstruction(
22521 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
22522 }
22523
22524 // We can demote selects if we can demote their true and false values.
22525 case Instruction::Select: {
22526 return TryProcessInstruction(
22527 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
22528 }
22529
22530 // We can demote phis if we can demote all their incoming operands.
22531 case Instruction::PHI: {
22532 const unsigned NumOps = E.getNumOperands();
22534 transform(seq<unsigned>(0, NumOps), Ops.begin(),
22535 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
22536
22537 return TryProcessInstruction(BitWidth, Ops);
22538 }
22539
22540 case Instruction::Call: {
22541 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
22542 if (!IC)
22543 break;
22545 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
22546 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
22547 break;
22548 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
22549 function_ref<bool(unsigned, unsigned)> CallChecker;
22550 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22551 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22552 return all_of(E.Scalars, [&](Value *V) {
22553 auto *I = cast<Instruction>(V);
22554 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22555 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22556 return MaskedValueIsZero(I->getOperand(0), Mask,
22557 SimplifyQuery(*DL)) &&
22558 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22559 }
22560 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
22561 "Expected min/max intrinsics only.");
22562 unsigned SignBits = OrigBitWidth - BitWidth;
22563 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22564 unsigned Op0SignBits =
22565 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22566 unsigned Op1SignBits =
22567 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
22568 return SignBits <= Op0SignBits &&
22569 ((SignBits != Op0SignBits &&
22570 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22571 MaskedValueIsZero(I->getOperand(0), Mask,
22572 SimplifyQuery(*DL))) &&
22573 SignBits <= Op1SignBits &&
22574 ((SignBits != Op1SignBits &&
22575 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
22576 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
22577 });
22578 };
22579 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22580 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22581 return all_of(E.Scalars, [&](Value *V) {
22582 auto *I = cast<Instruction>(V);
22583 unsigned SignBits = OrigBitWidth - BitWidth;
22584 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22585 unsigned Op0SignBits =
22586 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22587 return SignBits <= Op0SignBits &&
22588 ((SignBits != Op0SignBits &&
22589 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22590 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22591 });
22592 };
22593 if (ID != Intrinsic::abs) {
22594 Operands.push_back(getOperandEntry(&E, 1));
22595 CallChecker = CompChecker;
22596 } else {
22597 CallChecker = AbsChecker;
22598 }
22599 InstructionCost BestCost =
22600 std::numeric_limits<InstructionCost::CostType>::max();
22601 unsigned BestBitWidth = BitWidth;
22602 unsigned VF = E.Scalars.size();
22603 // Choose the best bitwidth based on cost estimations.
22604 auto Checker = [&](unsigned BitWidth, unsigned) {
22605 unsigned MinBW = PowerOf2Ceil(BitWidth);
22606 SmallVector<Type *> ArgTys =
22607 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
22608 auto VecCallCosts = getVectorCallCosts(
22609 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
22610 TTI, TLI, ArgTys);
22611 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
22612 if (Cost < BestCost) {
22613 BestCost = Cost;
22614 BestBitWidth = BitWidth;
22615 }
22616 return false;
22617 };
22618 [[maybe_unused]] bool NeedToExit;
22619 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22620 BitWidth = BestBitWidth;
22621 return TryProcessInstruction(BitWidth, Operands, CallChecker);
22622 }
22623
22624 // Otherwise, conservatively give up.
22625 default:
22626 break;
22627 }
22628 MaxDepthLevel = 1;
22629 return FinalAnalysis();
22630}
22631
22632static RecurKind getRdxKind(Value *V);
22633
22635 // We only attempt to truncate integer expressions.
22636 bool IsStoreOrInsertElt =
22637 VectorizableTree.front()->hasState() &&
22638 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22639 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22640 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22641 ExtraBitWidthNodes.size() <= 1 &&
22642 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22643 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22644 return;
22645
22646 unsigned NodeIdx = 0;
22647 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22648 NodeIdx = 1;
22649
22650 // Ensure the roots of the vectorizable tree don't form a cycle.
22651 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
22652 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22653 "Unexpected tree is graph.");
22654
22655 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
22656 // resize to the final type.
22657 bool IsTruncRoot = false;
22658 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22659 SmallVector<unsigned> RootDemotes;
22660 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
22661 if (NodeIdx != 0 &&
22662 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22663 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22664 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
22665 IsTruncRoot = true;
22666 RootDemotes.push_back(NodeIdx);
22667 IsProfitableToDemoteRoot = true;
22668 ++NodeIdx;
22669 }
22670
22671 // Analyzed the reduction already and not profitable - exit.
22672 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22673 return;
22674
22675 SmallVector<unsigned> ToDemote;
22676 auto ComputeMaxBitWidth =
22677 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
22678 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
22679 ToDemote.clear();
22680 // Check if the root is trunc and the next node is gather/buildvector, then
22681 // keep trunc in scalars, which is free in most cases.
22682 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22683 !NodesToKeepBWs.contains(E.Idx) &&
22684 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22685 all_of(E.Scalars, [&](Value *V) {
22686 return V->hasOneUse() || isa<Constant>(V) ||
22687 (!V->hasNUsesOrMore(UsesLimit) &&
22688 none_of(V->users(), [&](User *U) {
22689 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22690 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22691 if (TEs.empty() || is_contained(TEs, UserTE))
22692 return false;
22693 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22694 SelectInst>(U) ||
22695 isa<SIToFPInst, UIToFPInst>(U) ||
22696 (UserTE->hasState() &&
22697 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22698 SelectInst>(UserTE->getMainOp()) ||
22699 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22700 return true;
22701 unsigned UserTESz = DL->getTypeSizeInBits(
22702 UserTE->Scalars.front()->getType());
22703 if (all_of(TEs, [&](const TreeEntry *TE) {
22704 auto It = MinBWs.find(TE);
22705 return It != MinBWs.end() &&
22706 It->second.first > UserTESz;
22707 }))
22708 return true;
22709 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22710 }));
22711 })) {
22712 ToDemote.push_back(E.Idx);
22713 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22714 auto It = MinBWs.find(UserTE);
22715 if (It != MinBWs.end())
22716 return It->second.first;
22717 unsigned MaxBitWidth =
22718 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22719 MaxBitWidth = bit_ceil(MaxBitWidth);
22720 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22721 MaxBitWidth = 8;
22722 return MaxBitWidth;
22723 }
22724
22725 if (!E.hasState())
22726 return 0u;
22727
22728 unsigned VF = E.getVectorFactor();
22729 Type *ScalarTy = E.Scalars.front()->getType();
22730 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22731 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
22732 if (!TreeRootIT)
22733 return 0u;
22734
22735 if (any_of(E.Scalars,
22736 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
22737 return 0u;
22738
22739 unsigned NumParts = ::getNumberOfParts(
22740 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
22741
22742 // The maximum bit width required to represent all the values that can be
22743 // demoted without loss of precision. It would be safe to truncate the roots
22744 // of the expression to this width.
22745 unsigned MaxBitWidth = 1u;
22746
22747 // True if the roots can be zero-extended back to their original type,
22748 // rather than sign-extended. We know that if the leading bits are not
22749 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
22750 // True.
22751 // Determine if the sign bit of all the roots is known to be zero. If not,
22752 // IsKnownPositive is set to False.
22753 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
22754 if (isa<PoisonValue>(R))
22755 return true;
22756 KnownBits Known = computeKnownBits(R, *DL);
22757 return Known.isNonNegative();
22758 });
22759
22760 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22761 E.UserTreeIndex.UserTE->hasState() &&
22762 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22763 MaxBitWidth =
22764 std::min(DL->getTypeSizeInBits(
22765 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22766 DL->getTypeSizeInBits(ScalarTy));
22767
22768 // We first check if all the bits of the roots are demanded. If they're not,
22769 // we can truncate the roots to this narrower type.
22770 for (Value *Root : E.Scalars) {
22771 if (isa<PoisonValue>(Root))
22772 continue;
22773 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
22774 TypeSize NumTypeBits =
22775 DL->getTypeSizeInBits(Root->getType()->getScalarType());
22776 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22777 // If we can't prove that the sign bit is zero, we must add one to the
22778 // maximum bit width to account for the unknown sign bit. This preserves
22779 // the existing sign bit so we can safely sign-extend the root back to the
22780 // original type. Otherwise, if we know the sign bit is zero, we will
22781 // zero-extend the root instead.
22782 //
22783 // FIXME: This is somewhat suboptimal, as there will be cases where adding
22784 // one to the maximum bit width will yield a larger-than-necessary
22785 // type. In general, we need to add an extra bit only if we can't
22786 // prove that the upper bit of the original type is equal to the
22787 // upper bit of the proposed smaller type. If these two bits are
22788 // the same (either zero or one) we know that sign-extending from
22789 // the smaller type will result in the same value. Here, since we
22790 // can't yet prove this, we are just making the proposed smaller
22791 // type larger to ensure correctness.
22792 if (!IsKnownPositive)
22793 ++BitWidth1;
22794
22795 auto *I = dyn_cast<Instruction>(Root);
22796 if (!I) {
22797 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22798 continue;
22799 }
22800 APInt Mask = DB->getDemandedBits(I);
22801 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22802 MaxBitWidth =
22803 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22804 }
22805
22806 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22807 MaxBitWidth = 8;
22808
22809 // If the original type is large, but reduced type does not improve the reg
22810 // use - ignore it.
22811 if (NumParts > 1 &&
22812 NumParts ==
22814 *TTI, getWidenedType(IntegerType::get(F->getContext(),
22815 bit_ceil(MaxBitWidth)),
22816 VF)))
22817 return 0u;
22818
22819 unsigned Opcode = E.getOpcode();
22820 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22821 Opcode == Instruction::SExt ||
22822 Opcode == Instruction::ZExt || NumParts > 1;
22823 // Conservatively determine if we can actually truncate the roots of the
22824 // expression. Collect the values that can be demoted in ToDemote and
22825 // additional roots that require investigating in Roots.
22827 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22828 bool NeedToDemote = IsProfitableToDemote;
22829
22830 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22831 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22832 NeedToDemote, IsTruncRoot) ||
22833 (MaxDepthLevel <= Limit &&
22834 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22835 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22836 DL->getTypeSizeInBits(TreeRootIT) /
22837 DL->getTypeSizeInBits(
22838 E.getMainOp()->getOperand(0)->getType()) >
22839 2)))))
22840 return 0u;
22841 // Round MaxBitWidth up to the next power-of-two.
22842 MaxBitWidth = bit_ceil(MaxBitWidth);
22843
22844 return MaxBitWidth;
22845 };
22846
22847 // If we can truncate the root, we must collect additional values that might
22848 // be demoted as a result. That is, those seeded by truncations we will
22849 // modify.
22850 // Add reduction ops sizes, if any.
22851 if (UserIgnoreList &&
22852 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22853 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
22854 // x i1> to in)).
22855 if (all_of(*UserIgnoreList,
22856 [](Value *V) {
22857 return isa<PoisonValue>(V) ||
22858 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22859 }) &&
22860 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22861 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22862 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22863 Builder.getInt1Ty()) {
22864 ReductionBitWidth = 1;
22865 } else {
22866 for (Value *V : *UserIgnoreList) {
22867 if (isa<PoisonValue>(V))
22868 continue;
22869 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22870 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
22871 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22872 if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
22873 ++BitWidth1;
22874 unsigned BitWidth2 = BitWidth1;
22876 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
22877 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22878 }
22879 ReductionBitWidth =
22880 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22881 }
22882 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22883 ReductionBitWidth = 8;
22884
22885 ReductionBitWidth = bit_ceil(ReductionBitWidth);
22886 }
22887 }
22888 bool IsTopRoot = NodeIdx == 0;
22889 while (NodeIdx < VectorizableTree.size() &&
22890 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22891 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22892 RootDemotes.push_back(NodeIdx);
22893 ++NodeIdx;
22894 IsTruncRoot = true;
22895 }
22896 bool IsSignedCmp = false;
22897 if (UserIgnoreList &&
22898 all_of(*UserIgnoreList,
22900 m_SMax(m_Value(), m_Value())))))
22901 IsSignedCmp = true;
22902 while (NodeIdx < VectorizableTree.size()) {
22903 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
22904 unsigned Limit = 2;
22905 if (IsTopRoot &&
22906 ReductionBitWidth ==
22907 DL->getTypeSizeInBits(
22908 VectorizableTree.front()->Scalars.front()->getType()))
22909 Limit = 3;
22910 unsigned MaxBitWidth = ComputeMaxBitWidth(
22911 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22912 IsTruncRoot, IsSignedCmp);
22913 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22914 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22915 ReductionBitWidth = bit_ceil(MaxBitWidth);
22916 else if (MaxBitWidth == 0)
22917 ReductionBitWidth = 0;
22918 }
22919
22920 for (unsigned Idx : RootDemotes) {
22921 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
22922 uint32_t OrigBitWidth =
22923 DL->getTypeSizeInBits(V->getType()->getScalarType());
22924 if (OrigBitWidth > MaxBitWidth) {
22925 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
22926 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22927 }
22928 return false;
22929 }))
22930 ToDemote.push_back(Idx);
22931 }
22932 RootDemotes.clear();
22933 IsTopRoot = false;
22934 IsProfitableToDemoteRoot = true;
22935
22936 if (ExtraBitWidthNodes.empty()) {
22937 NodeIdx = VectorizableTree.size();
22938 } else {
22939 unsigned NewIdx = 0;
22940 do {
22941 NewIdx = *ExtraBitWidthNodes.begin();
22942 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22943 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22944 NodeIdx = NewIdx;
22945 IsTruncRoot =
22946 NodeIdx < VectorizableTree.size() &&
22947 VectorizableTree[NodeIdx]->UserTreeIndex &&
22948 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22949 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22950 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22951 Instruction::Trunc &&
22952 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22953 IsSignedCmp =
22954 NodeIdx < VectorizableTree.size() &&
22955 VectorizableTree[NodeIdx]->UserTreeIndex &&
22956 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22957 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22958 Instruction::ICmp &&
22959 any_of(
22960 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22961 [&](Value *V) {
22962 auto *IC = dyn_cast<ICmpInst>(V);
22963 return IC && (IC->isSigned() ||
22964 !isKnownNonNegative(IC->getOperand(0),
22965 SimplifyQuery(*DL)) ||
22966 !isKnownNonNegative(IC->getOperand(1),
22967 SimplifyQuery(*DL)));
22968 });
22969 }
22970
22971 // If the maximum bit width we compute is less than the width of the roots'
22972 // type, we can proceed with the narrowing. Otherwise, do nothing.
22973 if (MaxBitWidth == 0 ||
22974 MaxBitWidth >=
22975 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
22976 ->getBitWidth()) {
22977 if (UserIgnoreList)
22978 AnalyzedMinBWVals.insert_range(TreeRoot);
22979 NodesToKeepBWs.insert_range(ToDemote);
22980 continue;
22981 }
22982
22983 // Finally, map the values we can demote to the maximum bit with we
22984 // computed.
22985 for (unsigned Idx : ToDemote) {
22986 TreeEntry *TE = VectorizableTree[Idx].get();
22987 if (MinBWs.contains(TE))
22988 continue;
22989 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
22990 if (isa<PoisonValue>(R))
22991 return false;
22992 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22993 });
22994 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22995 }
22996 }
22997}
22998
23000 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
23001 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
23003 auto *AA = &AM.getResult<AAManager>(F);
23004 auto *LI = &AM.getResult<LoopAnalysis>(F);
23005 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
23006 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
23007 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
23009
23010 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
23011 if (!Changed)
23012 return PreservedAnalyses::all();
23013
23016 return PA;
23017}
23018
23020 TargetTransformInfo *TTI_,
23021 TargetLibraryInfo *TLI_, AAResults *AA_,
23022 LoopInfo *LI_, DominatorTree *DT_,
23023 AssumptionCache *AC_, DemandedBits *DB_,
23026 return false;
23027 SE = SE_;
23028 TTI = TTI_;
23029 TLI = TLI_;
23030 AA = AA_;
23031 LI = LI_;
23032 DT = DT_;
23033 AC = AC_;
23034 DB = DB_;
23035 DL = &F.getDataLayout();
23036
23037 Stores.clear();
23038 GEPs.clear();
23039 bool Changed = false;
23040
23041 // If the target claims to have no vector registers don't attempt
23042 // vectorization.
23043 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
23044 LLVM_DEBUG(
23045 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
23046 return false;
23047 }
23048
23049 // Don't vectorize when the attribute NoImplicitFloat is used.
23050 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
23051 return false;
23052
23053 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
23054
23055 // Use the bottom up slp vectorizer to construct chains that start with
23056 // store instructions.
23057 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
23058
23059 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
23060 // delete instructions.
23061
23062 // Update DFS numbers now so that we can use them for ordering.
23063 DT->updateDFSNumbers();
23064
23065 // Scan the blocks in the function in post order.
23066 for (auto *BB : post_order(&F.getEntryBlock())) {
23068 continue;
23069
23070 // Start new block - clear the list of reduction roots.
23071 R.clearReductionData();
23072 collectSeedInstructions(BB);
23073
23074 // Vectorize trees that end at stores.
23075 if (!Stores.empty()) {
23076 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
23077 << " underlying objects.\n");
23078 Changed |= vectorizeStoreChains(R);
23079 }
23080
23081 // Vectorize trees that end at reductions.
23082 Changed |= vectorizeChainsInBlock(BB, R);
23083
23084 // Vectorize the index computations of getelementptr instructions. This
23085 // is primarily intended to catch gather-like idioms ending at
23086 // non-consecutive loads.
23087 if (!GEPs.empty()) {
23088 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
23089 << " underlying objects.\n");
23090 Changed |= vectorizeGEPIndices(BB, R);
23091 }
23092 }
23093
23094 if (Changed) {
23095 R.optimizeGatherSequence();
23096 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
23097 }
23098 return Changed;
23099}
23100
23101std::optional<bool>
23102SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
23103 unsigned Idx, unsigned MinVF,
23104 unsigned &Size) {
23105 Size = 0;
23106 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
23107 << "\n");
23108 const unsigned Sz = R.getVectorElementSize(Chain[0]);
23109 unsigned VF = Chain.size();
23110
23111 if (!has_single_bit(Sz) ||
23113 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
23114 VF) ||
23115 VF < 2 || VF < MinVF) {
23116 // Check if vectorizing with a non-power-of-2 VF should be considered. At
23117 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
23118 // all vector lanes are used.
23119 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
23120 return false;
23121 }
23122
23123 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
23124 << "\n");
23125
23126 SetVector<Value *> ValOps;
23127 for (Value *V : Chain)
23128 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
23129 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
23130 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
23131 InstructionsState S = Analysis.buildInstructionsState(
23132 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
23133 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
23134 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
23135 bool IsAllowedSize =
23136 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
23137 ValOps.size()) ||
23138 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
23139 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
23140 (!S.getMainOp()->isSafeToRemove() ||
23141 any_of(ValOps.getArrayRef(),
23142 [&](Value *V) {
23143 return !isa<ExtractElementInst>(V) &&
23144 (V->getNumUses() > Chain.size() ||
23145 any_of(V->users(), [&](User *U) {
23146 return !Stores.contains(U);
23147 }));
23148 }))) ||
23149 (ValOps.size() > Chain.size() / 2 && !S)) {
23150 Size = (!IsAllowedSize && S) ? 1 : 2;
23151 return false;
23152 }
23153 }
23154 if (R.isLoadCombineCandidate(Chain))
23155 return true;
23156 R.buildTree(Chain);
23157 // Check if tree tiny and store itself or its value is not vectorized.
23158 if (R.isTreeTinyAndNotFullyVectorizable()) {
23159 if (R.isGathered(Chain.front()) ||
23160 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
23161 return std::nullopt;
23162 Size = R.getCanonicalGraphSize();
23163 return false;
23164 }
23165 if (R.isProfitableToReorder()) {
23166 R.reorderTopToBottom();
23167 R.reorderBottomToTop();
23168 }
23169 R.transformNodes();
23170 R.buildExternalUses();
23171
23172 R.computeMinimumValueSizes();
23173
23174 Size = R.getCanonicalGraphSize();
23175 if (S && S.getOpcode() == Instruction::Load)
23176 Size = 2; // cut off masked gather small trees
23177 InstructionCost Cost = R.getTreeCost();
23178
23179 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
23180 if (Cost < -SLPCostThreshold) {
23181 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
23182
23183 using namespace ore;
23184
23185 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
23186 cast<StoreInst>(Chain[0]))
23187 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
23188 << " and with tree size "
23189 << NV("TreeSize", R.getTreeSize()));
23190
23191 R.vectorizeTree();
23192 return true;
23193 }
23194
23195 return false;
23196}
23197
23198/// Checks if the quadratic mean deviation is less than 90% of the mean size.
23199static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
23200 bool First) {
23201 unsigned Num = 0;
23202 uint64_t Sum = std::accumulate(
23203 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
23204 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
23205 unsigned Size = First ? Val.first : Val.second;
23206 if (Size == 1)
23207 return V;
23208 ++Num;
23209 return V + Size;
23210 });
23211 if (Num == 0)
23212 return true;
23213 uint64_t Mean = Sum / Num;
23214 if (Mean == 0)
23215 return true;
23216 uint64_t Dev = std::accumulate(
23217 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
23218 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
23219 unsigned P = First ? Val.first : Val.second;
23220 if (P == 1)
23221 return V;
23222 return V + (P - Mean) * (P - Mean);
23223 }) /
23224 Num;
23225 return Dev * 96 / (Mean * Mean) == 0;
23226}
23227
23228namespace {
23229
23230/// A group of stores that we'll try to bundle together using vector ops.
23231/// They are ordered using the signed distance of their address operand to the
23232/// address of this group's BaseInstr.
23233class RelatedStoreInsts {
23234public:
23235 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
23236 : AllStores(AllStores) {
23237 reset(BaseInstrIdx);
23238 }
23239
23240 void reset(unsigned NewBaseInstr) {
23241 assert(NewBaseInstr < AllStores.size() &&
23242 "Instruction index out of bounds");
23243 BaseInstrIdx = NewBaseInstr;
23244 Instrs.clear();
23245 insertOrLookup(NewBaseInstr, 0);
23246 }
23247
23248 /// Tries to insert \p InstrIdx as the store with a pointer distance of
23249 /// \p PtrDist.
23250 /// Does nothing if there is already a store with that \p PtrDist.
23251 /// \returns The previously associated Instruction index, or std::nullopt
23252 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
23253 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23254 return Inserted ? std::nullopt : std::make_optional(It->second);
23255 }
23256
23257 using DistToInstMap = std::map<int64_t, unsigned>;
23258 const DistToInstMap &getStores() const { return Instrs; }
23259
23260 /// If \p SI is related to this group of stores, return the distance of its
23261 /// pointer operand to the one the group's BaseInstr.
23262 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
23263 ScalarEvolution &SE) const {
23264 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23265 return getPointersDiff(
23266 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
23267 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
23268 /*StrictCheck=*/true);
23269 }
23270
23271 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
23272 /// Stores whose index is less than \p MinSafeIdx will be dropped.
23273 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
23274 int64_t DistFromCurBase) {
23275 DistToInstMap PrevSet = std::move(Instrs);
23276 reset(NewBaseInstIdx);
23277
23278 // Re-insert stores that come after MinSafeIdx to try and vectorize them
23279 // again. Their distance will be "rebased" to use NewBaseInstIdx as
23280 // reference.
23281 for (auto [Dist, InstIdx] : PrevSet) {
23282 if (InstIdx >= MinSafeIdx)
23283 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23284 }
23285 }
23286
23287 /// Remove all stores that have been vectorized from this group.
23288 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
23289 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
23290 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
23291 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
23292 });
23293
23294 // Get a forward iterator pointing after the last vectorized store and erase
23295 // all stores before it so we don't try to vectorize them again.
23296 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23297 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23298 }
23299
23300private:
23301 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
23302 unsigned BaseInstrIdx;
23303
23304 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
23305 DistToInstMap Instrs;
23306
23307 /// Reference to all the stores in the BB being analyzed.
23308 ArrayRef<StoreInst *> AllStores;
23309};
23310
23311} // end anonymous namespace
23312
23313bool SLPVectorizerPass::vectorizeStores(
23314 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
23315 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23316 &Visited) {
23317 // We may run into multiple chains that merge into a single chain. We mark the
23318 // stores that we vectorized so that we don't visit the same store twice.
23319 BoUpSLP::ValueSet VectorizedStores;
23320 bool Changed = false;
23321
23322 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23323 int64_t PrevDist = -1;
23324 BoUpSLP::ValueList Operands;
23325 // Collect the chain into a list.
23326 for (auto [Idx, Data] : enumerate(StoreSeq)) {
23327 auto &[Dist, InstIdx] = Data;
23328 if (Operands.empty() || Dist - PrevDist == 1) {
23329 Operands.push_back(Stores[InstIdx]);
23330 PrevDist = Dist;
23331 if (Idx != StoreSeq.size() - 1)
23332 continue;
23333 }
23334 auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
23335 Operands.clear();
23336 Operands.push_back(Stores[InstIdx]);
23337 PrevDist = Dist;
23338 });
23339
23340 if (Operands.size() <= 1 ||
23341 !Visited
23342 .insert({Operands.front(),
23343 cast<StoreInst>(Operands.front())->getValueOperand(),
23344 Operands.back(),
23345 cast<StoreInst>(Operands.back())->getValueOperand(),
23346 Operands.size()})
23347 .second)
23348 continue;
23349
23350 unsigned MaxVecRegSize = R.getMaxVecRegSize();
23351 unsigned EltSize = R.getVectorElementSize(Operands[0]);
23352 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
23353
23354 unsigned MaxVF =
23355 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23356 auto *Store = cast<StoreInst>(Operands[0]);
23357 Type *StoreTy = Store->getValueOperand()->getType();
23358 Type *ValueTy = StoreTy;
23359 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
23360 ValueTy = Trunc->getSrcTy();
23361 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
23362 // getStoreMinimumVF only support scalar type as arguments. As a result,
23363 // we need to use the element type of StoreTy and ValueTy to retrieve the
23364 // VF and then transform it back.
23365 // Remember: VF is defined as the number we want to vectorize, not the
23366 // number of elements in the final vector.
23367 Type *StoreScalarTy = StoreTy->getScalarType();
23368 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
23369 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23370 ValueTy->getScalarType()));
23371 MinVF /= getNumElements(StoreTy);
23372 MinVF = std::max<unsigned>(2, MinVF);
23373
23374 if (MaxVF < MinVF) {
23375 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23376 << ") < "
23377 << "MinVF (" << MinVF << ")\n");
23378 continue;
23379 }
23380
23381 unsigned NonPowerOf2VF = 0;
23383 // First try vectorizing with a non-power-of-2 VF. At the moment, only
23384 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
23385 // lanes are used.
23386 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
23387 if (has_single_bit(CandVF + 1)) {
23388 NonPowerOf2VF = CandVF;
23389 assert(NonPowerOf2VF != MaxVF &&
23390 "Non-power-of-2 VF should not be equal to MaxVF");
23391 }
23392 }
23393
23394 // MaxRegVF represents the number of instructions (scalar, or vector in
23395 // case of revec) that can be vectorized to naturally fit in a vector
23396 // register.
23397 unsigned MaxRegVF = MaxVF;
23398
23399 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
23400 if (MaxVF < MinVF) {
23401 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23402 << ") < "
23403 << "MinVF (" << MinVF << ")\n");
23404 continue;
23405 }
23406
23407 SmallVector<unsigned> CandidateVFs;
23408 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23409 VF = divideCeil(VF, 2))
23410 CandidateVFs.push_back(VF);
23411
23412 unsigned End = Operands.size();
23413 unsigned Repeat = 0;
23414 constexpr unsigned MaxAttempts = 4;
23415 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
23416 for (std::pair<unsigned, unsigned> &P : RangeSizes)
23417 P.first = P.second = 1;
23418 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23419 auto IsNotVectorized = [](bool First,
23420 const std::pair<unsigned, unsigned> &P) {
23421 return First ? P.first > 0 : P.second > 0;
23422 };
23423 auto IsVectorized = [](bool First,
23424 const std::pair<unsigned, unsigned> &P) {
23425 return First ? P.first == 0 : P.second == 0;
23426 };
23427 auto VFIsProfitable = [](bool First, unsigned Size,
23428 const std::pair<unsigned, unsigned> &P) {
23429 return First ? Size >= P.first : Size >= P.second;
23430 };
23431 auto FirstSizeSame = [](unsigned Size,
23432 const std::pair<unsigned, unsigned> &P) {
23433 return Size == P.first;
23434 };
23435 while (true) {
23436 ++Repeat;
23437 bool RepeatChanged = false;
23438 bool AnyProfitableGraph = false;
23439 for (unsigned VF : CandidateVFs) {
23440 AnyProfitableGraph = false;
23441 unsigned FirstUnvecStore =
23442 std::distance(RangeSizes.begin(),
23443 find_if(RangeSizes, std::bind(IsNotVectorized,
23444 VF >= MaxRegVF, _1)));
23445
23446 // Form slices of size VF starting from FirstUnvecStore and try to
23447 // vectorize them.
23448 while (FirstUnvecStore < End) {
23449 unsigned FirstVecStore = std::distance(
23450 RangeSizes.begin(),
23451 find_if(RangeSizes.drop_front(FirstUnvecStore),
23452 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23453 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23454 for (unsigned SliceStartIdx = FirstUnvecStore;
23455 SliceStartIdx + VF <= MaxSliceEnd;) {
23456 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
23457 VF >= MaxRegVF)) {
23458 ++SliceStartIdx;
23459 continue;
23460 }
23461 ArrayRef<Value *> Slice =
23462 ArrayRef(Operands).slice(SliceStartIdx, VF);
23463 assert(all_of(Slice,
23464 [&](Value *V) {
23465 return cast<StoreInst>(V)
23466 ->getValueOperand()
23467 ->getType() ==
23468 cast<StoreInst>(Slice.front())
23469 ->getValueOperand()
23470 ->getType();
23471 }) &&
23472 "Expected all operands of same type.");
23473 if (!NonSchedulable.empty()) {
23474 auto [NonSchedSizeMax, NonSchedSizeMin] =
23475 NonSchedulable.lookup(Slice.front());
23476 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23477 // VF is too ambitious. Try to vectorize another slice before
23478 // trying a smaller VF.
23479 SliceStartIdx += NonSchedSizeMax;
23480 continue;
23481 }
23482 }
23483 unsigned TreeSize;
23484 std::optional<bool> Res =
23485 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23486 if (!Res) {
23487 // Update the range of non schedulable VFs for slices starting
23488 // at SliceStartIdx.
23489 NonSchedulable
23490 .try_emplace(Slice.front(), std::make_pair(VF, VF))
23491 .first->getSecond()
23492 .second = VF;
23493 } else if (*Res) {
23494 // Mark the vectorized stores so that we don't vectorize them
23495 // again.
23496 VectorizedStores.insert_range(Slice);
23497 // Mark the vectorized stores so that we don't vectorize them
23498 // again.
23499 AnyProfitableGraph = RepeatChanged = Changed = true;
23500 // If we vectorized initial block, no need to try to vectorize
23501 // it again.
23502 for (std::pair<unsigned, unsigned> &P :
23503 RangeSizes.slice(SliceStartIdx, VF))
23504 P.first = P.second = 0;
23505 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23506 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
23507 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23508 P.first = P.second = 0;
23509 FirstUnvecStore = SliceStartIdx + VF;
23510 }
23511 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23512 for (std::pair<unsigned, unsigned> &P :
23513 RangeSizes.slice(SliceStartIdx + VF,
23514 MaxSliceEnd - (SliceStartIdx + VF)))
23515 P.first = P.second = 0;
23516 if (MaxSliceEnd == End)
23517 End = SliceStartIdx;
23518 MaxSliceEnd = SliceStartIdx;
23519 }
23520 SliceStartIdx += VF;
23521 continue;
23522 }
23523 if (VF > 2 && Res &&
23524 !all_of(RangeSizes.slice(SliceStartIdx, VF),
23525 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23526 _1))) {
23527 SliceStartIdx += VF;
23528 continue;
23529 }
23530 // Check for the very big VFs that we're not rebuilding same
23531 // trees, just with larger number of elements.
23532 if (VF > MaxRegVF && TreeSize > 1 &&
23533 all_of(RangeSizes.slice(SliceStartIdx, VF),
23534 std::bind(FirstSizeSame, TreeSize, _1))) {
23535 SliceStartIdx += VF;
23536 while (SliceStartIdx != MaxSliceEnd &&
23537 RangeSizes[SliceStartIdx].first == TreeSize)
23538 ++SliceStartIdx;
23539 continue;
23540 }
23541 if (TreeSize > 1) {
23542 for (std::pair<unsigned, unsigned> &P :
23543 RangeSizes.slice(SliceStartIdx, VF)) {
23544 if (VF >= MaxRegVF)
23545 P.second = std::max(P.second, TreeSize);
23546 else
23547 P.first = std::max(P.first, TreeSize);
23548 }
23549 }
23550 ++SliceStartIdx;
23551 AnyProfitableGraph = true;
23552 }
23553 if (FirstUnvecStore >= End)
23554 break;
23555 if (MaxSliceEnd - FirstUnvecStore < VF &&
23556 MaxSliceEnd - FirstUnvecStore >= MinVF)
23557 AnyProfitableGraph = true;
23558 FirstUnvecStore = std::distance(
23559 RangeSizes.begin(),
23560 find_if(RangeSizes.drop_front(MaxSliceEnd),
23561 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23562 }
23563 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
23564 break;
23565 }
23566 // All values vectorized - exit.
23567 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
23568 return P.first == 0 && P.second == 0;
23569 }))
23570 break;
23571 // Check if tried all attempts or no need for the last attempts at all.
23572 if (Repeat >= MaxAttempts ||
23573 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23574 break;
23575 constexpr unsigned StoresLimit = 64;
23576 const unsigned MaxTotalNum = std::min<unsigned>(
23577 Operands.size(),
23578 static_cast<unsigned>(
23579 End -
23580 std::distance(
23581 RangeSizes.begin(),
23582 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
23583 1));
23584 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
23585 unsigned Limit =
23586 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
23587 CandidateVFs.clear();
23588 if (bit_floor(Limit) == VF)
23589 CandidateVFs.push_back(Limit);
23590 if (VF > MaxTotalNum || VF >= StoresLimit)
23591 break;
23592 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
23593 if (P.first != 0)
23594 P.first = std::max(P.second, P.first);
23595 }
23596 // Last attempt to vectorize max number of elements, if all previous
23597 // attempts were unsuccessful because of the cost issues.
23598 CandidateVFs.push_back(VF);
23599 }
23600 }
23601 };
23602
23603 /// Groups of stores to vectorize
23604 SmallVector<RelatedStoreInsts> SortedStores;
23605
23606 // Inserts the specified store SI with the given index Idx to the set of the
23607 // stores. If the store with the same distance is found already - stop
23608 // insertion, try to vectorize already found stores. If some stores from this
23609 // sequence were not vectorized - try to vectorize them with the new store
23610 // later. But this logic is applied only to the stores, that come before the
23611 // previous store with the same distance.
23612 // Example:
23613 // 1. store x, %p
23614 // 2. store y, %p+1
23615 // 3. store z, %p+2
23616 // 4. store a, %p
23617 // 5. store b, %p+3
23618 // - Scan this from the last to first store. The very first bunch of stores is
23619 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
23620 // vector).
23621 // - The next store in the list - #1 - has the same distance from store #5 as
23622 // the store #4.
23623 // - Try to vectorize sequence of stores 4,2,3,5.
23624 // - If all these stores are vectorized - just drop them.
23625 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
23626 // - Start new stores sequence.
23627 // The new bunch of stores is {1, {1, 0}}.
23628 // - Add the stores from previous sequence, that were not vectorized.
23629 // Here we consider the stores in the reversed order, rather they are used in
23630 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
23631 // Store #3 can be added -> comes after store #4 with the same distance as
23632 // store #1.
23633 // Store #5 cannot be added - comes before store #4.
23634 // This logic allows to improve the compile time, we assume that the stores
23635 // after previous store with the same distance most likely have memory
23636 // dependencies and no need to waste compile time to try to vectorize them.
23637 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
23638 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
23639 std::optional<int64_t> PtrDist;
23640 auto *RelatedStores = find_if(
23641 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
23642 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23643 return PtrDist.has_value();
23644 });
23645
23646 // We did not find a comparable store, start a new group.
23647 if (RelatedStores == SortedStores.end()) {
23648 SortedStores.emplace_back(Idx, Stores);
23649 return;
23650 }
23651
23652 // If there is already a store in the group with the same PtrDiff, try to
23653 // vectorize the existing instructions before adding the current store.
23654 // Otherwise, insert this store and keep collecting.
23655 if (std::optional<unsigned> PrevInst =
23656 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23657 TryToVectorize(RelatedStores->getStores());
23658 RelatedStores->clearVectorizedStores(VectorizedStores);
23659 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
23660 /*NewBaseInstIdx=*/Idx,
23661 /*DistFromCurBase=*/*PtrDist);
23662 }
23663 };
23664 Type *PrevValTy = nullptr;
23665 for (auto [I, SI] : enumerate(Stores)) {
23666 if (R.isDeleted(SI))
23667 continue;
23668 if (!PrevValTy)
23669 PrevValTy = SI->getValueOperand()->getType();
23670 // Check that we do not try to vectorize stores of different types.
23671 if (PrevValTy != SI->getValueOperand()->getType()) {
23672 for (RelatedStoreInsts &StoreSeq : SortedStores)
23673 TryToVectorize(StoreSeq.getStores());
23674 SortedStores.clear();
23675 PrevValTy = SI->getValueOperand()->getType();
23676 }
23677 FillStoresSet(I, SI);
23678 }
23679
23680 // Final vectorization attempt.
23681 for (RelatedStoreInsts &StoreSeq : SortedStores)
23682 TryToVectorize(StoreSeq.getStores());
23683
23684 return Changed;
23685}
23686
23687void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23688 // Initialize the collections. We will make a single pass over the block.
23689 Stores.clear();
23690 GEPs.clear();
23691
23692 // Visit the store and getelementptr instructions in BB and organize them in
23693 // Stores and GEPs according to the underlying objects of their pointer
23694 // operands.
23695 for (Instruction &I : *BB) {
23696 // Ignore store instructions that are volatile or have a pointer operand
23697 // that doesn't point to a scalar type.
23698 if (auto *SI = dyn_cast<StoreInst>(&I)) {
23699 if (!SI->isSimple())
23700 continue;
23701 if (!isValidElementType(SI->getValueOperand()->getType()))
23702 continue;
23703 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
23704 }
23705
23706 // Ignore getelementptr instructions that have more than one index, a
23707 // constant index, or a pointer operand that doesn't point to a scalar
23708 // type.
23709 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
23710 if (GEP->getNumIndices() != 1)
23711 continue;
23712 Value *Idx = GEP->idx_begin()->get();
23713 if (isa<Constant>(Idx))
23714 continue;
23715 if (!isValidElementType(Idx->getType()))
23716 continue;
23717 if (GEP->getType()->isVectorTy())
23718 continue;
23719 GEPs[GEP->getPointerOperand()].push_back(GEP);
23720 }
23721 }
23722}
23723
23724bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
23725 bool MaxVFOnly) {
23726 if (VL.size() < 2)
23727 return false;
23728
23729 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
23730 << VL.size() << ".\n");
23731
23732 // Check that all of the parts are instructions of the same type,
23733 // we permit an alternate opcode via InstructionsState.
23734 InstructionsState S = getSameOpcode(VL, *TLI);
23735 if (!S)
23736 return false;
23737
23738 Instruction *I0 = S.getMainOp();
23739 // Make sure invalid types (including vector type) are rejected before
23740 // determining vectorization factor for scalar instructions.
23741 for (Value *V : VL) {
23742 Type *Ty = V->getType();
23744 // NOTE: the following will give user internal llvm type name, which may
23745 // not be useful.
23746 R.getORE()->emit([&]() {
23747 std::string TypeStr;
23748 llvm::raw_string_ostream OS(TypeStr);
23749 Ty->print(OS);
23750 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
23751 << "Cannot SLP vectorize list: type "
23752 << TypeStr + " is unsupported by vectorizer";
23753 });
23754 return false;
23755 }
23756 }
23757
23758 Type *ScalarTy = getValueType(VL[0]);
23759 unsigned Sz = R.getVectorElementSize(I0);
23760 unsigned MinVF = R.getMinVF(Sz);
23761 unsigned MaxVF = std::max<unsigned>(
23762 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
23763 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23764 if (MaxVF < 2) {
23765 R.getORE()->emit([&]() {
23766 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
23767 << "Cannot SLP vectorize list: vectorization factor "
23768 << "less than 2 is not supported";
23769 });
23770 return false;
23771 }
23772
23773 bool Changed = false;
23774 bool CandidateFound = false;
23775 InstructionCost MinCost = SLPCostThreshold.getValue();
23776
23777 unsigned NextInst = 0, MaxInst = VL.size();
23778 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23779 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
23780 // No actual vectorization should happen, if number of parts is the same as
23781 // provided vectorization factor (i.e. the scalar type is used for vector
23782 // code during codegen).
23783 auto *VecTy = getWidenedType(ScalarTy, VF);
23784 if (TTI->getNumberOfParts(VecTy) == VF)
23785 continue;
23786 for (unsigned I = NextInst; I < MaxInst; ++I) {
23787 unsigned ActualVF = std::min(MaxInst - I, VF);
23788
23789 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
23790 continue;
23791
23792 if (MaxVFOnly && ActualVF < MaxVF)
23793 break;
23794 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23795 break;
23796
23797 SmallVector<Value *> Ops(ActualVF, nullptr);
23798 unsigned Idx = 0;
23799 for (Value *V : VL.drop_front(I)) {
23800 // Check that a previous iteration of this loop did not delete the
23801 // Value.
23802 if (auto *Inst = dyn_cast<Instruction>(V);
23803 !Inst || !R.isDeleted(Inst)) {
23804 Ops[Idx] = V;
23805 ++Idx;
23806 if (Idx == ActualVF)
23807 break;
23808 }
23809 }
23810 // Not enough vectorizable instructions - exit.
23811 if (Idx != ActualVF)
23812 break;
23813
23814 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
23815 << "\n");
23816
23817 R.buildTree(Ops);
23818 if (R.isTreeTinyAndNotFullyVectorizable())
23819 continue;
23820 if (R.isProfitableToReorder()) {
23821 R.reorderTopToBottom();
23822 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
23823 }
23824 R.transformNodes();
23825 R.buildExternalUses();
23826
23827 R.computeMinimumValueSizes();
23828 InstructionCost Cost = R.getTreeCost();
23829 CandidateFound = true;
23830 MinCost = std::min(MinCost, Cost);
23831
23832 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
23833 << " for VF=" << ActualVF << "\n");
23834 if (Cost < -SLPCostThreshold) {
23835 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
23836 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
23838 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
23839 << " and with tree size "
23840 << ore::NV("TreeSize", R.getTreeSize()));
23841
23842 R.vectorizeTree();
23843 // Move to the next bundle.
23844 I += VF - 1;
23845 NextInst = I + 1;
23846 Changed = true;
23847 }
23848 }
23849 }
23850
23851 if (!Changed && CandidateFound) {
23852 R.getORE()->emit([&]() {
23853 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
23854 << "List vectorization was possible but not beneficial with cost "
23855 << ore::NV("Cost", MinCost) << " >= "
23856 << ore::NV("Treshold", -SLPCostThreshold);
23857 });
23858 } else if (!Changed) {
23859 R.getORE()->emit([&]() {
23860 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
23861 << "Cannot SLP vectorize list: vectorization was impossible"
23862 << " with available vectorization factors";
23863 });
23864 }
23865 return Changed;
23866}
23867
23868namespace {
23869
23870/// Model horizontal reductions.
23871///
23872/// A horizontal reduction is a tree of reduction instructions that has values
23873/// that can be put into a vector as its leaves. For example:
23874///
23875/// mul mul mul mul
23876/// \ / \ /
23877/// + +
23878/// \ /
23879/// +
23880/// This tree has "mul" as its leaf values and "+" as its reduction
23881/// instructions. A reduction can feed into a store or a binary operation
23882/// feeding a phi.
23883/// ...
23884/// \ /
23885/// +
23886/// |
23887/// phi +=
23888///
23889/// Or:
23890/// ...
23891/// \ /
23892/// +
23893/// |
23894/// *p =
23895///
23896class HorizontalReduction {
23897 using ReductionOpsType = SmallVector<Value *, 16>;
23898 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23899 ReductionOpsListType ReductionOps;
23900 /// List of possibly reduced values.
23902 /// Maps reduced value to the corresponding reduction operation.
23903 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23904 WeakTrackingVH ReductionRoot;
23905 /// The type of reduction operation.
23906 RecurKind RdxKind;
23907 /// Checks if the optimization of original scalar identity operations on
23908 /// matched horizontal reductions is enabled and allowed.
23909 bool IsSupportedHorRdxIdentityOp = false;
23910 /// The minimum number of the reduced values.
23911 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
23912 /// Contains vector values for reduction including their scale factor and
23913 /// signedness.
23915
23916 static bool isCmpSelMinMax(Instruction *I) {
23917 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
23919 }
23920
23921 // And/or are potentially poison-safe logical patterns like:
23922 // select x, y, false
23923 // select x, true, y
23924 static bool isBoolLogicOp(Instruction *I) {
23925 return isa<SelectInst>(I) &&
23926 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
23927 }
23928
23929 /// Checks if instruction is associative and can be vectorized.
23930 static bool isVectorizable(RecurKind Kind, Instruction *I,
23931 bool TwoElementReduction = false) {
23932 if (Kind == RecurKind::None)
23933 return false;
23934
23935 // Integer ops that map to select instructions or intrinsics are fine.
23937 isBoolLogicOp(I))
23938 return true;
23939
23940 // No need to check for associativity, if 2 reduced values.
23941 if (TwoElementReduction)
23942 return true;
23943
23944 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23945 // FP min/max are associative except for NaN and -0.0. We do not
23946 // have to rule out -0.0 here because the intrinsic semantics do not
23947 // specify a fixed result for it.
23948 return I->getFastMathFlags().noNaNs();
23949 }
23950
23951 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23952 return true;
23953
23954 return I->isAssociative();
23955 }
23956
23957 static Value *getRdxOperand(Instruction *I, unsigned Index) {
23958 // Poison-safe 'or' takes the form: select X, true, Y
23959 // To make that work with the normal operand processing, we skip the
23960 // true value operand.
23961 // TODO: Change the code and data structures to handle this without a hack.
23962 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
23963 return I->getOperand(2);
23964 return I->getOperand(Index);
23965 }
23966
23967 /// Creates reduction operation with the current opcode.
23968 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
23969 Value *RHS, const Twine &Name, bool UseSelect) {
23970 Type *OpTy = LHS->getType();
23971 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
23972 switch (Kind) {
23973 case RecurKind::Or: {
23974 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23975 return Builder.CreateSelectWithUnknownProfile(
23976 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
23977 RHS, DEBUG_TYPE, Name);
23978 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23979 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23980 Name);
23981 }
23982 case RecurKind::And: {
23983 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23984 return Builder.CreateSelectWithUnknownProfile(
23985 LHS, RHS,
23986 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
23987 DEBUG_TYPE, Name);
23988 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23989 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23990 Name);
23991 }
23992 case RecurKind::Add:
23993 case RecurKind::Mul:
23994 case RecurKind::Xor:
23995 case RecurKind::FAdd:
23996 case RecurKind::FMul: {
23997 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23998 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23999 Name);
24000 }
24001 case RecurKind::SMax:
24002 case RecurKind::SMin:
24003 case RecurKind::UMax:
24004 case RecurKind::UMin:
24005 if (UseSelect) {
24007 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
24008 return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
24009 Name);
24010 }
24011 [[fallthrough]];
24012 case RecurKind::FMax:
24013 case RecurKind::FMin:
24014 case RecurKind::FMaximum:
24015 case RecurKind::FMinimum:
24016 case RecurKind::FMaximumNum:
24017 case RecurKind::FMinimumNum: {
24019 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
24020 }
24021 default:
24022 llvm_unreachable("Unknown reduction operation.");
24023 }
24024 }
24025
24026 /// Creates reduction operation with the current opcode with the IR flags
24027 /// from \p ReductionOps, dropping nuw/nsw flags.
24028 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
24029 Value *RHS, const Twine &Name,
24030 const ReductionOpsListType &ReductionOps) {
24031 bool UseSelect = ReductionOps.size() == 2 ||
24032 // Logical or/and.
24033 (ReductionOps.size() == 1 &&
24034 any_of(ReductionOps.front(), IsaPred<SelectInst>));
24035 assert((!UseSelect || ReductionOps.size() != 2 ||
24036 isa<SelectInst>(ReductionOps[1][0])) &&
24037 "Expected cmp + select pairs for reduction");
24038 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
24040 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
24041 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
24042 /*IncludeWrapFlags=*/false);
24043 propagateIRFlags(Op, ReductionOps[1], nullptr,
24044 /*IncludeWrapFlags=*/false);
24045 return Op;
24046 }
24047 }
24048 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
24049 return Op;
24050 }
24051
24052public:
24053 static RecurKind getRdxKind(Value *V) {
24054 auto *I = dyn_cast<Instruction>(V);
24055 if (!I)
24056 return RecurKind::None;
24057 if (match(I, m_Add(m_Value(), m_Value())))
24058 return RecurKind::Add;
24059 if (match(I, m_Mul(m_Value(), m_Value())))
24060 return RecurKind::Mul;
24061 if (match(I, m_And(m_Value(), m_Value())) ||
24063 return RecurKind::And;
24064 if (match(I, m_Or(m_Value(), m_Value())) ||
24066 return RecurKind::Or;
24067 if (match(I, m_Xor(m_Value(), m_Value())))
24068 return RecurKind::Xor;
24069 if (match(I, m_FAdd(m_Value(), m_Value())))
24070 return RecurKind::FAdd;
24071 if (match(I, m_FMul(m_Value(), m_Value())))
24072 return RecurKind::FMul;
24073
24075 return RecurKind::FMax;
24077 return RecurKind::FMin;
24078
24079 if (match(I, m_FMaximum(m_Value(), m_Value())))
24080 return RecurKind::FMaximum;
24081 if (match(I, m_FMinimum(m_Value(), m_Value())))
24082 return RecurKind::FMinimum;
24083 // This matches either cmp+select or intrinsics. SLP is expected to handle
24084 // either form.
24085 // TODO: If we are canonicalizing to intrinsics, we can remove several
24086 // special-case paths that deal with selects.
24087 if (match(I, m_SMax(m_Value(), m_Value())))
24088 return RecurKind::SMax;
24089 if (match(I, m_SMin(m_Value(), m_Value())))
24090 return RecurKind::SMin;
24091 if (match(I, m_UMax(m_Value(), m_Value())))
24092 return RecurKind::UMax;
24093 if (match(I, m_UMin(m_Value(), m_Value())))
24094 return RecurKind::UMin;
24095
24096 if (auto *Select = dyn_cast<SelectInst>(I)) {
24097 // Try harder: look for min/max pattern based on instructions producing
24098 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
24099 // During the intermediate stages of SLP, it's very common to have
24100 // pattern like this (since optimizeGatherSequence is run only once
24101 // at the end):
24102 // %1 = extractelement <2 x i32> %a, i32 0
24103 // %2 = extractelement <2 x i32> %a, i32 1
24104 // %cond = icmp sgt i32 %1, %2
24105 // %3 = extractelement <2 x i32> %a, i32 0
24106 // %4 = extractelement <2 x i32> %a, i32 1
24107 // %select = select i1 %cond, i32 %3, i32 %4
24108 CmpPredicate Pred;
24109 Instruction *L1;
24110 Instruction *L2;
24111
24112 Value *LHS = Select->getTrueValue();
24113 Value *RHS = Select->getFalseValue();
24114 Value *Cond = Select->getCondition();
24115
24116 // TODO: Support inverse predicates.
24117 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
24120 return RecurKind::None;
24121 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
24124 return RecurKind::None;
24125 } else {
24127 return RecurKind::None;
24128 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
24131 return RecurKind::None;
24132 }
24133
24134 switch (Pred) {
24135 default:
24136 return RecurKind::None;
24137 case CmpInst::ICMP_SGT:
24138 case CmpInst::ICMP_SGE:
24139 return RecurKind::SMax;
24140 case CmpInst::ICMP_SLT:
24141 case CmpInst::ICMP_SLE:
24142 return RecurKind::SMin;
24143 case CmpInst::ICMP_UGT:
24144 case CmpInst::ICMP_UGE:
24145 return RecurKind::UMax;
24146 case CmpInst::ICMP_ULT:
24147 case CmpInst::ICMP_ULE:
24148 return RecurKind::UMin;
24149 }
24150 }
24151 return RecurKind::None;
24152 }
24153
24154 /// Get the index of the first operand.
24155 static unsigned getFirstOperandIndex(Instruction *I) {
24156 return isCmpSelMinMax(I) ? 1 : 0;
24157 }
24158
24159private:
24160 /// Total number of operands in the reduction operation.
24161 static unsigned getNumberOfOperands(Instruction *I) {
24162 return isCmpSelMinMax(I) ? 3 : 2;
24163 }
24164
24165 /// Checks if the instruction is in basic block \p BB.
24166 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
24167 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
24168 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
24169 auto *Sel = cast<SelectInst>(I);
24170 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
24171 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
24172 }
24173 return I->getParent() == BB;
24174 }
24175
24176 /// Expected number of uses for reduction operations/reduced values.
24177 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
24178 if (IsCmpSelMinMax) {
24179 // SelectInst must be used twice while the condition op must have single
24180 // use only.
24181 if (auto *Sel = dyn_cast<SelectInst>(I))
24182 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
24183 return I->hasNUses(2);
24184 }
24185
24186 // Arithmetic reduction operation must be used once only.
24187 return I->hasOneUse();
24188 }
24189
24190 /// Initializes the list of reduction operations.
24191 void initReductionOps(Instruction *I) {
24192 if (isCmpSelMinMax(I))
24193 ReductionOps.assign(2, ReductionOpsType());
24194 else
24195 ReductionOps.assign(1, ReductionOpsType());
24196 }
24197
24198 /// Add all reduction operations for the reduction instruction \p I.
24199 void addReductionOps(Instruction *I) {
24200 if (isCmpSelMinMax(I)) {
24201 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
24202 ReductionOps[1].emplace_back(I);
24203 } else {
24204 ReductionOps[0].emplace_back(I);
24205 }
24206 }
24207
24208 static bool isGoodForReduction(ArrayRef<Value *> Data) {
24209 int Sz = Data.size();
24210 auto *I = dyn_cast<Instruction>(Data.front());
24211 return Sz > 1 || isConstant(Data.front()) ||
24212 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
24213 }
24214
24215public:
24216 HorizontalReduction() = default;
24218 : ReductionRoot(I), ReductionLimit(2) {
24219 RdxKind = HorizontalReduction::getRdxKind(I);
24220 ReductionOps.emplace_back().push_back(I);
24221 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
24222 for (Value *V : Ops)
24223 ReducedValsToOps[V].push_back(I);
24224 }
24225
24226 bool matchReductionForOperands() const {
24227 // Analyze "regular" integer/FP types for reductions - no target-specific
24228 // types or pointers.
24229 assert(ReductionRoot && "Reduction root is not set!");
24230 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
24231 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
24232 return Ops.size() == 2;
24233 })))
24234 return false;
24235
24236 return true;
24237 }
24238
24239 /// Try to find a reduction tree.
24240 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24241 ScalarEvolution &SE, const DataLayout &DL,
24242 const TargetLibraryInfo &TLI) {
24243 RdxKind = HorizontalReduction::getRdxKind(Root);
24244 if (!isVectorizable(RdxKind, Root))
24245 return false;
24246
24247 // Analyze "regular" integer/FP types for reductions - no target-specific
24248 // types or pointers.
24249 Type *Ty = Root->getType();
24250 if (!isValidElementType(Ty) || Ty->isPointerTy())
24251 return false;
24252
24253 // Though the ultimate reduction may have multiple uses, its condition must
24254 // have only single use.
24255 if (auto *Sel = dyn_cast<SelectInst>(Root))
24256 if (!Sel->getCondition()->hasOneUse())
24257 return false;
24258
24259 ReductionRoot = Root;
24260
24261 // Iterate through all the operands of the possible reduction tree and
24262 // gather all the reduced values, sorting them by their value id.
24263 BasicBlock *BB = Root->getParent();
24264 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24266 1, std::make_pair(Root, 0));
24267 // Checks if the operands of the \p TreeN instruction are also reduction
24268 // operations or should be treated as reduced values or an extra argument,
24269 // which is not part of the reduction.
24270 auto CheckOperands = [&](Instruction *TreeN,
24271 SmallVectorImpl<Value *> &PossibleReducedVals,
24272 SmallVectorImpl<Instruction *> &ReductionOps,
24273 unsigned Level) {
24274 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
24275 getNumberOfOperands(TreeN)))) {
24276 Value *EdgeVal = getRdxOperand(TreeN, I);
24277 ReducedValsToOps[EdgeVal].push_back(TreeN);
24278 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
24279 // If the edge is not an instruction, or it is different from the main
24280 // reduction opcode or has too many uses - possible reduced value.
24281 // Also, do not try to reduce const values, if the operation is not
24282 // foldable.
24283 if (!EdgeInst || Level > RecursionMaxDepth ||
24284 getRdxKind(EdgeInst) != RdxKind ||
24285 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24286 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24287 !isVectorizable(RdxKind, EdgeInst) ||
24288 (R.isAnalyzedReductionRoot(EdgeInst) &&
24289 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
24290 PossibleReducedVals.push_back(EdgeVal);
24291 continue;
24292 }
24293 ReductionOps.push_back(EdgeInst);
24294 }
24295 };
24296 // Try to regroup reduced values so that it gets more profitable to try to
24297 // reduce them. Values are grouped by their value ids, instructions - by
24298 // instruction op id and/or alternate op id, plus do extra analysis for
24299 // loads (grouping them by the distance between pointers) and cmp
24300 // instructions (grouping them by the predicate).
24301 SmallMapVector<
24302 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24303 8>
24304 PossibleReducedVals;
24305 initReductionOps(Root);
24306 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
24307 SmallSet<size_t, 2> LoadKeyUsed;
24308
24309 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
24311 Value *Ptr =
24313 if (!LoadKeyUsed.insert(Key).second) {
24314 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
24315 if (LIt != LoadsMap.end()) {
24316 for (LoadInst *RLI : LIt->second) {
24317 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
24318 LI->getType(), LI->getPointerOperand(), DL, SE,
24319 /*StrictCheck=*/true))
24320 return hash_value(RLI->getPointerOperand());
24321 }
24322 for (LoadInst *RLI : LIt->second) {
24324 LI->getPointerOperand(), TLI)) {
24325 hash_code SubKey = hash_value(RLI->getPointerOperand());
24326 return SubKey;
24327 }
24328 }
24329 if (LIt->second.size() > 2) {
24330 hash_code SubKey =
24331 hash_value(LIt->second.back()->getPointerOperand());
24332 return SubKey;
24333 }
24334 }
24335 }
24336 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
24337 .first->second.push_back(LI);
24338 return hash_value(LI->getPointerOperand());
24339 };
24340
24341 while (!Worklist.empty()) {
24342 auto [TreeN, Level] = Worklist.pop_back_val();
24343 SmallVector<Value *> PossibleRedVals;
24344 SmallVector<Instruction *> PossibleReductionOps;
24345 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24346 addReductionOps(TreeN);
24347 // Add reduction values. The values are sorted for better vectorization
24348 // results.
24349 for (Value *V : PossibleRedVals) {
24350 size_t Key, Idx;
24351 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
24352 /*AllowAlternate=*/false);
24353 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
24354 }
24355 for (Instruction *I : reverse(PossibleReductionOps))
24356 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
24357 }
24358 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
24359 // Sort values by the total number of values kinds to start the reduction
24360 // from the longest possible reduced values sequences.
24361 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
24362 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
24363 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
24364 for (auto &Slice : PossibleRedVals) {
24365 PossibleRedValsVect.emplace_back();
24366 auto RedValsVect = Slice.second.takeVector();
24367 stable_sort(RedValsVect, llvm::less_second());
24368 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
24369 PossibleRedValsVect.back().append(Data.second, Data.first);
24370 }
24371 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
24372 return P1.size() > P2.size();
24373 });
24374 bool First = true;
24375 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
24376 if (First) {
24377 First = false;
24378 ReducedVals.emplace_back();
24379 } else if (!isGoodForReduction(Data)) {
24380 auto *LI = dyn_cast<LoadInst>(Data.front());
24381 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
24382 if (!LI || !LastLI ||
24384 getUnderlyingObject(LastLI->getPointerOperand()))
24385 ReducedVals.emplace_back();
24386 }
24387 ReducedVals.back().append(Data.rbegin(), Data.rend());
24388 }
24389 }
24390 // Sort the reduced values by number of same/alternate opcode and/or pointer
24391 // operand.
24392 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
24393 return P1.size() > P2.size();
24394 });
24395 return true;
24396 }
24397
24398 /// Attempt to vectorize the tree found by matchAssociativeReduction.
24399 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
24400 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24401 DominatorTree &DT) {
24402 constexpr unsigned RegMaxNumber = 4;
24403 constexpr unsigned RedValsMaxNumber = 128;
24404 // If there are a sufficient number of reduction values, reduce
24405 // to a nearby power-of-2. We can safely generate oversized
24406 // vectors and rely on the backend to split them to legal sizes.
24407 if (unsigned NumReducedVals = std::accumulate(
24408 ReducedVals.begin(), ReducedVals.end(), 0,
24409 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
24410 if (!isGoodForReduction(Vals))
24411 return Num;
24412 return Num + Vals.size();
24413 });
24414 NumReducedVals < ReductionLimit &&
24415 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
24416 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
24417 })) {
24418 for (ReductionOpsType &RdxOps : ReductionOps)
24419 for (Value *RdxOp : RdxOps)
24420 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24421 return nullptr;
24422 }
24423
24424 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24425 TargetFolder(DL));
24426 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
24427
24428 // Track the reduced values in case if they are replaced by extractelement
24429 // because of the vectorization.
24430 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
24431 ReducedVals.front().size());
24432
24433 // The compare instruction of a min/max is the insertion point for new
24434 // instructions and may be replaced with a new compare instruction.
24435 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
24436 assert(isa<SelectInst>(RdxRootInst) &&
24437 "Expected min/max reduction to have select root instruction");
24438 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
24439 assert(isa<Instruction>(ScalarCond) &&
24440 "Expected min/max reduction to have compare condition");
24441 return cast<Instruction>(ScalarCond);
24442 };
24443
24444 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
24445 return isBoolLogicOp(cast<Instruction>(V));
24446 });
24447 // Return new VectorizedTree, based on previous value.
24448 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
24449 if (VectorizedTree) {
24450 // Update the final value in the reduction.
24452 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
24453 if (AnyBoolLogicOp) {
24454 auto It = ReducedValsToOps.find(VectorizedTree);
24455 auto It1 = ReducedValsToOps.find(Res);
24456 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
24457 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
24458 (It != ReducedValsToOps.end() &&
24459 any_of(It->getSecond(), [&](Instruction *I) {
24460 return isBoolLogicOp(I) &&
24461 getRdxOperand(I, 0) == VectorizedTree;
24462 }))) {
24463 ;
24464 } else if (isGuaranteedNotToBePoison(Res, AC) ||
24465 (It1 != ReducedValsToOps.end() &&
24466 any_of(It1->getSecond(), [&](Instruction *I) {
24467 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24468 }))) {
24469 std::swap(VectorizedTree, Res);
24470 } else {
24471 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
24472 }
24473 }
24474
24475 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
24476 ReductionOps);
24477 }
24478 // Initialize the final value in the reduction.
24479 return Res;
24480 };
24481 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24482 ReductionOps.front().size());
24483 for (ReductionOpsType &RdxOps : ReductionOps)
24484 for (Value *RdxOp : RdxOps) {
24485 if (!RdxOp)
24486 continue;
24487 IgnoreList.insert(RdxOp);
24488 }
24489 // Intersect the fast-math-flags from all reduction operations.
24490 FastMathFlags RdxFMF;
24491 RdxFMF.set();
24492 for (Value *U : IgnoreList)
24493 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
24494 RdxFMF &= FPMO->getFastMathFlags();
24495 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
24496
24497 // Need to track reduced vals, they may be changed during vectorization of
24498 // subvectors.
24499 for (ArrayRef<Value *> Candidates : ReducedVals)
24500 for (Value *V : Candidates)
24501 TrackedVals.try_emplace(V, V);
24502
24503 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24504 Value *V) -> unsigned & {
24505 auto *It = MV.find(V);
24506 assert(It != MV.end() && "Unable to find given key.");
24507 return It->second;
24508 };
24509
24510 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
24511 // List of the values that were reduced in other trees as part of gather
24512 // nodes and thus requiring extract if fully vectorized in other trees.
24513 SmallPtrSet<Value *, 4> RequiredExtract;
24514 WeakTrackingVH VectorizedTree = nullptr;
24515 bool CheckForReusedReductionOps = false;
24516 // Try to vectorize elements based on their type.
24518 for (ArrayRef<Value *> RV : ReducedVals)
24519 States.push_back(getSameOpcode(RV, TLI));
24520 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
24521 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
24522 InstructionsState S = States[I];
24523 SmallVector<Value *> Candidates;
24524 Candidates.reserve(2 * OrigReducedVals.size());
24525 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
24526 for (Value *ReducedVal : OrigReducedVals) {
24527 Value *RdxVal = TrackedVals.at(ReducedVal);
24528 // Check if the reduction value was not overriden by the extractelement
24529 // instruction because of the vectorization and exclude it, if it is not
24530 // compatible with other values.
24531 // Also check if the instruction was folded to constant/other value.
24532 auto *Inst = dyn_cast<Instruction>(RdxVal);
24533 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
24534 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24535 (S && !Inst))
24536 continue;
24537 Candidates.push_back(RdxVal);
24538 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24539 }
24540 bool ShuffledExtracts = false;
24541 // Try to handle shuffled extractelements.
24542 if (S && S.getOpcode() == Instruction::ExtractElement &&
24543 !S.isAltShuffle() && I + 1 < E) {
24544 SmallVector<Value *> CommonCandidates(Candidates);
24545 for (Value *RV : ReducedVals[I + 1]) {
24546 Value *RdxVal = TrackedVals.at(RV);
24547 // Check if the reduction value was not overriden by the
24548 // extractelement instruction because of the vectorization and
24549 // exclude it, if it is not compatible with other values.
24550 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
24551 if (!Inst)
24552 continue;
24553 CommonCandidates.push_back(RdxVal);
24554 TrackedToOrig.try_emplace(RdxVal, RV);
24555 }
24556 SmallVector<int> Mask;
24557 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
24558 ++I;
24559 Candidates.swap(CommonCandidates);
24560 ShuffledExtracts = true;
24561 }
24562 }
24563
24564 // Emit code for constant values.
24565 if (Candidates.size() > 1 && allConstant(Candidates)) {
24566 Value *Res = Candidates.front();
24567 Value *OrigV = TrackedToOrig.at(Candidates.front());
24568 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24569 for (Value *VC : ArrayRef(Candidates).drop_front()) {
24570 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
24571 Value *OrigV = TrackedToOrig.at(VC);
24572 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24573 if (auto *ResI = dyn_cast<Instruction>(Res))
24574 V.analyzedReductionRoot(ResI);
24575 }
24576 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24577 continue;
24578 }
24579
24580 unsigned NumReducedVals = Candidates.size();
24581 if (NumReducedVals < ReductionLimit &&
24582 (NumReducedVals < 2 || !isSplat(Candidates)))
24583 continue;
24584
24585 // Check if we support repeated scalar values processing (optimization of
24586 // original scalar identity operations on matched horizontal reductions).
24587 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24588 RdxKind != RecurKind::FMul &&
24589 RdxKind != RecurKind::FMulAdd;
24590 // Gather same values.
24591 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24592 if (IsSupportedHorRdxIdentityOp)
24593 for (Value *V : Candidates) {
24594 Value *OrigV = TrackedToOrig.at(V);
24595 ++SameValuesCounter.try_emplace(OrigV).first->second;
24596 }
24597 // Used to check if the reduced values used same number of times. In this
24598 // case the compiler may produce better code. E.g. if reduced values are
24599 // aabbccdd (8 x values), then the first node of the tree will have a node
24600 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
24601 // Plus, the final reduction will be performed on <8 x aabbccdd>.
24602 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
24603 // x abcd) * 2.
24604 // Currently it only handles add/fadd/xor. and/or/min/max do not require
24605 // this analysis, other operations may require an extra estimation of
24606 // the profitability.
24607 bool SameScaleFactor = false;
24608 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24609 SameValuesCounter.size() != Candidates.size();
24610 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
24611 if (OptReusedScalars) {
24612 SameScaleFactor =
24613 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24614 RdxKind == RecurKind::Xor) &&
24615 all_of(drop_begin(SameValuesCounter),
24616 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
24617 return P.second == SameValuesCounter.front().second;
24618 });
24619 Candidates.resize(SameValuesCounter.size());
24620 transform(SameValuesCounter, Candidates.begin(),
24621 [&](const auto &P) { return TrackedVals.at(P.first); });
24622 NumReducedVals = Candidates.size();
24623 // Have a reduction of the same element.
24624 if (NumReducedVals == 1) {
24625 Value *OrigV = TrackedToOrig.at(Candidates.front());
24626 unsigned Cnt = At(SameValuesCounter, OrigV);
24627 Value *RedVal =
24628 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24629 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24630 VectorizedVals.try_emplace(OrigV, Cnt);
24631 ExternallyUsedValues.insert(OrigV);
24632 continue;
24633 }
24634 }
24635
24636 unsigned MaxVecRegSize = V.getMaxVecRegSize();
24637 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
24638 const unsigned MaxElts = std::clamp<unsigned>(
24639 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
24640 RegMaxNumber * RedValsMaxNumber);
24641
24642 unsigned ReduxWidth = NumReducedVals;
24643 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
24644 unsigned NumParts, NumRegs;
24645 Type *ScalarTy = Candidates.front()->getType();
24646 ReduxWidth =
24647 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
24648 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24649 NumParts = ::getNumberOfParts(TTI, Tp);
24650 NumRegs =
24652 while (NumParts > NumRegs) {
24653 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
24654 ReduxWidth = bit_floor(ReduxWidth - 1);
24655 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24656 NumParts = ::getNumberOfParts(TTI, Tp);
24657 NumRegs =
24659 }
24660 if (NumParts > NumRegs / 2)
24661 ReduxWidth = bit_floor(ReduxWidth);
24662 return ReduxWidth;
24663 };
24664 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
24665 ReduxWidth = GetVectorFactor(ReduxWidth);
24666 ReduxWidth = std::min(ReduxWidth, MaxElts);
24667
24668 unsigned Start = 0;
24669 unsigned Pos = Start;
24670 // Restarts vectorization attempt with lower vector factor.
24671 unsigned PrevReduxWidth = ReduxWidth;
24672 bool CheckForReusedReductionOpsLocal = false;
24673 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
24674 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
24675 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24676 // Check if any of the reduction ops are gathered. If so, worth
24677 // trying again with less number of reduction ops.
24678 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24679 }
24680 ++Pos;
24681 if (Pos < NumReducedVals - ReduxWidth + 1)
24682 return IsAnyRedOpGathered;
24683 Pos = Start;
24684 --ReduxWidth;
24685 if (ReduxWidth > 1)
24686 ReduxWidth = GetVectorFactor(ReduxWidth);
24687 return IsAnyRedOpGathered;
24688 };
24689 bool AnyVectorized = false;
24690 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24691 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24692 ReduxWidth >= ReductionLimit) {
24693 // Dependency in tree of the reduction ops - drop this attempt, try
24694 // later.
24695 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24696 Start == 0) {
24697 CheckForReusedReductionOps = true;
24698 break;
24699 }
24700 PrevReduxWidth = ReduxWidth;
24701 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
24702 // Been analyzed already - skip.
24703 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
24704 (!has_single_bit(ReduxWidth) &&
24705 (IgnoredCandidates.contains(
24706 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
24707 IgnoredCandidates.contains(
24708 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
24709 bit_floor(ReduxWidth))))) ||
24710 V.areAnalyzedReductionVals(VL)) {
24711 (void)AdjustReducedVals(/*IgnoreVL=*/true);
24712 continue;
24713 }
24714 // Early exit if any of the reduction values were deleted during
24715 // previous vectorization attempts.
24716 if (any_of(VL, [&V](Value *RedVal) {
24717 auto *RedValI = dyn_cast<Instruction>(RedVal);
24718 return RedValI && V.isDeleted(RedValI);
24719 }))
24720 break;
24721 V.buildTree(VL, IgnoreList);
24722 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
24723 if (!AdjustReducedVals())
24724 V.analyzedReductionVals(VL);
24725 continue;
24726 }
24727 if (V.isLoadCombineReductionCandidate(RdxKind)) {
24728 if (!AdjustReducedVals())
24729 V.analyzedReductionVals(VL);
24730 continue;
24731 }
24732 V.reorderTopToBottom();
24733 // No need to reorder the root node at all for reassociative reduction.
24734 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
24735 VL.front()->getType()->isIntOrIntVectorTy() ||
24736 ReductionLimit > 2);
24737 // Keep extracted other reduction values, if they are used in the
24738 // vectorization trees.
24739 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
24740 ExternallyUsedValues);
24741 // The reduction root is used as the insertion point for new
24742 // instructions, so set it as externally used to prevent it from being
24743 // deleted.
24744 LocalExternallyUsedValues.insert(ReductionRoot);
24745 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
24746 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
24747 continue;
24748 for (Value *V : ReducedVals[Cnt])
24749 if (isa<Instruction>(V))
24750 LocalExternallyUsedValues.insert(TrackedVals[V]);
24751 }
24752 if (!IsSupportedHorRdxIdentityOp) {
24753 // Number of uses of the candidates in the vector of values.
24754 assert(SameValuesCounter.empty() &&
24755 "Reused values counter map is not empty");
24756 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24757 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24758 continue;
24759 Value *V = Candidates[Cnt];
24760 Value *OrigV = TrackedToOrig.at(V);
24761 ++SameValuesCounter.try_emplace(OrigV).first->second;
24762 }
24763 }
24764 V.transformNodes();
24765 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
24766 // Gather externally used values.
24767 SmallPtrSet<Value *, 4> Visited;
24768 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24769 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24770 continue;
24771 Value *RdxVal = Candidates[Cnt];
24772 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24773 RdxVal = It->second;
24774 if (!Visited.insert(RdxVal).second)
24775 continue;
24776 // Check if the scalar was vectorized as part of the vectorization
24777 // tree but not the top node.
24778 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
24779 LocalExternallyUsedValues.insert(RdxVal);
24780 continue;
24781 }
24782 Value *OrigV = TrackedToOrig.at(RdxVal);
24783 unsigned NumOps =
24784 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24785 if (NumOps != ReducedValsToOps.at(OrigV).size())
24786 LocalExternallyUsedValues.insert(RdxVal);
24787 }
24788 // Do not need the list of reused scalars in regular mode anymore.
24789 if (!IsSupportedHorRdxIdentityOp)
24790 SameValuesCounter.clear();
24791 for (Value *RdxVal : VL)
24792 if (RequiredExtract.contains(RdxVal))
24793 LocalExternallyUsedValues.insert(RdxVal);
24794 V.buildExternalUses(LocalExternallyUsedValues);
24795
24796 V.computeMinimumValueSizes();
24797
24798 // Estimate cost.
24799 InstructionCost ReductionCost =
24800 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
24801 InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
24802 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24803 << " for reduction\n");
24804 if (!Cost.isValid())
24805 break;
24806 if (Cost >= -SLPCostThreshold) {
24807 V.getORE()->emit([&]() {
24808 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
24809 ReducedValsToOps.at(VL[0]).front())
24810 << "Vectorizing horizontal reduction is possible "
24811 << "but not beneficial with cost " << ore::NV("Cost", Cost)
24812 << " and threshold "
24813 << ore::NV("Threshold", -SLPCostThreshold);
24814 });
24815 if (!AdjustReducedVals()) {
24816 V.analyzedReductionVals(VL);
24817 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24818 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
24819 // Add subvectors of VL to the list of the analyzed values.
24820 for (unsigned VF = getFloorFullVectorNumberOfElements(
24821 *TTI, VL.front()->getType(), ReduxWidth - 1);
24822 VF >= ReductionLimit;
24824 *TTI, VL.front()->getType(), VF - 1)) {
24825 if (has_single_bit(VF) &&
24826 V.getCanonicalGraphSize() != V.getTreeSize())
24827 continue;
24828 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24829 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
24830 }
24831 }
24832 }
24833 continue;
24834 }
24835
24836 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
24837 << Cost << ". (HorRdx)\n");
24838 V.getORE()->emit([&]() {
24839 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
24840 ReducedValsToOps.at(VL[0]).front())
24841 << "Vectorized horizontal reduction with cost "
24842 << ore::NV("Cost", Cost) << " and with tree size "
24843 << ore::NV("TreeSize", V.getTreeSize());
24844 });
24845
24846 Builder.setFastMathFlags(RdxFMF);
24847
24848 // Emit a reduction. If the root is a select (min/max idiom), the insert
24849 // point is the compare condition of that select.
24850 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24851 Instruction *InsertPt = RdxRootInst;
24852 if (IsCmpSelMinMax)
24853 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24854
24855 // Vectorize a tree.
24856 Value *VectorizedRoot = V.vectorizeTree(
24857 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24858 // Update TrackedToOrig mapping, since the tracked values might be
24859 // updated.
24860 for (Value *RdxVal : Candidates) {
24861 Value *OrigVal = TrackedToOrig.at(RdxVal);
24862 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24863 if (TransformedRdxVal != RdxVal)
24864 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24865 }
24866
24867 Builder.SetInsertPoint(InsertPt);
24868
24869 // To prevent poison from leaking across what used to be sequential,
24870 // safe, scalar boolean logic operations, the reduction operand must be
24871 // frozen.
24872 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
24873 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
24874
24875 // Emit code to correctly handle reused reduced values, if required.
24876 if (OptReusedScalars && !SameScaleFactor) {
24877 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24878 SameValuesCounter, TrackedToOrig);
24879 }
24880
24881 Type *ScalarTy = VL.front()->getType();
24882 Type *VecTy = VectorizedRoot->getType();
24883 Type *RedScalarTy = VecTy->getScalarType();
24884 VectorValuesAndScales.emplace_back(
24885 VectorizedRoot,
24886 OptReusedScalars && SameScaleFactor
24887 ? SameValuesCounter.front().second
24888 : 1,
24889 RedScalarTy != ScalarTy->getScalarType()
24890 ? V.isSignedMinBitwidthRootNode()
24891 : true);
24892
24893 // Count vectorized reduced values to exclude them from final reduction.
24894 for (Value *RdxVal : VL) {
24895 Value *OrigV = TrackedToOrig.at(RdxVal);
24896 if (IsSupportedHorRdxIdentityOp) {
24897 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24898 continue;
24899 }
24900 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24901 if (!V.isVectorized(RdxVal))
24902 RequiredExtract.insert(RdxVal);
24903 }
24904 Pos += ReduxWidth;
24905 Start = Pos;
24906 ReduxWidth = NumReducedVals - Pos;
24907 if (ReduxWidth > 1)
24908 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24909 AnyVectorized = true;
24910 }
24911 if (OptReusedScalars && !AnyVectorized) {
24912 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
24913 Value *RdxVal = TrackedVals.at(P.first);
24914 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
24915 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24916 VectorizedVals.try_emplace(P.first, P.second);
24917 }
24918 continue;
24919 }
24920 }
24921 if (!VectorValuesAndScales.empty())
24922 VectorizedTree = GetNewVectorizedTree(
24923 VectorizedTree,
24924 emitReduction(Builder, *TTI, ReductionRoot->getType()));
24925
24926 if (!VectorizedTree) {
24927 if (!CheckForReusedReductionOps) {
24928 for (ReductionOpsType &RdxOps : ReductionOps)
24929 for (Value *RdxOp : RdxOps)
24930 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24931 }
24932 return nullptr;
24933 }
24934
24935 // Reorder operands of bool logical op in the natural order to avoid
24936 // possible problem with poison propagation. If not possible to reorder
24937 // (both operands are originally RHS), emit an extra freeze instruction
24938 // for the LHS operand.
24939 // I.e., if we have original code like this:
24940 // RedOp1 = select i1 ?, i1 LHS, i1 false
24941 // RedOp2 = select i1 RHS, i1 ?, i1 false
24942
24943 // Then, we swap LHS/RHS to create a new op that matches the poison
24944 // semantics of the original code.
24945
24946 // If we have original code like this and both values could be poison:
24947 // RedOp1 = select i1 ?, i1 LHS, i1 false
24948 // RedOp2 = select i1 ?, i1 RHS, i1 false
24949
24950 // Then, we must freeze LHS in the new op.
24951 auto FixBoolLogicalOps =
24952 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
24953 Instruction *RedOp2, bool InitStep) {
24954 if (!AnyBoolLogicOp)
24955 return;
24956 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
24957 getRdxOperand(RedOp1, 0) == LHS ||
24959 return;
24960 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
24961 getRdxOperand(RedOp2, 0) == RHS ||
24963 std::swap(LHS, RHS);
24964 return;
24965 }
24966 if (LHS != VectorizedTree)
24967 LHS = Builder.CreateFreeze(LHS);
24968 };
24969 // Finish the reduction.
24970 // Need to add extra arguments and not vectorized possible reduction values.
24971 // Try to avoid dependencies between the scalar remainders after reductions.
24972 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
24973 bool InitStep) {
24974 unsigned Sz = InstVals.size();
24975 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
24976 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
24977 Instruction *RedOp = InstVals[I + 1].first;
24978 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
24979 Value *RdxVal1 = InstVals[I].second;
24980 Value *StableRdxVal1 = RdxVal1;
24981 auto It1 = TrackedVals.find(RdxVal1);
24982 if (It1 != TrackedVals.end())
24983 StableRdxVal1 = It1->second;
24984 Value *RdxVal2 = InstVals[I + 1].second;
24985 Value *StableRdxVal2 = RdxVal2;
24986 auto It2 = TrackedVals.find(RdxVal2);
24987 if (It2 != TrackedVals.end())
24988 StableRdxVal2 = It2->second;
24989 // To prevent poison from leaking across what used to be sequential,
24990 // safe, scalar boolean logic operations, the reduction operand must be
24991 // frozen.
24992 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
24993 RedOp, InitStep);
24994 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24995 StableRdxVal2, "op.rdx", ReductionOps);
24996 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
24997 }
24998 if (Sz % 2 == 1)
24999 ExtraReds[Sz / 2] = InstVals.back();
25000 return ExtraReds;
25001 };
25003 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
25004 VectorizedTree);
25005 SmallPtrSet<Value *, 8> Visited;
25006 for (ArrayRef<Value *> Candidates : ReducedVals) {
25007 for (Value *RdxVal : Candidates) {
25008 if (!Visited.insert(RdxVal).second)
25009 continue;
25010 unsigned NumOps = VectorizedVals.lookup(RdxVal);
25011 for (Instruction *RedOp :
25012 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
25013 ExtraReductions.emplace_back(RedOp, RdxVal);
25014 }
25015 }
25016 // Iterate through all not-vectorized reduction values/extra arguments.
25017 bool InitStep = true;
25018 while (ExtraReductions.size() > 1) {
25020 FinalGen(ExtraReductions, InitStep);
25021 ExtraReductions.swap(NewReds);
25022 InitStep = false;
25023 }
25024 VectorizedTree = ExtraReductions.front().second;
25025
25026 ReductionRoot->replaceAllUsesWith(VectorizedTree);
25027
25028 // The original scalar reduction is expected to have no remaining
25029 // uses outside the reduction tree itself. Assert that we got this
25030 // correct, replace internal uses with undef, and mark for eventual
25031 // deletion.
25032#ifndef NDEBUG
25033 SmallPtrSet<Value *, 4> IgnoreSet;
25034 for (ArrayRef<Value *> RdxOps : ReductionOps)
25035 IgnoreSet.insert_range(RdxOps);
25036#endif
25037 for (ArrayRef<Value *> RdxOps : ReductionOps) {
25038 for (Value *Ignore : RdxOps) {
25039 if (!Ignore)
25040 continue;
25041#ifndef NDEBUG
25042 for (auto *U : Ignore->users()) {
25043 assert(IgnoreSet.count(U) &&
25044 "All users must be either in the reduction ops list.");
25045 }
25046#endif
25047 if (!Ignore->use_empty()) {
25048 Value *P = PoisonValue::get(Ignore->getType());
25049 Ignore->replaceAllUsesWith(P);
25050 }
25051 }
25052 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
25053 }
25054 return VectorizedTree;
25055 }
25056
25057private:
25058 /// Creates the reduction from the given \p Vec vector value with the given
25059 /// scale \p Scale and signedness \p IsSigned.
25060 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
25061 Value *Vec, unsigned Scale, bool IsSigned,
25062 Type *DestTy) {
25063 Value *Rdx;
25064 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
25065 unsigned DestTyNumElements = getNumElements(VecTy);
25066 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
25067 Rdx = PoisonValue::get(
25068 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
25069 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
25070 // Do reduction for each lane.
25071 // e.g., do reduce add for
25072 // VL[0] = <4 x Ty> <a, b, c, d>
25073 // VL[1] = <4 x Ty> <e, f, g, h>
25074 // Lane[0] = <2 x Ty> <a, e>
25075 // Lane[1] = <2 x Ty> <b, f>
25076 // Lane[2] = <2 x Ty> <c, g>
25077 // Lane[3] = <2 x Ty> <d, h>
25078 // result[0] = reduce add Lane[0]
25079 // result[1] = reduce add Lane[1]
25080 // result[2] = reduce add Lane[2]
25081 // result[3] = reduce add Lane[3]
25082 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
25083 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
25084 Rdx = Builder.CreateInsertElement(
25085 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
25086 }
25087 } else {
25088 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
25089 }
25090 if (Rdx->getType() != DestTy)
25091 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
25092 // Improved analysis for add/fadd/xor reductions with same scale
25093 // factor for all operands of reductions. We can emit scalar ops for
25094 // them instead.
25095 if (Scale > 1)
25096 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
25097 return Rdx;
25098 }
25099
25100 /// Calculate the cost of a reduction.
25101 InstructionCost getReductionCost(TargetTransformInfo *TTI,
25102 ArrayRef<Value *> ReducedVals,
25103 bool IsCmpSelMinMax, FastMathFlags FMF,
25104 const BoUpSLP &R, DominatorTree &DT,
25105 const DataLayout &DL,
25106 const TargetLibraryInfo &TLI) {
25108 Type *ScalarTy = ReducedVals.front()->getType();
25109 unsigned ReduxWidth = ReducedVals.size();
25110 FixedVectorType *VectorTy = R.getReductionType();
25111 InstructionCost VectorCost = 0, ScalarCost;
25112 // If all of the reduced values are constant, the vector cost is 0, since
25113 // the reduction value can be calculated at the compile time.
25114 bool AllConsts = allConstant(ReducedVals);
25115 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
25117 // Scalar cost is repeated for N-1 elements.
25118 int Cnt = ReducedVals.size();
25119 for (Value *RdxVal : ReducedVals) {
25120 if (Cnt == 1)
25121 break;
25122 --Cnt;
25123 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
25124 Cost += GenCostFn();
25125 continue;
25126 }
25127 InstructionCost ScalarCost = 0;
25128 for (User *U : RdxVal->users()) {
25129 auto *RdxOp = cast<Instruction>(U);
25130 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
25131 if (RdxKind == RecurKind::FAdd) {
25133 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
25134 if (FMACost.isValid()) {
25135 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
25136 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
25137 // Also, exclude scalar fmul cost.
25138 InstructionCost FMulCost =
25140 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
25141 FMACost -= FMulCost;
25142 }
25143 ScalarCost += FMACost;
25144 continue;
25145 }
25146 }
25147 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
25148 continue;
25149 }
25150 ScalarCost = InstructionCost::getInvalid();
25151 break;
25152 }
25153 if (ScalarCost.isValid())
25154 Cost += ScalarCost;
25155 else
25156 Cost += GenCostFn();
25157 }
25158 return Cost;
25159 };
25160 // Require reduction cost if:
25161 // 1. This type is not a full register type and no other vectors with the
25162 // same type in the storage (first vector with small type).
25163 // 2. The storage does not have any vector with full vector use (first
25164 // vector with full register use).
25165 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
25166 switch (RdxKind) {
25167 case RecurKind::Add:
25168 case RecurKind::Mul:
25169 case RecurKind::Or:
25170 case RecurKind::And:
25171 case RecurKind::Xor:
25172 case RecurKind::FAdd:
25173 case RecurKind::FMul: {
25174 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
25175 if (!AllConsts) {
25176 if (DoesRequireReductionOp) {
25177 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
25178 assert(SLPReVec && "FixedVectorType is not expected.");
25179 unsigned ScalarTyNumElements = VecTy->getNumElements();
25180 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
25181 VectorCost += TTI->getShuffleCost(
25184 ReducedVals.size()),
25185 VectorTy,
25186 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
25187 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
25188 FMF, CostKind);
25189 }
25190 VectorCost += TTI->getScalarizationOverhead(
25191 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
25192 /*Extract*/ false, TTI::TCK_RecipThroughput);
25193 } else {
25194 Type *RedTy = VectorTy->getElementType();
25195 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25196 std::make_pair(RedTy, true));
25197 if (RType == RedTy) {
25198 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
25199 FMF, CostKind);
25200 } else {
25201 VectorCost = TTI->getExtendedReductionCost(
25202 RdxOpcode, !IsSigned, RedTy,
25203 getWidenedType(RType, ReduxWidth), FMF, CostKind);
25204 }
25205 }
25206 } else {
25207 Type *RedTy = VectorTy->getElementType();
25208 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25209 std::make_pair(RedTy, true));
25210 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
25211 InstructionCost FMACost = InstructionCost::getInvalid();
25212 if (RdxKind == RecurKind::FAdd) {
25213 // Check if the reduction operands can be converted to FMA.
25215 FastMathFlags FMF;
25216 FMF.set();
25217 for (Value *RdxVal : ReducedVals) {
25218 if (!RdxVal->hasOneUse()) {
25219 Ops.clear();
25220 break;
25221 }
25222 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
25223 FMF &= FPCI->getFastMathFlags();
25224 Ops.push_back(RdxVal->user_back());
25225 }
25226 if (!Ops.empty()) {
25227 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
25228 *TTI, TLI);
25229 if (FMACost.isValid()) {
25230 // Calculate actual FMAD cost.
25231 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25232 {RVecTy, RVecTy, RVecTy}, FMF);
25233 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
25234
25235 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
25236 // Also, exclude vector fmul cost.
25238 Instruction::FMul, RVecTy, CostKind);
25240 << "Minus vector FMul cost: " << FMulCost << "\n");
25241 FMACost -= FMulCost;
25242 }
25243 }
25244 }
25245 if (FMACost.isValid())
25246 VectorCost += FMACost;
25247 else
25248 VectorCost +=
25249 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
25250 if (RType != RedTy) {
25251 unsigned Opcode = Instruction::Trunc;
25252 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25253 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25254 VectorCost += TTI->getCastInstrCost(
25255 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25256 }
25257 }
25258 }
25259 ScalarCost = EvaluateScalarCost([&]() {
25260 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
25261 });
25262 break;
25263 }
25264 case RecurKind::FMax:
25265 case RecurKind::FMin:
25266 case RecurKind::FMaximum:
25267 case RecurKind::FMinimum:
25268 case RecurKind::SMax:
25269 case RecurKind::SMin:
25270 case RecurKind::UMax:
25271 case RecurKind::UMin: {
25273 if (!AllConsts) {
25274 if (DoesRequireReductionOp) {
25275 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
25276 } else {
25277 // Check if the previous reduction already exists and account it as
25278 // series of operations + single reduction.
25279 Type *RedTy = VectorTy->getElementType();
25280 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25281 std::make_pair(RedTy, true));
25282 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
25283 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25284 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
25285 if (RType != RedTy) {
25286 unsigned Opcode = Instruction::Trunc;
25287 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25288 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25289 VectorCost += TTI->getCastInstrCost(
25290 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25291 }
25292 }
25293 }
25294 ScalarCost = EvaluateScalarCost([&]() {
25295 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25296 return TTI->getIntrinsicInstrCost(ICA, CostKind);
25297 });
25298 break;
25299 }
25300 default:
25301 llvm_unreachable("Expected arithmetic or min/max reduction operation");
25302 }
25303
25304 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
25305 << " for reduction of " << shortBundleName(ReducedVals)
25306 << " (It is a splitting reduction)\n");
25307 return VectorCost - ScalarCost;
25308 }
25309
25310 /// Splits the values, stored in VectorValuesAndScales, into registers/free
25311 /// sub-registers, combines them with the given reduction operation as a
25312 /// vector operation and then performs single (small enough) reduction.
25313 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
25314 Type *DestTy) {
25315 Value *ReducedSubTree = nullptr;
25316 // Creates reduction and combines with the previous reduction.
25317 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
25318 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
25319 if (ReducedSubTree)
25320 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25321 "op.rdx", ReductionOps);
25322 else
25323 ReducedSubTree = Rdx;
25324 };
25325 if (VectorValuesAndScales.size() == 1) {
25326 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
25327 CreateSingleOp(Vec, Scale, IsSigned);
25328 return ReducedSubTree;
25329 }
25330 // Scales Vec using given Cnt scale factor and then performs vector combine
25331 // with previous value of VecOp.
25332 Value *VecRes = nullptr;
25333 bool VecResSignedness = false;
25334 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
25335 Type *ScalarTy = Vec->getType()->getScalarType();
25336 // Scale Vec using given Cnt scale factor.
25337 if (Cnt > 1) {
25338 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
25339 switch (RdxKind) {
25340 case RecurKind::Add: {
25341 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
25342 unsigned VF = getNumElements(Vec->getType());
25343 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
25344 << ". (HorRdx)\n");
25345 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
25346 for (unsigned I : seq<unsigned>(Cnt))
25347 std::iota(std::next(Mask.begin(), VF * I),
25348 std::next(Mask.begin(), VF * (I + 1)), 0);
25349 ++NumVectorInstructions;
25350 Vec = Builder.CreateShuffleVector(Vec, Mask);
25351 break;
25352 }
25353 // res = mul vv, n
25354 if (ScalarTy != DestTy->getScalarType())
25355 Vec = Builder.CreateIntCast(
25356 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25357 IsSigned);
25359 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
25360 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
25361 << ". (HorRdx)\n");
25362 ++NumVectorInstructions;
25363 Vec = Builder.CreateMul(Vec, Scale);
25364 break;
25365 }
25366 case RecurKind::Xor: {
25367 // res = n % 2 ? 0 : vv
25369 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
25370 if (Cnt % 2 == 0)
25371 Vec = Constant::getNullValue(Vec->getType());
25372 break;
25373 }
25374 case RecurKind::FAdd: {
25375 // res = fmul v, n
25376 Value *Scale =
25377 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
25378 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
25379 << ". (HorRdx)\n");
25380 ++NumVectorInstructions;
25381 Vec = Builder.CreateFMul(Vec, Scale);
25382 break;
25383 }
25384 case RecurKind::And:
25385 case RecurKind::Or:
25386 case RecurKind::SMax:
25387 case RecurKind::SMin:
25388 case RecurKind::UMax:
25389 case RecurKind::UMin:
25390 case RecurKind::FMax:
25391 case RecurKind::FMin:
25392 case RecurKind::FMaximum:
25393 case RecurKind::FMinimum:
25394 // res = vv
25395 break;
25396 case RecurKind::Sub:
25397 case RecurKind::AddChainWithSubs:
25398 case RecurKind::Mul:
25399 case RecurKind::FMul:
25400 case RecurKind::FMulAdd:
25401 case RecurKind::AnyOf:
25402 case RecurKind::FindFirstIVSMin:
25403 case RecurKind::FindFirstIVUMin:
25404 case RecurKind::FindLastIVSMax:
25405 case RecurKind::FindLastIVUMax:
25406 case RecurKind::FMaxNum:
25407 case RecurKind::FMinNum:
25408 case RecurKind::FMaximumNum:
25409 case RecurKind::FMinimumNum:
25410 case RecurKind::None:
25411 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25412 }
25413 }
25414 // Combine Vec with the previous VecOp.
25415 if (!VecRes) {
25416 VecRes = Vec;
25417 VecResSignedness = IsSigned;
25418 } else {
25419 ++NumVectorInstructions;
25420 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
25421 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
25422 // Handle ctpop.
25423 unsigned VecResVF = getNumElements(VecRes->getType());
25424 unsigned VecVF = getNumElements(Vec->getType());
25425 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
25426 std::iota(Mask.begin(), Mask.end(), 0);
25427 // Ensure that VecRes is always larger than Vec
25428 if (VecResVF < VecVF) {
25429 std::swap(VecRes, Vec);
25430 std::swap(VecResVF, VecVF);
25431 }
25432 if (VecResVF != VecVF) {
25433 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
25434 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25435 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
25436 }
25437 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
25438 return;
25439 }
25440 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
25441 VecRes = Builder.CreateIntCast(
25442 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
25443 VecResSignedness);
25444 if (ScalarTy != DestTy->getScalarType())
25445 Vec = Builder.CreateIntCast(
25446 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25447 IsSigned);
25448 unsigned VecResVF = getNumElements(VecRes->getType());
25449 unsigned VecVF = getNumElements(Vec->getType());
25450 // Ensure that VecRes is always larger than Vec
25451 if (VecResVF < VecVF) {
25452 std::swap(VecRes, Vec);
25453 std::swap(VecResVF, VecVF);
25454 }
25455 // extract + op + insert
25456 Value *Op = VecRes;
25457 if (VecResVF != VecVF)
25458 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
25459 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
25460 if (VecResVF != VecVF)
25461 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
25462 VecRes = Op;
25463 }
25464 };
25465 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25466 CreateVecOp(Vec, Scale, IsSigned);
25467 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
25468
25469 return ReducedSubTree;
25470 }
25471
25472 /// Emit a horizontal reduction of the vectorized value.
25473 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
25474 const TargetTransformInfo *TTI, Type *DestTy) {
25475 assert(VectorizedValue && "Need to have a vectorized tree node");
25476 assert(RdxKind != RecurKind::FMulAdd &&
25477 "A call to the llvm.fmuladd intrinsic is not handled yet");
25478
25479 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
25480 if (FTy->getScalarType() == Builder.getInt1Ty() &&
25481 RdxKind == RecurKind::Add &&
25482 DestTy->getScalarType() != FTy->getScalarType()) {
25483 // Convert vector_reduce_add(ZExt(<n x i1>)) to
25484 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
25485 Value *V = Builder.CreateBitCast(
25486 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
25487 ++NumVectorInstructions;
25488 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
25489 }
25490 ++NumVectorInstructions;
25491 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
25492 }
25493
25494 /// Emits optimized code for unique scalar value reused \p Cnt times.
25495 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
25496 unsigned Cnt) {
25497 assert(IsSupportedHorRdxIdentityOp &&
25498 "The optimization of matched scalar identity horizontal reductions "
25499 "must be supported.");
25500 if (Cnt == 1)
25501 return VectorizedValue;
25502 switch (RdxKind) {
25503 case RecurKind::Add: {
25504 // res = mul vv, n
25505 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
25506 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
25507 << VectorizedValue << ". (HorRdx)\n");
25508 return Builder.CreateMul(VectorizedValue, Scale);
25509 }
25510 case RecurKind::Xor: {
25511 // res = n % 2 ? 0 : vv
25512 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
25513 << ". (HorRdx)\n");
25514 if (Cnt % 2 == 0)
25515 return Constant::getNullValue(VectorizedValue->getType());
25516 return VectorizedValue;
25517 }
25518 case RecurKind::FAdd: {
25519 // res = fmul v, n
25520 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
25521 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
25522 << VectorizedValue << ". (HorRdx)\n");
25523 return Builder.CreateFMul(VectorizedValue, Scale);
25524 }
25525 case RecurKind::And:
25526 case RecurKind::Or:
25527 case RecurKind::SMax:
25528 case RecurKind::SMin:
25529 case RecurKind::UMax:
25530 case RecurKind::UMin:
25531 case RecurKind::FMax:
25532 case RecurKind::FMin:
25533 case RecurKind::FMaximum:
25534 case RecurKind::FMinimum:
25535 // res = vv
25536 return VectorizedValue;
25537 case RecurKind::Sub:
25538 case RecurKind::AddChainWithSubs:
25539 case RecurKind::Mul:
25540 case RecurKind::FMul:
25541 case RecurKind::FMulAdd:
25542 case RecurKind::AnyOf:
25543 case RecurKind::FindFirstIVSMin:
25544 case RecurKind::FindFirstIVUMin:
25545 case RecurKind::FindLastIVSMax:
25546 case RecurKind::FindLastIVUMax:
25547 case RecurKind::FMaxNum:
25548 case RecurKind::FMinNum:
25549 case RecurKind::FMaximumNum:
25550 case RecurKind::FMinimumNum:
25551 case RecurKind::None:
25552 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25553 }
25554 return nullptr;
25555 }
25556
25557 /// Emits actual operation for the scalar identity values, found during
25558 /// horizontal reduction analysis.
25559 Value *
25560 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25561 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25562 const DenseMap<Value *, Value *> &TrackedToOrig) {
25563 assert(IsSupportedHorRdxIdentityOp &&
25564 "The optimization of matched scalar identity horizontal reductions "
25565 "must be supported.");
25566 ArrayRef<Value *> VL = R.getRootNodeScalars();
25567 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
25568 if (VTy->getElementType() != VL.front()->getType()) {
25569 VectorizedValue = Builder.CreateIntCast(
25570 VectorizedValue,
25571 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
25572 R.isSignedMinBitwidthRootNode());
25573 }
25574 switch (RdxKind) {
25575 case RecurKind::Add: {
25576 // root = mul prev_root, <1, 1, n, 1>
25578 for (Value *V : VL) {
25579 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25580 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
25581 }
25582 auto *Scale = ConstantVector::get(Vals);
25583 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
25584 << VectorizedValue << ". (HorRdx)\n");
25585 return Builder.CreateMul(VectorizedValue, Scale);
25586 }
25587 case RecurKind::And:
25588 case RecurKind::Or:
25589 // No need for multiple or/and(s).
25590 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
25591 << ". (HorRdx)\n");
25592 return VectorizedValue;
25593 case RecurKind::SMax:
25594 case RecurKind::SMin:
25595 case RecurKind::UMax:
25596 case RecurKind::UMin:
25597 case RecurKind::FMax:
25598 case RecurKind::FMin:
25599 case RecurKind::FMaximum:
25600 case RecurKind::FMinimum:
25601 // No need for multiple min/max(s) of the same value.
25602 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
25603 << ". (HorRdx)\n");
25604 return VectorizedValue;
25605 case RecurKind::Xor: {
25606 // Replace values with even number of repeats with 0, since
25607 // x xor x = 0.
25608 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
25609 // 7>, if elements 4th and 6th elements have even number of repeats.
25610 SmallVector<int> Mask(
25611 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
25613 std::iota(Mask.begin(), Mask.end(), 0);
25614 bool NeedShuffle = false;
25615 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
25616 Value *V = VL[I];
25617 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25618 if (Cnt % 2 == 0) {
25619 Mask[I] = VF;
25620 NeedShuffle = true;
25621 }
25622 }
25623 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
25624 : Mask) dbgs()
25625 << I << " ";
25626 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
25627 if (NeedShuffle)
25628 VectorizedValue = Builder.CreateShuffleVector(
25629 VectorizedValue,
25630 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
25631 return VectorizedValue;
25632 }
25633 case RecurKind::FAdd: {
25634 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
25636 for (Value *V : VL) {
25637 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25638 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
25639 }
25640 auto *Scale = ConstantVector::get(Vals);
25641 return Builder.CreateFMul(VectorizedValue, Scale);
25642 }
25643 case RecurKind::Sub:
25644 case RecurKind::AddChainWithSubs:
25645 case RecurKind::Mul:
25646 case RecurKind::FMul:
25647 case RecurKind::FMulAdd:
25648 case RecurKind::AnyOf:
25649 case RecurKind::FindFirstIVSMin:
25650 case RecurKind::FindFirstIVUMin:
25651 case RecurKind::FindLastIVSMax:
25652 case RecurKind::FindLastIVUMax:
25653 case RecurKind::FMaxNum:
25654 case RecurKind::FMinNum:
25655 case RecurKind::FMaximumNum:
25656 case RecurKind::FMinimumNum:
25657 case RecurKind::None:
25658 llvm_unreachable("Unexpected reduction kind for reused scalars.");
25659 }
25660 return nullptr;
25661 }
25662};
25663} // end anonymous namespace
25664
25665/// Gets recurrence kind from the specified value.
25667 return HorizontalReduction::getRdxKind(V);
25668}
25669static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
25670 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25671 return cast<FixedVectorType>(IE->getType())->getNumElements();
25672
25673 unsigned AggregateSize = 1;
25674 auto *IV = cast<InsertValueInst>(InsertInst);
25675 Type *CurrentType = IV->getType();
25676 do {
25677 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
25678 for (auto *Elt : ST->elements())
25679 if (Elt != ST->getElementType(0)) // check homogeneity
25680 return std::nullopt;
25681 AggregateSize *= ST->getNumElements();
25682 CurrentType = ST->getElementType(0);
25683 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25684 AggregateSize *= AT->getNumElements();
25685 CurrentType = AT->getElementType();
25686 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25687 AggregateSize *= VT->getNumElements();
25688 return AggregateSize;
25689 } else if (CurrentType->isSingleValueType()) {
25690 return AggregateSize;
25691 } else {
25692 return std::nullopt;
25693 }
25694 } while (true);
25695}
25696
25697static void findBuildAggregateRec(Instruction *LastInsertInst,
25699 SmallVectorImpl<Value *> &BuildVectorOpds,
25700 SmallVectorImpl<Value *> &InsertElts,
25701 unsigned OperandOffset, const BoUpSLP &R) {
25702 do {
25703 Value *InsertedOperand = LastInsertInst->getOperand(1);
25704 std::optional<unsigned> OperandIndex =
25705 getElementIndex(LastInsertInst, OperandOffset);
25706 if (!OperandIndex || R.isDeleted(LastInsertInst))
25707 return;
25708 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25710 BuildVectorOpds, InsertElts, *OperandIndex, R);
25711
25712 } else {
25713 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25714 InsertElts[*OperandIndex] = LastInsertInst;
25715 }
25716 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
25717 } while (LastInsertInst != nullptr &&
25719 LastInsertInst->hasOneUse());
25720}
25721
25722/// Recognize construction of vectors like
25723/// %ra = insertelement <4 x float> poison, float %s0, i32 0
25724/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
25725/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
25726/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
25727/// starting from the last insertelement or insertvalue instruction.
25728///
25729/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
25730/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
25731/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
25732///
25733/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
25734///
25735/// \return true if it matches.
25736static bool findBuildAggregate(Instruction *LastInsertInst,
25738 SmallVectorImpl<Value *> &BuildVectorOpds,
25739 SmallVectorImpl<Value *> &InsertElts,
25740 const BoUpSLP &R) {
25741
25742 assert((isa<InsertElementInst>(LastInsertInst) ||
25743 isa<InsertValueInst>(LastInsertInst)) &&
25744 "Expected insertelement or insertvalue instruction!");
25745
25746 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
25747 "Expected empty result vectors!");
25748
25749 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
25750 if (!AggregateSize)
25751 return false;
25752 BuildVectorOpds.resize(*AggregateSize);
25753 InsertElts.resize(*AggregateSize);
25754
25755 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
25756 llvm::erase(BuildVectorOpds, nullptr);
25757 llvm::erase(InsertElts, nullptr);
25758 if (BuildVectorOpds.size() >= 2)
25759 return true;
25760
25761 return false;
25762}
25763
25764/// Try and get a reduction instruction from a phi node.
25765///
25766/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
25767/// if they come from either \p ParentBB or a containing loop latch.
25768///
25769/// \returns A candidate reduction value if possible, or \code nullptr \endcode
25770/// if not possible.
25772 BasicBlock *ParentBB, LoopInfo *LI) {
25773 // There are situations where the reduction value is not dominated by the
25774 // reduction phi. Vectorizing such cases has been reported to cause
25775 // miscompiles. See PR25787.
25776 auto DominatedReduxValue = [&](Value *R) {
25777 return isa<Instruction>(R) &&
25778 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
25779 };
25780
25781 Instruction *Rdx = nullptr;
25782
25783 // Return the incoming value if it comes from the same BB as the phi node.
25784 if (P->getIncomingBlock(0) == ParentBB) {
25785 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25786 } else if (P->getIncomingBlock(1) == ParentBB) {
25787 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25788 }
25789
25790 if (Rdx && DominatedReduxValue(Rdx))
25791 return Rdx;
25792
25793 // Otherwise, check whether we have a loop latch to look at.
25794 Loop *BBL = LI->getLoopFor(ParentBB);
25795 if (!BBL)
25796 return nullptr;
25797 BasicBlock *BBLatch = BBL->getLoopLatch();
25798 if (!BBLatch)
25799 return nullptr;
25800
25801 // There is a loop latch, return the incoming value if it comes from
25802 // that. This reduction pattern occasionally turns up.
25803 if (P->getIncomingBlock(0) == BBLatch) {
25804 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25805 } else if (P->getIncomingBlock(1) == BBLatch) {
25806 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25807 }
25808
25809 if (Rdx && DominatedReduxValue(Rdx))
25810 return Rdx;
25811
25812 return nullptr;
25813}
25814
25815static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
25816 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
25817 return true;
25818 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
25819 return true;
25820 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
25821 return true;
25822 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
25823 return true;
25824 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
25825 return true;
25827 return true;
25829 return true;
25831 return true;
25833 return true;
25834 return false;
25835}
25836
25837/// We could have an initial reduction that is not an add.
25838/// r *= v1 + v2 + v3 + v4
25839/// In such a case start looking for a tree rooted in the first '+'.
25840/// \Returns the new root if found, which may be nullptr if not an instruction.
25842 Instruction *Root) {
25843 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25844 isa<IntrinsicInst>(Root)) &&
25845 "Expected binop, select, or intrinsic for reduction matching");
25846 Value *LHS =
25847 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25848 Value *RHS =
25849 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25850 if (LHS == Phi)
25851 return dyn_cast<Instruction>(RHS);
25852 if (RHS == Phi)
25853 return dyn_cast<Instruction>(LHS);
25854 return nullptr;
25855}
25856
25857/// \p Returns the first operand of \p I that does not match \p Phi. If
25858/// operand is not an instruction it returns nullptr.
25860 Value *Op0 = nullptr;
25861 Value *Op1 = nullptr;
25862 if (!matchRdxBop(I, Op0, Op1))
25863 return nullptr;
25864 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25865}
25866
25867/// \Returns true if \p I is a candidate instruction for reduction vectorization.
25869 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
25870 Value *B0 = nullptr, *B1 = nullptr;
25871 bool IsBinop = matchRdxBop(I, B0, B1);
25872 return IsBinop || IsSelect;
25873}
25874
25875bool SLPVectorizerPass::vectorizeHorReduction(
25876 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
25877 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25878 if (!ShouldVectorizeHor)
25879 return false;
25880 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
25881
25882 if (Root->getParent() != BB || isa<PHINode>(Root))
25883 return false;
25884
25885 // If we can find a secondary reduction root, use that instead.
25886 auto SelectRoot = [&]() {
25887 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
25888 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25889 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
25890 return NewRoot;
25891 return Root;
25892 };
25893
25894 // Start analysis starting from Root instruction. If horizontal reduction is
25895 // found, try to vectorize it. If it is not a horizontal reduction or
25896 // vectorization is not possible or not effective, and currently analyzed
25897 // instruction is a binary operation, try to vectorize the operands, using
25898 // pre-order DFS traversal order. If the operands were not vectorized, repeat
25899 // the same procedure considering each operand as a possible root of the
25900 // horizontal reduction.
25901 // Interrupt the process if the Root instruction itself was vectorized or all
25902 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
25903 // If a horizintal reduction was not matched or vectorized we collect
25904 // instructions for possible later attempts for vectorization.
25905 std::queue<std::pair<Instruction *, unsigned>> Stack;
25906 Stack.emplace(SelectRoot(), 0);
25907 SmallPtrSet<Value *, 8> VisitedInstrs;
25908 bool Res = false;
25909 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
25910 if (R.isAnalyzedReductionRoot(Inst))
25911 return nullptr;
25912 if (!isReductionCandidate(Inst))
25913 return nullptr;
25914 HorizontalReduction HorRdx;
25915 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25916 return nullptr;
25917 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25918 };
25919 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
25920 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25921 FutureSeed = getNonPhiOperand(Root, P);
25922 if (!FutureSeed)
25923 return false;
25924 }
25925 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
25926 // analysis is done separately.
25928 PostponedInsts.push_back(FutureSeed);
25929 return true;
25930 };
25931
25932 while (!Stack.empty()) {
25933 Instruction *Inst;
25934 unsigned Level;
25935 std::tie(Inst, Level) = Stack.front();
25936 Stack.pop();
25937 // Do not try to analyze instruction that has already been vectorized.
25938 // This may happen when we vectorize instruction operands on a previous
25939 // iteration while stack was populated before that happened.
25940 if (R.isDeleted(Inst))
25941 continue;
25942 if (Value *VectorizedV = TryToReduce(Inst)) {
25943 Res = true;
25944 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
25945 // Try to find another reduction.
25946 Stack.emplace(I, Level);
25947 continue;
25948 }
25949 if (R.isDeleted(Inst))
25950 continue;
25951 } else {
25952 // We could not vectorize `Inst` so try to use it as a future seed.
25953 if (!TryAppendToPostponedInsts(Inst)) {
25954 assert(Stack.empty() && "Expected empty stack");
25955 break;
25956 }
25957 }
25958
25959 // Try to vectorize operands.
25960 // Continue analysis for the instruction from the same basic block only to
25961 // save compile time.
25962 if (++Level < RecursionMaxDepth)
25963 for (auto *Op : Inst->operand_values())
25964 if (VisitedInstrs.insert(Op).second)
25965 if (auto *I = dyn_cast<Instruction>(Op))
25966 // Do not try to vectorize CmpInst operands, this is done
25967 // separately.
25969 !R.isDeleted(I) && I->getParent() == BB)
25970 Stack.emplace(I, Level);
25971 }
25972 return Res;
25973}
25974
25975bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
25976 if (!I)
25977 return false;
25978
25979 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
25980 return false;
25981 // Skip potential FMA candidates.
25982 if ((I->getOpcode() == Instruction::FAdd ||
25983 I->getOpcode() == Instruction::FSub) &&
25984 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
25985 .isValid())
25986 return false;
25987
25988 Value *P = I->getParent();
25989
25990 // Vectorize in current basic block only.
25991 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
25992 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
25993 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
25994 R.isDeleted(Op0) || R.isDeleted(Op1))
25995 return false;
25996
25997 // First collect all possible candidates
25999 Candidates.emplace_back(Op0, Op1);
26000
26001 auto *A = dyn_cast<BinaryOperator>(Op0);
26002 auto *B = dyn_cast<BinaryOperator>(Op1);
26003 // Try to skip B.
26004 if (A && B && B->hasOneUse()) {
26005 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
26006 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
26007 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
26008 Candidates.emplace_back(A, B0);
26009 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
26010 Candidates.emplace_back(A, B1);
26011 }
26012 // Try to skip A.
26013 if (B && A && A->hasOneUse()) {
26014 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
26015 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
26016 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
26017 Candidates.emplace_back(A0, B);
26018 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
26019 Candidates.emplace_back(A1, B);
26020 }
26021
26022 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
26024 if (!isReductionCandidate(Inst))
26025 return false;
26026 Type *Ty = Inst->getType();
26027 if (!isValidElementType(Ty) || Ty->isPointerTy())
26028 return false;
26029 HorizontalReduction HorRdx(Inst, Ops);
26030 if (!HorRdx.matchReductionForOperands())
26031 return false;
26032 // Check the cost of operations.
26033 VectorType *VecTy = getWidenedType(Ty, Ops.size());
26035 InstructionCost ScalarCost =
26036 TTI.getScalarizationOverhead(
26037 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
26038 /*Extract=*/true, CostKind) +
26039 TTI.getInstructionCost(Inst, CostKind);
26040 InstructionCost RedCost;
26041 switch (::getRdxKind(Inst)) {
26042 case RecurKind::Add:
26043 case RecurKind::Mul:
26044 case RecurKind::Or:
26045 case RecurKind::And:
26046 case RecurKind::Xor:
26047 case RecurKind::FAdd:
26048 case RecurKind::FMul: {
26049 FastMathFlags FMF;
26050 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
26051 FMF = FPCI->getFastMathFlags();
26052 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
26053 CostKind);
26054 break;
26055 }
26056 default:
26057 return false;
26058 }
26059 if (RedCost >= ScalarCost)
26060 return false;
26061
26062 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
26063 };
26064 if (Candidates.size() == 1)
26065 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
26066
26067 // We have multiple options. Try to pick the single best.
26068 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
26069 if (!BestCandidate)
26070 return false;
26071 return (*BestCandidate == 0 &&
26072 TryToReduce(I, {Candidates[*BestCandidate].first,
26073 Candidates[*BestCandidate].second})) ||
26074 tryToVectorizeList({Candidates[*BestCandidate].first,
26075 Candidates[*BestCandidate].second},
26076 R);
26077}
26078
26079bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
26080 BasicBlock *BB, BoUpSLP &R) {
26081 SmallVector<WeakTrackingVH> PostponedInsts;
26082 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
26083 Res |= tryToVectorize(PostponedInsts, R);
26084 return Res;
26085}
26086
26087bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
26088 BoUpSLP &R) {
26089 bool Res = false;
26090 for (Value *V : Insts)
26091 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
26092 Res |= tryToVectorize(Inst, R);
26093 return Res;
26094}
26095
26096bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
26097 BasicBlock *BB, BoUpSLP &R,
26098 bool MaxVFOnly) {
26099 if (!R.canMapToVector(IVI->getType()))
26100 return false;
26101
26102 SmallVector<Value *, 16> BuildVectorOpds;
26103 SmallVector<Value *, 16> BuildVectorInsts;
26104 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
26105 return false;
26106
26107 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
26108 R.getORE()->emit([&]() {
26109 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
26110 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
26111 "trying reduction first.";
26112 });
26113 return false;
26114 }
26115 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
26116 // Aggregate value is unlikely to be processed in vector register.
26117 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
26118}
26119
26120bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
26121 BasicBlock *BB, BoUpSLP &R,
26122 bool MaxVFOnly) {
26123 SmallVector<Value *, 16> BuildVectorInsts;
26124 SmallVector<Value *, 16> BuildVectorOpds;
26125 SmallVector<int> Mask;
26126 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
26128 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
26129 return false;
26130
26131 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
26132 R.getORE()->emit([&]() {
26133 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
26134 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
26135 "trying reduction first.";
26136 });
26137 return false;
26138 }
26139 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
26140 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
26141}
26142
26143template <typename T>
26145 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
26146 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
26147 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
26148 bool MaxVFOnly, BoUpSLP &R) {
26149 bool Changed = false;
26150 // Sort by type, parent, operands.
26151 stable_sort(Incoming, Comparator);
26152
26153 // Try to vectorize elements base on their type.
26154 SmallVector<T *> Candidates;
26156 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
26157 VL.clear()) {
26158 // Look for the next elements with the same type, parent and operand
26159 // kinds.
26160 auto *I = dyn_cast<Instruction>(*IncIt);
26161 if (!I || R.isDeleted(I)) {
26162 ++IncIt;
26163 continue;
26164 }
26165 auto *SameTypeIt = IncIt;
26166 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
26167 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
26168 AreCompatible(VL, *SameTypeIt))) {
26169 auto *I = dyn_cast<Instruction>(*SameTypeIt);
26170 ++SameTypeIt;
26171 if (I && !R.isDeleted(I))
26172 VL.push_back(cast<T>(I));
26173 }
26174
26175 // Try to vectorize them.
26176 unsigned NumElts = VL.size();
26177 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
26178 << NumElts << ")\n");
26179 // The vectorization is a 3-state attempt:
26180 // 1. Try to vectorize instructions with the same/alternate opcodes with the
26181 // size of maximal register at first.
26182 // 2. Try to vectorize remaining instructions with the same type, if
26183 // possible. This may result in the better vectorization results rather than
26184 // if we try just to vectorize instructions with the same/alternate opcodes.
26185 // 3. Final attempt to try to vectorize all instructions with the
26186 // same/alternate ops only, this may result in some extra final
26187 // vectorization.
26188 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
26189 // Success start over because instructions might have been changed.
26190 Changed = true;
26191 VL.swap(Candidates);
26192 Candidates.clear();
26193 for (T *V : VL) {
26194 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
26195 Candidates.push_back(V);
26196 }
26197 } else {
26198 /// \Returns the minimum number of elements that we will attempt to
26199 /// vectorize.
26200 auto GetMinNumElements = [&R](Value *V) {
26201 unsigned EltSize = R.getVectorElementSize(V);
26202 return std::max(2U, R.getMaxVecRegSize() / EltSize);
26203 };
26204 if (NumElts < GetMinNumElements(*IncIt) &&
26205 (Candidates.empty() ||
26206 Candidates.front()->getType() == (*IncIt)->getType())) {
26207 for (T *V : VL) {
26208 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
26209 Candidates.push_back(V);
26210 }
26211 }
26212 }
26213 // Final attempt to vectorize instructions with the same types.
26214 if (Candidates.size() > 1 &&
26215 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
26216 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
26217 // Success start over because instructions might have been changed.
26218 Changed = true;
26219 } else if (MaxVFOnly) {
26220 // Try to vectorize using small vectors.
26222 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
26223 VL.clear()) {
26224 auto *I = dyn_cast<Instruction>(*It);
26225 if (!I || R.isDeleted(I)) {
26226 ++It;
26227 continue;
26228 }
26229 auto *SameTypeIt = It;
26230 while (SameTypeIt != End &&
26231 (!isa<Instruction>(*SameTypeIt) ||
26232 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
26233 AreCompatible(*SameTypeIt, *It))) {
26234 auto *I = dyn_cast<Instruction>(*SameTypeIt);
26235 ++SameTypeIt;
26236 if (I && !R.isDeleted(I))
26237 VL.push_back(cast<T>(I));
26238 }
26239 unsigned NumElts = VL.size();
26240 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
26241 /*MaxVFOnly=*/false))
26242 Changed = true;
26243 It = SameTypeIt;
26244 }
26245 }
26246 Candidates.clear();
26247 }
26248
26249 // Start over at the next instruction of a different type (or the end).
26250 IncIt = SameTypeIt;
26251 }
26252 return Changed;
26253}
26254
26255/// Compare two cmp instructions. If IsCompatibility is true, function returns
26256/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
26257/// operands. If IsCompatibility is false, function implements strict weak
26258/// ordering relation between two cmp instructions, returning true if the first
26259/// instruction is "less" than the second, i.e. its predicate is less than the
26260/// predicate of the second or the operands IDs are less than the operands IDs
26261/// of the second cmp instruction.
26262template <bool IsCompatibility>
26263static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
26264 const DominatorTree &DT) {
26265 assert(isValidElementType(V->getType()) &&
26266 isValidElementType(V2->getType()) &&
26267 "Expected valid element types only.");
26268 if (V == V2)
26269 return IsCompatibility;
26270 auto *CI1 = cast<CmpInst>(V);
26271 auto *CI2 = cast<CmpInst>(V2);
26272 if (CI1->getOperand(0)->getType()->getTypeID() <
26273 CI2->getOperand(0)->getType()->getTypeID())
26274 return !IsCompatibility;
26275 if (CI1->getOperand(0)->getType()->getTypeID() >
26276 CI2->getOperand(0)->getType()->getTypeID())
26277 return false;
26278 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26280 return !IsCompatibility;
26281 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26283 return false;
26284 CmpInst::Predicate Pred1 = CI1->getPredicate();
26285 CmpInst::Predicate Pred2 = CI2->getPredicate();
26288 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
26289 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
26290 if (BasePred1 < BasePred2)
26291 return !IsCompatibility;
26292 if (BasePred1 > BasePred2)
26293 return false;
26294 // Compare operands.
26295 bool CI1Preds = Pred1 == BasePred1;
26296 bool CI2Preds = Pred2 == BasePred1;
26297 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
26298 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
26299 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
26300 if (Op1 == Op2)
26301 continue;
26302 if (Op1->getValueID() < Op2->getValueID())
26303 return !IsCompatibility;
26304 if (Op1->getValueID() > Op2->getValueID())
26305 return false;
26306 if (auto *I1 = dyn_cast<Instruction>(Op1))
26307 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
26308 if (IsCompatibility) {
26309 if (I1->getParent() != I2->getParent())
26310 return false;
26311 } else {
26312 // Try to compare nodes with same parent.
26313 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
26314 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
26315 if (!NodeI1)
26316 return NodeI2 != nullptr;
26317 if (!NodeI2)
26318 return false;
26319 assert((NodeI1 == NodeI2) ==
26320 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26321 "Different nodes should have different DFS numbers");
26322 if (NodeI1 != NodeI2)
26323 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26324 }
26325 InstructionsState S = getSameOpcode({I1, I2}, TLI);
26326 if (S && (IsCompatibility || !S.isAltShuffle()))
26327 continue;
26328 if (IsCompatibility)
26329 return false;
26330 if (I1->getOpcode() != I2->getOpcode())
26331 return I1->getOpcode() < I2->getOpcode();
26332 }
26333 }
26334 return IsCompatibility;
26335}
26336
26337template <typename ItT>
26338bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
26339 BasicBlock *BB, BoUpSLP &R) {
26340 bool Changed = false;
26341 // Try to find reductions first.
26342 for (CmpInst *I : CmpInsts) {
26343 if (R.isDeleted(I))
26344 continue;
26345 for (Value *Op : I->operands())
26346 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
26347 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
26348 if (R.isDeleted(I))
26349 break;
26350 }
26351 }
26352 // Try to vectorize operands as vector bundles.
26353 for (CmpInst *I : CmpInsts) {
26354 if (R.isDeleted(I))
26355 continue;
26356 Changed |= tryToVectorize(I, R);
26357 }
26358 // Try to vectorize list of compares.
26359 // Sort by type, compare predicate, etc.
26360 auto CompareSorter = [&](Value *V, Value *V2) {
26361 if (V == V2)
26362 return false;
26363 return compareCmp<false>(V, V2, *TLI, *DT);
26364 };
26365
26366 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
26367 if (VL.empty() || VL.back() == V1)
26368 return true;
26369 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
26370 };
26371
26373 for (Instruction *V : CmpInsts)
26374 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
26375 Vals.push_back(V);
26376 if (Vals.size() <= 1)
26377 return Changed;
26379 Vals, CompareSorter, AreCompatibleCompares,
26380 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26381 // Exclude possible reductions from other blocks.
26382 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
26383 return any_of(V->users(), [V](User *U) {
26384 auto *Select = dyn_cast<SelectInst>(U);
26385 return Select &&
26386 Select->getParent() != cast<Instruction>(V)->getParent();
26387 });
26388 });
26389 if (ArePossiblyReducedInOtherBlock)
26390 return false;
26391 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26392 },
26393 /*MaxVFOnly=*/true, R);
26394 return Changed;
26395}
26396
26397bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
26398 BasicBlock *BB, BoUpSLP &R) {
26400 "This function only accepts Insert instructions");
26401 bool OpsChanged = false;
26402 SmallVector<WeakTrackingVH> PostponedInsts;
26403 for (auto *I : reverse(Instructions)) {
26404 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
26405 if (R.isDeleted(I) || isa<CmpInst>(I))
26406 continue;
26407 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
26408 OpsChanged |=
26409 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
26410 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
26411 OpsChanged |=
26412 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
26413 }
26414 // pass2 - try to vectorize reductions only
26415 if (R.isDeleted(I))
26416 continue;
26417 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
26418 if (R.isDeleted(I) || isa<CmpInst>(I))
26419 continue;
26420 // pass3 - try to match and vectorize a buildvector sequence.
26421 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
26422 OpsChanged |=
26423 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
26424 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
26425 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
26426 /*MaxVFOnly=*/false);
26427 }
26428 }
26429 // Now try to vectorize postponed instructions.
26430 OpsChanged |= tryToVectorize(PostponedInsts, R);
26431
26432 Instructions.clear();
26433 return OpsChanged;
26434}
26435
26436bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
26437 bool Changed = false;
26438 SmallVector<Value *, 4> Incoming;
26439 SmallPtrSet<Value *, 16> VisitedInstrs;
26440 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
26441 // node. Allows better to identify the chains that can be vectorized in the
26442 // better way.
26443 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
26444 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
26446 isValidElementType(V2->getType()) &&
26447 "Expected vectorizable types only.");
26448 if (V1 == V2)
26449 return false;
26450 // It is fine to compare type IDs here, since we expect only vectorizable
26451 // types, like ints, floats and pointers, we don't care about other type.
26452 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
26453 return true;
26454 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
26455 return false;
26456 if (V1->getType()->getScalarSizeInBits() <
26457 V2->getType()->getScalarSizeInBits())
26458 return true;
26459 if (V1->getType()->getScalarSizeInBits() >
26460 V2->getType()->getScalarSizeInBits())
26461 return false;
26462 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26463 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26464 if (Opcodes1.size() < Opcodes2.size())
26465 return true;
26466 if (Opcodes1.size() > Opcodes2.size())
26467 return false;
26468 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26469 {
26470 // Instructions come first.
26471 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
26472 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
26473 if (I1 && I2) {
26474 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
26475 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
26476 if (!NodeI1)
26477 return NodeI2 != nullptr;
26478 if (!NodeI2)
26479 return false;
26480 assert((NodeI1 == NodeI2) ==
26481 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26482 "Different nodes should have different DFS numbers");
26483 if (NodeI1 != NodeI2)
26484 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26485 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
26486 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
26487 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
26488 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
26489 if (!E1 || !E2)
26490 continue;
26491
26492 // Sort on ExtractElementInsts primarily by vector operands. Prefer
26493 // program order of the vector operands.
26494 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
26495 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
26496 if (V1 != V2) {
26497 if (V1 && !V2)
26498 return true;
26499 if (!V1 && V2)
26500 return false;
26502 DT->getNode(V1->getParent());
26504 DT->getNode(V2->getParent());
26505 if (!NodeI1)
26506 return NodeI2 != nullptr;
26507 if (!NodeI2)
26508 return false;
26509 assert((NodeI1 == NodeI2) ==
26510 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26511 "Different nodes should have different DFS numbers");
26512 if (NodeI1 != NodeI2)
26513 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26514 return V1->comesBefore(V2);
26515 }
26516 // If we have the same vector operand, try to sort by constant
26517 // index.
26518 std::optional<unsigned> Id1 = getExtractIndex(E1);
26519 std::optional<unsigned> Id2 = getExtractIndex(E2);
26520 // Bring constants to the top
26521 if (Id1 && !Id2)
26522 return true;
26523 if (!Id1 && Id2)
26524 return false;
26525 // First elements come first.
26526 if (Id1 && Id2)
26527 return *Id1 < *Id2;
26528
26529 continue;
26530 }
26531 if (I1->getOpcode() == I2->getOpcode())
26532 continue;
26533 return I1->getOpcode() < I2->getOpcode();
26534 }
26535 if (I1)
26536 return true;
26537 if (I2)
26538 return false;
26539 }
26540 {
26541 // Non-undef constants come next.
26542 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
26543 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
26544 if (C1 && C2)
26545 continue;
26546 if (C1)
26547 return true;
26548 if (C2)
26549 return false;
26550 }
26551 bool U1 = isa<UndefValue>(Opcodes1[I]);
26552 bool U2 = isa<UndefValue>(Opcodes2[I]);
26553 {
26554 // Non-constant non-instructions come next.
26555 if (!U1 && !U2) {
26556 auto ValID1 = Opcodes1[I]->getValueID();
26557 auto ValID2 = Opcodes2[I]->getValueID();
26558 if (ValID1 == ValID2)
26559 continue;
26560 if (ValID1 < ValID2)
26561 return true;
26562 if (ValID1 > ValID2)
26563 return false;
26564 }
26565 if (!U1)
26566 return true;
26567 if (!U2)
26568 return false;
26569 }
26570 // Undefs come last.
26571 assert(U1 && U2 && "The only thing left should be undef & undef.");
26572 }
26573 return false;
26574 };
26575 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
26576 Value *V1) {
26577 if (VL.empty() || V1 == VL.back())
26578 return true;
26579 Value *V2 = VL.back();
26580 if (V1->getType() != V2->getType())
26581 return false;
26582 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26583 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26584 if (Opcodes1.size() != Opcodes2.size())
26585 return false;
26586 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26587 // Undefs are compatible with any other value.
26588 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
26589 continue;
26590 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
26591 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
26592 if (R.isDeleted(I1) || R.isDeleted(I2))
26593 return false;
26594 if (I1->getParent() != I2->getParent())
26595 return false;
26596 if (getSameOpcode({I1, I2}, *TLI))
26597 continue;
26598 return false;
26599 }
26600 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
26601 continue;
26602 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
26603 return false;
26604 }
26605 return true;
26606 };
26607
26608 bool HaveVectorizedPhiNodes = false;
26609 do {
26610 // Collect the incoming values from the PHIs.
26611 Incoming.clear();
26612 for (Instruction &I : *BB) {
26613 auto *P = dyn_cast<PHINode>(&I);
26614 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
26615 break;
26616
26617 // No need to analyze deleted, vectorized and non-vectorizable
26618 // instructions.
26619 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
26620 isValidElementType(P->getType()))
26621 Incoming.push_back(P);
26622 }
26623
26624 if (Incoming.size() <= 1)
26625 break;
26626
26627 // Find the corresponding non-phi nodes for better matching when trying to
26628 // build the tree.
26629 for (Value *V : Incoming) {
26630 SmallVectorImpl<Value *> &Opcodes =
26631 PHIToOpcodes.try_emplace(V).first->getSecond();
26632 if (!Opcodes.empty())
26633 continue;
26634 SmallVector<Value *, 4> Nodes(1, V);
26635 SmallPtrSet<Value *, 4> Visited;
26636 while (!Nodes.empty()) {
26637 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
26638 if (!Visited.insert(PHI).second)
26639 continue;
26640 for (Value *V : PHI->incoming_values()) {
26641 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
26642 Nodes.push_back(PHI1);
26643 continue;
26644 }
26645 Opcodes.emplace_back(V);
26646 }
26647 }
26648 }
26649
26650 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26651 Incoming, PHICompare, AreCompatiblePHIs,
26652 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26653 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26654 },
26655 /*MaxVFOnly=*/true, R);
26656 Changed |= HaveVectorizedPhiNodes;
26657 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
26658 auto *PHI = dyn_cast<PHINode>(P.first);
26659 return !PHI || R.isDeleted(PHI);
26660 }))
26661 PHIToOpcodes.clear();
26662 VisitedInstrs.insert_range(Incoming);
26663 } while (HaveVectorizedPhiNodes);
26664
26665 VisitedInstrs.clear();
26666
26667 InstSetVector PostProcessInserts;
26668 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26669 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
26670 // also vectorizes `PostProcessCmps`.
26671 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
26672 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26673 if (VectorizeCmps) {
26674 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
26675 PostProcessCmps.clear();
26676 }
26677 PostProcessInserts.clear();
26678 return Changed;
26679 };
26680 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
26681 auto IsInPostProcessInstrs = [&](Instruction *I) {
26682 if (auto *Cmp = dyn_cast<CmpInst>(I))
26683 return PostProcessCmps.contains(Cmp);
26685 PostProcessInserts.contains(I);
26686 };
26687 // Returns true if `I` is an instruction without users, like terminator, or
26688 // function call with ignored return value, store. Ignore unused instructions
26689 // (basing on instruction type, except for CallInst and InvokeInst).
26690 auto HasNoUsers = [](Instruction *I) {
26691 return I->use_empty() &&
26692 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
26693 };
26694 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
26695 // Skip instructions with scalable type. The num of elements is unknown at
26696 // compile-time for scalable type.
26697 if (isa<ScalableVectorType>(It->getType()))
26698 continue;
26699
26700 // Skip instructions marked for the deletion.
26701 if (R.isDeleted(&*It))
26702 continue;
26703 // We may go through BB multiple times so skip the one we have checked.
26704 if (!VisitedInstrs.insert(&*It).second) {
26705 if (HasNoUsers(&*It) &&
26706 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
26707 // We would like to start over since some instructions are deleted
26708 // and the iterator may become invalid value.
26709 Changed = true;
26710 It = BB->begin();
26711 E = BB->end();
26712 }
26713 continue;
26714 }
26715
26716 // Try to vectorize reductions that use PHINodes.
26717 if (PHINode *P = dyn_cast<PHINode>(It)) {
26718 // Check that the PHI is a reduction PHI.
26719 if (P->getNumIncomingValues() == 2) {
26720 // Try to match and vectorize a horizontal reduction.
26721 Instruction *Root = getReductionInstr(DT, P, BB, LI);
26722 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
26723 Changed = true;
26724 It = BB->begin();
26725 E = BB->end();
26726 continue;
26727 }
26728 }
26729 // Try to vectorize the incoming values of the PHI, to catch reductions
26730 // that feed into PHIs.
26731 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
26732 // Skip if the incoming block is the current BB for now. Also, bypass
26733 // unreachable IR for efficiency and to avoid crashing.
26734 // TODO: Collect the skipped incoming values and try to vectorize them
26735 // after processing BB.
26736 if (BB == P->getIncomingBlock(I) ||
26737 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
26738 continue;
26739
26740 // Postponed instructions should not be vectorized here, delay their
26741 // vectorization.
26742 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
26743 PI && !IsInPostProcessInstrs(PI)) {
26744 bool Res =
26745 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
26746 Changed |= Res;
26747 if (Res && R.isDeleted(P)) {
26748 It = BB->begin();
26749 E = BB->end();
26750 break;
26751 }
26752 }
26753 }
26754 continue;
26755 }
26756
26757 if (HasNoUsers(&*It)) {
26758 bool OpsChanged = false;
26759 auto *SI = dyn_cast<StoreInst>(It);
26760 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
26761 if (SI) {
26762 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
26763 // Try to vectorize chain in store, if this is the only store to the
26764 // address in the block.
26765 // TODO: This is just a temporarily solution to save compile time. Need
26766 // to investigate if we can safely turn on slp-vectorize-hor-store
26767 // instead to allow lookup for reduction chains in all non-vectorized
26768 // stores (need to check side effects and compile time).
26769 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
26770 SI->getValueOperand()->hasOneUse();
26771 }
26772 if (TryToVectorizeRoot) {
26773 for (auto *V : It->operand_values()) {
26774 // Postponed instructions should not be vectorized here, delay their
26775 // vectorization.
26776 if (auto *VI = dyn_cast<Instruction>(V);
26777 VI && !IsInPostProcessInstrs(VI))
26778 // Try to match and vectorize a horizontal reduction.
26779 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
26780 }
26781 }
26782 // Start vectorization of post-process list of instructions from the
26783 // top-tree instructions to try to vectorize as many instructions as
26784 // possible.
26785 OpsChanged |=
26786 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
26787 if (OpsChanged) {
26788 // We would like to start over since some instructions are deleted
26789 // and the iterator may become invalid value.
26790 Changed = true;
26791 It = BB->begin();
26792 E = BB->end();
26793 continue;
26794 }
26795 }
26796
26798 PostProcessInserts.insert(&*It);
26799 else if (isa<CmpInst>(It))
26800 PostProcessCmps.insert(cast<CmpInst>(&*It));
26801 }
26802
26803 return Changed;
26804}
26805
26806bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
26807 auto Changed = false;
26808 for (auto &Entry : GEPs) {
26809 // If the getelementptr list has fewer than two elements, there's nothing
26810 // to do.
26811 if (Entry.second.size() < 2)
26812 continue;
26813
26814 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
26815 << Entry.second.size() << ".\n");
26816
26817 // Process the GEP list in chunks suitable for the target's supported
26818 // vector size. If a vector register can't hold 1 element, we are done. We
26819 // are trying to vectorize the index computations, so the maximum number of
26820 // elements is based on the size of the index expression, rather than the
26821 // size of the GEP itself (the target's pointer size).
26822 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
26823 return !R.isDeleted(GEP);
26824 });
26825 if (It == Entry.second.end())
26826 continue;
26827 unsigned MaxVecRegSize = R.getMaxVecRegSize();
26828 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
26829 if (MaxVecRegSize < EltSize)
26830 continue;
26831
26832 unsigned MaxElts = MaxVecRegSize / EltSize;
26833 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
26834 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26835 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
26836
26837 // Initialize a set a candidate getelementptrs. Note that we use a
26838 // SetVector here to preserve program order. If the index computations
26839 // are vectorizable and begin with loads, we want to minimize the chance
26840 // of having to reorder them later.
26841 SetVector<Value *> Candidates(llvm::from_range, GEPList);
26842
26843 // Some of the candidates may have already been vectorized after we
26844 // initially collected them or their index is optimized to constant value.
26845 // If so, they are marked as deleted, so remove them from the set of
26846 // candidates.
26847 Candidates.remove_if([&R](Value *I) {
26848 return R.isDeleted(cast<Instruction>(I)) ||
26849 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
26850 });
26851
26852 // Remove from the set of candidates all pairs of getelementptrs with
26853 // constant differences. Such getelementptrs are likely not good
26854 // candidates for vectorization in a bottom-up phase since one can be
26855 // computed from the other. We also ensure all candidate getelementptr
26856 // indices are unique.
26857 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
26858 auto *GEPI = GEPList[I];
26859 if (!Candidates.count(GEPI))
26860 continue;
26861 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
26862 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
26863 auto *GEPJ = GEPList[J];
26864 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26865 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
26866 Candidates.remove(GEPI);
26867 Candidates.remove(GEPJ);
26868 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26869 Candidates.remove(GEPJ);
26870 }
26871 }
26872 }
26873
26874 // We break out of the above computation as soon as we know there are
26875 // fewer than two candidates remaining.
26876 if (Candidates.size() < 2)
26877 continue;
26878
26879 // Add the single, non-constant index of each candidate to the bundle. We
26880 // ensured the indices met these constraints when we originally collected
26881 // the getelementptrs.
26882 SmallVector<Value *, 16> Bundle(Candidates.size());
26883 auto BundleIndex = 0u;
26884 for (auto *V : Candidates) {
26885 auto *GEP = cast<GetElementPtrInst>(V);
26886 auto *GEPIdx = GEP->idx_begin()->get();
26887 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26888 Bundle[BundleIndex++] = GEPIdx;
26889 }
26890
26891 // Try and vectorize the indices. We are currently only interested in
26892 // gather-like cases of the form:
26893 //
26894 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
26895 //
26896 // where the loads of "a", the loads of "b", and the subtractions can be
26897 // performed in parallel. It's likely that detecting this pattern in a
26898 // bottom-up phase will be simpler and less costly than building a
26899 // full-blown top-down phase beginning at the consecutive loads.
26900 Changed |= tryToVectorizeList(Bundle, R);
26901 }
26902 }
26903 return Changed;
26904}
26905
26906bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
26907 bool Changed = false;
26908 // Sort by type, base pointers and values operand. Value operands must be
26909 // compatible (have the same opcode, same parent), otherwise it is
26910 // definitely not profitable to try to vectorize them.
26911 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
26912 if (V->getValueOperand()->getType()->getTypeID() <
26913 V2->getValueOperand()->getType()->getTypeID())
26914 return true;
26915 if (V->getValueOperand()->getType()->getTypeID() >
26916 V2->getValueOperand()->getType()->getTypeID())
26917 return false;
26918 if (V->getPointerOperandType()->getTypeID() <
26919 V2->getPointerOperandType()->getTypeID())
26920 return true;
26921 if (V->getPointerOperandType()->getTypeID() >
26922 V2->getPointerOperandType()->getTypeID())
26923 return false;
26924 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
26925 V2->getValueOperand()->getType()->getScalarSizeInBits())
26926 return true;
26927 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
26928 V2->getValueOperand()->getType()->getScalarSizeInBits())
26929 return false;
26930 // UndefValues are compatible with all other values.
26931 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
26932 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26933 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26934 DT->getNode(I1->getParent());
26935 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26936 DT->getNode(I2->getParent());
26937 assert(NodeI1 && "Should only process reachable instructions");
26938 assert(NodeI2 && "Should only process reachable instructions");
26939 assert((NodeI1 == NodeI2) ==
26940 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26941 "Different nodes should have different DFS numbers");
26942 if (NodeI1 != NodeI2)
26943 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26944 return I1->getOpcode() < I2->getOpcode();
26945 }
26946 return V->getValueOperand()->getValueID() <
26947 V2->getValueOperand()->getValueID();
26948 };
26949
26950 bool SameParent = true;
26951 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
26952 if (VL.empty()) {
26953 SameParent = true;
26954 return true;
26955 }
26956 StoreInst *V2 = VL.back();
26957 if (V1 == V2)
26958 return true;
26959 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
26960 return false;
26961 if (V1->getPointerOperandType() != V2->getPointerOperandType())
26962 return false;
26963 // Undefs are compatible with any other value.
26964 if (isa<UndefValue>(V1->getValueOperand()) ||
26966 return true;
26967 if (isa<Constant>(V1->getValueOperand()) &&
26969 return true;
26970 // Check if the operands of the stores can be vectorized. They can be
26971 // vectorized, if they have compatible operands or have operands, which can
26972 // be vectorized as copyables.
26973 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
26974 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
26975 if (I1 || I2) {
26976 // Accept only tail-following non-compatible values for now.
26977 // TODO: investigate if it is possible to vectorize incompatible values,
26978 // if the copyables are first in the list.
26979 if (I1 && !I2)
26980 return false;
26981 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
26982 SmallVector<Value *> NewVL(VL.size() + 1);
26983 for (auto [SI, V] : zip(VL, NewVL))
26984 V = SI->getValueOperand();
26985 NewVL.back() = V1->getValueOperand();
26986 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
26987 InstructionsState S = Analysis.buildInstructionsState(
26988 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
26989 /*SkipSameCodeCheck=*/!SameParent);
26990 if (S)
26991 return true;
26992 if (!SameParent)
26993 return false;
26994 }
26995 return V1->getValueOperand()->getValueID() ==
26996 V2->getValueOperand()->getValueID();
26997 };
26998
26999 // Attempt to sort and vectorize each of the store-groups.
27000 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
27001 for (auto &Pair : Stores) {
27002 if (Pair.second.size() < 2)
27003 continue;
27004
27005 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
27006 << Pair.second.size() << ".\n");
27007
27008 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
27009 continue;
27010
27011 // Reverse stores to do bottom-to-top analysis. This is important if the
27012 // values are stores to the same addresses several times, in this case need
27013 // to follow the stores order (reversed to meet the memory dependecies).
27014 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
27015 Pair.second.rend());
27017 ReversedStores, StoreSorter, AreCompatibleStores,
27018 [&](ArrayRef<StoreInst *> Candidates, bool) {
27019 return vectorizeStores(Candidates, R, Attempted);
27020 },
27021 /*MaxVFOnly=*/false, R);
27022 }
27023 return Changed;
27024}
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
Definition ExpandFp.cpp:994
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
static Type * getIndexType(Value *In)
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1407
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1331
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1112
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1397
void negate()
Negate this APInt in place.
Definition APInt.h:1469
unsigned logBase2() const
Definition APInt.h:1762
void setAllBits()
Set every bit to 1.
Definition APInt.h:1320
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1368
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:287
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:178
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:219
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:195
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:157
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:480
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:707
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:664
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
static bool shouldExecute(CounterInfo &Counter)
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getUnknown()
Definition DebugLoc.h:161
An analysis that produces DemandedBits for a function.
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:154
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Information for memory intrinsic cost model.
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
T & front() const
front - Get the first element.
Definition ArrayRef.h:349
iterator end() const
Definition ArrayRef.h:343
iterator begin() const
Definition ArrayRef.h:342
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:91
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:132
void insert_range(Range &&R)
Definition SetVector.h:176
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:94
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:228
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:180
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:284
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:24
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
iterator_range< value_op_iterator > operand_values()
Definition User.h:316
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2070
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1757
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2494
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1729
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2253
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:1993
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2140
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1980
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1775
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:359
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1397
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1932
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1966
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2042
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1847
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1407
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1973
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2100
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1437
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1446
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)