LLVM 22.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <map>
99#include <memory>
100#include <optional>
101#include <set>
102#include <string>
103#include <tuple>
104#include <utility>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108using namespace slpvectorizer;
109using namespace std::placeholders;
110
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
113
114STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
115
116DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
117 "Controls which SLP graphs should be vectorized.");
118
119static cl::opt<bool>
120 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
121 cl::desc("Run the SLP vectorization passes"));
122
123static cl::opt<bool>
124 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
125 cl::desc("Enable vectorization for wider vector utilization"));
126
127static cl::opt<int>
129 cl::desc("Only vectorize if you gain more than this "
130 "number "));
131
133 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
134 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
135 "heuristics and makes vectorization decision via cost modeling."));
136
137static cl::opt<bool>
138ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
139 cl::desc("Attempt to vectorize horizontal reductions"));
140
142 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
143 cl::desc(
144 "Attempt to vectorize horizontal reductions feeding into a store"));
145
147 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
148 cl::desc("Improve the code quality by splitting alternate instructions"));
149
150static cl::opt<int>
152 cl::desc("Attempt to vectorize for this register size in bits"));
153
156 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
157
158/// Limits the size of scheduling regions in a block.
159/// It avoid long compile times for _very_ large blocks where vector
160/// instructions are spread over a wide range.
161/// This limit is way higher than needed by real-world functions.
162static cl::opt<int>
163ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
164 cl::desc("Limit the size of the SLP scheduling region per block"));
165
167 "slp-min-reg-size", cl::init(128), cl::Hidden,
168 cl::desc("Attempt to vectorize for this register size in bits"));
169
171 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
172 cl::desc("Limit the recursion depth when building a vectorizable tree"));
173
175 "slp-min-tree-size", cl::init(3), cl::Hidden,
176 cl::desc("Only vectorize small trees if they are fully vectorizable"));
177
178// The maximum depth that the look-ahead score heuristic will explore.
179// The higher this value, the higher the compilation time overhead.
181 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
182 cl::desc("The maximum look-ahead depth for operand reordering scores"));
183
184// The maximum depth that the look-ahead score heuristic will explore
185// when it probing among candidates for vectorization tree roots.
186// The higher this value, the higher the compilation time overhead but unlike
187// similar limit for operands ordering this is less frequently used, hence
188// impact of higher value is less noticeable.
190 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
191 cl::desc("The maximum look-ahead depth for searching best rooting option"));
192
194 "slp-min-strided-loads", cl::init(2), cl::Hidden,
195 cl::desc("The minimum number of loads, which should be considered strided, "
196 "if the stride is > 1 or is runtime value"));
197
199 "slp-max-stride", cl::init(8), cl::Hidden,
200 cl::desc("The maximum stride, considered to be profitable."));
201
202static cl::opt<bool>
203 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
204 cl::desc("Disable tree reordering even if it is "
205 "profitable. Used for testing only."));
206
207static cl::opt<bool>
208 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
209 cl::desc("Generate strided loads even if they are not "
210 "profitable. Used for testing only."));
211
212static cl::opt<bool>
213 ViewSLPTree("view-slp-tree", cl::Hidden,
214 cl::desc("Display the SLP trees with Graphviz"));
215
217 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
218 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
219
220/// Enables vectorization of copyable elements.
222 "slp-copyable-elements", cl::init(true), cl::Hidden,
223 cl::desc("Try to replace values with the idempotent instructions for "
224 "better vectorization."));
225
226// Limit the number of alias checks. The limit is chosen so that
227// it has no negative effect on the llvm benchmarks.
228static const unsigned AliasedCheckLimit = 10;
229
230// Limit of the number of uses for potentially transformed instructions/values,
231// used in checks to avoid compile-time explode.
232static constexpr int UsesLimit = 64;
233
234// Another limit for the alias checks: The maximum distance between load/store
235// instructions where alias checks are done.
236// This limit is useful for very large basic blocks.
237static const unsigned MaxMemDepDistance = 160;
238
239/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
240/// regions to be handled.
241static const int MinScheduleRegionSize = 16;
242
243/// Maximum allowed number of operands in the PHI nodes.
244static const unsigned MaxPHINumOperands = 128;
245
246/// Predicate for the element types that the SLP vectorizer supports.
247///
248/// The most important thing to filter here are types which are invalid in LLVM
249/// vectors. We also filter target specific types which have absolutely no
250/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
251/// avoids spending time checking the cost model and realizing that they will
252/// be inevitably scalarized.
253static bool isValidElementType(Type *Ty) {
254 // TODO: Support ScalableVectorType.
256 Ty = Ty->getScalarType();
257 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
258 !Ty->isPPC_FP128Ty();
259}
260
261/// Returns the type of the given value/instruction \p V. If it is store,
262/// returns the type of its value operand, for Cmp - the types of the compare
263/// operands and for insertelement - the type os the inserted operand.
264/// Otherwise, just the type of the value is returned.
266 if (auto *SI = dyn_cast<StoreInst>(V))
267 return SI->getValueOperand()->getType();
268 if (auto *CI = dyn_cast<CmpInst>(V))
269 return CI->getOperand(0)->getType();
270 if (auto *IE = dyn_cast<InsertElementInst>(V))
271 return IE->getOperand(1)->getType();
272 return V->getType();
273}
274
275/// \returns the number of elements for Ty.
276static unsigned getNumElements(Type *Ty) {
278 "ScalableVectorType is not supported.");
279 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
280 return VecTy->getNumElements();
281 return 1;
282}
283
284/// \returns the vector type of ScalarTy based on vectorization factor.
285static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
286 return FixedVectorType::get(ScalarTy->getScalarType(),
287 VF * getNumElements(ScalarTy));
288}
289
290/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
291/// which forms type, which splits by \p TTI into whole vector types during
292/// legalization.
294 Type *Ty, unsigned Sz) {
295 if (!isValidElementType(Ty))
296 return bit_ceil(Sz);
297 // Find the number of elements, which forms full vectors.
298 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
299 if (NumParts == 0 || NumParts >= Sz)
300 return bit_ceil(Sz);
301 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
302}
303
304/// Returns the number of elements of the given type \p Ty, not greater than \p
305/// Sz, which forms type, which splits by \p TTI into whole vector types during
306/// legalization.
307static unsigned
309 unsigned Sz) {
310 if (!isValidElementType(Ty))
311 return bit_floor(Sz);
312 // Find the number of elements, which forms full vectors.
313 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
314 if (NumParts == 0 || NumParts >= Sz)
315 return bit_floor(Sz);
316 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
317 if (RegVF > Sz)
318 return bit_floor(Sz);
319 return (Sz / RegVF) * RegVF;
320}
321
322static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
323 SmallVectorImpl<int> &Mask) {
324 // The ShuffleBuilder implementation use shufflevector to splat an "element".
325 // But the element have different meaning for SLP (scalar) and REVEC
326 // (vector). We need to expand Mask into masks which shufflevector can use
327 // directly.
328 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
329 for (unsigned I : seq<unsigned>(Mask.size()))
330 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
331 I * VecTyNumElements, VecTyNumElements)))
332 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
333 : Mask[I] * VecTyNumElements + J;
334 Mask.swap(NewMask);
335}
336
337/// \returns the number of groups of shufflevector
338/// A group has the following features
339/// 1. All of value in a group are shufflevector.
340/// 2. The mask of all shufflevector is isExtractSubvectorMask.
341/// 3. The mask of all shufflevector uses all of the elements of the source.
342/// e.g., it is 1 group (%0)
343/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
344/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
345/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
346/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
347/// it is 2 groups (%3 and %4)
348/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
349/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
350/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
351/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
352/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
353/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
354/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
355/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
356/// it is 0 group
357/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
358/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
359/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
360/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
362 if (VL.empty())
363 return 0;
365 return 0;
366 auto *SV = cast<ShuffleVectorInst>(VL.front());
367 unsigned SVNumElements =
368 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
369 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
370 if (SVNumElements % ShuffleMaskSize != 0)
371 return 0;
372 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
373 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
374 return 0;
375 unsigned NumGroup = 0;
376 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
377 auto *SV = cast<ShuffleVectorInst>(VL[I]);
378 Value *Src = SV->getOperand(0);
379 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
380 SmallBitVector ExpectedIndex(GroupSize);
381 if (!all_of(Group, [&](Value *V) {
382 auto *SV = cast<ShuffleVectorInst>(V);
383 // From the same source.
384 if (SV->getOperand(0) != Src)
385 return false;
386 int Index;
387 if (!SV->isExtractSubvectorMask(Index))
388 return false;
389 ExpectedIndex.set(Index / ShuffleMaskSize);
390 return true;
391 }))
392 return 0;
393 if (!ExpectedIndex.all())
394 return 0;
395 ++NumGroup;
396 }
397 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
398 return NumGroup;
399}
400
401/// \returns a shufflevector mask which is used to vectorize shufflevectors
402/// e.g.,
403/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
404/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
405/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
406/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
407/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
408/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
409/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
410/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
411/// the result is
412/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
414 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
415 auto *SV = cast<ShuffleVectorInst>(VL.front());
416 unsigned SVNumElements =
417 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
418 SmallVector<int> Mask;
419 unsigned AccumulateLength = 0;
420 for (Value *V : VL) {
421 auto *SV = cast<ShuffleVectorInst>(V);
422 for (int M : SV->getShuffleMask())
423 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
424 : AccumulateLength + M);
425 AccumulateLength += SVNumElements;
426 }
427 return Mask;
428}
429
430/// \returns True if the value is a constant (but not globals/constant
431/// expressions).
432static bool isConstant(Value *V) {
434}
435
436/// Checks if \p V is one of vector-like instructions, i.e. undef,
437/// insertelement/extractelement with constant indices for fixed vector type or
438/// extractvalue instruction.
442 return false;
443 auto *I = dyn_cast<Instruction>(V);
444 if (!I || isa<ExtractValueInst>(I))
445 return true;
446 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
447 return false;
449 return isConstant(I->getOperand(1));
450 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
451 return isConstant(I->getOperand(2));
452}
453
454/// Returns power-of-2 number of elements in a single register (part), given the
455/// total number of elements \p Size and number of registers (parts) \p
456/// NumParts.
457static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
458 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
459}
460
461/// Returns correct remaining number of elements, considering total amount \p
462/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
463/// and current register (part) \p Part.
464static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
465 unsigned Part) {
466 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
467}
468
469#if !defined(NDEBUG)
470/// Print a short descriptor of the instruction bundle suitable for debug output.
471static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
472 std::string Result;
473 raw_string_ostream OS(Result);
474 if (Idx >= 0)
475 OS << "Idx: " << Idx << ", ";
476 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
477 return Result;
478}
479#endif
480
481/// \returns true if all of the instructions in \p VL are in the same block or
482/// false otherwise.
484 auto *It = find_if(VL, IsaPred<Instruction>);
485 if (It == VL.end())
486 return false;
489 return true;
490
491 BasicBlock *BB = I0->getParent();
492 for (Value *V : iterator_range(It, VL.end())) {
493 if (isa<PoisonValue>(V))
494 continue;
495 auto *II = dyn_cast<Instruction>(V);
496 if (!II)
497 return false;
498
499 if (BB != II->getParent())
500 return false;
501 }
502 return true;
503}
504
505/// \returns True if all of the values in \p VL are constants (but not
506/// globals/constant expressions).
508 // Constant expressions and globals can't be vectorized like normal integer/FP
509 // constants.
510 return all_of(VL, isConstant);
511}
512
513/// \returns True if all of the values in \p VL are identical or some of them
514/// are UndefValue.
515static bool isSplat(ArrayRef<Value *> VL) {
516 Value *FirstNonUndef = nullptr;
517 for (Value *V : VL) {
518 if (isa<UndefValue>(V))
519 continue;
520 if (!FirstNonUndef) {
521 FirstNonUndef = V;
522 continue;
523 }
524 if (V != FirstNonUndef)
525 return false;
526 }
527 return FirstNonUndef != nullptr;
528}
529
530/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
531/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
532/// patterns that make it effectively commutative (like equality comparisons
533/// with zero).
534/// In most cases, users should not call this function directly (since \p I and
535/// \p InstWithUses are the same). However, when analyzing interchangeable
536/// instructions, we need to use the converted opcode along with the original
537/// uses.
538/// \param I The instruction to check for commutativity
539/// \param ValWithUses The value whose uses are analyzed for special
540/// patterns
541static bool isCommutative(Instruction *I, Value *ValWithUses,
542 bool IsCopyable = false) {
543 if (auto *Cmp = dyn_cast<CmpInst>(I))
544 return Cmp->isCommutative();
545 if (auto *BO = dyn_cast<BinaryOperator>(I))
546 return BO->isCommutative() ||
547 (BO->getOpcode() == Instruction::Sub &&
548 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
549 all_of(
550 ValWithUses->uses(),
551 [&](const Use &U) {
552 // Commutative, if icmp eq/ne sub, 0
553 CmpPredicate Pred;
554 if (match(U.getUser(),
555 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
556 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
557 return true;
558 // Commutative, if abs(sub nsw, true) or abs(sub, false).
559 ConstantInt *Flag;
560 auto *I = dyn_cast<BinaryOperator>(U.get());
561 return match(U.getUser(),
562 m_Intrinsic<Intrinsic::abs>(
563 m_Specific(U.get()), m_ConstantInt(Flag))) &&
564 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
565 Flag->isOne());
566 })) ||
567 (BO->getOpcode() == Instruction::FSub &&
568 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
569 all_of(ValWithUses->uses(), [](const Use &U) {
570 return match(U.getUser(),
571 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
572 }));
573 return I->isCommutative();
574}
575
576/// This is a helper function to check whether \p I is commutative.
577/// This is a convenience wrapper that calls the two-parameter version of
578/// isCommutative with the same instruction for both parameters. This is
579/// the common case where the instruction being checked for commutativity
580/// is the same as the instruction whose uses are analyzed for special
581/// patterns (see the two-parameter version above for details).
582/// \param I The instruction to check for commutativity
583/// \returns true if the instruction is commutative, false otherwise
584static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
585
586/// \returns number of operands of \p I, considering commutativity. Returns 2
587/// for commutative instrinsics.
588/// \param I The instruction to check for commutativity
591 // IntrinsicInst::isCommutative returns true if swapping the first "two"
592 // arguments to the intrinsic produces the same result.
593 constexpr unsigned IntrinsicNumOperands = 2;
594 return IntrinsicNumOperands;
595 }
596 return I->getNumOperands();
597}
598
599template <typename T>
600static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
601 unsigned Offset) {
602 static_assert(std::is_same_v<T, InsertElementInst> ||
603 std::is_same_v<T, ExtractElementInst>,
604 "unsupported T");
605 int Index = Offset;
606 if (const auto *IE = dyn_cast<T>(Inst)) {
607 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
608 if (!VT)
609 return std::nullopt;
610 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
611 if (!CI)
612 return std::nullopt;
613 if (CI->getValue().uge(VT->getNumElements()))
614 return std::nullopt;
615 Index *= VT->getNumElements();
616 Index += CI->getZExtValue();
617 return Index;
618 }
619 return std::nullopt;
620}
621
622/// \returns inserting or extracting index of InsertElement, ExtractElement or
623/// InsertValue instruction, using Offset as base offset for index.
624/// \returns std::nullopt if the index is not an immediate.
625static std::optional<unsigned> getElementIndex(const Value *Inst,
626 unsigned Offset = 0) {
627 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
628 return Index;
630 return Index;
631
632 int Index = Offset;
633
634 const auto *IV = dyn_cast<InsertValueInst>(Inst);
635 if (!IV)
636 return std::nullopt;
637
638 Type *CurrentType = IV->getType();
639 for (unsigned I : IV->indices()) {
640 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
641 Index *= ST->getNumElements();
642 CurrentType = ST->getElementType(I);
643 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
644 Index *= AT->getNumElements();
645 CurrentType = AT->getElementType();
646 } else {
647 return std::nullopt;
648 }
649 Index += I;
650 }
651 return Index;
652}
653
654/// \returns true if all of the values in \p VL use the same opcode.
655/// For comparison instructions, also checks if predicates match.
656/// PoisonValues are considered matching.
657/// Interchangeable instructions are not considered.
659 auto *It = find_if(VL, IsaPred<Instruction>);
660 if (It == VL.end())
661 return true;
662 Instruction *MainOp = cast<Instruction>(*It);
663 unsigned Opcode = MainOp->getOpcode();
664 bool IsCmpOp = isa<CmpInst>(MainOp);
665 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
667 return std::all_of(It, VL.end(), [&](Value *V) {
668 if (auto *CI = dyn_cast<CmpInst>(V))
669 return BasePred == CI->getPredicate();
670 if (auto *I = dyn_cast<Instruction>(V))
671 return I->getOpcode() == Opcode;
672 return isa<PoisonValue>(V);
673 });
674}
675
676namespace {
677/// Specifies the way the mask should be analyzed for undefs/poisonous elements
678/// in the shuffle mask.
679enum class UseMask {
680 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
681 ///< check for the mask elements for the first argument (mask
682 ///< indices are in range [0:VF)).
683 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
684 ///< for the mask elements for the second argument (mask indices
685 ///< are in range [VF:2*VF))
686 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
687 ///< future shuffle elements and mark them as ones as being used
688 ///< in future. Non-undef elements are considered as unused since
689 ///< they're already marked as used in the mask.
690};
691} // namespace
692
693/// Prepares a use bitset for the given mask either for the first argument or
694/// for the second.
696 UseMask MaskArg) {
697 SmallBitVector UseMask(VF, true);
698 for (auto [Idx, Value] : enumerate(Mask)) {
699 if (Value == PoisonMaskElem) {
700 if (MaskArg == UseMask::UndefsAsMask)
701 UseMask.reset(Idx);
702 continue;
703 }
704 if (MaskArg == UseMask::FirstArg && Value < VF)
705 UseMask.reset(Value);
706 else if (MaskArg == UseMask::SecondArg && Value >= VF)
707 UseMask.reset(Value - VF);
708 }
709 return UseMask;
710}
711
712/// Checks if the given value is actually an undefined constant vector.
713/// Also, if the \p UseMask is not empty, tries to check if the non-masked
714/// elements actually mask the insertelement buildvector, if any.
715template <bool IsPoisonOnly = false>
717 const SmallBitVector &UseMask = {}) {
718 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
719 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
720 if (isa<T>(V))
721 return Res;
722 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
723 if (!VecTy)
724 return Res.reset();
725 auto *C = dyn_cast<Constant>(V);
726 if (!C) {
727 if (!UseMask.empty()) {
728 const Value *Base = V;
729 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
730 Base = II->getOperand(0);
731 if (isa<T>(II->getOperand(1)))
732 continue;
733 std::optional<unsigned> Idx = getElementIndex(II);
734 if (!Idx) {
735 Res.reset();
736 return Res;
737 }
738 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
739 Res.reset(*Idx);
740 }
741 // TODO: Add analysis for shuffles here too.
742 if (V == Base) {
743 Res.reset();
744 } else {
745 SmallBitVector SubMask(UseMask.size(), false);
746 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
747 }
748 } else {
749 Res.reset();
750 }
751 return Res;
752 }
753 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
754 if (Constant *Elem = C->getAggregateElement(I))
755 if (!isa<T>(Elem) &&
756 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
757 Res.reset(I);
758 }
759 return Res;
760}
761
762/// Checks if the vector of instructions can be represented as a shuffle, like:
763/// %x0 = extractelement <4 x i8> %x, i32 0
764/// %x3 = extractelement <4 x i8> %x, i32 3
765/// %y1 = extractelement <4 x i8> %y, i32 1
766/// %y2 = extractelement <4 x i8> %y, i32 2
767/// %x0x0 = mul i8 %x0, %x0
768/// %x3x3 = mul i8 %x3, %x3
769/// %y1y1 = mul i8 %y1, %y1
770/// %y2y2 = mul i8 %y2, %y2
771/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
772/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
773/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
774/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
775/// ret <4 x i8> %ins4
776/// can be transformed into:
777/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
778/// i32 6>
779/// %2 = mul <4 x i8> %1, %1
780/// ret <4 x i8> %2
781/// Mask will return the Shuffle Mask equivalent to the extracted elements.
782/// TODO: Can we split off and reuse the shuffle mask detection from
783/// ShuffleVectorInst/getShuffleCost?
784static std::optional<TargetTransformInfo::ShuffleKind>
786 AssumptionCache *AC) {
787 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
788 if (It == VL.end())
789 return std::nullopt;
790 unsigned Size =
791 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
792 auto *EI = dyn_cast<ExtractElementInst>(V);
793 if (!EI)
794 return S;
795 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
796 if (!VTy)
797 return S;
798 return std::max(S, VTy->getNumElements());
799 });
800
801 Value *Vec1 = nullptr;
802 Value *Vec2 = nullptr;
803 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
804 auto *EE = dyn_cast<ExtractElementInst>(V);
805 if (!EE)
806 return false;
807 Value *Vec = EE->getVectorOperand();
808 if (isa<UndefValue>(Vec))
809 return false;
810 return isGuaranteedNotToBePoison(Vec, AC);
811 });
812 enum ShuffleMode { Unknown, Select, Permute };
813 ShuffleMode CommonShuffleMode = Unknown;
814 Mask.assign(VL.size(), PoisonMaskElem);
815 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
816 // Undef can be represented as an undef element in a vector.
817 if (isa<UndefValue>(VL[I]))
818 continue;
819 auto *EI = cast<ExtractElementInst>(VL[I]);
820 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
821 return std::nullopt;
822 auto *Vec = EI->getVectorOperand();
823 // We can extractelement from undef or poison vector.
825 continue;
826 // All vector operands must have the same number of vector elements.
827 if (isa<UndefValue>(Vec)) {
828 Mask[I] = I;
829 } else {
830 if (isa<UndefValue>(EI->getIndexOperand()))
831 continue;
832 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
833 if (!Idx)
834 return std::nullopt;
835 // Undefined behavior if Idx is negative or >= Size.
836 if (Idx->getValue().uge(Size))
837 continue;
838 unsigned IntIdx = Idx->getValue().getZExtValue();
839 Mask[I] = IntIdx;
840 }
841 if (isUndefVector(Vec).all() && HasNonUndefVec)
842 continue;
843 // For correct shuffling we have to have at most 2 different vector operands
844 // in all extractelement instructions.
845 if (!Vec1 || Vec1 == Vec) {
846 Vec1 = Vec;
847 } else if (!Vec2 || Vec2 == Vec) {
848 Vec2 = Vec;
849 Mask[I] += Size;
850 } else {
851 return std::nullopt;
852 }
853 if (CommonShuffleMode == Permute)
854 continue;
855 // If the extract index is not the same as the operation number, it is a
856 // permutation.
857 if (Mask[I] % Size != I) {
858 CommonShuffleMode = Permute;
859 continue;
860 }
861 CommonShuffleMode = Select;
862 }
863 // If we're not crossing lanes in different vectors, consider it as blending.
864 if (CommonShuffleMode == Select && Vec2)
866 // If Vec2 was never used, we have a permutation of a single vector, otherwise
867 // we have permutation of 2 vectors.
870}
871
872/// \returns True if Extract{Value,Element} instruction extracts element Idx.
873static std::optional<unsigned> getExtractIndex(const Instruction *E) {
874 unsigned Opcode = E->getOpcode();
875 assert((Opcode == Instruction::ExtractElement ||
876 Opcode == Instruction::ExtractValue) &&
877 "Expected extractelement or extractvalue instruction.");
878 if (Opcode == Instruction::ExtractElement) {
879 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
880 if (!CI)
881 return std::nullopt;
882 return CI->getZExtValue();
883 }
884 auto *EI = cast<ExtractValueInst>(E);
885 if (EI->getNumIndices() != 1)
886 return std::nullopt;
887 return *EI->idx_begin();
888}
889
890/// Checks if the provided value does not require scheduling. It does not
891/// require scheduling if this is not an instruction or it is an instruction
892/// that does not read/write memory and all operands are either not instructions
893/// or phi nodes or instructions from different blocks.
894static bool areAllOperandsNonInsts(Value *V);
895/// Checks if the provided value does not require scheduling. It does not
896/// require scheduling if this is not an instruction or it is an instruction
897/// that does not read/write memory and all users are phi nodes or instructions
898/// from the different blocks.
899static bool isUsedOutsideBlock(Value *V);
900/// Checks if the specified value does not require scheduling. It does not
901/// require scheduling if all operands and all users do not need to be scheduled
902/// in the current basic block.
903static bool doesNotNeedToBeScheduled(Value *V);
904
905/// \returns true if \p Opcode is allowed as part of the main/alternate
906/// instruction for SLP vectorization.
907///
908/// Example of unsupported opcode is SDIV that can potentially cause UB if the
909/// "shuffled out" lane would result in division by zero.
910static bool isValidForAlternation(unsigned Opcode) {
911 return !Instruction::isIntDivRem(Opcode);
912}
913
914namespace {
915
916/// Helper class that determines VL can use the same opcode.
917/// Alternate instruction is supported. In addition, it supports interchangeable
918/// instruction. An interchangeable instruction is an instruction that can be
919/// converted to another instruction with same semantics. For example, x << 1 is
920/// equal to x * 2. x * 1 is equal to x | 0.
921class BinOpSameOpcodeHelper {
922 using MaskType = std::uint_fast16_t;
923 /// Sort SupportedOp because it is used by binary_search.
924 constexpr static std::initializer_list<unsigned> SupportedOp = {
925 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
926 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
927 enum : MaskType {
928 ShlBIT = 0b1,
929 AShrBIT = 0b10,
930 MulBIT = 0b100,
931 AddBIT = 0b1000,
932 SubBIT = 0b10000,
933 AndBIT = 0b100000,
934 OrBIT = 0b1000000,
935 XorBIT = 0b10000000,
936 MainOpBIT = 0b100000000,
938 };
939 /// Return a non-nullptr if either operand of I is a ConstantInt.
940 /// The second return value represents the operand position. We check the
941 /// right-hand side first (1). If the right hand side is not a ConstantInt and
942 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
943 /// side (0).
944 static std::pair<ConstantInt *, unsigned>
945 isBinOpWithConstantInt(const Instruction *I) {
946 unsigned Opcode = I->getOpcode();
947 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
948 (void)SupportedOp;
949 auto *BinOp = cast<BinaryOperator>(I);
950 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
951 return {CI, 1};
952 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
953 Opcode == Instruction::AShr)
954 return {nullptr, 0};
955 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
956 return {CI, 0};
957 return {nullptr, 0};
958 }
959 struct InterchangeableInfo {
960 const Instruction *I = nullptr;
961 /// The bit it sets represents whether MainOp can be converted to.
962 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
963 MulBIT | AShrBIT | ShlBIT;
964 /// We cannot create an interchangeable instruction that does not exist in
965 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
966 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
967 /// 1]. SeenBefore is used to know what operations have been seen before.
968 MaskType SeenBefore = 0;
969 InterchangeableInfo(const Instruction *I) : I(I) {}
970 /// Return false allows BinOpSameOpcodeHelper to find an alternate
971 /// instruction. Directly setting the mask will destroy the mask state,
972 /// preventing us from determining which instruction it should convert to.
973 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
974 if (Mask & InterchangeableMask) {
975 SeenBefore |= OpcodeInMaskForm;
976 Mask &= InterchangeableMask;
977 return true;
978 }
979 return false;
980 }
981 bool equal(unsigned Opcode) {
982 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
983 }
984 unsigned getOpcode() const {
985 MaskType Candidate = Mask & SeenBefore;
986 if (Candidate & MainOpBIT)
987 return I->getOpcode();
988 if (Candidate & ShlBIT)
989 return Instruction::Shl;
990 if (Candidate & AShrBIT)
991 return Instruction::AShr;
992 if (Candidate & MulBIT)
993 return Instruction::Mul;
994 if (Candidate & AddBIT)
995 return Instruction::Add;
996 if (Candidate & SubBIT)
997 return Instruction::Sub;
998 if (Candidate & AndBIT)
999 return Instruction::And;
1000 if (Candidate & OrBIT)
1001 return Instruction::Or;
1002 if (Candidate & XorBIT)
1003 return Instruction::Xor;
1004 llvm_unreachable("Cannot find interchangeable instruction.");
1005 }
1006
1007 /// Return true if the instruction can be converted to \p Opcode.
1008 bool hasCandidateOpcode(unsigned Opcode) const {
1009 MaskType Candidate = Mask & SeenBefore;
1010 switch (Opcode) {
1011 case Instruction::Shl:
1012 return Candidate & ShlBIT;
1013 case Instruction::AShr:
1014 return Candidate & AShrBIT;
1015 case Instruction::Mul:
1016 return Candidate & MulBIT;
1017 case Instruction::Add:
1018 return Candidate & AddBIT;
1019 case Instruction::Sub:
1020 return Candidate & SubBIT;
1021 case Instruction::And:
1022 return Candidate & AndBIT;
1023 case Instruction::Or:
1024 return Candidate & OrBIT;
1025 case Instruction::Xor:
1026 return Candidate & XorBIT;
1027 case Instruction::LShr:
1028 case Instruction::FAdd:
1029 case Instruction::FSub:
1030 case Instruction::FMul:
1031 case Instruction::SDiv:
1032 case Instruction::UDiv:
1033 case Instruction::FDiv:
1034 case Instruction::SRem:
1035 case Instruction::URem:
1036 case Instruction::FRem:
1037 return false;
1038 default:
1039 break;
1040 }
1041 llvm_unreachable("Cannot find interchangeable instruction.");
1042 }
1043
1044 SmallVector<Value *> getOperand(const Instruction *To) const {
1045 unsigned ToOpcode = To->getOpcode();
1046 unsigned FromOpcode = I->getOpcode();
1047 if (FromOpcode == ToOpcode)
1048 return SmallVector<Value *>(I->operands());
1049 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1050 auto [CI, Pos] = isBinOpWithConstantInt(I);
1051 const APInt &FromCIValue = CI->getValue();
1052 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1053 APInt ToCIValue;
1054 switch (FromOpcode) {
1055 case Instruction::Shl:
1056 if (ToOpcode == Instruction::Mul) {
1057 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1058 FromCIValue.getZExtValue());
1059 } else {
1060 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1061 ToCIValue = ToOpcode == Instruction::And
1062 ? APInt::getAllOnes(FromCIValueBitWidth)
1063 : APInt::getZero(FromCIValueBitWidth);
1064 }
1065 break;
1066 case Instruction::Mul:
1067 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1068 if (ToOpcode == Instruction::Shl) {
1069 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1070 } else {
1071 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1072 ToCIValue = ToOpcode == Instruction::And
1073 ? APInt::getAllOnes(FromCIValueBitWidth)
1074 : APInt::getZero(FromCIValueBitWidth);
1075 }
1076 break;
1077 case Instruction::Add:
1078 case Instruction::Sub:
1079 if (FromCIValue.isZero()) {
1080 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1081 } else {
1082 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1083 "Cannot convert the instruction.");
1084 ToCIValue = FromCIValue;
1085 ToCIValue.negate();
1086 }
1087 break;
1088 case Instruction::And:
1089 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1090 ToCIValue = ToOpcode == Instruction::Mul
1091 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1092 : APInt::getZero(FromCIValueBitWidth);
1093 break;
1094 default:
1095 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1096 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1097 break;
1098 }
1099 Value *LHS = I->getOperand(1 - Pos);
1100 Constant *RHS =
1101 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1102 // constant + x cannot be -constant - x
1103 // instead, it should be x - -constant
1104 if (Pos == 1 ||
1105 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1106 FromOpcode == Instruction::Xor) &&
1107 ToOpcode == Instruction::Sub))
1108 return SmallVector<Value *>({LHS, RHS});
1109 return SmallVector<Value *>({RHS, LHS});
1110 }
1111 };
1112 InterchangeableInfo MainOp;
1113 InterchangeableInfo AltOp;
1114 bool isValidForAlternation(const Instruction *I) const {
1115 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1116 ::isValidForAlternation(I->getOpcode());
1117 }
1118 bool initializeAltOp(const Instruction *I) {
1119 if (AltOp.I)
1120 return true;
1122 return false;
1123 AltOp.I = I;
1124 return true;
1125 }
1126
1127public:
1128 BinOpSameOpcodeHelper(const Instruction *MainOp,
1129 const Instruction *AltOp = nullptr)
1130 : MainOp(MainOp), AltOp(AltOp) {
1131 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1132 }
1133 bool add(const Instruction *I) {
1135 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1136 unsigned Opcode = I->getOpcode();
1137 MaskType OpcodeInMaskForm;
1138 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1139 switch (Opcode) {
1140 case Instruction::Shl:
1141 OpcodeInMaskForm = ShlBIT;
1142 break;
1143 case Instruction::AShr:
1144 OpcodeInMaskForm = AShrBIT;
1145 break;
1146 case Instruction::Mul:
1147 OpcodeInMaskForm = MulBIT;
1148 break;
1149 case Instruction::Add:
1150 OpcodeInMaskForm = AddBIT;
1151 break;
1152 case Instruction::Sub:
1153 OpcodeInMaskForm = SubBIT;
1154 break;
1155 case Instruction::And:
1156 OpcodeInMaskForm = AndBIT;
1157 break;
1158 case Instruction::Or:
1159 OpcodeInMaskForm = OrBIT;
1160 break;
1161 case Instruction::Xor:
1162 OpcodeInMaskForm = XorBIT;
1163 break;
1164 default:
1165 return MainOp.equal(Opcode) ||
1166 (initializeAltOp(I) && AltOp.equal(Opcode));
1167 }
1168 MaskType InterchangeableMask = OpcodeInMaskForm;
1169 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1170 if (CI) {
1171 constexpr MaskType CanBeAll =
1172 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1173 const APInt &CIValue = CI->getValue();
1174 switch (Opcode) {
1175 case Instruction::Shl:
1176 if (CIValue.ult(CIValue.getBitWidth()))
1177 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1178 break;
1179 case Instruction::Mul:
1180 if (CIValue.isOne()) {
1181 InterchangeableMask = CanBeAll;
1182 break;
1183 }
1184 if (CIValue.isPowerOf2())
1185 InterchangeableMask = MulBIT | ShlBIT;
1186 break;
1187 case Instruction::Add:
1188 case Instruction::Sub:
1189 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1190 break;
1191 case Instruction::And:
1192 if (CIValue.isAllOnes())
1193 InterchangeableMask = CanBeAll;
1194 break;
1195 case Instruction::Xor:
1196 if (CIValue.isZero())
1197 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1198 break;
1199 default:
1200 if (CIValue.isZero())
1201 InterchangeableMask = CanBeAll;
1202 break;
1203 }
1204 }
1205 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1206 (initializeAltOp(I) &&
1207 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1208 }
1209 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1210 /// Checks if the list of potential opcodes includes \p Opcode.
1211 bool hasCandidateOpcode(unsigned Opcode) const {
1212 return MainOp.hasCandidateOpcode(Opcode);
1213 }
1214 bool hasAltOp() const { return AltOp.I; }
1215 unsigned getAltOpcode() const {
1216 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1217 }
1218 SmallVector<Value *> getOperand(const Instruction *I) const {
1219 return MainOp.getOperand(I);
1220 }
1221};
1222
1223/// Main data required for vectorization of instructions.
1224class InstructionsState {
1225 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1226 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1227 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1228 /// isAltShuffle).
1229 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1230 /// from getMainAltOpsNoStateVL.
1231 /// For those InstructionsState that use alternate instructions, the resulting
1232 /// vectorized output ultimately comes from a shufflevector. For example,
1233 /// given a vector list (VL):
1234 /// VL[0] = add i32 a, e
1235 /// VL[1] = sub i32 b, f
1236 /// VL[2] = add i32 c, g
1237 /// VL[3] = sub i32 d, h
1238 /// The vectorized result would be:
1239 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1240 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1241 /// result = shufflevector <4 x i32> intermediated_0,
1242 /// <4 x i32> intermediated_1,
1243 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1244 /// Since shufflevector is used in the final result, when calculating the cost
1245 /// (getEntryCost), we must account for the usage of shufflevector in
1246 /// GetVectorCost.
1247 Instruction *MainOp = nullptr;
1248 Instruction *AltOp = nullptr;
1249 /// Wether the instruction state represents copyable instructions.
1250 bool HasCopyables = false;
1251
1252public:
1253 Instruction *getMainOp() const {
1254 assert(valid() && "InstructionsState is invalid.");
1255 return MainOp;
1256 }
1257
1258 Instruction *getAltOp() const {
1259 assert(valid() && "InstructionsState is invalid.");
1260 return AltOp;
1261 }
1262
1263 /// The main/alternate opcodes for the list of instructions.
1264 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1265
1266 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1267
1268 /// Some of the instructions in the list have alternate opcodes.
1269 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1270
1271 /// Checks if the instruction matches either the main or alternate opcode.
1272 /// \returns
1273 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1274 /// to it
1275 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1276 /// it
1277 /// - nullptr if \param I cannot be matched or converted to either opcode
1278 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1279 assert(MainOp && "MainOp cannot be nullptr.");
1280 if (I->getOpcode() == MainOp->getOpcode())
1281 return MainOp;
1282 // Prefer AltOp instead of interchangeable instruction of MainOp.
1283 assert(AltOp && "AltOp cannot be nullptr.");
1284 if (I->getOpcode() == AltOp->getOpcode())
1285 return AltOp;
1286 if (!I->isBinaryOp())
1287 return nullptr;
1288 BinOpSameOpcodeHelper Converter(MainOp);
1289 if (!Converter.add(I) || !Converter.add(MainOp))
1290 return nullptr;
1291 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1292 BinOpSameOpcodeHelper AltConverter(AltOp);
1293 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1294 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1295 return AltOp;
1296 }
1297 if (Converter.hasAltOp() && !isAltShuffle())
1298 return nullptr;
1299 return Converter.hasAltOp() ? AltOp : MainOp;
1300 }
1301
1302 /// Checks if main/alt instructions are shift operations.
1303 bool isShiftOp() const {
1304 return getMainOp()->isShift() && getAltOp()->isShift();
1305 }
1306
1307 /// Checks if main/alt instructions are bitwise logic operations.
1308 bool isBitwiseLogicOp() const {
1309 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1310 }
1311
1312 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1313 bool isMulDivLikeOp() const {
1314 constexpr std::array<unsigned, 8> MulDiv = {
1315 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1316 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1317 Instruction::URem, Instruction::FRem};
1318 return is_contained(MulDiv, getOpcode()) &&
1319 is_contained(MulDiv, getAltOpcode());
1320 }
1321
1322 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1323 bool isAddSubLikeOp() const {
1324 constexpr std::array<unsigned, 4> AddSub = {
1325 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1326 Instruction::FSub};
1327 return is_contained(AddSub, getOpcode()) &&
1328 is_contained(AddSub, getAltOpcode());
1329 }
1330
1331 /// Checks if main/alt instructions are cmp operations.
1332 bool isCmpOp() const {
1333 return (getOpcode() == Instruction::ICmp ||
1334 getOpcode() == Instruction::FCmp) &&
1335 getAltOpcode() == getOpcode();
1336 }
1337
1338 /// Checks if the current state is valid, i.e. has non-null MainOp
1339 bool valid() const { return MainOp && AltOp; }
1340
1341 explicit operator bool() const { return valid(); }
1342
1343 InstructionsState() = delete;
1344 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1345 bool HasCopyables = false)
1346 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1347 static InstructionsState invalid() { return {nullptr, nullptr}; }
1348
1349 /// Checks if the value is a copyable element.
1350 bool isCopyableElement(Value *V) const {
1351 assert(valid() && "InstructionsState is invalid.");
1352 if (!HasCopyables)
1353 return false;
1354 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1355 return false;
1356 auto *I = dyn_cast<Instruction>(V);
1357 if (!I)
1358 return !isa<PoisonValue>(V);
1359 if (I->getParent() != MainOp->getParent() &&
1362 return true;
1363 if (I->getOpcode() == MainOp->getOpcode())
1364 return false;
1365 if (!I->isBinaryOp())
1366 return true;
1367 BinOpSameOpcodeHelper Converter(MainOp);
1368 return !Converter.add(I) || !Converter.add(MainOp) ||
1369 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1370 }
1371
1372 /// Checks if the value is non-schedulable.
1373 bool isNonSchedulable(Value *V) const {
1374 assert(valid() && "InstructionsState is invalid.");
1375 auto *I = dyn_cast<Instruction>(V);
1376 if (!HasCopyables)
1377 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1379 // MainOp for copyables always schedulable to correctly identify
1380 // non-schedulable copyables.
1381 if (getMainOp() == V)
1382 return false;
1383 if (isCopyableElement(V)) {
1384 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1385 auto *I = dyn_cast<Instruction>(V);
1386 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1388 // If the copyable instructions comes after MainOp
1389 // (non-schedulable, but used in the block) - cannot vectorize
1390 // it, will possibly generate use before def.
1391 !MainOp->comesBefore(I));
1392 };
1393
1394 return IsNonSchedulableCopyableElement(V);
1395 }
1396 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1398 }
1399
1400 /// Checks if the state represents copyable instructions.
1401 bool areInstructionsWithCopyableElements() const {
1402 assert(valid() && "InstructionsState is invalid.");
1403 return HasCopyables;
1404 }
1405};
1406
1407std::pair<Instruction *, SmallVector<Value *>>
1408convertTo(Instruction *I, const InstructionsState &S) {
1409 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1410 assert(SelectedOp && "Cannot convert the instruction.");
1411 if (I->isBinaryOp()) {
1412 BinOpSameOpcodeHelper Converter(I);
1413 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1414 }
1415 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1416}
1417
1418} // end anonymous namespace
1419
1420static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1421 const TargetLibraryInfo &TLI);
1422
1423/// Find an instruction with a specific opcode in VL.
1424/// \param VL Array of values to search through. Must contain only Instructions
1425/// and PoisonValues.
1426/// \param Opcode The instruction opcode to search for
1427/// \returns
1428/// - The first instruction found with matching opcode
1429/// - nullptr if no matching instruction is found
1431 unsigned Opcode) {
1432 for (Value *V : VL) {
1433 if (isa<PoisonValue>(V))
1434 continue;
1435 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1436 auto *Inst = cast<Instruction>(V);
1437 if (Inst->getOpcode() == Opcode)
1438 return Inst;
1439 }
1440 return nullptr;
1441}
1442
1443/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1444/// compatible instructions or constants, or just some other regular values.
1445static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1446 Value *Op1, const TargetLibraryInfo &TLI) {
1447 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1448 (isConstant(BaseOp1) && isConstant(Op1)) ||
1449 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1450 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1451 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1452 getSameOpcode({BaseOp0, Op0}, TLI) ||
1453 getSameOpcode({BaseOp1, Op1}, TLI);
1454}
1455
1456/// \returns true if a compare instruction \p CI has similar "look" and
1457/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1458/// swapped, false otherwise.
1459static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1460 const TargetLibraryInfo &TLI) {
1461 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1462 "Assessing comparisons of different types?");
1463 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1464 CmpInst::Predicate Pred = CI->getPredicate();
1466
1467 Value *BaseOp0 = BaseCI->getOperand(0);
1468 Value *BaseOp1 = BaseCI->getOperand(1);
1469 Value *Op0 = CI->getOperand(0);
1470 Value *Op1 = CI->getOperand(1);
1471
1472 return (BasePred == Pred &&
1473 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1474 (BasePred == SwappedPred &&
1475 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1476}
1477
1478/// \returns analysis of the Instructions in \p VL described in
1479/// InstructionsState, the Opcode that we suppose the whole list
1480/// could be vectorized even if its structure is diverse.
1481static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1482 const TargetLibraryInfo &TLI) {
1483 // Make sure these are all Instructions.
1485 return InstructionsState::invalid();
1486
1487 auto *It = find_if(VL, IsaPred<Instruction>);
1488 if (It == VL.end())
1489 return InstructionsState::invalid();
1490
1491 Instruction *MainOp = cast<Instruction>(*It);
1492 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1493 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1494 (VL.size() == 2 && InstCnt < 2))
1495 return InstructionsState::invalid();
1496
1497 bool IsCastOp = isa<CastInst>(MainOp);
1498 bool IsBinOp = isa<BinaryOperator>(MainOp);
1499 bool IsCmpOp = isa<CmpInst>(MainOp);
1500 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1502 Instruction *AltOp = MainOp;
1503 unsigned Opcode = MainOp->getOpcode();
1504 unsigned AltOpcode = Opcode;
1505
1506 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1507 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1508 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1509 UniquePreds.insert(BasePred);
1510 UniqueNonSwappedPreds.insert(BasePred);
1511 for (Value *V : VL) {
1512 auto *I = dyn_cast<CmpInst>(V);
1513 if (!I)
1514 return false;
1515 CmpInst::Predicate CurrentPred = I->getPredicate();
1516 CmpInst::Predicate SwappedCurrentPred =
1517 CmpInst::getSwappedPredicate(CurrentPred);
1518 UniqueNonSwappedPreds.insert(CurrentPred);
1519 if (!UniquePreds.contains(CurrentPred) &&
1520 !UniquePreds.contains(SwappedCurrentPred))
1521 UniquePreds.insert(CurrentPred);
1522 }
1523 // Total number of predicates > 2, but if consider swapped predicates
1524 // compatible only 2, consider swappable predicates as compatible opcodes,
1525 // not alternate.
1526 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1527 }();
1528 // Check for one alternate opcode from another BinaryOperator.
1529 // TODO - generalize to support all operators (types, calls etc.).
1530 Intrinsic::ID BaseID = 0;
1531 SmallVector<VFInfo> BaseMappings;
1532 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1533 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1534 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1535 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1536 return InstructionsState::invalid();
1537 }
1538 bool AnyPoison = InstCnt != VL.size();
1539 // Check MainOp too to be sure that it matches the requirements for the
1540 // instructions.
1541 for (Value *V : iterator_range(It, VL.end())) {
1542 auto *I = dyn_cast<Instruction>(V);
1543 if (!I)
1544 continue;
1545
1546 // Cannot combine poison and divisions.
1547 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1548 // intrinsics/functions only.
1549 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1550 return InstructionsState::invalid();
1551 unsigned InstOpcode = I->getOpcode();
1552 if (IsBinOp && isa<BinaryOperator>(I)) {
1553 if (BinOpHelper.add(I))
1554 continue;
1555 } else if (IsCastOp && isa<CastInst>(I)) {
1556 Value *Op0 = MainOp->getOperand(0);
1557 Type *Ty0 = Op0->getType();
1558 Value *Op1 = I->getOperand(0);
1559 Type *Ty1 = Op1->getType();
1560 if (Ty0 == Ty1) {
1561 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1562 continue;
1563 if (Opcode == AltOpcode) {
1564 assert(isValidForAlternation(Opcode) &&
1565 isValidForAlternation(InstOpcode) &&
1566 "Cast isn't safe for alternation, logic needs to be updated!");
1567 AltOpcode = InstOpcode;
1568 AltOp = I;
1569 continue;
1570 }
1571 }
1572 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1573 auto *BaseInst = cast<CmpInst>(MainOp);
1574 Type *Ty0 = BaseInst->getOperand(0)->getType();
1575 Type *Ty1 = Inst->getOperand(0)->getType();
1576 if (Ty0 == Ty1) {
1577 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1578 assert(InstOpcode == AltOpcode &&
1579 "Alternate instructions are only supported by BinaryOperator "
1580 "and CastInst.");
1581 // Check for compatible operands. If the corresponding operands are not
1582 // compatible - need to perform alternate vectorization.
1583 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1584 CmpInst::Predicate SwappedCurrentPred =
1585 CmpInst::getSwappedPredicate(CurrentPred);
1586
1587 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1588 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1589 continue;
1590
1591 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1592 continue;
1593 auto *AltInst = cast<CmpInst>(AltOp);
1594 if (MainOp != AltOp) {
1595 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1596 continue;
1597 } else if (BasePred != CurrentPred) {
1598 assert(
1599 isValidForAlternation(InstOpcode) &&
1600 "CmpInst isn't safe for alternation, logic needs to be updated!");
1601 AltOp = I;
1602 continue;
1603 }
1604 CmpInst::Predicate AltPred = AltInst->getPredicate();
1605 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1606 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1607 continue;
1608 }
1609 } else if (InstOpcode == Opcode) {
1610 assert(InstOpcode == AltOpcode &&
1611 "Alternate instructions are only supported by BinaryOperator and "
1612 "CastInst.");
1613 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1614 if (Gep->getNumOperands() != 2 ||
1615 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1616 return InstructionsState::invalid();
1617 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1619 return InstructionsState::invalid();
1620 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1621 auto *BaseLI = cast<LoadInst>(MainOp);
1622 if (!LI->isSimple() || !BaseLI->isSimple())
1623 return InstructionsState::invalid();
1624 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1625 auto *CallBase = cast<CallInst>(MainOp);
1626 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1627 return InstructionsState::invalid();
1628 if (Call->hasOperandBundles() &&
1630 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1631 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1632 CallBase->op_begin() +
1634 return InstructionsState::invalid();
1636 if (ID != BaseID)
1637 return InstructionsState::invalid();
1638 if (!ID) {
1639 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1640 if (Mappings.size() != BaseMappings.size() ||
1641 Mappings.front().ISA != BaseMappings.front().ISA ||
1642 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1643 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1644 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1645 Mappings.front().Shape.Parameters !=
1646 BaseMappings.front().Shape.Parameters)
1647 return InstructionsState::invalid();
1648 }
1649 }
1650 continue;
1651 }
1652 return InstructionsState::invalid();
1653 }
1654
1655 if (IsBinOp) {
1656 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1657 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1658 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1659 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1660 }
1661 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1662 "Incorrect implementation of allSameOpcode.");
1663 InstructionsState S(MainOp, AltOp);
1664 assert(all_of(VL,
1665 [&](Value *V) {
1666 return isa<PoisonValue>(V) ||
1667 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1668 }) &&
1669 "Invalid InstructionsState.");
1670 return S;
1671}
1672
1673/// \returns true if all of the values in \p VL have the same type or false
1674/// otherwise.
1676 Type *Ty = VL.consume_front()->getType();
1677 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1678}
1679
1680/// \returns True if in-tree use also needs extract. This refers to
1681/// possible scalar operand in vectorized instruction.
1682static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1683 TargetLibraryInfo *TLI,
1684 const TargetTransformInfo *TTI) {
1685 if (!UserInst)
1686 return false;
1687 unsigned Opcode = UserInst->getOpcode();
1688 switch (Opcode) {
1689 case Instruction::Load: {
1690 LoadInst *LI = cast<LoadInst>(UserInst);
1691 return (LI->getPointerOperand() == Scalar);
1692 }
1693 case Instruction::Store: {
1694 StoreInst *SI = cast<StoreInst>(UserInst);
1695 return (SI->getPointerOperand() == Scalar);
1696 }
1697 case Instruction::Call: {
1698 CallInst *CI = cast<CallInst>(UserInst);
1700 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1701 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1702 Arg.value().get() == Scalar;
1703 });
1704 }
1705 default:
1706 return false;
1707 }
1708}
1709
1710/// \returns the AA location that is being access by the instruction.
1713 return MemoryLocation::get(SI);
1714 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1715 return MemoryLocation::get(LI);
1716 return MemoryLocation();
1717}
1718
1719/// \returns True if the instruction is not a volatile or atomic load/store.
1720static bool isSimple(Instruction *I) {
1721 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1722 return LI->isSimple();
1724 return SI->isSimple();
1726 return !MI->isVolatile();
1727 return true;
1728}
1729
1730/// Shuffles \p Mask in accordance with the given \p SubMask.
1731/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1732/// one but two input vectors.
1733static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1734 bool ExtendingManyInputs = false) {
1735 if (SubMask.empty())
1736 return;
1737 assert(
1738 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1739 // Check if input scalars were extended to match the size of other node.
1740 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1741 "SubMask with many inputs support must be larger than the mask.");
1742 if (Mask.empty()) {
1743 Mask.append(SubMask.begin(), SubMask.end());
1744 return;
1745 }
1746 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1747 int TermValue = std::min(Mask.size(), SubMask.size());
1748 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1749 if (SubMask[I] == PoisonMaskElem ||
1750 (!ExtendingManyInputs &&
1751 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1752 continue;
1753 NewMask[I] = Mask[SubMask[I]];
1754 }
1755 Mask.swap(NewMask);
1756}
1757
1758/// Order may have elements assigned special value (size) which is out of
1759/// bounds. Such indices only appear on places which correspond to undef values
1760/// (see canReuseExtract for details) and used in order to avoid undef values
1761/// have effect on operands ordering.
1762/// The first loop below simply finds all unused indices and then the next loop
1763/// nest assigns these indices for undef values positions.
1764/// As an example below Order has two undef positions and they have assigned
1765/// values 3 and 7 respectively:
1766/// before: 6 9 5 4 9 2 1 0
1767/// after: 6 3 5 4 7 2 1 0
1769 const size_t Sz = Order.size();
1770 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1771 SmallBitVector MaskedIndices(Sz);
1772 for (unsigned I = 0; I < Sz; ++I) {
1773 if (Order[I] < Sz)
1774 UnusedIndices.reset(Order[I]);
1775 else
1776 MaskedIndices.set(I);
1777 }
1778 if (MaskedIndices.none())
1779 return;
1780 assert(UnusedIndices.count() == MaskedIndices.count() &&
1781 "Non-synced masked/available indices.");
1782 int Idx = UnusedIndices.find_first();
1783 int MIdx = MaskedIndices.find_first();
1784 while (MIdx >= 0) {
1785 assert(Idx >= 0 && "Indices must be synced.");
1786 Order[MIdx] = Idx;
1787 Idx = UnusedIndices.find_next(Idx);
1788 MIdx = MaskedIndices.find_next(MIdx);
1789 }
1790}
1791
1792/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1793/// Opcode1.
1795 unsigned Opcode0, unsigned Opcode1) {
1796 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1797 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1798 for (unsigned Lane : seq<unsigned>(VL.size())) {
1799 if (isa<PoisonValue>(VL[Lane]))
1800 continue;
1801 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1802 OpcodeMask.set(Lane * ScalarTyNumElements,
1803 Lane * ScalarTyNumElements + ScalarTyNumElements);
1804 }
1805 return OpcodeMask;
1806}
1807
1808/// Replicates the given \p Val \p VF times.
1810 unsigned VF) {
1811 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1812 "Expected scalar constants.");
1813 SmallVector<Constant *> NewVal(Val.size() * VF);
1814 for (auto [I, V] : enumerate(Val))
1815 std::fill_n(NewVal.begin() + I * VF, VF, V);
1816 return NewVal;
1817}
1818
1820 SmallVectorImpl<int> &Mask) {
1821 Mask.clear();
1822 const unsigned E = Indices.size();
1823 Mask.resize(E, PoisonMaskElem);
1824 for (unsigned I = 0; I < E; ++I)
1825 Mask[Indices[I]] = I;
1826}
1827
1828/// Reorders the list of scalars in accordance with the given \p Mask.
1830 ArrayRef<int> Mask) {
1831 assert(!Mask.empty() && "Expected non-empty mask.");
1832 SmallVector<Value *> Prev(Scalars.size(),
1833 PoisonValue::get(Scalars.front()->getType()));
1834 Prev.swap(Scalars);
1835 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1836 if (Mask[I] != PoisonMaskElem)
1837 Scalars[Mask[I]] = Prev[I];
1838}
1839
1840/// Checks if the provided value does not require scheduling. It does not
1841/// require scheduling if this is not an instruction or it is an instruction
1842/// that does not read/write memory and all operands are either not instructions
1843/// or phi nodes or instructions from different blocks.
1845 auto *I = dyn_cast<Instruction>(V);
1846 if (!I)
1847 return true;
1848 return !mayHaveNonDefUseDependency(*I) &&
1849 all_of(I->operands(), [I](Value *V) {
1850 auto *IO = dyn_cast<Instruction>(V);
1851 if (!IO)
1852 return true;
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1854 });
1855}
1856
1857/// Checks if the provided value does not require scheduling. It does not
1858/// require scheduling if this is not an instruction or it is an instruction
1859/// that does not read/write memory and all users are phi nodes or instructions
1860/// from the different blocks.
1861static bool isUsedOutsideBlock(Value *V) {
1862 auto *I = dyn_cast<Instruction>(V);
1863 if (!I)
1864 return true;
1865 // Limits the number of uses to save compile time.
1866 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1867 all_of(I->users(), [I](User *U) {
1868 auto *IU = dyn_cast<Instruction>(U);
1869 if (!IU)
1870 return true;
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1872 });
1873}
1874
1875/// Checks if the specified value does not require scheduling. It does not
1876/// require scheduling if all operands and all users do not need to be scheduled
1877/// in the current basic block.
1880}
1881
1882/// Checks if the specified array of instructions does not require scheduling.
1883/// It is so if all either instructions have operands that do not require
1884/// scheduling or their users do not require scheduling since they are phis or
1885/// in other basic blocks.
1887 return !VL.empty() &&
1889}
1890
1891/// Returns true if widened type of \p Ty elements with size \p Sz represents
1892/// full vector type, i.e. adding extra element results in extra parts upon type
1893/// legalization.
1895 unsigned Sz) {
1896 if (Sz <= 1)
1897 return false;
1899 return false;
1900 if (has_single_bit(Sz))
1901 return true;
1902 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1903 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1904 Sz % NumParts == 0;
1905}
1906
1907/// Returns number of parts, the type \p VecTy will be split at the codegen
1908/// phase. If the type is going to be scalarized or does not uses whole
1909/// registers, returns 1.
1910static unsigned
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1915 return 1;
1916 unsigned Sz = getNumElements(VecTy);
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1918 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1919 return 1;
1920 return NumParts;
1921}
1922
1923/// Bottom Up SLP Vectorizer.
1925 class TreeEntry;
1926 class ScheduleEntity;
1927 class ScheduleData;
1928 class ScheduleCopyableData;
1929 class ScheduleBundle;
1932
1933 /// If we decide to generate strided load / store, this struct contains all
1934 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1935 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1936 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1937 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1938 /// size of element of FixedVectorType.
1939 struct StridedPtrInfo {
1940 Value *StrideVal = nullptr;
1941 const SCEV *StrideSCEV = nullptr;
1942 FixedVectorType *Ty = nullptr;
1943 };
1944 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1945
1946public:
1947 /// Tracks the state we can represent the loads in the given sequence.
1955
1962
1964 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1966 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1967 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1968 AC(AC), DB(DB), DL(DL), ORE(ORE),
1969 Builder(Se->getContext(), TargetFolder(*DL)) {
1970 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1971 // Use the vector register size specified by the target unless overridden
1972 // by a command-line option.
1973 // TODO: It would be better to limit the vectorization factor based on
1974 // data type rather than just register size. For example, x86 AVX has
1975 // 256-bit registers, but it does not support integer operations
1976 // at that width (that requires AVX2).
1977 if (MaxVectorRegSizeOption.getNumOccurrences())
1978 MaxVecRegSize = MaxVectorRegSizeOption;
1979 else
1980 MaxVecRegSize =
1981 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1982 .getFixedValue();
1983
1984 if (MinVectorRegSizeOption.getNumOccurrences())
1985 MinVecRegSize = MinVectorRegSizeOption;
1986 else
1987 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1988 }
1989
1990 /// Vectorize the tree that starts with the elements in \p VL.
1991 /// Returns the vectorized root.
1993
1994 /// Vectorize the tree but with the list of externally used values \p
1995 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1996 /// generated extractvalue instructions.
1998 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1999 Instruction *ReductionRoot = nullptr,
2000 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2001
2002 /// \returns the cost incurred by unwanted spills and fills, caused by
2003 /// holding live values over call sites.
2005
2006 /// \returns the vectorization cost of the subtree that starts at \p VL.
2007 /// A negative number means that this is profitable.
2008 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
2009 InstructionCost ReductionCost = TTI::TCC_Free);
2010
2011 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2012 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2013 void buildTree(ArrayRef<Value *> Roots,
2014 const SmallDenseSet<Value *> &UserIgnoreLst);
2015
2016 /// Construct a vectorizable tree that starts at \p Roots.
2017 void buildTree(ArrayRef<Value *> Roots);
2018
2019 /// Return the scalars of the root node.
2021 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2022 return VectorizableTree.front()->Scalars;
2023 }
2024
2025 /// Returns the type/is-signed info for the root node in the graph without
2026 /// casting.
2027 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2028 const TreeEntry &Root = *VectorizableTree.front();
2029 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2030 !Root.Scalars.front()->getType()->isIntegerTy())
2031 return std::nullopt;
2032 auto It = MinBWs.find(&Root);
2033 if (It != MinBWs.end())
2034 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2035 It->second.first),
2036 It->second.second);
2037 if (Root.getOpcode() == Instruction::ZExt ||
2038 Root.getOpcode() == Instruction::SExt)
2039 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2040 Root.getOpcode() == Instruction::SExt);
2041 return std::nullopt;
2042 }
2043
2044 /// Checks if the root graph node can be emitted with narrower bitwidth at
2045 /// codegen and returns it signedness, if so.
2047 return MinBWs.at(VectorizableTree.front().get()).second;
2048 }
2049
2050 /// Returns reduction type after minbitdth analysis.
2052 if (ReductionBitWidth == 0 ||
2053 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2054 ReductionBitWidth >=
2055 DL->getTypeSizeInBits(
2056 VectorizableTree.front()->Scalars.front()->getType()))
2057 return getWidenedType(
2058 VectorizableTree.front()->Scalars.front()->getType(),
2059 VectorizableTree.front()->getVectorFactor());
2060 return getWidenedType(
2062 VectorizableTree.front()->Scalars.front()->getContext(),
2063 ReductionBitWidth),
2064 VectorizableTree.front()->getVectorFactor());
2065 }
2066
2067 /// Builds external uses of the vectorized scalars, i.e. the list of
2068 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2069 /// ExternallyUsedValues contains additional list of external uses to handle
2070 /// vectorization of reductions.
2071 void
2072 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2073
2074 /// Transforms graph nodes to target specific representations, if profitable.
2075 void transformNodes();
2076
2077 /// Clear the internal data structures that are created by 'buildTree'.
2078 void deleteTree() {
2079 VectorizableTree.clear();
2080 ScalarToTreeEntries.clear();
2081 OperandsToTreeEntry.clear();
2082 ScalarsInSplitNodes.clear();
2083 MustGather.clear();
2084 NonScheduledFirst.clear();
2085 EntryToLastInstruction.clear();
2086 LastInstructionToPos.clear();
2087 LoadEntriesToVectorize.clear();
2088 IsGraphTransformMode = false;
2089 GatheredLoadsEntriesFirst.reset();
2090 CompressEntryToData.clear();
2091 ExternalUses.clear();
2092 ExternalUsesAsOriginalScalar.clear();
2093 ExternalUsesWithNonUsers.clear();
2094 for (auto &Iter : BlocksSchedules) {
2095 BlockScheduling *BS = Iter.second.get();
2096 BS->clear();
2097 }
2098 MinBWs.clear();
2099 ReductionBitWidth = 0;
2100 BaseGraphSize = 1;
2101 CastMaxMinBWSizes.reset();
2102 ExtraBitWidthNodes.clear();
2103 InstrElementSize.clear();
2104 UserIgnoreList = nullptr;
2105 PostponedGathers.clear();
2106 ValueToGatherNodes.clear();
2107 TreeEntryToStridedPtrInfoMap.clear();
2108 }
2109
2110 unsigned getTreeSize() const { return VectorizableTree.size(); }
2111
2112 /// Returns the base graph size, before any transformations.
2113 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2114
2115 /// Perform LICM and CSE on the newly generated gather sequences.
2117
2118 /// Does this non-empty order represent an identity order? Identity
2119 /// should be represented as an empty order, so this is used to
2120 /// decide if we can canonicalize a computed order. Undef elements
2121 /// (represented as size) are ignored.
2123 assert(!Order.empty() && "expected non-empty order");
2124 const unsigned Sz = Order.size();
2125 return all_of(enumerate(Order), [&](const auto &P) {
2126 return P.value() == P.index() || P.value() == Sz;
2127 });
2128 }
2129
2130 /// Checks if the specified gather tree entry \p TE can be represented as a
2131 /// shuffled vector entry + (possibly) permutation with other gathers. It
2132 /// implements the checks only for possibly ordered scalars (Loads,
2133 /// ExtractElement, ExtractValue), which can be part of the graph.
2134 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2135 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2136 /// node might be ignored.
2137 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2138 bool TopToBottom,
2139 bool IgnoreReorder);
2140
2141 /// Sort loads into increasing pointers offsets to allow greater clustering.
2142 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2143
2144 /// Gets reordering data for the given tree entry. If the entry is vectorized
2145 /// - just return ReorderIndices, otherwise check if the scalars can be
2146 /// reordered and return the most optimal order.
2147 /// \return std::nullopt if ordering is not important, empty order, if
2148 /// identity order is important, or the actual order.
2149 /// \param TopToBottom If true, include the order of vectorized stores and
2150 /// insertelement nodes, otherwise skip them.
2151 /// \param IgnoreReorder true, if the root node order can be ignored.
2152 std::optional<OrdersType>
2153 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2154
2155 /// Checks if it is profitable to reorder the current tree.
2156 /// If the tree does not contain many profitable reordable nodes, better to
2157 /// skip it to save compile time.
2158 bool isProfitableToReorder() const;
2159
2160 /// Reorders the current graph to the most profitable order starting from the
2161 /// root node to the leaf nodes. The best order is chosen only from the nodes
2162 /// of the same size (vectorization factor). Smaller nodes are considered
2163 /// parts of subgraph with smaller VF and they are reordered independently. We
2164 /// can make it because we still need to extend smaller nodes to the wider VF
2165 /// and we can merge reordering shuffles with the widening shuffles.
2166 void reorderTopToBottom();
2167
2168 /// Reorders the current graph to the most profitable order starting from
2169 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2170 /// number of reshuffles if the leaf nodes use the same order. In this case we
2171 /// can merge the orders and just shuffle user node instead of shuffling its
2172 /// operands. Plus, even the leaf nodes have different orders, it allows to
2173 /// sink reordering in the graph closer to the root node and merge it later
2174 /// during analysis.
2175 void reorderBottomToTop(bool IgnoreReorder = false);
2176
2177 /// \return The vector element size in bits to use when vectorizing the
2178 /// expression tree ending at \p V. If V is a store, the size is the width of
2179 /// the stored value. Otherwise, the size is the width of the largest loaded
2180 /// value reaching V. This method is used by the vectorizer to calculate
2181 /// vectorization factors.
2182 unsigned getVectorElementSize(Value *V);
2183
2184 /// Compute the minimum type sizes required to represent the entries in a
2185 /// vectorizable tree.
2187
2188 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2189 unsigned getMaxVecRegSize() const {
2190 return MaxVecRegSize;
2191 }
2192
2193 // \returns minimum vector register size as set by cl::opt.
2194 unsigned getMinVecRegSize() const {
2195 return MinVecRegSize;
2196 }
2197
2198 unsigned getMinVF(unsigned Sz) const {
2199 return std::max(2U, getMinVecRegSize() / Sz);
2200 }
2201
2202 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2203 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2204 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2205 return MaxVF ? MaxVF : UINT_MAX;
2206 }
2207
2208 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2209 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2210 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2211 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2212 ///
2213 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2214 unsigned canMapToVector(Type *T) const;
2215
2216 /// \returns True if the VectorizableTree is both tiny and not fully
2217 /// vectorizable. We do not vectorize such trees.
2218 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2219
2220 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2221 /// It may happen, if all gather nodes are loads and they cannot be
2222 /// "clusterized". In this case even subgraphs cannot be vectorized more
2223 /// effectively than the base graph.
2224 bool isTreeNotExtendable() const;
2225
2226 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2227 /// can be load combined in the backend. Load combining may not be allowed in
2228 /// the IR optimizer, so we do not want to alter the pattern. For example,
2229 /// partially transforming a scalar bswap() pattern into vector code is
2230 /// effectively impossible for the backend to undo.
2231 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2232 /// may not be necessary.
2233 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2234
2235 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2236 /// can be load combined in the backend. Load combining may not be allowed in
2237 /// the IR optimizer, so we do not want to alter the pattern. For example,
2238 /// partially transforming a scalar bswap() pattern into vector code is
2239 /// effectively impossible for the backend to undo.
2240 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2241 /// may not be necessary.
2242 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2243 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2244 Align Alignment, const int64_t Diff,
2245 const size_t Sz) const;
2246
2247 /// Return true if an array of scalar loads can be replaced with a strided
2248 /// load (with constant stride).
2249 ///
2250 /// It is possible that the load gets "widened". Suppose that originally each
2251 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2252 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2253 /// ...
2254 /// %b + 0 * %s + (w - 1)
2255 ///
2256 /// %b + 1 * %s + 0
2257 /// %b + 1 * %s + 1
2258 /// %b + 1 * %s + 2
2259 /// ...
2260 /// %b + 1 * %s + (w - 1)
2261 /// ...
2262 ///
2263 /// %b + (n - 1) * %s + 0
2264 /// %b + (n - 1) * %s + 1
2265 /// %b + (n - 1) * %s + 2
2266 /// ...
2267 /// %b + (n - 1) * %s + (w - 1)
2268 ///
2269 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2270 ///
2271 /// \param PointerOps list of pointer arguments of loads.
2272 /// \param ElemTy original scalar type of loads.
2273 /// \param Alignment alignment of the first load.
2274 /// \param SortedIndices is the order of PointerOps as returned by
2275 /// `sortPtrAccesses`
2276 /// \param Diff Pointer difference between the lowest and the highes pointer
2277 /// in `PointerOps` as returned by `getPointersDiff`.
2278 /// \param Ptr0 first pointer in `PointersOps`.
2279 /// \param PtrN last pointer in `PointersOps`.
2280 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2281 /// of `SPtrInfo` necessary to generate the strided load later.
2283 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2284 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2285 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2286
2287 /// Return true if an array of scalar loads can be replaced with a strided
2288 /// load (with run-time stride).
2289 /// \param PointerOps list of pointer arguments of loads.
2290 /// \param ScalarTy type of loads.
2291 /// \param CommonAlignment common alignement of loads as computed by
2292 /// `computeCommonAlignment<LoadInst>`.
2293 /// \param SortedIndicies is a list of indicies computed by this function such
2294 /// that the sequence `PointerOps[SortedIndices[0]],
2295 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2296 /// ordered by the coefficient of the stride. For example, if PointerOps is
2297 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2298 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2299 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2300 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2301 /// of `SPtrInfo` necessary to generate the strided load later.
2302 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2303 Align CommonAlignment,
2304 SmallVectorImpl<unsigned> &SortedIndices,
2305 StridedPtrInfo &SPtrInfo) const;
2306
2307 /// Checks if the given array of loads can be represented as a vectorized,
2308 /// scatter or just simple gather.
2309 /// \param VL list of loads.
2310 /// \param VL0 main load value.
2311 /// \param Order returned order of load instructions.
2312 /// \param PointerOps returned list of pointer operands.
2313 /// \param BestVF return best vector factor, if recursive check found better
2314 /// vectorization sequences rather than masked gather.
2315 /// \param TryRecursiveCheck used to check if long masked gather can be
2316 /// represented as a serie of loads/insert subvector, if profitable.
2319 SmallVectorImpl<Value *> &PointerOps,
2320 StridedPtrInfo &SPtrInfo,
2321 unsigned *BestVF = nullptr,
2322 bool TryRecursiveCheck = true) const;
2323
2324 /// Registers non-vectorizable sequence of loads
2325 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2326 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2327 }
2328
2329 /// Checks if the given loads sequence is known as not vectorizable
2330 template <typename T>
2332 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2333 }
2334
2336
2337 /// This structure holds any data we need about the edges being traversed
2338 /// during buildTreeRec(). We keep track of:
2339 /// (i) the user TreeEntry index, and
2340 /// (ii) the index of the edge.
2341 struct EdgeInfo {
2342 EdgeInfo() = default;
2343 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2345 /// The user TreeEntry.
2346 TreeEntry *UserTE = nullptr;
2347 /// The operand index of the use.
2348 unsigned EdgeIdx = UINT_MAX;
2349#ifndef NDEBUG
2351 const BoUpSLP::EdgeInfo &EI) {
2352 EI.dump(OS);
2353 return OS;
2354 }
2355 /// Debug print.
2356 void dump(raw_ostream &OS) const {
2357 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2358 << " EdgeIdx:" << EdgeIdx << "}";
2359 }
2360 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2361#endif
2362 bool operator == (const EdgeInfo &Other) const {
2363 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2364 }
2365
2366 operator bool() const { return UserTE != nullptr; }
2367 };
2368 friend struct DenseMapInfo<EdgeInfo>;
2369
2370 /// A helper class used for scoring candidates for two consecutive lanes.
2372 const TargetLibraryInfo &TLI;
2373 const DataLayout &DL;
2374 ScalarEvolution &SE;
2375 const BoUpSLP &R;
2376 int NumLanes; // Total number of lanes (aka vectorization factor).
2377 int MaxLevel; // The maximum recursion depth for accumulating score.
2378
2379 public:
2381 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2382 int MaxLevel)
2383 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2384 MaxLevel(MaxLevel) {}
2385
2386 // The hard-coded scores listed here are not very important, though it shall
2387 // be higher for better matches to improve the resulting cost. When
2388 // computing the scores of matching one sub-tree with another, we are
2389 // basically counting the number of values that are matching. So even if all
2390 // scores are set to 1, we would still get a decent matching result.
2391 // However, sometimes we have to break ties. For example we may have to
2392 // choose between matching loads vs matching opcodes. This is what these
2393 // scores are helping us with: they provide the order of preference. Also,
2394 // this is important if the scalar is externally used or used in another
2395 // tree entry node in the different lane.
2396
2397 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2398 static const int ScoreConsecutiveLoads = 4;
2399 /// The same load multiple times. This should have a better score than
2400 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2401 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2402 /// a vector load and 1.0 for a broadcast.
2403 static const int ScoreSplatLoads = 3;
2404 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2405 static const int ScoreReversedLoads = 3;
2406 /// A load candidate for masked gather.
2407 static const int ScoreMaskedGatherCandidate = 1;
2408 /// ExtractElementInst from same vector and consecutive indexes.
2409 static const int ScoreConsecutiveExtracts = 4;
2410 /// ExtractElementInst from same vector and reversed indices.
2411 static const int ScoreReversedExtracts = 3;
2412 /// Constants.
2413 static const int ScoreConstants = 2;
2414 /// Instructions with the same opcode.
2415 static const int ScoreSameOpcode = 2;
2416 /// Instructions with alt opcodes (e.g, add + sub).
2417 static const int ScoreAltOpcodes = 1;
2418 /// Identical instructions (a.k.a. splat or broadcast).
2419 static const int ScoreSplat = 1;
2420 /// Matching with an undef is preferable to failing.
2421 static const int ScoreUndef = 1;
2422 /// Score for failing to find a decent match.
2423 static const int ScoreFail = 0;
2424 /// Score if all users are vectorized.
2425 static const int ScoreAllUserVectorized = 1;
2426
2427 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2428 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2429 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2430 /// MainAltOps.
2432 ArrayRef<Value *> MainAltOps) const {
2433 if (!isValidElementType(V1->getType()) ||
2436
2437 if (V1 == V2) {
2438 if (isa<LoadInst>(V1)) {
2439 // Retruns true if the users of V1 and V2 won't need to be extracted.
2440 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2441 // Bail out if we have too many uses to save compilation time.
2442 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2443 return false;
2444
2445 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2446 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2447 return U == U1 || U == U2 || R.isVectorized(U);
2448 });
2449 };
2450 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2451 };
2452 // A broadcast of a load can be cheaper on some targets.
2453 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2454 ElementCount::getFixed(NumLanes)) &&
2455 ((int)V1->getNumUses() == NumLanes ||
2456 AllUsersAreInternal(V1, V2)))
2458 }
2460 }
2461
2462 auto CheckSameEntryOrFail = [&]() {
2463 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2465 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2466 !TEs2.empty() &&
2467 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2469 }
2471 };
2472
2473 auto *LI1 = dyn_cast<LoadInst>(V1);
2474 auto *LI2 = dyn_cast<LoadInst>(V2);
2475 if (LI1 && LI2) {
2476 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2477 !LI2->isSimple())
2478 return CheckSameEntryOrFail();
2479
2480 std::optional<int64_t> Dist = getPointersDiff(
2481 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2482 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2483 if (!Dist || *Dist == 0) {
2484 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2485 getUnderlyingObject(LI2->getPointerOperand()) &&
2486 R.TTI->isLegalMaskedGather(
2487 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2489 return CheckSameEntryOrFail();
2490 }
2491 // The distance is too large - still may be profitable to use masked
2492 // loads/gathers.
2493 if (std::abs(*Dist) > NumLanes / 2)
2495 // This still will detect consecutive loads, but we might have "holes"
2496 // in some cases. It is ok for non-power-2 vectorization and may produce
2497 // better results. It should not affect current vectorization.
2500 }
2501
2502 auto *C1 = dyn_cast<Constant>(V1);
2503 auto *C2 = dyn_cast<Constant>(V2);
2504 if (C1 && C2)
2506
2507 // Consider constants and buildvector compatible.
2508 if ((C1 && isa<InsertElementInst>(V2)) ||
2509 (C2 && isa<InsertElementInst>(V1)))
2511
2512 // Extracts from consecutive indexes of the same vector better score as
2513 // the extracts could be optimized away.
2514 Value *EV1;
2515 ConstantInt *Ex1Idx;
2516 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2517 // Undefs are always profitable for extractelements.
2518 // Compiler can easily combine poison and extractelement <non-poison> or
2519 // undef and extractelement <poison>. But combining undef +
2520 // extractelement <non-poison-but-may-produce-poison> requires some
2521 // extra operations.
2522 if (isa<UndefValue>(V2))
2523 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2526 Value *EV2 = nullptr;
2527 ConstantInt *Ex2Idx = nullptr;
2528 if (match(V2,
2530 m_Undef())))) {
2531 // Undefs are always profitable for extractelements.
2532 if (!Ex2Idx)
2534 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2536 if (EV2 == EV1) {
2537 int Idx1 = Ex1Idx->getZExtValue();
2538 int Idx2 = Ex2Idx->getZExtValue();
2539 int Dist = Idx2 - Idx1;
2540 // The distance is too large - still may be profitable to use
2541 // shuffles.
2542 if (std::abs(Dist) == 0)
2544 if (std::abs(Dist) > NumLanes / 2)
2548 }
2550 }
2551 return CheckSameEntryOrFail();
2552 }
2553
2554 auto *I1 = dyn_cast<Instruction>(V1);
2555 auto *I2 = dyn_cast<Instruction>(V2);
2556 if (I1 && I2) {
2557 if (I1->getParent() != I2->getParent())
2558 return CheckSameEntryOrFail();
2559 SmallVector<Value *, 4> Ops(MainAltOps);
2560 Ops.push_back(I1);
2561 Ops.push_back(I2);
2562 InstructionsState S = getSameOpcode(Ops, TLI);
2563 // Note: Only consider instructions with <= 2 operands to avoid
2564 // complexity explosion.
2565 if (S &&
2566 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2567 !S.isAltShuffle()) &&
2568 all_of(Ops, [&S](Value *V) {
2569 return isa<PoisonValue>(V) ||
2570 cast<Instruction>(V)->getNumOperands() ==
2571 S.getMainOp()->getNumOperands();
2572 }))
2573 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2575 }
2576
2577 if (I1 && isa<PoisonValue>(V2))
2579
2580 if (isa<UndefValue>(V2))
2582
2583 return CheckSameEntryOrFail();
2584 }
2585
2586 /// Go through the operands of \p LHS and \p RHS recursively until
2587 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2588 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2589 /// of \p U1 and \p U2), except at the beginning of the recursion where
2590 /// these are set to nullptr.
2591 ///
2592 /// For example:
2593 /// \verbatim
2594 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2595 /// \ / \ / \ / \ /
2596 /// + + + +
2597 /// G1 G2 G3 G4
2598 /// \endverbatim
2599 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2600 /// each level recursively, accumulating the score. It starts from matching
2601 /// the additions at level 0, then moves on to the loads (level 1). The
2602 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2603 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2604 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2605 /// Please note that the order of the operands does not matter, as we
2606 /// evaluate the score of all profitable combinations of operands. In
2607 /// other words the score of G1 and G4 is the same as G1 and G2. This
2608 /// heuristic is based on ideas described in:
2609 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2610 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2611 /// Luís F. W. Góes
2613 Instruction *U2, int CurrLevel,
2614 ArrayRef<Value *> MainAltOps) const {
2615
2616 // Get the shallow score of V1 and V2.
2617 int ShallowScoreAtThisLevel =
2618 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2619
2620 // If reached MaxLevel,
2621 // or if V1 and V2 are not instructions,
2622 // or if they are SPLAT,
2623 // or if they are not consecutive,
2624 // or if profitable to vectorize loads or extractelements, early return
2625 // the current cost.
2626 auto *I1 = dyn_cast<Instruction>(LHS);
2627 auto *I2 = dyn_cast<Instruction>(RHS);
2628 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2629 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2630 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2631 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2633 ShallowScoreAtThisLevel))
2634 return ShallowScoreAtThisLevel;
2635 assert(I1 && I2 && "Should have early exited.");
2636
2637 // Contains the I2 operand indexes that got matched with I1 operands.
2638 SmallSet<unsigned, 4> Op2Used;
2639
2640 // Recursion towards the operands of I1 and I2. We are trying all possible
2641 // operand pairs, and keeping track of the best score.
2642 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2643 OpIdx1 != NumOperands1; ++OpIdx1) {
2644 // Try to pair op1I with the best operand of I2.
2645 int MaxTmpScore = 0;
2646 unsigned MaxOpIdx2 = 0;
2647 bool FoundBest = false;
2648 // If I2 is commutative try all combinations.
2649 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2650 unsigned ToIdx = isCommutative(I2)
2651 ? I2->getNumOperands()
2652 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2653 assert(FromIdx <= ToIdx && "Bad index");
2654 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2655 // Skip operands already paired with OpIdx1.
2656 if (Op2Used.count(OpIdx2))
2657 continue;
2658 // Recursively calculate the cost at each level
2659 int TmpScore =
2660 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2661 I1, I2, CurrLevel + 1, {});
2662 // Look for the best score.
2663 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2664 TmpScore > MaxTmpScore) {
2665 MaxTmpScore = TmpScore;
2666 MaxOpIdx2 = OpIdx2;
2667 FoundBest = true;
2668 }
2669 }
2670 if (FoundBest) {
2671 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2672 Op2Used.insert(MaxOpIdx2);
2673 ShallowScoreAtThisLevel += MaxTmpScore;
2674 }
2675 }
2676 return ShallowScoreAtThisLevel;
2677 }
2678 };
2679 /// A helper data structure to hold the operands of a vector of instructions.
2680 /// This supports a fixed vector length for all operand vectors.
2682 /// For each operand we need (i) the value, and (ii) the opcode that it
2683 /// would be attached to if the expression was in a left-linearized form.
2684 /// This is required to avoid illegal operand reordering.
2685 /// For example:
2686 /// \verbatim
2687 /// 0 Op1
2688 /// |/
2689 /// Op1 Op2 Linearized + Op2
2690 /// \ / ----------> |/
2691 /// - -
2692 ///
2693 /// Op1 - Op2 (0 + Op1) - Op2
2694 /// \endverbatim
2695 ///
2696 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2697 ///
2698 /// Another way to think of this is to track all the operations across the
2699 /// path from the operand all the way to the root of the tree and to
2700 /// calculate the operation that corresponds to this path. For example, the
2701 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2702 /// corresponding operation is a '-' (which matches the one in the
2703 /// linearized tree, as shown above).
2704 ///
2705 /// For lack of a better term, we refer to this operation as Accumulated
2706 /// Path Operation (APO).
2707 struct OperandData {
2708 OperandData() = default;
2709 OperandData(Value *V, bool APO, bool IsUsed)
2710 : V(V), APO(APO), IsUsed(IsUsed) {}
2711 /// The operand value.
2712 Value *V = nullptr;
2713 /// TreeEntries only allow a single opcode, or an alternate sequence of
2714 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2715 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2716 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2717 /// (e.g., Add/Mul)
2718 bool APO = false;
2719 /// Helper data for the reordering function.
2720 bool IsUsed = false;
2721 };
2722
2723 /// During operand reordering, we are trying to select the operand at lane
2724 /// that matches best with the operand at the neighboring lane. Our
2725 /// selection is based on the type of value we are looking for. For example,
2726 /// if the neighboring lane has a load, we need to look for a load that is
2727 /// accessing a consecutive address. These strategies are summarized in the
2728 /// 'ReorderingMode' enumerator.
2729 enum class ReorderingMode {
2730 Load, ///< Matching loads to consecutive memory addresses
2731 Opcode, ///< Matching instructions based on opcode (same or alternate)
2732 Constant, ///< Matching constants
2733 Splat, ///< Matching the same instruction multiple times (broadcast)
2734 Failed, ///< We failed to create a vectorizable group
2735 };
2736
2737 using OperandDataVec = SmallVector<OperandData, 2>;
2738
2739 /// A vector of operand vectors.
2741 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2742 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2743 unsigned ArgSize = 0;
2744
2745 const TargetLibraryInfo &TLI;
2746 const DataLayout &DL;
2747 ScalarEvolution &SE;
2748 const BoUpSLP &R;
2749 const Loop *L = nullptr;
2750
2751 /// \returns the operand data at \p OpIdx and \p Lane.
2752 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2753 return OpsVec[OpIdx][Lane];
2754 }
2755
2756 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2757 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2758 return OpsVec[OpIdx][Lane];
2759 }
2760
2761 /// Clears the used flag for all entries.
2762 void clearUsed() {
2763 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2764 OpIdx != NumOperands; ++OpIdx)
2765 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2766 ++Lane)
2767 OpsVec[OpIdx][Lane].IsUsed = false;
2768 }
2769
2770 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2771 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2772 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2773 }
2774
2775 /// \param Lane lane of the operands under analysis.
2776 /// \param OpIdx operand index in \p Lane lane we're looking the best
2777 /// candidate for.
2778 /// \param Idx operand index of the current candidate value.
2779 /// \returns The additional score due to possible broadcasting of the
2780 /// elements in the lane. It is more profitable to have power-of-2 unique
2781 /// elements in the lane, it will be vectorized with higher probability
2782 /// after removing duplicates. Currently the SLP vectorizer supports only
2783 /// vectorization of the power-of-2 number of unique scalars.
2784 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2785 const SmallBitVector &UsedLanes) const {
2786 Value *IdxLaneV = getData(Idx, Lane).V;
2787 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2788 isa<ExtractElementInst>(IdxLaneV))
2789 return 0;
2791 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2792 if (Ln == Lane)
2793 continue;
2794 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2795 if (!isa<Instruction>(OpIdxLnV))
2796 return 0;
2797 Uniques.try_emplace(OpIdxLnV, Ln);
2798 }
2799 unsigned UniquesCount = Uniques.size();
2800 auto IdxIt = Uniques.find(IdxLaneV);
2801 unsigned UniquesCntWithIdxLaneV =
2802 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2803 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2804 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2805 unsigned UniquesCntWithOpIdxLaneV =
2806 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2807 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2808 return 0;
2809 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2810 UniquesCntWithOpIdxLaneV,
2811 UniquesCntWithOpIdxLaneV -
2812 bit_floor(UniquesCntWithOpIdxLaneV)) -
2813 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2814 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2815 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2816 }
2817
2818 /// \param Lane lane of the operands under analysis.
2819 /// \param OpIdx operand index in \p Lane lane we're looking the best
2820 /// candidate for.
2821 /// \param Idx operand index of the current candidate value.
2822 /// \returns The additional score for the scalar which users are all
2823 /// vectorized.
2824 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2825 Value *IdxLaneV = getData(Idx, Lane).V;
2826 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2827 // Do not care about number of uses for vector-like instructions
2828 // (extractelement/extractvalue with constant indices), they are extracts
2829 // themselves and already externally used. Vectorization of such
2830 // instructions does not add extra extractelement instruction, just may
2831 // remove it.
2832 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2833 isVectorLikeInstWithConstOps(OpIdxLaneV))
2835 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2836 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2837 return 0;
2838 return R.areAllUsersVectorized(IdxLaneI)
2840 : 0;
2841 }
2842
2843 /// Score scaling factor for fully compatible instructions but with
2844 /// different number of external uses. Allows better selection of the
2845 /// instructions with less external uses.
2846 static const int ScoreScaleFactor = 10;
2847
2848 /// \Returns the look-ahead score, which tells us how much the sub-trees
2849 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2850 /// score. This helps break ties in an informed way when we cannot decide on
2851 /// the order of the operands by just considering the immediate
2852 /// predecessors.
2853 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2854 int Lane, unsigned OpIdx, unsigned Idx,
2855 bool &IsUsed, const SmallBitVector &UsedLanes) {
2856 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2858 // Keep track of the instruction stack as we recurse into the operands
2859 // during the look-ahead score exploration.
2860 int Score =
2861 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2862 /*CurrLevel=*/1, MainAltOps);
2863 if (Score) {
2864 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2865 if (Score <= -SplatScore) {
2866 // Failed score.
2867 Score = 0;
2868 } else {
2869 Score += SplatScore;
2870 // Scale score to see the difference between different operands
2871 // and similar operands but all vectorized/not all vectorized
2872 // uses. It does not affect actual selection of the best
2873 // compatible operand in general, just allows to select the
2874 // operand with all vectorized uses.
2875 Score *= ScoreScaleFactor;
2876 Score += getExternalUseScore(Lane, OpIdx, Idx);
2877 IsUsed = true;
2878 }
2879 }
2880 return Score;
2881 }
2882
2883 /// Best defined scores per lanes between the passes. Used to choose the
2884 /// best operand (with the highest score) between the passes.
2885 /// The key - {Operand Index, Lane}.
2886 /// The value - the best score between the passes for the lane and the
2887 /// operand.
2889 BestScoresPerLanes;
2890
2891 // Search all operands in Ops[*][Lane] for the one that matches best
2892 // Ops[OpIdx][LastLane] and return its opreand index.
2893 // If no good match can be found, return std::nullopt.
2894 std::optional<unsigned>
2895 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2896 ArrayRef<ReorderingMode> ReorderingModes,
2897 ArrayRef<Value *> MainAltOps,
2898 const SmallBitVector &UsedLanes) {
2899 unsigned NumOperands = getNumOperands();
2900
2901 // The operand of the previous lane at OpIdx.
2902 Value *OpLastLane = getData(OpIdx, LastLane).V;
2903
2904 // Our strategy mode for OpIdx.
2905 ReorderingMode RMode = ReorderingModes[OpIdx];
2906 if (RMode == ReorderingMode::Failed)
2907 return std::nullopt;
2908
2909 // The linearized opcode of the operand at OpIdx, Lane.
2910 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2911
2912 // The best operand index and its score.
2913 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2914 // are using the score to differentiate between the two.
2915 struct BestOpData {
2916 std::optional<unsigned> Idx;
2917 unsigned Score = 0;
2918 } BestOp;
2919 BestOp.Score =
2920 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2921 .first->second;
2922
2923 // Track if the operand must be marked as used. If the operand is set to
2924 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2925 // want to reestimate the operands again on the following iterations).
2926 bool IsUsed = RMode == ReorderingMode::Splat ||
2927 RMode == ReorderingMode::Constant ||
2928 RMode == ReorderingMode::Load;
2929 // Iterate through all unused operands and look for the best.
2930 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2931 // Get the operand at Idx and Lane.
2932 OperandData &OpData = getData(Idx, Lane);
2933 Value *Op = OpData.V;
2934 bool OpAPO = OpData.APO;
2935
2936 // Skip already selected operands.
2937 if (OpData.IsUsed)
2938 continue;
2939
2940 // Skip if we are trying to move the operand to a position with a
2941 // different opcode in the linearized tree form. This would break the
2942 // semantics.
2943 if (OpAPO != OpIdxAPO)
2944 continue;
2945
2946 // Look for an operand that matches the current mode.
2947 switch (RMode) {
2948 case ReorderingMode::Load:
2949 case ReorderingMode::Opcode: {
2950 bool LeftToRight = Lane > LastLane;
2951 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2952 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2953 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2954 OpIdx, Idx, IsUsed, UsedLanes);
2955 if (Score > static_cast<int>(BestOp.Score) ||
2956 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2957 Idx == OpIdx)) {
2958 BestOp.Idx = Idx;
2959 BestOp.Score = Score;
2960 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2961 }
2962 break;
2963 }
2964 case ReorderingMode::Constant:
2965 if (isa<Constant>(Op) ||
2966 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2967 BestOp.Idx = Idx;
2968 if (isa<Constant>(Op)) {
2970 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2972 }
2974 IsUsed = false;
2975 }
2976 break;
2977 case ReorderingMode::Splat:
2978 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2979 IsUsed = Op == OpLastLane;
2980 if (Op == OpLastLane) {
2981 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2982 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2984 }
2985 BestOp.Idx = Idx;
2986 }
2987 break;
2988 case ReorderingMode::Failed:
2989 llvm_unreachable("Not expected Failed reordering mode.");
2990 }
2991 }
2992
2993 if (BestOp.Idx) {
2994 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2995 return BestOp.Idx;
2996 }
2997 // If we could not find a good match return std::nullopt.
2998 return std::nullopt;
2999 }
3000
3001 /// Helper for reorderOperandVecs.
3002 /// \returns the lane that we should start reordering from. This is the one
3003 /// which has the least number of operands that can freely move about or
3004 /// less profitable because it already has the most optimal set of operands.
3005 unsigned getBestLaneToStartReordering() const {
3006 unsigned Min = UINT_MAX;
3007 unsigned SameOpNumber = 0;
3008 // std::pair<unsigned, unsigned> is used to implement a simple voting
3009 // algorithm and choose the lane with the least number of operands that
3010 // can freely move about or less profitable because it already has the
3011 // most optimal set of operands. The first unsigned is a counter for
3012 // voting, the second unsigned is the counter of lanes with instructions
3013 // with same/alternate opcodes and same parent basic block.
3015 // Try to be closer to the original results, if we have multiple lanes
3016 // with same cost. If 2 lanes have the same cost, use the one with the
3017 // highest index.
3018 for (int I = getNumLanes(); I > 0; --I) {
3019 unsigned Lane = I - 1;
3020 OperandsOrderData NumFreeOpsHash =
3021 getMaxNumOperandsThatCanBeReordered(Lane);
3022 // Compare the number of operands that can move and choose the one with
3023 // the least number.
3024 if (NumFreeOpsHash.NumOfAPOs < Min) {
3025 Min = NumFreeOpsHash.NumOfAPOs;
3026 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3027 HashMap.clear();
3028 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3029 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3030 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3031 // Select the most optimal lane in terms of number of operands that
3032 // should be moved around.
3033 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3034 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3035 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3036 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3037 auto [It, Inserted] =
3038 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3039 if (!Inserted)
3040 ++It->second.first;
3041 }
3042 }
3043 // Select the lane with the minimum counter.
3044 unsigned BestLane = 0;
3045 unsigned CntMin = UINT_MAX;
3046 for (const auto &Data : reverse(HashMap)) {
3047 if (Data.second.first < CntMin) {
3048 CntMin = Data.second.first;
3049 BestLane = Data.second.second;
3050 }
3051 }
3052 return BestLane;
3053 }
3054
3055 /// Data structure that helps to reorder operands.
3056 struct OperandsOrderData {
3057 /// The best number of operands with the same APOs, which can be
3058 /// reordered.
3059 unsigned NumOfAPOs = UINT_MAX;
3060 /// Number of operands with the same/alternate instruction opcode and
3061 /// parent.
3062 unsigned NumOpsWithSameOpcodeParent = 0;
3063 /// Hash for the actual operands ordering.
3064 /// Used to count operands, actually their position id and opcode
3065 /// value. It is used in the voting mechanism to find the lane with the
3066 /// least number of operands that can freely move about or less profitable
3067 /// because it already has the most optimal set of operands. Can be
3068 /// replaced with SmallVector<unsigned> instead but hash code is faster
3069 /// and requires less memory.
3070 unsigned Hash = 0;
3071 };
3072 /// \returns the maximum number of operands that are allowed to be reordered
3073 /// for \p Lane and the number of compatible instructions(with the same
3074 /// parent/opcode). This is used as a heuristic for selecting the first lane
3075 /// to start operand reordering.
3076 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3077 unsigned CntTrue = 0;
3078 unsigned NumOperands = getNumOperands();
3079 // Operands with the same APO can be reordered. We therefore need to count
3080 // how many of them we have for each APO, like this: Cnt[APO] = x.
3081 // Since we only have two APOs, namely true and false, we can avoid using
3082 // a map. Instead we can simply count the number of operands that
3083 // correspond to one of them (in this case the 'true' APO), and calculate
3084 // the other by subtracting it from the total number of operands.
3085 // Operands with the same instruction opcode and parent are more
3086 // profitable since we don't need to move them in many cases, with a high
3087 // probability such lane already can be vectorized effectively.
3088 bool AllUndefs = true;
3089 unsigned NumOpsWithSameOpcodeParent = 0;
3090 Instruction *OpcodeI = nullptr;
3091 BasicBlock *Parent = nullptr;
3092 unsigned Hash = 0;
3093 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3094 const OperandData &OpData = getData(OpIdx, Lane);
3095 if (OpData.APO)
3096 ++CntTrue;
3097 // Use Boyer-Moore majority voting for finding the majority opcode and
3098 // the number of times it occurs.
3099 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3100 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3101 I->getParent() != Parent) {
3102 if (NumOpsWithSameOpcodeParent == 0) {
3103 NumOpsWithSameOpcodeParent = 1;
3104 OpcodeI = I;
3105 Parent = I->getParent();
3106 } else {
3107 --NumOpsWithSameOpcodeParent;
3108 }
3109 } else {
3110 ++NumOpsWithSameOpcodeParent;
3111 }
3112 }
3113 Hash = hash_combine(
3114 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3115 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3116 }
3117 if (AllUndefs)
3118 return {};
3119 OperandsOrderData Data;
3120 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3121 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3122 Data.Hash = Hash;
3123 return Data;
3124 }
3125
3126 /// Go through the instructions in VL and append their operands.
3127 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3128 const InstructionsState &S) {
3129 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3130 assert((empty() || all_of(Operands,
3131 [this](const ValueList &VL) {
3132 return VL.size() == getNumLanes();
3133 })) &&
3134 "Expected same number of lanes");
3135 assert(S.valid() && "InstructionsState is invalid.");
3136 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3137 // arguments to the intrinsic produces the same result.
3138 Instruction *MainOp = S.getMainOp();
3139 unsigned NumOperands = MainOp->getNumOperands();
3141 OpsVec.resize(ArgSize);
3142 unsigned NumLanes = VL.size();
3143 for (OperandDataVec &Ops : OpsVec)
3144 Ops.resize(NumLanes);
3145 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3146 // Our tree has just 3 nodes: the root and two operands.
3147 // It is therefore trivial to get the APO. We only need to check the
3148 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3149 // operand. The LHS operand of both add and sub is never attached to an
3150 // inversese operation in the linearized form, therefore its APO is
3151 // false. The RHS is true only if V is an inverse operation.
3152
3153 // Since operand reordering is performed on groups of commutative
3154 // operations or alternating sequences (e.g., +, -), we can safely tell
3155 // the inverse operations by checking commutativity.
3156 auto *I = dyn_cast<Instruction>(VL[Lane]);
3157 if (!I && isa<PoisonValue>(VL[Lane])) {
3158 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3159 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3160 continue;
3161 }
3162 bool IsInverseOperation = false;
3163 if (S.isCopyableElement(VL[Lane])) {
3164 // The value is a copyable element.
3165 IsInverseOperation =
3166 !isCommutative(MainOp, VL[Lane], /*IsCopyable=*/true);
3167 } else {
3168 assert(I && "Expected instruction");
3169 auto [SelectedOp, Ops] = convertTo(I, S);
3170 // We cannot check commutativity by the converted instruction
3171 // (SelectedOp) because isCommutative also examines def-use
3172 // relationships.
3173 IsInverseOperation = !isCommutative(SelectedOp, I);
3174 }
3175 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3176 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3177 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3178 }
3179 }
3180 }
3181
3182 /// \returns the number of operands.
3183 unsigned getNumOperands() const { return ArgSize; }
3184
3185 /// \returns the number of lanes.
3186 unsigned getNumLanes() const { return OpsVec[0].size(); }
3187
3188 /// \returns the operand value at \p OpIdx and \p Lane.
3189 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3190 return getData(OpIdx, Lane).V;
3191 }
3192
3193 /// \returns true if the data structure is empty.
3194 bool empty() const { return OpsVec.empty(); }
3195
3196 /// Clears the data.
3197 void clear() { OpsVec.clear(); }
3198
3199 /// \Returns true if there are enough operands identical to \p Op to fill
3200 /// the whole vector (it is mixed with constants or loop invariant values).
3201 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3202 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3203 assert(Op == getValue(OpIdx, Lane) &&
3204 "Op is expected to be getValue(OpIdx, Lane).");
3205 // Small number of loads - try load matching.
3206 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3207 return false;
3208 bool OpAPO = getData(OpIdx, Lane).APO;
3209 bool IsInvariant = L && L->isLoopInvariant(Op);
3210 unsigned Cnt = 0;
3211 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3212 if (Ln == Lane)
3213 continue;
3214 // This is set to true if we found a candidate for broadcast at Lane.
3215 bool FoundCandidate = false;
3216 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3217 OperandData &Data = getData(OpI, Ln);
3218 if (Data.APO != OpAPO || Data.IsUsed)
3219 continue;
3220 Value *OpILane = getValue(OpI, Lane);
3221 bool IsConstantOp = isa<Constant>(OpILane);
3222 // Consider the broadcast candidate if:
3223 // 1. Same value is found in one of the operands.
3224 if (Data.V == Op ||
3225 // 2. The operand in the given lane is not constant but there is a
3226 // constant operand in another lane (which can be moved to the
3227 // given lane). In this case we can represent it as a simple
3228 // permutation of constant and broadcast.
3229 (!IsConstantOp &&
3230 ((Lns > 2 && isa<Constant>(Data.V)) ||
3231 // 2.1. If we have only 2 lanes, need to check that value in the
3232 // next lane does not build same opcode sequence.
3233 (Lns == 2 &&
3234 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3235 isa<Constant>(Data.V)))) ||
3236 // 3. The operand in the current lane is loop invariant (can be
3237 // hoisted out) and another operand is also a loop invariant
3238 // (though not a constant). In this case the whole vector can be
3239 // hoisted out.
3240 // FIXME: need to teach the cost model about this case for better
3241 // estimation.
3242 (IsInvariant && !isa<Constant>(Data.V) &&
3243 !getSameOpcode({Op, Data.V}, TLI) &&
3244 L->isLoopInvariant(Data.V))) {
3245 FoundCandidate = true;
3246 Data.IsUsed = Data.V == Op;
3247 if (Data.V == Op)
3248 ++Cnt;
3249 break;
3250 }
3251 }
3252 if (!FoundCandidate)
3253 return false;
3254 }
3255 return getNumLanes() == 2 || Cnt > 1;
3256 }
3257
3258 /// Checks if there is at least single compatible operand in lanes other
3259 /// than \p Lane, compatible with the operand \p Op.
3260 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3261 assert(Op == getValue(OpIdx, Lane) &&
3262 "Op is expected to be getValue(OpIdx, Lane).");
3263 bool OpAPO = getData(OpIdx, Lane).APO;
3264 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3265 if (Ln == Lane)
3266 continue;
3267 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3268 const OperandData &Data = getData(OpI, Ln);
3269 if (Data.APO != OpAPO || Data.IsUsed)
3270 return true;
3271 Value *OpILn = getValue(OpI, Ln);
3272 return (L && L->isLoopInvariant(OpILn)) ||
3273 (getSameOpcode({Op, OpILn}, TLI) &&
3274 allSameBlock({Op, OpILn}));
3275 }))
3276 return true;
3277 }
3278 return false;
3279 }
3280
3281 public:
3282 /// Initialize with all the operands of the instruction vector \p RootVL.
3284 const InstructionsState &S, const BoUpSLP &R)
3285 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3286 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3287 // Append all the operands of RootVL.
3288 appendOperands(RootVL, Operands, S);
3289 }
3290
3291 /// \Returns a value vector with the operands across all lanes for the
3292 /// opearnd at \p OpIdx.
3293 ValueList getVL(unsigned OpIdx) const {
3294 ValueList OpVL(OpsVec[OpIdx].size());
3295 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3296 "Expected same num of lanes across all operands");
3297 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3298 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3299 return OpVL;
3300 }
3301
3302 // Performs operand reordering for 2 or more operands.
3303 // The original operands are in OrigOps[OpIdx][Lane].
3304 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3305 void reorder() {
3306 unsigned NumOperands = getNumOperands();
3307 unsigned NumLanes = getNumLanes();
3308 // Each operand has its own mode. We are using this mode to help us select
3309 // the instructions for each lane, so that they match best with the ones
3310 // we have selected so far.
3311 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3312
3313 // This is a greedy single-pass algorithm. We are going over each lane
3314 // once and deciding on the best order right away with no back-tracking.
3315 // However, in order to increase its effectiveness, we start with the lane
3316 // that has operands that can move the least. For example, given the
3317 // following lanes:
3318 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3319 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3320 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3321 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3322 // we will start at Lane 1, since the operands of the subtraction cannot
3323 // be reordered. Then we will visit the rest of the lanes in a circular
3324 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3325
3326 // Find the first lane that we will start our search from.
3327 unsigned FirstLane = getBestLaneToStartReordering();
3328
3329 // Initialize the modes.
3330 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3331 Value *OpLane0 = getValue(OpIdx, FirstLane);
3332 // Keep track if we have instructions with all the same opcode on one
3333 // side.
3334 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3335 // Check if OpLane0 should be broadcast.
3336 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3337 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3338 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3339 else if (isa<LoadInst>(OpILane0))
3340 ReorderingModes[OpIdx] = ReorderingMode::Load;
3341 else
3342 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3343 } else if (isa<Constant>(OpLane0)) {
3344 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3345 } else if (isa<Argument>(OpLane0)) {
3346 // Our best hope is a Splat. It may save some cost in some cases.
3347 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3348 } else {
3349 llvm_unreachable("Unexpected value kind.");
3350 }
3351 }
3352
3353 // Check that we don't have same operands. No need to reorder if operands
3354 // are just perfect diamond or shuffled diamond match. Do not do it only
3355 // for possible broadcasts or non-power of 2 number of scalars (just for
3356 // now).
3357 auto &&SkipReordering = [this]() {
3358 SmallPtrSet<Value *, 4> UniqueValues;
3359 ArrayRef<OperandData> Op0 = OpsVec.front();
3360 for (const OperandData &Data : Op0)
3361 UniqueValues.insert(Data.V);
3363 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3364 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3365 return !UniqueValues.contains(Data.V);
3366 }))
3367 return false;
3368 }
3369 // TODO: Check if we can remove a check for non-power-2 number of
3370 // scalars after full support of non-power-2 vectorization.
3371 return UniqueValues.size() != 2 &&
3372 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3373 UniqueValues.size());
3374 };
3375
3376 // If the initial strategy fails for any of the operand indexes, then we
3377 // perform reordering again in a second pass. This helps avoid assigning
3378 // high priority to the failed strategy, and should improve reordering for
3379 // the non-failed operand indexes.
3380 for (int Pass = 0; Pass != 2; ++Pass) {
3381 // Check if no need to reorder operands since they're are perfect or
3382 // shuffled diamond match.
3383 // Need to do it to avoid extra external use cost counting for
3384 // shuffled matches, which may cause regressions.
3385 if (SkipReordering())
3386 break;
3387 // Skip the second pass if the first pass did not fail.
3388 bool StrategyFailed = false;
3389 // Mark all operand data as free to use.
3390 clearUsed();
3391 // We keep the original operand order for the FirstLane, so reorder the
3392 // rest of the lanes. We are visiting the nodes in a circular fashion,
3393 // using FirstLane as the center point and increasing the radius
3394 // distance.
3395 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3396 for (unsigned I = 0; I < NumOperands; ++I)
3397 MainAltOps[I].push_back(getData(I, FirstLane).V);
3398
3399 SmallBitVector UsedLanes(NumLanes);
3400 UsedLanes.set(FirstLane);
3401 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3402 // Visit the lane on the right and then the lane on the left.
3403 for (int Direction : {+1, -1}) {
3404 int Lane = FirstLane + Direction * Distance;
3405 if (Lane < 0 || Lane >= (int)NumLanes)
3406 continue;
3407 UsedLanes.set(Lane);
3408 int LastLane = Lane - Direction;
3409 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3410 "Out of bounds");
3411 // Look for a good match for each operand.
3412 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3413 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3414 std::optional<unsigned> BestIdx =
3415 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3416 MainAltOps[OpIdx], UsedLanes);
3417 // By not selecting a value, we allow the operands that follow to
3418 // select a better matching value. We will get a non-null value in
3419 // the next run of getBestOperand().
3420 if (BestIdx) {
3421 // Swap the current operand with the one returned by
3422 // getBestOperand().
3423 swap(OpIdx, *BestIdx, Lane);
3424 } else {
3425 // Enable the second pass.
3426 StrategyFailed = true;
3427 }
3428 // Try to get the alternate opcode and follow it during analysis.
3429 if (MainAltOps[OpIdx].size() != 2) {
3430 OperandData &AltOp = getData(OpIdx, Lane);
3431 InstructionsState OpS =
3432 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3433 if (OpS && OpS.isAltShuffle())
3434 MainAltOps[OpIdx].push_back(AltOp.V);
3435 }
3436 }
3437 }
3438 }
3439 // Skip second pass if the strategy did not fail.
3440 if (!StrategyFailed)
3441 break;
3442 }
3443 }
3444
3445#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3446 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3447 switch (RMode) {
3448 case ReorderingMode::Load:
3449 return "Load";
3450 case ReorderingMode::Opcode:
3451 return "Opcode";
3452 case ReorderingMode::Constant:
3453 return "Constant";
3454 case ReorderingMode::Splat:
3455 return "Splat";
3456 case ReorderingMode::Failed:
3457 return "Failed";
3458 }
3459 llvm_unreachable("Unimplemented Reordering Type");
3460 }
3461
3462 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3463 raw_ostream &OS) {
3464 return OS << getModeStr(RMode);
3465 }
3466
3467 /// Debug print.
3468 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3469 printMode(RMode, dbgs());
3470 }
3471
3472 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3473 return printMode(RMode, OS);
3474 }
3475
3477 const unsigned Indent = 2;
3478 unsigned Cnt = 0;
3479 for (const OperandDataVec &OpDataVec : OpsVec) {
3480 OS << "Operand " << Cnt++ << "\n";
3481 for (const OperandData &OpData : OpDataVec) {
3482 OS.indent(Indent) << "{";
3483 if (Value *V = OpData.V)
3484 OS << *V;
3485 else
3486 OS << "null";
3487 OS << ", APO:" << OpData.APO << "}\n";
3488 }
3489 OS << "\n";
3490 }
3491 return OS;
3492 }
3493
3494 /// Debug print.
3495 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3496#endif
3497 };
3498
3499 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3500 /// for a pair which have highest score deemed to have best chance to form
3501 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3502 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3503 /// of the cost, considered to be good enough score.
3504 std::optional<int>
3505 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3506 int Limit = LookAheadHeuristics::ScoreFail) const {
3507 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3509 int BestScore = Limit;
3510 std::optional<int> Index;
3511 for (int I : seq<int>(0, Candidates.size())) {
3512 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3513 Candidates[I].second,
3514 /*U1=*/nullptr, /*U2=*/nullptr,
3515 /*CurrLevel=*/1, {});
3516 if (Score > BestScore) {
3517 BestScore = Score;
3518 Index = I;
3519 }
3520 }
3521 return Index;
3522 }
3523
3524 /// Checks if the instruction is marked for deletion.
3525 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3526
3527 /// Removes an instruction from its block and eventually deletes it.
3528 /// It's like Instruction::eraseFromParent() except that the actual deletion
3529 /// is delayed until BoUpSLP is destructed.
3531 DeletedInstructions.insert(I);
3532 }
3533
3534 /// Remove instructions from the parent function and clear the operands of \p
3535 /// DeadVals instructions, marking for deletion trivially dead operands.
3536 template <typename T>
3538 ArrayRef<T *> DeadVals,
3539 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3541 for (T *V : DeadVals) {
3542 auto *I = cast<Instruction>(V);
3544 }
3545 DenseSet<Value *> Processed;
3546 for (T *V : DeadVals) {
3547 if (!V || !Processed.insert(V).second)
3548 continue;
3549 auto *I = cast<Instruction>(V);
3551 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3552 for (Use &U : I->operands()) {
3553 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3554 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3556 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3557 return Entry->VectorizedValue == OpI;
3558 })))
3559 DeadInsts.push_back(OpI);
3560 }
3561 I->dropAllReferences();
3562 }
3563 for (T *V : DeadVals) {
3564 auto *I = cast<Instruction>(V);
3565 if (!I->getParent())
3566 continue;
3567 assert((I->use_empty() || all_of(I->uses(),
3568 [&](Use &U) {
3569 return isDeleted(
3570 cast<Instruction>(U.getUser()));
3571 })) &&
3572 "trying to erase instruction with users.");
3573 I->removeFromParent();
3574 SE->forgetValue(I);
3575 }
3576 // Process the dead instruction list until empty.
3577 while (!DeadInsts.empty()) {
3578 Value *V = DeadInsts.pop_back_val();
3580 if (!VI || !VI->getParent())
3581 continue;
3583 "Live instruction found in dead worklist!");
3584 assert(VI->use_empty() && "Instructions with uses are not dead.");
3585
3586 // Don't lose the debug info while deleting the instructions.
3587 salvageDebugInfo(*VI);
3588
3589 // Null out all of the instruction's operands to see if any operand
3590 // becomes dead as we go.
3591 for (Use &OpU : VI->operands()) {
3592 Value *OpV = OpU.get();
3593 if (!OpV)
3594 continue;
3595 OpU.set(nullptr);
3596
3597 if (!OpV->use_empty())
3598 continue;
3599
3600 // If the operand is an instruction that became dead as we nulled out
3601 // the operand, and if it is 'trivially' dead, delete it in a future
3602 // loop iteration.
3603 if (auto *OpI = dyn_cast<Instruction>(OpV))
3604 if (!DeletedInstructions.contains(OpI) &&
3605 (!OpI->getType()->isVectorTy() ||
3606 none_of(VectorValuesAndScales,
3607 [&](const std::tuple<Value *, unsigned, bool> &V) {
3608 return std::get<0>(V) == OpI;
3609 })) &&
3611 DeadInsts.push_back(OpI);
3612 }
3613
3614 VI->removeFromParent();
3615 eraseInstruction(VI);
3616 SE->forgetValue(VI);
3617 }
3618 }
3619
3620 /// Checks if the instruction was already analyzed for being possible
3621 /// reduction root.
3623 return AnalyzedReductionsRoots.count(I);
3624 }
3625 /// Register given instruction as already analyzed for being possible
3626 /// reduction root.
3628 AnalyzedReductionsRoots.insert(I);
3629 }
3630 /// Checks if the provided list of reduced values was checked already for
3631 /// vectorization.
3633 return AnalyzedReductionVals.contains(hash_value(VL));
3634 }
3635 /// Adds the list of reduced values to list of already checked values for the
3636 /// vectorization.
3638 AnalyzedReductionVals.insert(hash_value(VL));
3639 }
3640 /// Clear the list of the analyzed reduction root instructions.
3642 AnalyzedReductionsRoots.clear();
3643 AnalyzedReductionVals.clear();
3644 AnalyzedMinBWVals.clear();
3645 }
3646 /// Checks if the given value is gathered in one of the nodes.
3647 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3648 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3649 }
3650 /// Checks if the given value is gathered in one of the nodes.
3651 bool isGathered(const Value *V) const {
3652 return MustGather.contains(V);
3653 }
3654 /// Checks if the specified value was not schedule.
3655 bool isNotScheduled(const Value *V) const {
3656 return NonScheduledFirst.contains(V);
3657 }
3658
3659 /// Check if the value is vectorized in the tree.
3660 bool isVectorized(const Value *V) const {
3661 assert(V && "V cannot be nullptr.");
3662 return ScalarToTreeEntries.contains(V);
3663 }
3664
3665 ~BoUpSLP();
3666
3667private:
3668 /// Determine if a node \p E in can be demoted to a smaller type with a
3669 /// truncation. We collect the entries that will be demoted in ToDemote.
3670 /// \param E Node for analysis
3671 /// \param ToDemote indices of the nodes to be demoted.
3672 bool collectValuesToDemote(
3673 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3675 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3676 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3677
3678 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3679 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3680 /// they have only one user and reordarable).
3681 /// \param ReorderableGathers List of all gather nodes that require reordering
3682 /// (e.g., gather of extractlements or partially vectorizable loads).
3683 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3684 /// reordering, subset of \p NonVectorized.
3685 void buildReorderableOperands(
3686 TreeEntry *UserTE,
3687 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3688 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3689 SmallVectorImpl<TreeEntry *> &GatherOps);
3690
3691 /// Checks if the given \p TE is a gather node with clustered reused scalars
3692 /// and reorders it per given \p Mask.
3693 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3694
3695 /// Checks if all users of \p I are the part of the vectorization tree.
3696 bool areAllUsersVectorized(
3697 Instruction *I,
3698 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3699
3700 /// Return information about the vector formed for the specified index
3701 /// of a vector of (the same) instruction.
3703
3704 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3705 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3706 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3707 return const_cast<TreeEntry *>(
3708 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3709 }
3710
3711 /// Gets the root instruction for the given node. If the node is a strided
3712 /// load/store node with the reverse order, the root instruction is the last
3713 /// one.
3714 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3715
3716 /// \returns Cast context for the given graph node.
3718 getCastContextHint(const TreeEntry &TE) const;
3719
3720 /// \returns the cost of the vectorizable entry.
3721 InstructionCost getEntryCost(const TreeEntry *E,
3722 ArrayRef<Value *> VectorizedVals,
3723 SmallPtrSetImpl<Value *> &CheckedExtracts);
3724
3725 /// Checks if it is legal and profitable to build SplitVectorize node for the
3726 /// given \p VL.
3727 /// \param Op1 first homogeneous scalars.
3728 /// \param Op2 second homogeneous scalars.
3729 /// \param ReorderIndices indices to reorder the scalars.
3730 /// \returns true if the node was successfully built.
3731 bool canBuildSplitNode(ArrayRef<Value *> VL,
3732 const InstructionsState &LocalState,
3735 OrdersType &ReorderIndices) const;
3736
3737 /// This is the recursive part of buildTree.
3738 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3739 unsigned InterleaveFactor = 0);
3740
3741 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3742 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3743 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3744 /// returns false, setting \p CurrentOrder to either an empty vector or a
3745 /// non-identity permutation that allows to reuse extract instructions.
3746 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3747 /// extract order.
3748 bool canReuseExtract(ArrayRef<Value *> VL,
3749 SmallVectorImpl<unsigned> &CurrentOrder,
3750 bool ResizeAllowed = false) const;
3751
3752 /// Vectorize a single entry in the tree.
3753 Value *vectorizeTree(TreeEntry *E);
3754
3755 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3756 /// \p E.
3757 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3758
3759 /// Create a new vector from a list of scalar values. Produces a sequence
3760 /// which exploits values reused across lanes, and arranges the inserts
3761 /// for ease of later optimization.
3762 template <typename BVTy, typename ResTy, typename... Args>
3763 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3764
3765 /// Create a new vector from a list of scalar values. Produces a sequence
3766 /// which exploits values reused across lanes, and arranges the inserts
3767 /// for ease of later optimization.
3768 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3769
3770 /// Returns the instruction in the bundle, which can be used as a base point
3771 /// for scheduling. Usually it is the last instruction in the bundle, except
3772 /// for the case when all operands are external (in this case, it is the first
3773 /// instruction in the list).
3774 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3775
3776 /// Tries to find extractelement instructions with constant indices from fixed
3777 /// vector type and gather such instructions into a bunch, which highly likely
3778 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3779 /// was successful, the matched scalars are replaced by poison values in \p VL
3780 /// for future analysis.
3781 std::optional<TargetTransformInfo::ShuffleKind>
3782 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3783 SmallVectorImpl<int> &Mask) const;
3784
3785 /// Tries to find extractelement instructions with constant indices from fixed
3786 /// vector type and gather such instructions into a bunch, which highly likely
3787 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3788 /// was successful, the matched scalars are replaced by poison values in \p VL
3789 /// for future analysis.
3791 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3793 unsigned NumParts) const;
3794
3795 /// Checks if the gathered \p VL can be represented as a single register
3796 /// shuffle(s) of previous tree entries.
3797 /// \param TE Tree entry checked for permutation.
3798 /// \param VL List of scalars (a subset of the TE scalar), checked for
3799 /// permutations. Must form single-register vector.
3800 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3801 /// commands to build the mask using the original vector value, without
3802 /// relying on the potential reordering.
3803 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3804 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3805 std::optional<TargetTransformInfo::ShuffleKind>
3806 isGatherShuffledSingleRegisterEntry(
3807 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3808 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3809 bool ForOrder);
3810
3811 /// Checks if the gathered \p VL can be represented as multi-register
3812 /// shuffle(s) of previous tree entries.
3813 /// \param TE Tree entry checked for permutation.
3814 /// \param VL List of scalars (a subset of the TE scalar), checked for
3815 /// permutations.
3816 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3817 /// commands to build the mask using the original vector value, without
3818 /// relying on the potential reordering.
3819 /// \returns per-register series of ShuffleKind, if gathered values can be
3820 /// represented as shuffles of previous tree entries. \p Mask is filled with
3821 /// the shuffle mask (also on per-register base).
3823 isGatherShuffledEntry(
3824 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3826 unsigned NumParts, bool ForOrder = false);
3827
3828 /// \returns the cost of gathering (inserting) the values in \p VL into a
3829 /// vector.
3830 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3831 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3832 Type *ScalarTy) const;
3833
3834 /// Set the Builder insert point to one after the last instruction in
3835 /// the bundle
3836 void setInsertPointAfterBundle(const TreeEntry *E);
3837
3838 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3839 /// specified, the starting vector value is poison.
3840 Value *
3841 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3842 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3843
3844 /// \returns whether the VectorizableTree is fully vectorizable and will
3845 /// be beneficial even the tree height is tiny.
3846 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3847
3848 /// Run through the list of all gathered loads in the graph and try to find
3849 /// vector loads/masked gathers instead of regular gathers. Later these loads
3850 /// are reshufled to build final gathered nodes.
3851 void tryToVectorizeGatheredLoads(
3852 const SmallMapVector<
3853 std::tuple<BasicBlock *, Value *, Type *>,
3854 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3855 &GatheredLoads);
3856
3857 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3858 /// users of \p TE and collects the stores. It returns the map from the store
3859 /// pointers to the collected stores.
3861 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3862
3863 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3864 /// stores in \p StoresVec can form a vector instruction. If so it returns
3865 /// true and populates \p ReorderIndices with the shuffle indices of the
3866 /// stores when compared to the sorted vector.
3867 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3868 OrdersType &ReorderIndices) const;
3869
3870 /// Iterates through the users of \p TE, looking for scalar stores that can be
3871 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3872 /// their order and builds an order index vector for each store bundle. It
3873 /// returns all these order vectors found.
3874 /// We run this after the tree has formed, otherwise we may come across user
3875 /// instructions that are not yet in the tree.
3877 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3878
3879 /// Tries to reorder the gathering node for better vectorization
3880 /// opportunities.
3881 void reorderGatherNode(TreeEntry &TE);
3882
3883 class TreeEntry {
3884 public:
3885 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3886 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3887
3888 /// \returns Common mask for reorder indices and reused scalars.
3889 SmallVector<int> getCommonMask() const {
3890 if (State == TreeEntry::SplitVectorize)
3891 return {};
3892 SmallVector<int> Mask;
3893 inversePermutation(ReorderIndices, Mask);
3894 ::addMask(Mask, ReuseShuffleIndices);
3895 return Mask;
3896 }
3897
3898 /// \returns The mask for split nodes.
3899 SmallVector<int> getSplitMask() const {
3900 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3901 "Expected only split vectorize node.");
3902 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3903 unsigned CommonVF = std::max<unsigned>(
3904 CombinedEntriesWithIndices.back().second,
3905 Scalars.size() - CombinedEntriesWithIndices.back().second);
3906 for (auto [Idx, I] : enumerate(ReorderIndices))
3907 Mask[I] =
3908 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3909 ? CommonVF - CombinedEntriesWithIndices.back().second
3910 : 0);
3911 return Mask;
3912 }
3913
3914 /// Updates (reorders) SplitVectorize node according to the given mask \p
3915 /// Mask and order \p MaskOrder.
3916 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3917 ArrayRef<int> MaskOrder);
3918
3919 /// \returns true if the scalars in VL are equal to this entry.
3920 bool isSame(ArrayRef<Value *> VL) const {
3921 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3922 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3923 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3924 return VL.size() == Mask.size() &&
3925 std::equal(VL.begin(), VL.end(), Mask.begin(),
3926 [Scalars](Value *V, int Idx) {
3927 return (isa<UndefValue>(V) &&
3928 Idx == PoisonMaskElem) ||
3929 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3930 });
3931 };
3932 if (!ReorderIndices.empty()) {
3933 // TODO: implement matching if the nodes are just reordered, still can
3934 // treat the vector as the same if the list of scalars matches VL
3935 // directly, without reordering.
3936 SmallVector<int> Mask;
3937 inversePermutation(ReorderIndices, Mask);
3938 if (VL.size() == Scalars.size())
3939 return IsSame(Scalars, Mask);
3940 if (VL.size() == ReuseShuffleIndices.size()) {
3941 ::addMask(Mask, ReuseShuffleIndices);
3942 return IsSame(Scalars, Mask);
3943 }
3944 return false;
3945 }
3946 return IsSame(Scalars, ReuseShuffleIndices);
3947 }
3948
3949 /// \returns true if current entry has same operands as \p TE.
3950 bool hasEqualOperands(const TreeEntry &TE) const {
3951 if (TE.getNumOperands() != getNumOperands())
3952 return false;
3953 SmallBitVector Used(getNumOperands());
3954 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3955 unsigned PrevCount = Used.count();
3956 for (unsigned K = 0; K < E; ++K) {
3957 if (Used.test(K))
3958 continue;
3959 if (getOperand(K) == TE.getOperand(I)) {
3960 Used.set(K);
3961 break;
3962 }
3963 }
3964 // Check if we actually found the matching operand.
3965 if (PrevCount == Used.count())
3966 return false;
3967 }
3968 return true;
3969 }
3970
3971 /// \return Final vectorization factor for the node. Defined by the total
3972 /// number of vectorized scalars, including those, used several times in the
3973 /// entry and counted in the \a ReuseShuffleIndices, if any.
3974 unsigned getVectorFactor() const {
3975 if (!ReuseShuffleIndices.empty())
3976 return ReuseShuffleIndices.size();
3977 return Scalars.size();
3978 };
3979
3980 /// Checks if the current node is a gather node.
3981 bool isGather() const { return State == NeedToGather; }
3982
3983 /// A vector of scalars.
3984 ValueList Scalars;
3985
3986 /// The Scalars are vectorized into this value. It is initialized to Null.
3987 WeakTrackingVH VectorizedValue = nullptr;
3988
3989 /// Do we need to gather this sequence or vectorize it
3990 /// (either with vector instruction or with scatter/gather
3991 /// intrinsics for store/load)?
3992 enum EntryState {
3993 Vectorize, ///< The node is regularly vectorized.
3994 ScatterVectorize, ///< Masked scatter/gather node.
3995 StridedVectorize, ///< Strided loads (and stores)
3996 CompressVectorize, ///< (Masked) load with compress.
3997 NeedToGather, ///< Gather/buildvector node.
3998 CombinedVectorize, ///< Vectorized node, combined with its user into more
3999 ///< complex node like select/cmp to minmax, mul/add to
4000 ///< fma, etc. Must be used for the following nodes in
4001 ///< the pattern, not the very first one.
4002 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4003 ///< independently and then combines back.
4004 };
4005 EntryState State;
4006
4007 /// List of combined opcodes supported by the vectorizer.
4008 enum CombinedOpcode {
4009 NotCombinedOp = -1,
4010 MinMax = Instruction::OtherOpsEnd + 1,
4011 FMulAdd,
4012 };
4013 CombinedOpcode CombinedOp = NotCombinedOp;
4014
4015 /// Does this sequence require some shuffling?
4016 SmallVector<int, 4> ReuseShuffleIndices;
4017
4018 /// Does this entry require reordering?
4019 SmallVector<unsigned, 4> ReorderIndices;
4020
4021 /// Points back to the VectorizableTree.
4022 ///
4023 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4024 /// to be a pointer and needs to be able to initialize the child iterator.
4025 /// Thus we need a reference back to the container to translate the indices
4026 /// to entries.
4027 VecTreeTy &Container;
4028
4029 /// The TreeEntry index containing the user of this entry.
4030 EdgeInfo UserTreeIndex;
4031
4032 /// The index of this treeEntry in VectorizableTree.
4033 unsigned Idx = 0;
4034
4035 /// For gather/buildvector/alt opcode nodes, which are combined from
4036 /// other nodes as a series of insertvector instructions.
4037 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4038
4039 private:
4040 /// The operands of each instruction in each lane Operands[op_index][lane].
4041 /// Note: This helps avoid the replication of the code that performs the
4042 /// reordering of operands during buildTreeRec() and vectorizeTree().
4043 SmallVector<ValueList, 2> Operands;
4044
4045 /// Copyable elements of the entry node.
4046 SmallPtrSet<const Value *, 4> CopyableElements;
4047
4048 /// MainOp and AltOp are recorded inside. S should be obtained from
4049 /// newTreeEntry.
4050 InstructionsState S = InstructionsState::invalid();
4051
4052 /// Interleaving factor for interleaved loads Vectorize nodes.
4053 unsigned InterleaveFactor = 0;
4054
4055 /// True if the node does not require scheduling.
4056 bool DoesNotNeedToSchedule = false;
4057
4058 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4059 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4060 if (Operands.size() < OpIdx + 1)
4061 Operands.resize(OpIdx + 1);
4062 assert(Operands[OpIdx].empty() && "Already resized?");
4063 assert(OpVL.size() <= Scalars.size() &&
4064 "Number of operands is greater than the number of scalars.");
4065 Operands[OpIdx].resize(OpVL.size());
4066 copy(OpVL, Operands[OpIdx].begin());
4067 }
4068
4069 public:
4070 /// Returns interleave factor for interleave nodes.
4071 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4072 /// Sets interleaving factor for the interleaving nodes.
4073 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4074
4075 /// Marks the node as one that does not require scheduling.
4076 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4077 /// Returns true if the node is marked as one that does not require
4078 /// scheduling.
4079 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4080
4081 /// Set this bundle's operands from \p Operands.
4082 void setOperands(ArrayRef<ValueList> Operands) {
4083 for (unsigned I : seq<unsigned>(Operands.size()))
4084 setOperand(I, Operands[I]);
4085 }
4086
4087 /// Reorders operands of the node to the given mask \p Mask.
4088 void reorderOperands(ArrayRef<int> Mask) {
4089 for (ValueList &Operand : Operands)
4090 reorderScalars(Operand, Mask);
4091 }
4092
4093 /// \returns the \p OpIdx operand of this TreeEntry.
4094 ValueList &getOperand(unsigned OpIdx) {
4095 assert(OpIdx < Operands.size() && "Off bounds");
4096 return Operands[OpIdx];
4097 }
4098
4099 /// \returns the \p OpIdx operand of this TreeEntry.
4100 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4101 assert(OpIdx < Operands.size() && "Off bounds");
4102 return Operands[OpIdx];
4103 }
4104
4105 /// \returns the number of operands.
4106 unsigned getNumOperands() const { return Operands.size(); }
4107
4108 /// \return the single \p OpIdx operand.
4109 Value *getSingleOperand(unsigned OpIdx) const {
4110 assert(OpIdx < Operands.size() && "Off bounds");
4111 assert(!Operands[OpIdx].empty() && "No operand available");
4112 return Operands[OpIdx][0];
4113 }
4114
4115 /// Some of the instructions in the list have alternate opcodes.
4116 bool isAltShuffle() const { return S.isAltShuffle(); }
4117
4118 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4119 return S.getMatchingMainOpOrAltOp(I);
4120 }
4121
4122 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4123 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4124 /// \p OpValue.
4125 Value *isOneOf(Value *Op) const {
4126 auto *I = dyn_cast<Instruction>(Op);
4127 if (I && getMatchingMainOpOrAltOp(I))
4128 return Op;
4129 return S.getMainOp();
4130 }
4131
4132 void setOperations(const InstructionsState &S) {
4133 assert(S && "InstructionsState is invalid.");
4134 this->S = S;
4135 }
4136
4137 Instruction *getMainOp() const { return S.getMainOp(); }
4138
4139 Instruction *getAltOp() const { return S.getAltOp(); }
4140
4141 /// The main/alternate opcodes for the list of instructions.
4142 unsigned getOpcode() const { return S.getOpcode(); }
4143
4144 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4145
4146 bool hasState() const { return S.valid(); }
4147
4148 /// Add \p V to the list of copyable elements.
4149 void addCopyableElement(Value *V) {
4150 assert(S.isCopyableElement(V) && "Not a copyable element.");
4151 CopyableElements.insert(V);
4152 }
4153
4154 /// Returns true if \p V is a copyable element.
4155 bool isCopyableElement(Value *V) const {
4156 return CopyableElements.contains(V);
4157 }
4158
4159 /// Returns true if any scalar in the list is a copyable element.
4160 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4161
4162 /// Returns the state of the operations.
4163 const InstructionsState &getOperations() const { return S; }
4164
4165 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4166 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4167 unsigned findLaneForValue(Value *V) const {
4168 unsigned FoundLane = getVectorFactor();
4169 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4170 std::advance(It, 1)) {
4171 if (*It != V)
4172 continue;
4173 FoundLane = std::distance(Scalars.begin(), It);
4174 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4175 if (!ReorderIndices.empty())
4176 FoundLane = ReorderIndices[FoundLane];
4177 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4178 if (ReuseShuffleIndices.empty())
4179 break;
4180 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4181 RIt != ReuseShuffleIndices.end()) {
4182 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4183 break;
4184 }
4185 }
4186 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4187 return FoundLane;
4188 }
4189
4190 /// Build a shuffle mask for graph entry which represents a merge of main
4191 /// and alternate operations.
4192 void
4193 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4194 SmallVectorImpl<int> &Mask,
4195 SmallVectorImpl<Value *> *OpScalars = nullptr,
4196 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4197
4198 /// Return true if this is a non-power-of-2 node.
4199 bool isNonPowOf2Vec() const {
4200 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4201 return IsNonPowerOf2;
4202 }
4203
4204 /// Return true if this is a node, which tries to vectorize number of
4205 /// elements, forming whole vectors.
4206 bool
4207 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4208 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4209 TTI, getValueType(Scalars.front()), Scalars.size());
4210 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4211 "Reshuffling not supported with non-power-of-2 vectors yet.");
4212 return IsNonPowerOf2;
4213 }
4214
4215 Value *getOrdered(unsigned Idx) const {
4216 assert(isGather() && "Must be used only for buildvectors/gathers.");
4217 if (ReorderIndices.empty())
4218 return Scalars[Idx];
4219 SmallVector<int> Mask;
4220 inversePermutation(ReorderIndices, Mask);
4221 return Scalars[Mask[Idx]];
4222 }
4223
4224#ifndef NDEBUG
4225 /// Debug printer.
4226 LLVM_DUMP_METHOD void dump() const {
4227 dbgs() << Idx << ".\n";
4228 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4229 dbgs() << "Operand " << OpI << ":\n";
4230 for (const Value *V : Operands[OpI])
4231 dbgs().indent(2) << *V << "\n";
4232 }
4233 dbgs() << "Scalars: \n";
4234 for (Value *V : Scalars)
4235 dbgs().indent(2) << *V << "\n";
4236 dbgs() << "State: ";
4237 if (S && hasCopyableElements())
4238 dbgs() << "[[Copyable]] ";
4239 switch (State) {
4240 case Vectorize:
4241 if (InterleaveFactor > 0) {
4242 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4243 << "\n";
4244 } else {
4245 dbgs() << "Vectorize\n";
4246 }
4247 break;
4248 case ScatterVectorize:
4249 dbgs() << "ScatterVectorize\n";
4250 break;
4251 case StridedVectorize:
4252 dbgs() << "StridedVectorize\n";
4253 break;
4254 case CompressVectorize:
4255 dbgs() << "CompressVectorize\n";
4256 break;
4257 case NeedToGather:
4258 dbgs() << "NeedToGather\n";
4259 break;
4260 case CombinedVectorize:
4261 dbgs() << "CombinedVectorize\n";
4262 break;
4263 case SplitVectorize:
4264 dbgs() << "SplitVectorize\n";
4265 break;
4266 }
4267 if (S) {
4268 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4269 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4270 } else {
4271 dbgs() << "MainOp: NULL\n";
4272 dbgs() << "AltOp: NULL\n";
4273 }
4274 dbgs() << "VectorizedValue: ";
4275 if (VectorizedValue)
4276 dbgs() << *VectorizedValue << "\n";
4277 else
4278 dbgs() << "NULL\n";
4279 dbgs() << "ReuseShuffleIndices: ";
4280 if (ReuseShuffleIndices.empty())
4281 dbgs() << "Empty";
4282 else
4283 for (int ReuseIdx : ReuseShuffleIndices)
4284 dbgs() << ReuseIdx << ", ";
4285 dbgs() << "\n";
4286 dbgs() << "ReorderIndices: ";
4287 for (unsigned ReorderIdx : ReorderIndices)
4288 dbgs() << ReorderIdx << ", ";
4289 dbgs() << "\n";
4290 dbgs() << "UserTreeIndex: ";
4291 if (UserTreeIndex)
4292 dbgs() << UserTreeIndex;
4293 else
4294 dbgs() << "<invalid>";
4295 dbgs() << "\n";
4296 if (!CombinedEntriesWithIndices.empty()) {
4297 dbgs() << "Combined entries: ";
4298 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4299 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4300 });
4301 dbgs() << "\n";
4302 }
4303 }
4304#endif
4305 };
4306
4307#ifndef NDEBUG
4308 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4309 InstructionCost VecCost, InstructionCost ScalarCost,
4310 StringRef Banner) const {
4311 dbgs() << "SLP: " << Banner << ":\n";
4312 E->dump();
4313 dbgs() << "SLP: Costs:\n";
4314 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4315 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4316 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4317 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4318 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4319 }
4320#endif
4321
4322 /// Create a new gather TreeEntry
4323 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4324 const InstructionsState &S,
4325 const EdgeInfo &UserTreeIdx,
4326 ArrayRef<int> ReuseShuffleIndices = {}) {
4327 auto Invalid = ScheduleBundle::invalid();
4328 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4329 }
4330
4331 /// Create a new VectorizableTree entry.
4332 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4333 const InstructionsState &S,
4334 const EdgeInfo &UserTreeIdx,
4335 ArrayRef<int> ReuseShuffleIndices = {},
4336 ArrayRef<unsigned> ReorderIndices = {},
4337 unsigned InterleaveFactor = 0) {
4338 TreeEntry::EntryState EntryState =
4339 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4340 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4341 ReuseShuffleIndices, ReorderIndices);
4342 if (E && InterleaveFactor > 0)
4343 E->setInterleave(InterleaveFactor);
4344 return E;
4345 }
4346
4347 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4348 TreeEntry::EntryState EntryState,
4349 ScheduleBundle &Bundle, const InstructionsState &S,
4350 const EdgeInfo &UserTreeIdx,
4351 ArrayRef<int> ReuseShuffleIndices = {},
4352 ArrayRef<unsigned> ReorderIndices = {}) {
4353 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4354 EntryState == TreeEntry::SplitVectorize)) ||
4355 (Bundle && EntryState != TreeEntry::NeedToGather &&
4356 EntryState != TreeEntry::SplitVectorize)) &&
4357 "Need to vectorize gather entry?");
4358 // Gathered loads still gathered? Do not create entry, use the original one.
4359 if (GatheredLoadsEntriesFirst.has_value() &&
4360 EntryState == TreeEntry::NeedToGather && S &&
4361 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4362 !UserTreeIdx.UserTE)
4363 return nullptr;
4364 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4365 TreeEntry *Last = VectorizableTree.back().get();
4366 Last->Idx = VectorizableTree.size() - 1;
4367 Last->State = EntryState;
4368 if (UserTreeIdx.UserTE)
4369 OperandsToTreeEntry.try_emplace(
4370 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4371 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4372 // for non-power-of-two vectors.
4373 assert(
4374 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4375 ReuseShuffleIndices.empty()) &&
4376 "Reshuffling scalars not yet supported for nodes with padding");
4377 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4378 ReuseShuffleIndices.end());
4379 if (ReorderIndices.empty()) {
4380 Last->Scalars.assign(VL.begin(), VL.end());
4381 if (S)
4382 Last->setOperations(S);
4383 } else {
4384 // Reorder scalars and build final mask.
4385 Last->Scalars.assign(VL.size(), nullptr);
4386 transform(ReorderIndices, Last->Scalars.begin(),
4387 [VL](unsigned Idx) -> Value * {
4388 if (Idx >= VL.size())
4389 return UndefValue::get(VL.front()->getType());
4390 return VL[Idx];
4391 });
4392 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4393 if (S)
4394 Last->setOperations(S);
4395 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4396 }
4397 if (EntryState == TreeEntry::SplitVectorize) {
4398 assert(S && "Split nodes must have operations.");
4399 Last->setOperations(S);
4400 SmallPtrSet<Value *, 4> Processed;
4401 for (Value *V : VL) {
4402 auto *I = dyn_cast<Instruction>(V);
4403 if (!I)
4404 continue;
4405 auto It = ScalarsInSplitNodes.find(V);
4406 if (It == ScalarsInSplitNodes.end()) {
4407 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4408 (void)Processed.insert(V);
4409 } else if (Processed.insert(V).second) {
4410 assert(!is_contained(It->getSecond(), Last) &&
4411 "Value already associated with the node.");
4412 It->getSecond().push_back(Last);
4413 }
4414 }
4415 } else if (!Last->isGather()) {
4416 if (isa<PHINode>(S.getMainOp()) ||
4417 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4418 (!S.areInstructionsWithCopyableElements() &&
4419 doesNotNeedToSchedule(VL)) ||
4420 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4421 Last->setDoesNotNeedToSchedule();
4422 SmallPtrSet<Value *, 4> Processed;
4423 for (Value *V : VL) {
4424 if (isa<PoisonValue>(V))
4425 continue;
4426 if (S.isCopyableElement(V)) {
4427 Last->addCopyableElement(V);
4428 continue;
4429 }
4430 auto It = ScalarToTreeEntries.find(V);
4431 if (It == ScalarToTreeEntries.end()) {
4432 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4433 (void)Processed.insert(V);
4434 } else if (Processed.insert(V).second) {
4435 assert(!is_contained(It->getSecond(), Last) &&
4436 "Value already associated with the node.");
4437 It->getSecond().push_back(Last);
4438 }
4439 }
4440 // Update the scheduler bundle to point to this TreeEntry.
4441 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4442 "Bundle and VL out of sync");
4443 if (!Bundle.getBundle().empty()) {
4444#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4445 auto *BundleMember = Bundle.getBundle().begin();
4446 SmallPtrSet<Value *, 4> Processed;
4447 for (Value *V : VL) {
4448 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4449 continue;
4450 ++BundleMember;
4451 }
4452 assert(BundleMember == Bundle.getBundle().end() &&
4453 "Bundle and VL out of sync");
4454#endif
4455 Bundle.setTreeEntry(Last);
4456 }
4457 } else {
4458 // Build a map for gathered scalars to the nodes where they are used.
4459 bool AllConstsOrCasts = true;
4460 for (Value *V : VL) {
4461 if (S && S.areInstructionsWithCopyableElements() &&
4462 S.isCopyableElement(V))
4463 Last->addCopyableElement(V);
4464 if (!isConstant(V)) {
4465 auto *I = dyn_cast<CastInst>(V);
4466 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4467 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4468 !UserTreeIdx.UserTE->isGather())
4469 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4470 }
4471 }
4472 if (AllConstsOrCasts)
4473 CastMaxMinBWSizes =
4474 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4475 MustGather.insert_range(VL);
4476 }
4477
4478 if (UserTreeIdx.UserTE)
4479 Last->UserTreeIndex = UserTreeIdx;
4480 return Last;
4481 }
4482
4483 /// -- Vectorization State --
4484 /// Holds all of the tree entries.
4485 TreeEntry::VecTreeTy VectorizableTree;
4486
4487#ifndef NDEBUG
4488 /// Debug printer.
4489 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4490 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4491 VectorizableTree[Id]->dump();
4492 dbgs() << "\n";
4493 }
4494 }
4495#endif
4496
4497 /// Get list of vector entries, associated with the value \p V.
4498 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4499 assert(V && "V cannot be nullptr.");
4500 auto It = ScalarToTreeEntries.find(V);
4501 if (It == ScalarToTreeEntries.end())
4502 return {};
4503 return It->getSecond();
4504 }
4505
4506 /// Get list of split vector entries, associated with the value \p V.
4507 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4508 assert(V && "V cannot be nullptr.");
4509 auto It = ScalarsInSplitNodes.find(V);
4510 if (It == ScalarsInSplitNodes.end())
4511 return {};
4512 return It->getSecond();
4513 }
4514
4515 /// Returns first vector node for value \p V, matching values \p VL.
4516 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4517 bool SameVF = false) const {
4518 assert(V && "V cannot be nullptr.");
4519 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4520 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4521 return TE;
4522 return nullptr;
4523 }
4524
4525 /// Check that the operand node of alternate node does not generate
4526 /// buildvector sequence. If it is, then probably not worth it to build
4527 /// alternate shuffle, if number of buildvector operands + alternate
4528 /// instruction > than the number of buildvector instructions.
4529 /// \param S the instructions state of the analyzed values.
4530 /// \param VL list of the instructions with alternate opcodes.
4531 bool areAltOperandsProfitable(const InstructionsState &S,
4532 ArrayRef<Value *> VL) const;
4533
4534 /// Contains all the outputs of legality analysis for a list of values to
4535 /// vectorize.
4536 class ScalarsVectorizationLegality {
4537 InstructionsState S;
4538 bool IsLegal;
4539 bool TryToFindDuplicates;
4540 bool TrySplitVectorize;
4541
4542 public:
4543 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4544 bool TryToFindDuplicates = true,
4545 bool TrySplitVectorize = false)
4546 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4547 TrySplitVectorize(TrySplitVectorize) {
4548 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4549 "Inconsistent state");
4550 }
4551 const InstructionsState &getInstructionsState() const { return S; };
4552 bool isLegal() const { return IsLegal; }
4553 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4554 bool trySplitVectorize() const { return TrySplitVectorize; }
4555 };
4556
4557 /// Checks if the specified list of the instructions/values can be vectorized
4558 /// in general.
4559 ScalarsVectorizationLegality
4560 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4561 const EdgeInfo &UserTreeIdx,
4562 bool TryCopyableElementsVectorization) const;
4563
4564 /// Checks if the specified list of the instructions/values can be vectorized
4565 /// and fills required data before actual scheduling of the instructions.
4566 TreeEntry::EntryState getScalarsVectorizationState(
4567 const InstructionsState &S, ArrayRef<Value *> VL,
4568 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4569 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4570
4571 /// Maps a specific scalar to its tree entry(ies).
4572 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4573
4574 /// Maps the operand index and entry to the corresponding tree entry.
4575 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4576 OperandsToTreeEntry;
4577
4578 /// Scalars, used in split vectorize nodes.
4579 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4580
4581 /// Maps a value to the proposed vectorizable size.
4582 SmallDenseMap<Value *, unsigned> InstrElementSize;
4583
4584 /// A list of scalars that we found that we need to keep as scalars.
4585 ValueSet MustGather;
4586
4587 /// A set of first non-schedulable values.
4588 ValueSet NonScheduledFirst;
4589
4590 /// A map between the vectorized entries and the last instructions in the
4591 /// bundles. The bundles are built in use order, not in the def order of the
4592 /// instructions. So, we cannot rely directly on the last instruction in the
4593 /// bundle being the last instruction in the program order during
4594 /// vectorization process since the basic blocks are affected, need to
4595 /// pre-gather them before.
4596 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4597
4598 /// Keeps the mapping between the last instructions and their insertion
4599 /// points, which is an instruction-after-the-last-instruction.
4600 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4601
4602 /// List of gather nodes, depending on other gather/vector nodes, which should
4603 /// be emitted after the vector instruction emission process to correctly
4604 /// handle order of the vector instructions and shuffles.
4605 SetVector<const TreeEntry *> PostponedGathers;
4606
4607 using ValueToGatherNodesMap =
4608 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4609 ValueToGatherNodesMap ValueToGatherNodes;
4610
4611 /// A list of the load entries (node indices), which can be vectorized using
4612 /// strided or masked gather approach, but attempted to be represented as
4613 /// contiguous loads.
4614 SetVector<unsigned> LoadEntriesToVectorize;
4615
4616 /// true if graph nodes transforming mode is on.
4617 bool IsGraphTransformMode = false;
4618
4619 /// The index of the first gathered load entry in the VectorizeTree.
4620 std::optional<unsigned> GatheredLoadsEntriesFirst;
4621
4622 /// Maps compress entries to their mask data for the final codegen.
4623 SmallDenseMap<const TreeEntry *,
4624 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4625 CompressEntryToData;
4626
4627 /// This POD struct describes one external user in the vectorized tree.
4628 struct ExternalUser {
4629 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4630 : Scalar(S), User(U), E(E), Lane(L) {}
4631
4632 /// Which scalar in our function.
4633 Value *Scalar = nullptr;
4634
4635 /// Which user that uses the scalar.
4636 llvm::User *User = nullptr;
4637
4638 /// Vector node, the value is part of.
4639 const TreeEntry &E;
4640
4641 /// Which lane does the scalar belong to.
4642 unsigned Lane;
4643 };
4644 using UserList = SmallVector<ExternalUser, 16>;
4645
4646 /// Checks if two instructions may access the same memory.
4647 ///
4648 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4649 /// is invariant in the calling loop.
4650 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4651 Instruction *Inst2) {
4652 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4653 // First check if the result is already in the cache.
4654 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4655 auto Res = AliasCache.try_emplace(Key);
4656 if (!Res.second)
4657 return Res.first->second;
4658 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4659 // Store the result in the cache.
4660 Res.first->getSecond() = Aliased;
4661 return Aliased;
4662 }
4663
4664 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4665
4666 /// Cache for alias results.
4667 /// TODO: consider moving this to the AliasAnalysis itself.
4668 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4669
4670 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4671 // globally through SLP because we don't perform any action which
4672 // invalidates capture results.
4673 BatchAAResults BatchAA;
4674
4675 /// Temporary store for deleted instructions. Instructions will be deleted
4676 /// eventually when the BoUpSLP is destructed. The deferral is required to
4677 /// ensure that there are no incorrect collisions in the AliasCache, which
4678 /// can happen if a new instruction is allocated at the same address as a
4679 /// previously deleted instruction.
4680 DenseSet<Instruction *> DeletedInstructions;
4681
4682 /// Set of the instruction, being analyzed already for reductions.
4683 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4684
4685 /// Set of hashes for the list of reduction values already being analyzed.
4686 DenseSet<size_t> AnalyzedReductionVals;
4687
4688 /// Values, already been analyzed for mininmal bitwidth and found to be
4689 /// non-profitable.
4690 DenseSet<Value *> AnalyzedMinBWVals;
4691
4692 /// A list of values that need to extracted out of the tree.
4693 /// This list holds pairs of (Internal Scalar : External User). External User
4694 /// can be nullptr, it means that this Internal Scalar will be used later,
4695 /// after vectorization.
4696 UserList ExternalUses;
4697
4698 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4699 /// extractelement instructions.
4700 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4701
4702 /// A list of scalar to be extracted without specific user necause of too many
4703 /// uses.
4704 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4705
4706 /// Values used only by @llvm.assume calls.
4707 SmallPtrSet<const Value *, 32> EphValues;
4708
4709 /// Holds all of the instructions that we gathered, shuffle instructions and
4710 /// extractelements.
4711 SetVector<Instruction *> GatherShuffleExtractSeq;
4712
4713 /// A list of blocks that we are going to CSE.
4714 DenseSet<BasicBlock *> CSEBlocks;
4715
4716 /// List of hashes of vector of loads, which are known to be non vectorizable.
4717 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4718
4719 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4720 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4721 /// instructions, while ScheduleBundle represents a batch of instructions,
4722 /// going to be groupped together. ScheduleCopyableData models extra user for
4723 /// "copyable" instructions.
4724 class ScheduleEntity {
4725 friend class ScheduleBundle;
4726 friend class ScheduleData;
4727 friend class ScheduleCopyableData;
4728
4729 protected:
4730 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4731 Kind getKind() const { return K; }
4732 ScheduleEntity(Kind K) : K(K) {}
4733
4734 private:
4735 /// Used for getting a "good" final ordering of instructions.
4736 int SchedulingPriority = 0;
4737 /// True if this instruction (or bundle) is scheduled (or considered as
4738 /// scheduled in the dry-run).
4739 bool IsScheduled = false;
4740 /// The kind of the ScheduleEntity.
4741 const Kind K = Kind::ScheduleData;
4742
4743 public:
4744 ScheduleEntity() = delete;
4745 /// Gets/sets the scheduling priority.
4746 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4747 int getSchedulingPriority() const { return SchedulingPriority; }
4748 bool isReady() const {
4749 if (const auto *SD = dyn_cast<ScheduleData>(this))
4750 return SD->isReady();
4751 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4752 return CD->isReady();
4753 return cast<ScheduleBundle>(this)->isReady();
4754 }
4755 /// Returns true if the dependency information has been calculated.
4756 /// Note that depenendency validity can vary between instructions within
4757 /// a single bundle.
4758 bool hasValidDependencies() const {
4759 if (const auto *SD = dyn_cast<ScheduleData>(this))
4760 return SD->hasValidDependencies();
4761 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4762 return CD->hasValidDependencies();
4763 return cast<ScheduleBundle>(this)->hasValidDependencies();
4764 }
4765 /// Gets the number of unscheduled dependencies.
4766 int getUnscheduledDeps() const {
4767 if (const auto *SD = dyn_cast<ScheduleData>(this))
4768 return SD->getUnscheduledDeps();
4769 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4770 return CD->getUnscheduledDeps();
4771 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4772 }
4773 /// Increments the number of unscheduled dependencies.
4774 int incrementUnscheduledDeps(int Incr) {
4775 if (auto *SD = dyn_cast<ScheduleData>(this))
4776 return SD->incrementUnscheduledDeps(Incr);
4777 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4778 }
4779 /// Gets the number of dependencies.
4780 int getDependencies() const {
4781 if (const auto *SD = dyn_cast<ScheduleData>(this))
4782 return SD->getDependencies();
4783 return cast<ScheduleCopyableData>(this)->getDependencies();
4784 }
4785 /// Gets the instruction.
4786 Instruction *getInst() const {
4787 if (const auto *SD = dyn_cast<ScheduleData>(this))
4788 return SD->getInst();
4789 return cast<ScheduleCopyableData>(this)->getInst();
4790 }
4791
4792 /// Gets/sets if the bundle is scheduled.
4793 bool isScheduled() const { return IsScheduled; }
4794 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4795
4796 static bool classof(const ScheduleEntity *) { return true; }
4797
4798#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4799 void dump(raw_ostream &OS) const {
4800 if (const auto *SD = dyn_cast<ScheduleData>(this))
4801 return SD->dump(OS);
4802 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4803 return CD->dump(OS);
4804 return cast<ScheduleBundle>(this)->dump(OS);
4805 }
4806
4807 LLVM_DUMP_METHOD void dump() const {
4808 dump(dbgs());
4809 dbgs() << '\n';
4810 }
4811#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4812 };
4813
4814#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4816 const BoUpSLP::ScheduleEntity &SE) {
4817 SE.dump(OS);
4818 return OS;
4819 }
4820#endif
4821
4822 /// Contains all scheduling relevant data for an instruction.
4823 /// A ScheduleData either represents a single instruction or a member of an
4824 /// instruction bundle (= a group of instructions which is combined into a
4825 /// vector instruction).
4826 class ScheduleData final : public ScheduleEntity {
4827 public:
4828 // The initial value for the dependency counters. It means that the
4829 // dependencies are not calculated yet.
4830 enum { InvalidDeps = -1 };
4831
4832 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4833 static bool classof(const ScheduleEntity *Entity) {
4834 return Entity->getKind() == Kind::ScheduleData;
4835 }
4836
4837 void init(int BlockSchedulingRegionID, Instruction *I) {
4838 NextLoadStore = nullptr;
4839 IsScheduled = false;
4840 SchedulingRegionID = BlockSchedulingRegionID;
4841 clearDependencies();
4842 Inst = I;
4843 }
4844
4845 /// Verify basic self consistency properties
4846 void verify() {
4847 if (hasValidDependencies()) {
4848 assert(UnscheduledDeps <= Dependencies && "invariant");
4849 } else {
4850 assert(UnscheduledDeps == Dependencies && "invariant");
4851 }
4852
4853 if (IsScheduled) {
4854 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4855 "unexpected scheduled state");
4856 }
4857 }
4858
4859 /// Returns true if the dependency information has been calculated.
4860 /// Note that depenendency validity can vary between instructions within
4861 /// a single bundle.
4862 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4863
4864 /// Returns true if it is ready for scheduling, i.e. it has no more
4865 /// unscheduled depending instructions/bundles.
4866 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4867
4868 /// Modifies the number of unscheduled dependencies for this instruction,
4869 /// and returns the number of remaining dependencies for the containing
4870 /// bundle.
4871 int incrementUnscheduledDeps(int Incr) {
4872 assert(hasValidDependencies() &&
4873 "increment of unscheduled deps would be meaningless");
4874 UnscheduledDeps += Incr;
4875 assert(UnscheduledDeps >= 0 &&
4876 "Expected valid number of unscheduled deps");
4877 return UnscheduledDeps;
4878 }
4879
4880 /// Sets the number of unscheduled dependencies to the number of
4881 /// dependencies.
4882 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4883
4884 /// Clears all dependency information.
4885 void clearDependencies() {
4886 clearDirectDependencies();
4887 MemoryDependencies.clear();
4888 ControlDependencies.clear();
4889 }
4890
4891 /// Clears all direct dependencies only, except for control and memory
4892 /// dependencies.
4893 /// Required for copyable elements to correctly handle control/memory deps
4894 /// and avoid extra reclaculation of such deps.
4895 void clearDirectDependencies() {
4896 Dependencies = InvalidDeps;
4897 resetUnscheduledDeps();
4898 IsScheduled = false;
4899 }
4900
4901 /// Gets the number of unscheduled dependencies.
4902 int getUnscheduledDeps() const { return UnscheduledDeps; }
4903 /// Gets the number of dependencies.
4904 int getDependencies() const { return Dependencies; }
4905 /// Initializes the number of dependencies.
4906 void initDependencies() { Dependencies = 0; }
4907 /// Increments the number of dependencies.
4908 void incDependencies() { Dependencies++; }
4909
4910 /// Gets scheduling region ID.
4911 int getSchedulingRegionID() const { return SchedulingRegionID; }
4912
4913 /// Gets the instruction.
4914 Instruction *getInst() const { return Inst; }
4915
4916 /// Gets the list of memory dependencies.
4917 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4918 return MemoryDependencies;
4919 }
4920 /// Adds a memory dependency.
4921 void addMemoryDependency(ScheduleData *Dep) {
4922 MemoryDependencies.push_back(Dep);
4923 }
4924 /// Gets the list of control dependencies.
4925 ArrayRef<ScheduleData *> getControlDependencies() const {
4926 return ControlDependencies;
4927 }
4928 /// Adds a control dependency.
4929 void addControlDependency(ScheduleData *Dep) {
4930 ControlDependencies.push_back(Dep);
4931 }
4932 /// Gets/sets the next load/store instruction in the block.
4933 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4934 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4935
4936 void dump(raw_ostream &OS) const { OS << *Inst; }
4937
4938 LLVM_DUMP_METHOD void dump() const {
4939 dump(dbgs());
4940 dbgs() << '\n';
4941 }
4942
4943 private:
4944 Instruction *Inst = nullptr;
4945
4946 /// Single linked list of all memory instructions (e.g. load, store, call)
4947 /// in the block - until the end of the scheduling region.
4948 ScheduleData *NextLoadStore = nullptr;
4949
4950 /// The dependent memory instructions.
4951 /// This list is derived on demand in calculateDependencies().
4952 SmallVector<ScheduleData *> MemoryDependencies;
4953
4954 /// List of instructions which this instruction could be control dependent
4955 /// on. Allowing such nodes to be scheduled below this one could introduce
4956 /// a runtime fault which didn't exist in the original program.
4957 /// ex: this is a load or udiv following a readonly call which inf loops
4958 SmallVector<ScheduleData *> ControlDependencies;
4959
4960 /// This ScheduleData is in the current scheduling region if this matches
4961 /// the current SchedulingRegionID of BlockScheduling.
4962 int SchedulingRegionID = 0;
4963
4964 /// The number of dependencies. Constitutes of the number of users of the
4965 /// instruction plus the number of dependent memory instructions (if any).
4966 /// This value is calculated on demand.
4967 /// If InvalidDeps, the number of dependencies is not calculated yet.
4968 int Dependencies = InvalidDeps;
4969
4970 /// The number of dependencies minus the number of dependencies of scheduled
4971 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4972 /// for scheduling.
4973 /// Note that this is negative as long as Dependencies is not calculated.
4974 int UnscheduledDeps = InvalidDeps;
4975 };
4976
4977#ifndef NDEBUG
4979 const BoUpSLP::ScheduleData &SD) {
4980 SD.dump(OS);
4981 return OS;
4982 }
4983#endif
4984
4985 class ScheduleBundle final : public ScheduleEntity {
4986 /// The schedule data for the instructions in the bundle.
4988 /// True if this bundle is valid.
4989 bool IsValid = true;
4990 /// The TreeEntry that this instruction corresponds to.
4991 TreeEntry *TE = nullptr;
4992 ScheduleBundle(bool IsValid)
4993 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4994
4995 public:
4996 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4997 static bool classof(const ScheduleEntity *Entity) {
4998 return Entity->getKind() == Kind::ScheduleBundle;
4999 }
5000
5001 /// Verify basic self consistency properties
5002 void verify() const {
5003 for (const ScheduleEntity *SD : Bundle) {
5004 if (SD->hasValidDependencies()) {
5005 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5006 "invariant");
5007 } else {
5008 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5009 "invariant");
5010 }
5011
5012 if (isScheduled()) {
5013 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5014 "unexpected scheduled state");
5015 }
5016 }
5017 }
5018
5019 /// Returns the number of unscheduled dependencies in the bundle.
5020 int unscheduledDepsInBundle() const {
5021 assert(*this && "bundle must not be empty");
5022 int Sum = 0;
5023 for (const ScheduleEntity *BundleMember : Bundle) {
5024 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5025 return ScheduleData::InvalidDeps;
5026 Sum += BundleMember->getUnscheduledDeps();
5027 }
5028 return Sum;
5029 }
5030
5031 /// Returns true if the dependency information has been calculated.
5032 /// Note that depenendency validity can vary between instructions within
5033 /// a single bundle.
5034 bool hasValidDependencies() const {
5035 return all_of(Bundle, [](const ScheduleEntity *SD) {
5036 return SD->hasValidDependencies();
5037 });
5038 }
5039
5040 /// Returns true if it is ready for scheduling, i.e. it has no more
5041 /// unscheduled depending instructions/bundles.
5042 bool isReady() const {
5043 assert(*this && "bundle must not be empty");
5044 return unscheduledDepsInBundle() == 0 && !isScheduled();
5045 }
5046
5047 /// Returns the bundle of scheduling data, associated with the current
5048 /// instruction.
5049 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5050 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5051 /// Adds an instruction to the bundle.
5052 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5053
5054 /// Gets/sets the associated tree entry.
5055 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5056 TreeEntry *getTreeEntry() const { return TE; }
5057
5058 static ScheduleBundle invalid() { return {false}; }
5059
5060 operator bool() const { return IsValid; }
5061
5062#ifndef NDEBUG
5063 void dump(raw_ostream &OS) const {
5064 if (!*this) {
5065 OS << "[]";
5066 return;
5067 }
5068 OS << '[';
5069 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5071 OS << "<Copyable>";
5072 OS << *SD->getInst();
5073 });
5074 OS << ']';
5075 }
5076
5077 LLVM_DUMP_METHOD void dump() const {
5078 dump(dbgs());
5079 dbgs() << '\n';
5080 }
5081#endif // NDEBUG
5082 };
5083
5084#ifndef NDEBUG
5086 const BoUpSLP::ScheduleBundle &Bundle) {
5087 Bundle.dump(OS);
5088 return OS;
5089 }
5090#endif
5091
5092 /// Contains all scheduling relevant data for the copyable instruction.
5093 /// It models the virtual instructions, supposed to replace the original
5094 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5095 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5096 /// instruction %virt = add %0, 0.
5097 class ScheduleCopyableData final : public ScheduleEntity {
5098 /// The source schedule data for the instruction.
5099 Instruction *Inst = nullptr;
5100 /// The edge information for the instruction.
5101 const EdgeInfo EI;
5102 /// This ScheduleData is in the current scheduling region if this matches
5103 /// the current SchedulingRegionID of BlockScheduling.
5104 int SchedulingRegionID = 0;
5105 /// Bundle, this data is part of.
5106 ScheduleBundle &Bundle;
5107
5108 public:
5109 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5110 const EdgeInfo &EI, ScheduleBundle &Bundle)
5111 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5112 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5113 static bool classof(const ScheduleEntity *Entity) {
5114 return Entity->getKind() == Kind::ScheduleCopyableData;
5115 }
5116
5117 /// Verify basic self consistency properties
5118 void verify() {
5119 if (hasValidDependencies()) {
5120 assert(UnscheduledDeps <= Dependencies && "invariant");
5121 } else {
5122 assert(UnscheduledDeps == Dependencies && "invariant");
5123 }
5124
5125 if (IsScheduled) {
5126 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5127 "unexpected scheduled state");
5128 }
5129 }
5130
5131 /// Returns true if the dependency information has been calculated.
5132 /// Note that depenendency validity can vary between instructions within
5133 /// a single bundle.
5134 bool hasValidDependencies() const {
5135 return Dependencies != ScheduleData::InvalidDeps;
5136 }
5137
5138 /// Returns true if it is ready for scheduling, i.e. it has no more
5139 /// unscheduled depending instructions/bundles.
5140 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5141
5142 /// Modifies the number of unscheduled dependencies for this instruction,
5143 /// and returns the number of remaining dependencies for the containing
5144 /// bundle.
5145 int incrementUnscheduledDeps(int Incr) {
5146 assert(hasValidDependencies() &&
5147 "increment of unscheduled deps would be meaningless");
5148 UnscheduledDeps += Incr;
5149 assert(UnscheduledDeps >= 0 && "invariant");
5150 return UnscheduledDeps;
5151 }
5152
5153 /// Sets the number of unscheduled dependencies to the number of
5154 /// dependencies.
5155 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5156
5157 /// Gets the number of unscheduled dependencies.
5158 int getUnscheduledDeps() const { return UnscheduledDeps; }
5159 /// Gets the number of dependencies.
5160 int getDependencies() const { return Dependencies; }
5161 /// Initializes the number of dependencies.
5162 void initDependencies() { Dependencies = 0; }
5163 /// Increments the number of dependencies.
5164 void incDependencies() { Dependencies++; }
5165
5166 /// Gets scheduling region ID.
5167 int getSchedulingRegionID() const { return SchedulingRegionID; }
5168
5169 /// Gets the instruction.
5170 Instruction *getInst() const { return Inst; }
5171
5172 /// Clears all dependency information.
5173 void clearDependencies() {
5174 Dependencies = ScheduleData::InvalidDeps;
5175 UnscheduledDeps = ScheduleData::InvalidDeps;
5176 IsScheduled = false;
5177 }
5178
5179 /// Gets the edge information.
5180 const EdgeInfo &getEdgeInfo() const { return EI; }
5181
5182 /// Gets the bundle.
5183 ScheduleBundle &getBundle() { return Bundle; }
5184 const ScheduleBundle &getBundle() const { return Bundle; }
5185
5186#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5187 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5188
5189 LLVM_DUMP_METHOD void dump() const {
5190 dump(dbgs());
5191 dbgs() << '\n';
5192 }
5193#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5194
5195 private:
5196 /// true, if it has valid dependency information. These nodes always have
5197 /// only single dependency.
5198 int Dependencies = ScheduleData::InvalidDeps;
5199
5200 /// The number of dependencies minus the number of dependencies of scheduled
5201 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5202 /// for scheduling.
5203 /// Note that this is negative as long as Dependencies is not calculated.
5204 int UnscheduledDeps = ScheduleData::InvalidDeps;
5205 };
5206
5207#ifndef NDEBUG
5208 friend inline raw_ostream &
5209 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5210 SD.dump(OS);
5211 return OS;
5212 }
5213#endif
5214
5215 friend struct GraphTraits<BoUpSLP *>;
5216 friend struct DOTGraphTraits<BoUpSLP *>;
5217
5218 /// Contains all scheduling data for a basic block.
5219 /// It does not schedules instructions, which are not memory read/write
5220 /// instructions and their operands are either constants, or arguments, or
5221 /// phis, or instructions from others blocks, or their users are phis or from
5222 /// the other blocks. The resulting vector instructions can be placed at the
5223 /// beginning of the basic block without scheduling (if operands does not need
5224 /// to be scheduled) or at the end of the block (if users are outside of the
5225 /// block). It allows to save some compile time and memory used by the
5226 /// compiler.
5227 /// ScheduleData is assigned for each instruction in between the boundaries of
5228 /// the tree entry, even for those, which are not part of the graph. It is
5229 /// required to correctly follow the dependencies between the instructions and
5230 /// their correct scheduling. The ScheduleData is not allocated for the
5231 /// instructions, which do not require scheduling, like phis, nodes with
5232 /// extractelements/insertelements only or nodes with instructions, with
5233 /// uses/operands outside of the block.
5234 struct BlockScheduling {
5235 BlockScheduling(BasicBlock *BB)
5236 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5237
5238 void clear() {
5239 ScheduledBundles.clear();
5240 ScheduledBundlesList.clear();
5241 ScheduleCopyableDataMap.clear();
5242 ScheduleCopyableDataMapByInst.clear();
5243 ScheduleCopyableDataMapByInstUser.clear();
5244 ScheduleCopyableDataMapByUsers.clear();
5245 ReadyInsts.clear();
5246 ScheduleStart = nullptr;
5247 ScheduleEnd = nullptr;
5248 FirstLoadStoreInRegion = nullptr;
5249 LastLoadStoreInRegion = nullptr;
5250 RegionHasStackSave = false;
5251
5252 // Reduce the maximum schedule region size by the size of the
5253 // previous scheduling run.
5254 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5255 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5256 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5257 ScheduleRegionSize = 0;
5258
5259 // Make a new scheduling region, i.e. all existing ScheduleData is not
5260 // in the new region yet.
5261 ++SchedulingRegionID;
5262 }
5263
5264 ScheduleData *getScheduleData(Instruction *I) {
5265 if (!I)
5266 return nullptr;
5267 if (BB != I->getParent())
5268 // Avoid lookup if can't possibly be in map.
5269 return nullptr;
5270 ScheduleData *SD = ScheduleDataMap.lookup(I);
5271 if (SD && isInSchedulingRegion(*SD))
5272 return SD;
5273 return nullptr;
5274 }
5275
5276 ScheduleData *getScheduleData(Value *V) {
5277 return getScheduleData(dyn_cast<Instruction>(V));
5278 }
5279
5280 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5281 /// operand number) and value.
5282 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5283 const Value *V) const {
5284 if (ScheduleCopyableDataMap.empty())
5285 return nullptr;
5286 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5287 if (It == ScheduleCopyableDataMap.end())
5288 return nullptr;
5289 ScheduleCopyableData *SD = It->getSecond().get();
5290 if (!isInSchedulingRegion(*SD))
5291 return nullptr;
5292 return SD;
5293 }
5294
5295 /// Returns the ScheduleCopyableData for the given user \p User, operand
5296 /// number and operand \p V.
5298 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5299 const Value *V) {
5300 if (ScheduleCopyableDataMapByInstUser.empty())
5301 return {};
5302 const auto It = ScheduleCopyableDataMapByInstUser.find(
5303 std::make_pair(std::make_pair(User, OperandIdx), V));
5304 if (It == ScheduleCopyableDataMapByInstUser.end())
5305 return {};
5307 for (ScheduleCopyableData *SD : It->getSecond()) {
5308 if (isInSchedulingRegion(*SD))
5309 Res.push_back(SD);
5310 }
5311 return Res;
5312 }
5313
5314 /// Returns true if all operands of the given instruction \p User are
5315 /// replaced by copyable data.
5316 /// \param User The user instruction.
5317 /// \param Op The operand, which might be replaced by the copyable data.
5318 /// \param SLP The SLP tree.
5319 /// \param NumOps The number of operands used. If the instruction uses the
5320 /// same operand several times, check for the first use, then the second,
5321 /// etc.
5322 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5323 Instruction *Op, BoUpSLP &SLP,
5324 unsigned NumOps) const {
5325 assert(NumOps > 0 && "No operands");
5326 if (ScheduleCopyableDataMap.empty())
5327 return false;
5328 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5329 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5330 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5331 if (Entries.empty())
5332 return false;
5333 for (const Use &U : User->operands()) {
5334 if (U.get() != Op)
5335 continue;
5336 // Check all tree entries, if they have operands replaced by copyable
5337 // data.
5338 for (TreeEntry *TE : Entries) {
5339 unsigned Inc = 0;
5340 bool IsNonSchedulableWithParentPhiNode =
5341 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5342 TE->UserTreeIndex.UserTE->hasState() &&
5343 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5344 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5345 // Count the number of unique phi nodes, which are the parent for
5346 // parent entry, and exit, if all the unique phis are processed.
5347 if (IsNonSchedulableWithParentPhiNode) {
5348 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5349 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5350 for (Value *V : ParentTE->Scalars) {
5351 auto *PHI = dyn_cast<PHINode>(V);
5352 if (!PHI)
5353 continue;
5354 if (ParentsUniqueUsers.insert(PHI).second &&
5355 is_contained(PHI->incoming_values(), User))
5356 ++Inc;
5357 }
5358 } else {
5359 Inc = 1;
5360 }
5361
5362 // Check if the user is commutative.
5363 // The commutatives are handled later, as their operands can be
5364 // reordered.
5365 // Same applies even for non-commutative cmps, because we can invert
5366 // their predicate potentially and, thus, reorder the operands.
5367 bool IsCommutativeUser =
5368 ::isCommutative(User) ||
5369 ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
5370 if (!IsCommutativeUser && !isa<CmpInst>(User)) {
5371 unsigned &OpCnt =
5372 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
5373 EdgeInfo EI(TE, U.getOperandNo());
5374 if (!getScheduleCopyableData(EI, Op))
5375 continue;
5376 // Found copyable operand - continue.
5377 OpCnt += Inc;
5378 continue;
5379 }
5380 PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5381 .first->getSecond() += Inc;
5382 }
5383 }
5384 if (PotentiallyReorderedEntriesCount.empty())
5385 return all_of(OrderedEntriesCount,
5386 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5387 return P.second == NumOps;
5388 });
5389 // Check the commutative/cmp entries.
5390 for (auto &P : PotentiallyReorderedEntriesCount) {
5391 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5392 bool IsNonSchedulableWithParentPhiNode =
5393 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5394 P.first->UserTreeIndex.UserTE->hasState() &&
5395 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5396 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5397 auto *It = find(P.first->Scalars, User);
5398 do {
5399 assert(It != P.first->Scalars.end() &&
5400 "User is not in the tree entry");
5401 int Lane = std::distance(P.first->Scalars.begin(), It);
5402 assert(Lane >= 0 && "Lane is not found");
5403 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5404 Lane = P.first->ReorderIndices[Lane];
5405 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5406 "Couldn't find extract lane");
5407 // Count the number of unique phi nodes, which are the parent for
5408 // parent entry, and exit, if all the unique phis are processed.
5409 if (IsNonSchedulableWithParentPhiNode) {
5410 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5411 Value *User = ParentTE->Scalars[Lane];
5412 if (!ParentsUniqueUsers.insert(User).second) {
5413 It =
5414 find(make_range(std::next(It), P.first->Scalars.end()), User);
5415 continue;
5416 }
5417 }
5418 for (unsigned OpIdx :
5420 P.first->getMainOp()))) {
5421 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5422 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5423 --P.getSecond();
5424 }
5425 // If parent node is schedulable, it will be handled correctly.
5426 if (!IsNonSchedulableWithParentPhiNode)
5427 break;
5428 It = find(make_range(std::next(It), P.first->Scalars.end()), User);
5429 } while (It != P.first->Scalars.end());
5430 }
5431 return all_of(PotentiallyReorderedEntriesCount,
5432 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5433 return P.second == NumOps - 1;
5434 }) &&
5435 all_of(OrderedEntriesCount,
5436 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5437 return P.second == NumOps;
5438 });
5439 }
5440
5442 getScheduleCopyableData(const Instruction *I) const {
5443 if (ScheduleCopyableDataMapByInst.empty())
5444 return {};
5445 const auto It = ScheduleCopyableDataMapByInst.find(I);
5446 if (It == ScheduleCopyableDataMapByInst.end())
5447 return {};
5449 for (ScheduleCopyableData *SD : It->getSecond()) {
5450 if (isInSchedulingRegion(*SD))
5451 Res.push_back(SD);
5452 }
5453 return Res;
5454 }
5455
5457 getScheduleCopyableDataUsers(const Instruction *User) const {
5458 if (ScheduleCopyableDataMapByUsers.empty())
5459 return {};
5460 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5461 if (It == ScheduleCopyableDataMapByUsers.end())
5462 return {};
5464 for (ScheduleCopyableData *SD : It->getSecond()) {
5465 if (isInSchedulingRegion(*SD))
5466 Res.push_back(SD);
5467 }
5468 return Res;
5469 }
5470
5471 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5472 Instruction *I,
5473 int SchedulingRegionID,
5474 ScheduleBundle &Bundle) {
5475 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5476 ScheduleCopyableData *CD =
5477 ScheduleCopyableDataMap
5478 .try_emplace(std::make_pair(EI, I),
5479 std::make_unique<ScheduleCopyableData>(
5480 SchedulingRegionID, I, EI, Bundle))
5481 .first->getSecond()
5482 .get();
5483 ScheduleCopyableDataMapByInst[I].push_back(CD);
5484 if (EI.UserTE) {
5485 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5486 const auto *It = find(Op, I);
5487 assert(It != Op.end() && "Lane not set");
5488 SmallPtrSet<Instruction *, 4> Visited;
5489 do {
5490 int Lane = std::distance(Op.begin(), It);
5491 assert(Lane >= 0 && "Lane not set");
5492 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5493 !EI.UserTE->ReorderIndices.empty())
5494 Lane = EI.UserTE->ReorderIndices[Lane];
5495 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5496 "Couldn't find extract lane");
5497 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5498 if (!Visited.insert(In).second) {
5499 It = find(make_range(std::next(It), Op.end()), I);
5500 continue;
5501 }
5502 ScheduleCopyableDataMapByInstUser
5503 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5504 .first->getSecond()
5505 .push_back(CD);
5506 ScheduleCopyableDataMapByUsers.try_emplace(I)
5507 .first->getSecond()
5508 .insert(CD);
5509 // Remove extra deps for users, becoming non-immediate users of the
5510 // instruction. It may happen, if the chain of same copyable elements
5511 // appears in the tree.
5512 if (In == I) {
5513 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5514 if (ScheduleCopyableData *UserCD =
5515 getScheduleCopyableData(UserEI, In))
5516 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5517 }
5518 It = find(make_range(std::next(It), Op.end()), I);
5519 } while (It != Op.end());
5520 } else {
5521 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5522 CD);
5523 }
5524 return *CD;
5525 }
5526
5527 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5528 auto *I = dyn_cast<Instruction>(V);
5529 if (!I)
5530 return {};
5531 auto It = ScheduledBundles.find(I);
5532 if (It == ScheduledBundles.end())
5533 return {};
5534 return It->getSecond();
5535 }
5536
5537 /// Returns true if the entity is in the scheduling region.
5538 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5539 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5540 return Data->getSchedulingRegionID() == SchedulingRegionID;
5541 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5542 return CD->getSchedulingRegionID() == SchedulingRegionID;
5543 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5544 [&](const ScheduleEntity *BundleMember) {
5545 return isInSchedulingRegion(*BundleMember);
5546 });
5547 }
5548
5549 /// Marks an instruction as scheduled and puts all dependent ready
5550 /// instructions into the ready-list.
5551 template <typename ReadyListType>
5552 void schedule(const BoUpSLP &R, const InstructionsState &S,
5553 const EdgeInfo &EI, ScheduleEntity *Data,
5554 ReadyListType &ReadyList) {
5555 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5557 // Handle the def-use chain dependencies.
5558
5559 // Decrement the unscheduled counter and insert to ready list if ready.
5560 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5561 if ((IsControl || Data->hasValidDependencies()) &&
5562 Data->incrementUnscheduledDeps(-1) == 0) {
5563 // There are no more unscheduled dependencies after
5564 // decrementing, so we can put the dependent instruction
5565 // into the ready list.
5566 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5568 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5569 CopyableBundle.push_back(&CD->getBundle());
5570 Bundles = CopyableBundle;
5571 } else {
5572 Bundles = getScheduleBundles(Data->getInst());
5573 }
5574 if (!Bundles.empty()) {
5575 for (ScheduleBundle *Bundle : Bundles) {
5576 if (Bundle->unscheduledDepsInBundle() == 0) {
5577 assert(!Bundle->isScheduled() &&
5578 "already scheduled bundle gets ready");
5579 ReadyList.insert(Bundle);
5581 << "SLP: gets ready: " << *Bundle << "\n");
5582 }
5583 }
5584 return;
5585 }
5586 assert(!Data->isScheduled() &&
5587 "already scheduled bundle gets ready");
5589 "Expected non-copyable data");
5590 ReadyList.insert(Data);
5591 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5592 }
5593 };
5594
5595 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5596 Instruction *I) {
5597 if (!ScheduleCopyableDataMap.empty()) {
5599 getScheduleCopyableData(User, OpIdx, I);
5600 for (ScheduleCopyableData *CD : CopyableData)
5601 DecrUnsched(CD, /*IsControl=*/false);
5602 if (!CopyableData.empty())
5603 return;
5604 }
5605 if (ScheduleData *OpSD = getScheduleData(I))
5606 DecrUnsched(OpSD, /*IsControl=*/false);
5607 };
5608
5609 // If BundleMember is a vector bundle, its operands may have been
5610 // reordered during buildTree(). We therefore need to get its operands
5611 // through the TreeEntry.
5612 if (!Bundles.empty()) {
5613 auto *In = BundleMember->getInst();
5614 // Count uses of each instruction operand.
5615 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5616 unsigned TotalOpCount = 0;
5617 if (isa<ScheduleCopyableData>(BundleMember)) {
5618 // Copyable data is used only once (uses itself).
5619 TotalOpCount = OperandsUses[In] = 1;
5620 } else {
5621 for (const Use &U : In->operands()) {
5622 if (auto *I = dyn_cast<Instruction>(U.get())) {
5623 auto Res = OperandsUses.try_emplace(I, 0);
5624 ++Res.first->getSecond();
5625 ++TotalOpCount;
5626 }
5627 }
5628 }
5629 // Decrement the unscheduled counter and insert to ready list if
5630 // ready.
5631 auto DecrUnschedForInst =
5632 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5633 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5634 &Checked) {
5635 if (!ScheduleCopyableDataMap.empty()) {
5636 const EdgeInfo EI = {UserTE, OpIdx};
5637 if (ScheduleCopyableData *CD =
5638 getScheduleCopyableData(EI, I)) {
5639 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
5640 return;
5641 DecrUnsched(CD, /*IsControl=*/false);
5642 return;
5643 }
5644 }
5645 auto It = OperandsUses.find(I);
5646 assert(It != OperandsUses.end() && "Operand not found");
5647 if (It->second > 0) {
5648 --It->getSecond();
5649 assert(TotalOpCount > 0 && "No more operands to decrement");
5650 --TotalOpCount;
5651 if (ScheduleData *OpSD = getScheduleData(I)) {
5652 if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
5653 return;
5654 DecrUnsched(OpSD, /*IsControl=*/false);
5655 }
5656 }
5657 };
5658
5659 for (ScheduleBundle *Bundle : Bundles) {
5660 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5661 break;
5662 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5663 // Need to search for the lane since the tree entry can be
5664 // reordered.
5665 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
5666 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5667 bool IsNonSchedulableWithParentPhiNode =
5668 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5669 Bundle->getTreeEntry()->UserTreeIndex &&
5670 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5671 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5672 TreeEntry::SplitVectorize &&
5673 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5674 Instruction::PHI;
5675 do {
5676 int Lane =
5677 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5678 assert(Lane >= 0 && "Lane not set");
5679 if (isa<StoreInst>(In) &&
5680 !Bundle->getTreeEntry()->ReorderIndices.empty())
5681 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5682 assert(Lane < static_cast<int>(
5683 Bundle->getTreeEntry()->Scalars.size()) &&
5684 "Couldn't find extract lane");
5685
5686 // Since vectorization tree is being built recursively this
5687 // assertion ensures that the tree entry has all operands set
5688 // before reaching this code. Couple of exceptions known at the
5689 // moment are extracts where their second (immediate) operand is
5690 // not added. Since immediates do not affect scheduler behavior
5691 // this is considered okay.
5692 assert(In &&
5694 In->getNumOperands() ==
5695 Bundle->getTreeEntry()->getNumOperands() ||
5696 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5697 "Missed TreeEntry operands?");
5698
5699 // Count the number of unique phi nodes, which are the parent for
5700 // parent entry, and exit, if all the unique phis are processed.
5701 if (IsNonSchedulableWithParentPhiNode) {
5702 const TreeEntry *ParentTE =
5703 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5704 Value *User = ParentTE->Scalars[Lane];
5705 if (!ParentsUniqueUsers.insert(User).second) {
5706 It = std::find(std::next(It),
5707 Bundle->getTreeEntry()->Scalars.end(), In);
5708 continue;
5709 }
5710 }
5711
5712 for (unsigned OpIdx :
5713 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5714 if (auto *I = dyn_cast<Instruction>(
5715 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5716 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5717 << *I << "\n");
5718 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5719 }
5720 // If parent node is schedulable, it will be handled correctly.
5721 if (!IsNonSchedulableWithParentPhiNode)
5722 break;
5723 It = std::find(std::next(It),
5724 Bundle->getTreeEntry()->Scalars.end(), In);
5725 } while (It != Bundle->getTreeEntry()->Scalars.end());
5726 }
5727 } else {
5728 // If BundleMember is a stand-alone instruction, no operand reordering
5729 // has taken place, so we directly access its operands.
5730 for (Use &U : BundleMember->getInst()->operands()) {
5731 if (auto *I = dyn_cast<Instruction>(U.get())) {
5733 << "SLP: check for readiness (def): " << *I << "\n");
5734 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5735 }
5736 }
5737 }
5738 // Handle the memory dependencies.
5739 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5740 if (!SD)
5741 return;
5742 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5743 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5744 if (!VisitedMemory.insert(MemoryDep).second)
5745 continue;
5746 // There are no more unscheduled dependencies after decrementing,
5747 // so we can put the dependent instruction into the ready list.
5748 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5749 << *MemoryDep << "\n");
5750 DecrUnsched(MemoryDep);
5751 }
5752 // Handle the control dependencies.
5753 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5754 for (ScheduleData *Dep : SD->getControlDependencies()) {
5755 if (!VisitedControl.insert(Dep).second)
5756 continue;
5757 // There are no more unscheduled dependencies after decrementing,
5758 // so we can put the dependent instruction into the ready list.
5760 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5761 DecrUnsched(Dep, /*IsControl=*/true);
5762 }
5763 };
5764 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5765 SD->setScheduled(/*Scheduled=*/true);
5766 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5769 Instruction *In = SD->getInst();
5770 if (R.isVectorized(In)) {
5771 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5772 for (TreeEntry *TE : Entries) {
5774 In->getNumOperands() != TE->getNumOperands())
5775 continue;
5776 auto &BundlePtr =
5777 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5778 BundlePtr->setTreeEntry(TE);
5779 BundlePtr->add(SD);
5780 Bundles.push_back(BundlePtr.get());
5781 }
5782 }
5783 ProcessBundleMember(SD, Bundles);
5784 } else {
5785 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5786 Bundle.setScheduled(/*Scheduled=*/true);
5787 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5788 auto AreAllBundlesScheduled =
5789 [&](const ScheduleEntity *SD,
5790 ArrayRef<ScheduleBundle *> SDBundles) {
5792 return true;
5793 return !SDBundles.empty() &&
5794 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5795 return SDBundle->isScheduled();
5796 });
5797 };
5798 for (ScheduleEntity *SD : Bundle.getBundle()) {
5801 SDBundles = getScheduleBundles(SD->getInst());
5802 if (AreAllBundlesScheduled(SD, SDBundles)) {
5803 SD->setScheduled(/*Scheduled=*/true);
5804 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5805 : SDBundles);
5806 }
5807 }
5808 }
5809 }
5810
5811 /// Verify basic self consistency properties of the data structure.
5812 void verify() {
5813 if (!ScheduleStart)
5814 return;
5815
5816 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5817 ScheduleStart->comesBefore(ScheduleEnd) &&
5818 "Not a valid scheduling region?");
5819
5820 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5821 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5822 if (!Bundles.empty()) {
5823 for (ScheduleBundle *Bundle : Bundles) {
5824 assert(isInSchedulingRegion(*Bundle) &&
5825 "primary schedule data not in window?");
5826 Bundle->verify();
5827 }
5828 continue;
5829 }
5830 auto *SD = getScheduleData(I);
5831 if (!SD)
5832 continue;
5833 assert(isInSchedulingRegion(*SD) &&
5834 "primary schedule data not in window?");
5835 SD->verify();
5836 }
5837
5838 assert(all_of(ReadyInsts,
5839 [](const ScheduleEntity *Bundle) {
5840 return Bundle->isReady();
5841 }) &&
5842 "item in ready list not ready?");
5843 }
5844
5845 /// Put all instructions into the ReadyList which are ready for scheduling.
5846 template <typename ReadyListType>
5847 void initialFillReadyList(ReadyListType &ReadyList) {
5848 SmallPtrSet<ScheduleBundle *, 16> Visited;
5849 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5850 ScheduleData *SD = getScheduleData(I);
5851 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5852 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5853 !Bundles.empty()) {
5854 for (ScheduleBundle *Bundle : Bundles) {
5855 if (!Visited.insert(Bundle).second)
5856 continue;
5857 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5858 ReadyList.insert(Bundle);
5859 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5860 << *Bundle << "\n");
5861 }
5862 }
5863 continue;
5864 }
5865 ReadyList.insert(SD);
5867 << "SLP: initially in ready list: " << *SD << "\n");
5868 }
5869 }
5870 }
5871
5872 /// Build a bundle from the ScheduleData nodes corresponding to the
5873 /// scalar instruction for each lane.
5874 /// \param VL The list of scalar instructions.
5875 /// \param S The state of the instructions.
5876 /// \param EI The edge in the SLP graph or the user node/operand number.
5877 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5878 const InstructionsState &S, const EdgeInfo &EI);
5879
5880 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5881 /// cyclic dependencies. This is only a dry-run, no instructions are
5882 /// actually moved at this stage.
5883 /// \returns the scheduling bundle. The returned Optional value is not
5884 /// std::nullopt if \p VL is allowed to be scheduled.
5885 std::optional<ScheduleBundle *>
5886 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5887 const InstructionsState &S, const EdgeInfo &EI);
5888
5889 /// Allocates schedule data chunk.
5890 ScheduleData *allocateScheduleDataChunks();
5891
5892 /// Extends the scheduling region so that V is inside the region.
5893 /// \returns true if the region size is within the limit.
5894 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5895
5896 /// Initialize the ScheduleData structures for new instructions in the
5897 /// scheduling region.
5898 void initScheduleData(Instruction *FromI, Instruction *ToI,
5899 ScheduleData *PrevLoadStore,
5900 ScheduleData *NextLoadStore);
5901
5902 /// Updates the dependency information of a bundle and of all instructions/
5903 /// bundles which depend on the original bundle.
5904 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5905 BoUpSLP *SLP,
5906 ArrayRef<ScheduleData *> ControlDeps = {});
5907
5908 /// Sets all instruction in the scheduling region to un-scheduled.
5909 void resetSchedule();
5910
5911 BasicBlock *BB;
5912
5913 /// Simple memory allocation for ScheduleData.
5915
5916 /// The size of a ScheduleData array in ScheduleDataChunks.
5917 int ChunkSize;
5918
5919 /// The allocator position in the current chunk, which is the last entry
5920 /// of ScheduleDataChunks.
5921 int ChunkPos;
5922
5923 /// Attaches ScheduleData to Instruction.
5924 /// Note that the mapping survives during all vectorization iterations, i.e.
5925 /// ScheduleData structures are recycled.
5926 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5927
5928 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5929 /// number) and the operand instruction, represented as copyable element.
5930 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5931 std::unique_ptr<ScheduleCopyableData>>
5932 ScheduleCopyableDataMap;
5933
5934 /// Represents mapping between instruction and all related
5935 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5936 /// element). The SLP tree may contain several representations of the same
5937 /// instruction.
5938 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5939 ScheduleCopyableDataMapByInst;
5940
5941 /// Represents mapping between user value and operand number, the operand
5942 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5943 /// the same user may refernce the same operand in different tree entries
5944 /// and the operand may be modelled by the different copyable data element.
5945 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5947 ScheduleCopyableDataMapByInstUser;
5948
5949 /// Represents mapping between instruction and all related
5950 /// ScheduleCopyableData. It represents the mapping between the actual
5951 /// instruction and the last copyable data element in the chain. E.g., if
5952 /// the graph models the following instructions:
5953 /// %0 = non-add instruction ...
5954 /// ...
5955 /// %4 = add %3, 1
5956 /// %5 = add %4, 1
5957 /// %6 = insertelement poison, %0, 0
5958 /// %7 = insertelement %6, %5, 1
5959 /// And the graph is modeled as:
5960 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
5961 /// -> [1, 0] -> [%1, 0]
5962 ///
5963 /// this map will map %0 only to the copyable element <1>, which is the last
5964 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
5965 /// keep the map to <0>, not the %0.
5966 SmallDenseMap<const Instruction *,
5967 SmallSetVector<ScheduleCopyableData *, 4>>
5968 ScheduleCopyableDataMapByUsers;
5969
5970 /// Attaches ScheduleBundle to Instruction.
5971 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5972 ScheduledBundles;
5973 /// The list of ScheduleBundles.
5974 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5975
5976 /// The ready-list for scheduling (only used for the dry-run).
5977 SetVector<ScheduleEntity *> ReadyInsts;
5978
5979 /// The first instruction of the scheduling region.
5980 Instruction *ScheduleStart = nullptr;
5981
5982 /// The first instruction _after_ the scheduling region.
5983 Instruction *ScheduleEnd = nullptr;
5984
5985 /// The first memory accessing instruction in the scheduling region
5986 /// (can be null).
5987 ScheduleData *FirstLoadStoreInRegion = nullptr;
5988
5989 /// The last memory accessing instruction in the scheduling region
5990 /// (can be null).
5991 ScheduleData *LastLoadStoreInRegion = nullptr;
5992
5993 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5994 /// region? Used to optimize the dependence calculation for the
5995 /// common case where there isn't.
5996 bool RegionHasStackSave = false;
5997
5998 /// The current size of the scheduling region.
5999 int ScheduleRegionSize = 0;
6000
6001 /// The maximum size allowed for the scheduling region.
6002 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6003
6004 /// The ID of the scheduling region. For a new vectorization iteration this
6005 /// is incremented which "removes" all ScheduleData from the region.
6006 /// Make sure that the initial SchedulingRegionID is greater than the
6007 /// initial SchedulingRegionID in ScheduleData (which is 0).
6008 int SchedulingRegionID = 1;
6009 };
6010
6011 /// Attaches the BlockScheduling structures to basic blocks.
6012 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6013
6014 /// Performs the "real" scheduling. Done before vectorization is actually
6015 /// performed in a basic block.
6016 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6017
6018 /// List of users to ignore during scheduling and that don't need extracting.
6019 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6020
6021 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6022 /// sorted SmallVectors of unsigned.
6023 struct OrdersTypeDenseMapInfo {
6024 static OrdersType getEmptyKey() {
6025 OrdersType V;
6026 V.push_back(~1U);
6027 return V;
6028 }
6029
6030 static OrdersType getTombstoneKey() {
6031 OrdersType V;
6032 V.push_back(~2U);
6033 return V;
6034 }
6035
6036 static unsigned getHashValue(const OrdersType &V) {
6037 return static_cast<unsigned>(hash_combine_range(V));
6038 }
6039
6040 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6041 return LHS == RHS;
6042 }
6043 };
6044
6045 // Analysis and block reference.
6046 Function *F;
6047 ScalarEvolution *SE;
6048 TargetTransformInfo *TTI;
6049 TargetLibraryInfo *TLI;
6050 LoopInfo *LI;
6051 DominatorTree *DT;
6052 AssumptionCache *AC;
6053 DemandedBits *DB;
6054 const DataLayout *DL;
6055 OptimizationRemarkEmitter *ORE;
6056
6057 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6058 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6059
6060 /// Instruction builder to construct the vectorized tree.
6061 IRBuilder<TargetFolder> Builder;
6062
6063 /// A map of scalar integer values to the smallest bit width with which they
6064 /// can legally be represented. The values map to (width, signed) pairs,
6065 /// where "width" indicates the minimum bit width and "signed" is True if the
6066 /// value must be signed-extended, rather than zero-extended, back to its
6067 /// original width.
6068 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6069
6070 /// Final size of the reduced vector, if the current graph represents the
6071 /// input for the reduction and it was possible to narrow the size of the
6072 /// reduction.
6073 unsigned ReductionBitWidth = 0;
6074
6075 /// Canonical graph size before the transformations.
6076 unsigned BaseGraphSize = 1;
6077
6078 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6079 /// type sizes, used in the tree.
6080 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6081
6082 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6083 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6084 DenseSet<unsigned> ExtraBitWidthNodes;
6085};
6086
6087template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6091 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6092 SecondInfo::getEmptyKey());
6093 }
6094
6096 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6097 SecondInfo::getTombstoneKey());
6098 }
6099
6100 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6101 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6102 SecondInfo::getHashValue(Val.EdgeIdx));
6103 }
6104
6105 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6106 const BoUpSLP::EdgeInfo &RHS) {
6107 return LHS == RHS;
6108 }
6109};
6110
6111template <> struct llvm::GraphTraits<BoUpSLP *> {
6112 using TreeEntry = BoUpSLP::TreeEntry;
6113
6114 /// NodeRef has to be a pointer per the GraphWriter.
6116
6117 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6118
6119 /// Add the VectorizableTree to the index iterator to be able to return
6120 /// TreeEntry pointers.
6122 : public iterator_adaptor_base<
6123 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6125
6129
6130 NodeRef operator*() { return I->UserTE; }
6131 };
6132
6134 return R.VectorizableTree[0].get();
6135 }
6136
6138 return {&N->UserTreeIndex, N->Container};
6139 }
6140
6142 return {&N->UserTreeIndex + 1, N->Container};
6143 }
6144
6145 /// For the node iterator we just need to turn the TreeEntry iterator into a
6146 /// TreeEntry* iterator so that it dereferences to NodeRef.
6148 using ItTy = ContainerTy::iterator;
6149 ItTy It;
6150
6151 public:
6152 nodes_iterator(const ItTy &It2) : It(It2) {}
6153 NodeRef operator*() { return It->get(); }
6155 ++It;
6156 return *this;
6157 }
6158 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6159 };
6160
6162 return nodes_iterator(R->VectorizableTree.begin());
6163 }
6164
6166 return nodes_iterator(R->VectorizableTree.end());
6167 }
6168
6169 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6170};
6171
6172template <>
6174 using TreeEntry = BoUpSLP::TreeEntry;
6175
6176 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6177
6178 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6179 std::string Str;
6180 raw_string_ostream OS(Str);
6181 OS << Entry->Idx << ".\n";
6182 if (isSplat(Entry->Scalars))
6183 OS << "<splat> ";
6184 for (auto *V : Entry->Scalars) {
6185 OS << *V;
6186 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6187 return EU.Scalar == V;
6188 }))
6189 OS << " <extract>";
6190 OS << "\n";
6191 }
6192 return Str;
6193 }
6194
6195 static std::string getNodeAttributes(const TreeEntry *Entry,
6196 const BoUpSLP *) {
6197 if (Entry->isGather())
6198 return "color=red";
6199 if (Entry->State == TreeEntry::ScatterVectorize ||
6200 Entry->State == TreeEntry::StridedVectorize ||
6201 Entry->State == TreeEntry::CompressVectorize)
6202 return "color=blue";
6203 return "";
6204 }
6205};
6206
6209 for (auto *I : DeletedInstructions) {
6210 if (!I->getParent()) {
6211 // Temporarily insert instruction back to erase them from parent and
6212 // memory later.
6213 if (isa<PHINode>(I))
6214 // Phi nodes must be the very first instructions in the block.
6215 I->insertBefore(F->getEntryBlock(),
6216 F->getEntryBlock().getFirstNonPHIIt());
6217 else
6218 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6219 continue;
6220 }
6221 for (Use &U : I->operands()) {
6222 auto *Op = dyn_cast<Instruction>(U.get());
6223 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6225 DeadInsts.emplace_back(Op);
6226 }
6227 I->dropAllReferences();
6228 }
6229 for (auto *I : DeletedInstructions) {
6230 assert(I->use_empty() &&
6231 "trying to erase instruction with users.");
6232 I->eraseFromParent();
6233 }
6234
6235 // Cleanup any dead scalar code feeding the vectorized instructions
6237
6238#ifdef EXPENSIVE_CHECKS
6239 // If we could guarantee that this call is not extremely slow, we could
6240 // remove the ifdef limitation (see PR47712).
6241 assert(!verifyFunction(*F, &dbgs()));
6242#endif
6243}
6244
6245/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6246/// contains original mask for the scalars reused in the node. Procedure
6247/// transform this mask in accordance with the given \p Mask.
6249 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6250 "Expected non-empty mask.");
6251 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6252 Prev.swap(Reuses);
6253 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6254 if (Mask[I] != PoisonMaskElem)
6255 Reuses[Mask[I]] = Prev[I];
6256}
6257
6258/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6259/// the original order of the scalars. Procedure transforms the provided order
6260/// in accordance with the given \p Mask. If the resulting \p Order is just an
6261/// identity order, \p Order is cleared.
6263 bool BottomOrder = false) {
6264 assert(!Mask.empty() && "Expected non-empty mask.");
6265 unsigned Sz = Mask.size();
6266 if (BottomOrder) {
6267 SmallVector<unsigned> PrevOrder;
6268 if (Order.empty()) {
6269 PrevOrder.resize(Sz);
6270 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6271 } else {
6272 PrevOrder.swap(Order);
6273 }
6274 Order.assign(Sz, Sz);
6275 for (unsigned I = 0; I < Sz; ++I)
6276 if (Mask[I] != PoisonMaskElem)
6277 Order[I] = PrevOrder[Mask[I]];
6278 if (all_of(enumerate(Order), [&](const auto &Data) {
6279 return Data.value() == Sz || Data.index() == Data.value();
6280 })) {
6281 Order.clear();
6282 return;
6283 }
6284 fixupOrderingIndices(Order);
6285 return;
6286 }
6287 SmallVector<int> MaskOrder;
6288 if (Order.empty()) {
6289 MaskOrder.resize(Sz);
6290 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6291 } else {
6292 inversePermutation(Order, MaskOrder);
6293 }
6294 reorderReuses(MaskOrder, Mask);
6295 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6296 Order.clear();
6297 return;
6298 }
6299 Order.assign(Sz, Sz);
6300 for (unsigned I = 0; I < Sz; ++I)
6301 if (MaskOrder[I] != PoisonMaskElem)
6302 Order[MaskOrder[I]] = I;
6303 fixupOrderingIndices(Order);
6304}
6305
6306std::optional<BoUpSLP::OrdersType>
6307BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6308 bool TopToBottom, bool IgnoreReorder) {
6309 assert(TE.isGather() && "Expected gather node only.");
6310 // Try to find subvector extract/insert patterns and reorder only such
6311 // patterns.
6312 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6313 Type *ScalarTy = GatheredScalars.front()->getType();
6314 size_t NumScalars = GatheredScalars.size();
6315 if (!isValidElementType(ScalarTy))
6316 return std::nullopt;
6317 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6318 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6319 SmallVector<int> ExtractMask;
6320 SmallVector<int> Mask;
6323 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6325 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6326 /*ForOrder=*/true);
6327 // No shuffled operands - ignore.
6328 if (GatherShuffles.empty() && ExtractShuffles.empty())
6329 return std::nullopt;
6330 OrdersType CurrentOrder(NumScalars, NumScalars);
6331 if (GatherShuffles.size() == 1 &&
6332 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6333 Entries.front().front()->isSame(TE.Scalars)) {
6334 // If the full matched node in whole tree rotation - no need to consider the
6335 // matching order, rotating the whole tree.
6336 if (TopToBottom)
6337 return std::nullopt;
6338 // No need to keep the order for the same user node.
6339 if (Entries.front().front()->UserTreeIndex.UserTE ==
6340 TE.UserTreeIndex.UserTE)
6341 return std::nullopt;
6342 // No need to keep the order for the matched root node, if it can be freely
6343 // reordered.
6344 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6345 return std::nullopt;
6346 // If shuffling 2 elements only and the matching node has reverse reuses -
6347 // no need to count order, both work fine.
6348 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6349 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6350 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6351 [](const auto &P) {
6352 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6353 }))
6354 return std::nullopt;
6355
6356 // Perfect match in the graph, will reuse the previously vectorized
6357 // node. Cost is 0.
6358 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6359 return CurrentOrder;
6360 }
6361 auto IsSplatMask = [](ArrayRef<int> Mask) {
6362 int SingleElt = PoisonMaskElem;
6363 return all_of(Mask, [&](int I) {
6364 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6365 SingleElt = I;
6366 return I == PoisonMaskElem || I == SingleElt;
6367 });
6368 };
6369 // Exclusive broadcast mask - ignore.
6370 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6371 (Entries.size() != 1 ||
6372 Entries.front().front()->ReorderIndices.empty())) ||
6373 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6374 return std::nullopt;
6375 SmallBitVector ShuffledSubMasks(NumParts);
6376 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6377 ArrayRef<int> Mask, int PartSz, int NumParts,
6378 function_ref<unsigned(unsigned)> GetVF) {
6379 for (int I : seq<int>(0, NumParts)) {
6380 if (ShuffledSubMasks.test(I))
6381 continue;
6382 const int VF = GetVF(I);
6383 if (VF == 0)
6384 continue;
6385 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6386 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6387 // Shuffle of at least 2 vectors - ignore.
6388 if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {
6389 llvm::fill(Slice, NumScalars);
6390 ShuffledSubMasks.set(I);
6391 continue;
6392 }
6393 // Try to include as much elements from the mask as possible.
6394 int FirstMin = INT_MAX;
6395 int SecondVecFound = false;
6396 for (int K : seq<int>(Limit)) {
6397 int Idx = Mask[I * PartSz + K];
6398 if (Idx == PoisonMaskElem) {
6399 Value *V = GatheredScalars[I * PartSz + K];
6400 if (isConstant(V) && !isa<PoisonValue>(V)) {
6401 SecondVecFound = true;
6402 break;
6403 }
6404 continue;
6405 }
6406 if (Idx < VF) {
6407 if (FirstMin > Idx)
6408 FirstMin = Idx;
6409 } else {
6410 SecondVecFound = true;
6411 break;
6412 }
6413 }
6414 FirstMin = (FirstMin / PartSz) * PartSz;
6415 // Shuffle of at least 2 vectors - ignore.
6416 if (SecondVecFound) {
6417 llvm::fill(Slice, NumScalars);
6418 ShuffledSubMasks.set(I);
6419 continue;
6420 }
6421 for (int K : seq<int>(Limit)) {
6422 int Idx = Mask[I * PartSz + K];
6423 if (Idx == PoisonMaskElem)
6424 continue;
6425 Idx -= FirstMin;
6426 if (Idx >= PartSz) {
6427 SecondVecFound = true;
6428 break;
6429 }
6430 if (CurrentOrder[I * PartSz + Idx] >
6431 static_cast<unsigned>(I * PartSz + K) &&
6432 CurrentOrder[I * PartSz + Idx] !=
6433 static_cast<unsigned>(I * PartSz + Idx))
6434 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6435 }
6436 // Shuffle of at least 2 vectors - ignore.
6437 if (SecondVecFound) {
6438 llvm::fill(Slice, NumScalars);
6439 ShuffledSubMasks.set(I);
6440 continue;
6441 }
6442 }
6443 };
6444 int PartSz = getPartNumElems(NumScalars, NumParts);
6445 if (!ExtractShuffles.empty())
6446 TransformMaskToOrder(
6447 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6448 if (!ExtractShuffles[I])
6449 return 0U;
6450 unsigned VF = 0;
6451 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6452 for (unsigned Idx : seq<unsigned>(Sz)) {
6453 int K = I * PartSz + Idx;
6454 if (ExtractMask[K] == PoisonMaskElem)
6455 continue;
6456 if (!TE.ReuseShuffleIndices.empty())
6457 K = TE.ReuseShuffleIndices[K];
6458 if (K == PoisonMaskElem)
6459 continue;
6460 if (!TE.ReorderIndices.empty())
6461 K = std::distance(TE.ReorderIndices.begin(),
6462 find(TE.ReorderIndices, K));
6463 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6464 if (!EI)
6465 continue;
6466 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6467 ->getElementCount()
6468 .getKnownMinValue());
6469 }
6470 return VF;
6471 });
6472 // Check special corner case - single shuffle of the same entry.
6473 if (GatherShuffles.size() == 1 && NumParts != 1) {
6474 if (ShuffledSubMasks.any())
6475 return std::nullopt;
6476 PartSz = NumScalars;
6477 NumParts = 1;
6478 }
6479 if (!Entries.empty())
6480 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6481 if (!GatherShuffles[I])
6482 return 0U;
6483 return std::max(Entries[I].front()->getVectorFactor(),
6484 Entries[I].back()->getVectorFactor());
6485 });
6486 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6487 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6488 return std::nullopt;
6489 return std::move(CurrentOrder);
6490}
6491
6492static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6493 const TargetLibraryInfo &TLI,
6494 bool CompareOpcodes = true) {
6497 return false;
6498 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6499 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6500 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6501 (!GEP2 || GEP2->getNumOperands() == 2) &&
6502 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6503 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6504 !CompareOpcodes ||
6505 (GEP1 && GEP2 &&
6506 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6507}
6508
6509/// Calculates minimal alignment as a common alignment.
6510template <typename T>
6512 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6513 for (Value *V : VL)
6514 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6515 return CommonAlignment;
6516}
6517
6518/// Check if \p Order represents reverse order.
6520 assert(!Order.empty() &&
6521 "Order is empty. Please check it before using isReverseOrder.");
6522 unsigned Sz = Order.size();
6523 return all_of(enumerate(Order), [&](const auto &Pair) {
6524 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6525 });
6526}
6527
6528/// Checks if the provided list of pointers \p Pointers represents the strided
6529/// pointers for type ElemTy. If they are not, nullptr is returned.
6530/// Otherwise, SCEV* of the stride value is returned.
6531static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6532 const DataLayout &DL, ScalarEvolution &SE,
6533 SmallVectorImpl<unsigned> &SortedIndices) {
6535 const SCEV *PtrSCEVLowest = nullptr;
6536 const SCEV *PtrSCEVHighest = nullptr;
6537 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6538 // addresses).
6539 for (Value *Ptr : PointerOps) {
6540 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6541 if (!PtrSCEV)
6542 return nullptr;
6543 SCEVs.push_back(PtrSCEV);
6544 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6545 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6546 continue;
6547 }
6548 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6549 if (isa<SCEVCouldNotCompute>(Diff))
6550 return nullptr;
6551 if (Diff->isNonConstantNegative()) {
6552 PtrSCEVLowest = PtrSCEV;
6553 continue;
6554 }
6555 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6556 if (isa<SCEVCouldNotCompute>(Diff1))
6557 return nullptr;
6558 if (Diff1->isNonConstantNegative()) {
6559 PtrSCEVHighest = PtrSCEV;
6560 continue;
6561 }
6562 }
6563 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6564 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6565 if (isa<SCEVCouldNotCompute>(Dist))
6566 return nullptr;
6567 int Size = DL.getTypeStoreSize(ElemTy);
6568 auto TryGetStride = [&](const SCEV *Dist,
6569 const SCEV *Multiplier) -> const SCEV * {
6570 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6571 if (M->getOperand(0) == Multiplier)
6572 return M->getOperand(1);
6573 if (M->getOperand(1) == Multiplier)
6574 return M->getOperand(0);
6575 return nullptr;
6576 }
6577 if (Multiplier == Dist)
6578 return SE.getConstant(Dist->getType(), 1);
6579 return SE.getUDivExactExpr(Dist, Multiplier);
6580 };
6581 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6582 const SCEV *Stride = nullptr;
6583 if (Size != 1 || SCEVs.size() > 2) {
6584 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6585 Stride = TryGetStride(Dist, Sz);
6586 if (!Stride)
6587 return nullptr;
6588 }
6589 if (!Stride || isa<SCEVConstant>(Stride))
6590 return nullptr;
6591 // Iterate through all pointers and check if all distances are
6592 // unique multiple of Stride.
6593 using DistOrdPair = std::pair<int64_t, int>;
6594 auto Compare = llvm::less_first();
6595 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6596 int Cnt = 0;
6597 bool IsConsecutive = true;
6598 for (const SCEV *PtrSCEV : SCEVs) {
6599 unsigned Dist = 0;
6600 if (PtrSCEV != PtrSCEVLowest) {
6601 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6602 const SCEV *Coeff = TryGetStride(Diff, Stride);
6603 if (!Coeff)
6604 return nullptr;
6605 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6606 if (!SC || isa<SCEVCouldNotCompute>(SC))
6607 return nullptr;
6608 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6609 SE.getMulExpr(Stride, SC)))
6610 ->isZero())
6611 return nullptr;
6612 Dist = SC->getAPInt().getZExtValue();
6613 }
6614 // If the strides are not the same or repeated, we can't vectorize.
6615 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6616 return nullptr;
6617 auto Res = Offsets.emplace(Dist, Cnt);
6618 if (!Res.second)
6619 return nullptr;
6620 // Consecutive order if the inserted element is the last one.
6621 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6622 ++Cnt;
6623 }
6624 if (Offsets.size() != SCEVs.size())
6625 return nullptr;
6626 SortedIndices.clear();
6627 if (!IsConsecutive) {
6628 // Fill SortedIndices array only if it is non-consecutive.
6629 SortedIndices.resize(PointerOps.size());
6630 Cnt = 0;
6631 for (const std::pair<int64_t, int> &Pair : Offsets) {
6632 SortedIndices[Cnt] = Pair.second;
6633 ++Cnt;
6634 }
6635 }
6636 return Stride;
6637}
6638
6639static std::pair<InstructionCost, InstructionCost>
6641 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6642 Type *ScalarTy, VectorType *VecTy);
6643
6644/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6645/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6646/// subvector pattern.
6647static InstructionCost
6649 VectorType *Tp, ArrayRef<int> Mask = {},
6651 int Index = 0, VectorType *SubTp = nullptr,
6653 VectorType *DstTy = Tp;
6654 if (!Mask.empty())
6655 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6656
6657 if (Kind != TTI::SK_PermuteTwoSrc)
6658 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6659 Args);
6660 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6661 int NumSubElts;
6663 Mask, NumSrcElts, NumSubElts, Index)) {
6664 if (Index + NumSubElts > NumSrcElts &&
6665 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6666 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6667 TTI::TCK_RecipThroughput, Index, Tp);
6668 }
6669 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6670 Args);
6671}
6672
6673/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6674/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6675/// instead of a scalar.
6676static InstructionCost
6678 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6679 bool Extract, TTI::TargetCostKind CostKind,
6680 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6682 "ScalableVectorType is not supported.");
6683 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6684 getNumElements(Ty) &&
6685 "Incorrect usage.");
6686 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6687 assert(SLPReVec && "Only supported by REVEC.");
6688 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6689 // of CreateInsertElement.
6690 unsigned ScalarTyNumElements = VecTy->getNumElements();
6691 InstructionCost Cost = 0;
6692 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6693 if (!DemandedElts[I])
6694 continue;
6695 if (Insert)
6697 I * ScalarTyNumElements, VecTy);
6698 if (Extract)
6700 I * ScalarTyNumElements, VecTy);
6701 }
6702 return Cost;
6703 }
6704 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6705 CostKind, ForPoisonSrc, VL);
6706}
6707
6708/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6709/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6711 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6712 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6713 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6714 if (Opcode == Instruction::ExtractElement) {
6715 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6716 assert(SLPReVec && "Only supported by REVEC.");
6717 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6719 cast<VectorType>(Val), {}, CostKind,
6720 Index * VecTy->getNumElements(), VecTy);
6721 }
6722 }
6723 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6724 ScalarUserAndIdx);
6725}
6726
6727/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6728/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6730 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6731 VectorType *VecTy, unsigned Index,
6733 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6734 assert(SLPReVec && "Only supported by REVEC.");
6735 auto *SubTp =
6736 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6738 Index * ScalarTy->getNumElements(), SubTp) +
6739 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6740 CostKind);
6741 }
6742 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6743}
6744
6745/// Creates subvector insert. Generates shuffle using \p Generator or
6746/// using default shuffle.
6748 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6749 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6750 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6751 return Vec;
6752 const unsigned SubVecVF = getNumElements(V->getType());
6753 // Create shuffle, insertvector requires that index is multiple of
6754 // the subvector length.
6755 const unsigned VecVF = getNumElements(Vec->getType());
6757 if (isa<PoisonValue>(Vec)) {
6758 auto *Begin = std::next(Mask.begin(), Index);
6759 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6760 Vec = Builder.CreateShuffleVector(V, Mask);
6761 return Vec;
6762 }
6763 std::iota(Mask.begin(), Mask.end(), 0);
6764 std::iota(std::next(Mask.begin(), Index),
6765 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6766 if (Generator)
6767 return Generator(Vec, V, Mask);
6768 // 1. Resize V to the size of Vec.
6769 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6770 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6771 V = Builder.CreateShuffleVector(V, ResizeMask);
6772 // 2. Insert V into Vec.
6773 return Builder.CreateShuffleVector(Vec, V, Mask);
6774}
6775
6776/// Generates subvector extract using \p Generator or using default shuffle.
6778 unsigned SubVecVF, unsigned Index) {
6779 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6780 std::iota(Mask.begin(), Mask.end(), Index);
6781 return Builder.CreateShuffleVector(Vec, Mask);
6782}
6783
6784/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6785/// with \p Order.
6786/// \return true if the mask represents strided access, false - otherwise.
6788 ArrayRef<unsigned> Order, Type *ScalarTy,
6789 const DataLayout &DL, ScalarEvolution &SE,
6790 SmallVectorImpl<int> &CompressMask) {
6791 const unsigned Sz = PointerOps.size();
6792 CompressMask.assign(Sz, PoisonMaskElem);
6793 // The first element always set.
6794 CompressMask[0] = 0;
6795 // Check if the mask represents strided access.
6796 std::optional<unsigned> Stride = 0;
6797 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6798 for (unsigned I : seq<unsigned>(1, Sz)) {
6799 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6800 std::optional<int64_t> OptPos =
6801 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6802 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6803 return false;
6804 unsigned Pos = static_cast<unsigned>(*OptPos);
6805 CompressMask[I] = Pos;
6806 if (!Stride)
6807 continue;
6808 if (*Stride == 0) {
6809 *Stride = Pos;
6810 continue;
6811 }
6812 if (Pos != *Stride * I)
6813 Stride.reset();
6814 }
6815 return Stride.has_value();
6816}
6817
6818/// Checks if the \p VL can be transformed to a (masked)load + compress or
6819/// (masked) interleaved load.
6821 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6824 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6825 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6826 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6827 VectorType *&LoadVecTy) {
6828 InterleaveFactor = 0;
6829 Type *ScalarTy = VL.front()->getType();
6830 const size_t Sz = VL.size();
6831 auto *VecTy = getWidenedType(ScalarTy, Sz);
6833 SmallVector<int> Mask;
6834 if (!Order.empty())
6835 inversePermutation(Order, Mask);
6836 // Check external uses.
6837 for (const auto [I, V] : enumerate(VL)) {
6838 if (AreAllUsersVectorized(V))
6839 continue;
6840 InstructionCost ExtractCost =
6841 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6842 Mask.empty() ? I : Mask[I]);
6843 InstructionCost ScalarCost =
6844 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6845 if (ExtractCost <= ScalarCost)
6846 return false;
6847 }
6848 Value *Ptr0;
6849 Value *PtrN;
6850 if (Order.empty()) {
6851 Ptr0 = PointerOps.front();
6852 PtrN = PointerOps.back();
6853 } else {
6854 Ptr0 = PointerOps[Order.front()];
6855 PtrN = PointerOps[Order.back()];
6856 }
6857 std::optional<int64_t> Diff =
6858 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6859 if (!Diff)
6860 return false;
6861 const size_t MaxRegSize =
6863 .getFixedValue();
6864 // Check for very large distances between elements.
6865 if (*Diff / Sz >= MaxRegSize / 8)
6866 return false;
6867 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6868 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6869 Align CommonAlignment = LI->getAlign();
6870 IsMasked = !isSafeToLoadUnconditionally(
6871 Ptr0, LoadVecTy, CommonAlignment, DL,
6872 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6873 &TLI);
6874 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6875 LI->getPointerAddressSpace()))
6876 return false;
6877 // TODO: perform the analysis of each scalar load for better
6878 // safe-load-unconditionally analysis.
6879 bool IsStrided =
6880 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6881 assert(CompressMask.size() >= 2 && "At least two elements are required");
6882 SmallVector<Value *> OrderedPointerOps(PointerOps);
6883 if (!Order.empty())
6884 reorderScalars(OrderedPointerOps, Mask);
6885 auto [ScalarGEPCost, VectorGEPCost] =
6886 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6887 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6888 // The cost of scalar loads.
6889 InstructionCost ScalarLoadsCost =
6890 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6891 [&](InstructionCost C, Value *V) {
6892 return C + TTI.getInstructionCost(cast<Instruction>(V),
6893 CostKind);
6894 }) +
6895 ScalarGEPCost;
6896 APInt DemandedElts = APInt::getAllOnes(Sz);
6897 InstructionCost GatherCost =
6898 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6899 /*Insert=*/true,
6900 /*Extract=*/false, CostKind) +
6901 ScalarLoadsCost;
6902 InstructionCost LoadCost = 0;
6903 if (IsMasked) {
6904 LoadCost = TTI.getMemIntrinsicInstrCost(
6905 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
6906 CommonAlignment,
6907 LI->getPointerAddressSpace()),
6908 CostKind);
6909 } else {
6910 LoadCost =
6911 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6912 LI->getPointerAddressSpace(), CostKind);
6913 }
6914 if (IsStrided && !IsMasked && Order.empty()) {
6915 // Check for potential segmented(interleaved) loads.
6916 VectorType *AlignedLoadVecTy = getWidenedType(
6917 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6918 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6919 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6920 &TLI))
6921 AlignedLoadVecTy = LoadVecTy;
6922 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6923 CommonAlignment,
6924 LI->getPointerAddressSpace())) {
6925 InstructionCost InterleavedCost =
6926 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6927 Instruction::Load, AlignedLoadVecTy,
6928 CompressMask[1], {}, CommonAlignment,
6929 LI->getPointerAddressSpace(), CostKind, IsMasked);
6930 if (InterleavedCost < GatherCost) {
6931 InterleaveFactor = CompressMask[1];
6932 LoadVecTy = AlignedLoadVecTy;
6933 return true;
6934 }
6935 }
6936 }
6937 InstructionCost CompressCost = ::getShuffleCost(
6938 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
6939 if (!Order.empty()) {
6940 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6941 for (unsigned I : seq<unsigned>(Sz)) {
6942 NewMask[I] = CompressMask[Mask[I]];
6943 }
6944 CompressMask.swap(NewMask);
6945 }
6946 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6947 return TotalVecCost < GatherCost;
6948}
6949
6950/// Checks if the \p VL can be transformed to a (masked)load + compress or
6951/// (masked) interleaved load.
6952static bool
6955 const DataLayout &DL, ScalarEvolution &SE,
6956 AssumptionCache &AC, const DominatorTree &DT,
6957 const TargetLibraryInfo &TLI,
6958 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6959 bool IsMasked;
6960 unsigned InterleaveFactor;
6961 SmallVector<int> CompressMask;
6962 VectorType *LoadVecTy;
6963 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6964 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6965 CompressMask, LoadVecTy);
6966}
6967
6968/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6969/// PointerOps:
6970/// 1. Target with strided load support is detected.
6971/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6972/// potential stride <= MaxProfitableLoadStride and the potential stride is
6973/// power-of-2 (to avoid perf regressions for the very small number of loads)
6974/// and max distance > number of loads, or potential stride is -1.
6975/// 3. The loads are ordered, or number of unordered loads <=
6976/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6977/// to avoid extra costs for very expensive shuffles).
6978/// 4. Any pointer operand is an instruction with the users outside of the
6979/// current graph (for masked gathers extra extractelement instructions
6980/// might be required).
6982 Align Alignment, const int64_t Diff,
6983 const size_t Sz) const {
6984 if (Diff % (Sz - 1) != 0)
6985 return false;
6986
6987 // Try to generate strided load node.
6988 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
6989 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
6990 return !isVectorized(U) && !MustGather.contains(U);
6991 });
6992 });
6993
6994 const uint64_t AbsoluteDiff = std::abs(Diff);
6995 auto *VecTy = getWidenedType(ScalarTy, Sz);
6996 if (IsAnyPointerUsedOutGraph ||
6997 (AbsoluteDiff > Sz &&
6999 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
7000 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
7001 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7002 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7003 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7004 return false;
7005 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7006 return false;
7007 return true;
7008 }
7009 return false;
7010}
7011
7013 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7014 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7015 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
7016 const size_t Sz = PointerOps.size();
7017 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7018 // Go through `PointerOps` in sorted order and record offsets from `Ptr0`.
7019 for (unsigned I : seq<unsigned>(Sz)) {
7020 Value *Ptr =
7021 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7022 SortedOffsetsFromBase[I] =
7023 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
7024 }
7025
7026 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7027 // ```
7028 // [
7029 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7030 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7031 // ...
7032 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7033 // GroupSize - 1}), // last group
7034 // ]
7035 // ```
7036 // The distance between consecutive elements within each group should all be
7037 // the same `StrideWithinGroup`. The distance between the first elements of
7038 // consecutive groups should all be the same `StrideBetweenGroups`.
7039
7040 int64_t StrideWithinGroup =
7041 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7042 // Determine size of the first group. Later we will check that all other
7043 // groups have the same size.
7044 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7045 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7046 StrideWithinGroup;
7047 };
7048 auto Indices = seq<unsigned>(1, Sz);
7049 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7050 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7051
7052 unsigned VecSz = Sz;
7053 Type *NewScalarTy = ScalarTy;
7054
7055 // Quick detour: at this point we can say what the type of strided load would
7056 // be if all the checks pass. Check if this type is legal for the target.
7057 bool NeedsWidening = Sz != GroupSize;
7058 if (NeedsWidening) {
7059 if (Sz % GroupSize != 0)
7060 return false;
7061
7062 if (StrideWithinGroup != 1)
7063 return false;
7064 VecSz = Sz / GroupSize;
7065 NewScalarTy = Type::getIntNTy(
7066 SE->getContext(),
7067 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7068 }
7069
7070 if (!isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7071 return false;
7072
7073 int64_t StrideIntVal = StrideWithinGroup;
7074 if (NeedsWidening) {
7075 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7076 // Check that the strides between groups are all the same.
7077 unsigned CurrentGroupStartIdx = GroupSize;
7078 int64_t StrideBetweenGroups =
7079 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7080 StrideIntVal = StrideBetweenGroups;
7081 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7082 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7083 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7084 StrideBetweenGroups)
7085 return false;
7086 }
7087
7088 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7089 auto Indices = seq<unsigned>(StartIdx + 1, Sz);
7090 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7091 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7092 return GroupEndIdx - StartIdx == GroupSize;
7093 };
7094 for (unsigned I = 0; I < Sz; I += GroupSize) {
7095 if (!CheckGroup(I))
7096 return false;
7097 }
7098 }
7099
7100 Type *StrideTy = DL->getIndexType(Ptr0->getType());
7101 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
7102 SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
7103 return true;
7104}
7105
7107 Type *ScalarTy, Align CommonAlignment,
7108 SmallVectorImpl<unsigned> &SortedIndices,
7109 StridedPtrInfo &SPtrInfo) const {
7110 const unsigned Sz = PointerOps.size();
7111 FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, Sz);
7112 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7113 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7114 return false;
7115 if (const SCEV *Stride =
7116 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) {
7117 SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
7118 SPtrInfo.StrideSCEV = Stride;
7119 return true;
7120 }
7121 return false;
7122}
7123
7125 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7126 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7127 unsigned *BestVF, bool TryRecursiveCheck) const {
7128 // Check that a vectorized load would load the same memory as a scalar
7129 // load. For example, we don't want to vectorize loads that are smaller
7130 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7131 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7132 // from such a struct, we read/write packed bits disagreeing with the
7133 // unvectorized version.
7134 if (BestVF)
7135 *BestVF = 0;
7137 return LoadsState::Gather;
7138 Type *ScalarTy = VL0->getType();
7139
7140 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7141 return LoadsState::Gather;
7142
7143 // Make sure all loads in the bundle are simple - we can't vectorize
7144 // atomic or volatile loads.
7145 PointerOps.clear();
7146 const size_t Sz = VL.size();
7147 PointerOps.resize(Sz);
7148 auto *POIter = PointerOps.begin();
7149 for (Value *V : VL) {
7150 auto *L = dyn_cast<LoadInst>(V);
7151 if (!L || !L->isSimple())
7152 return LoadsState::Gather;
7153 *POIter = L->getPointerOperand();
7154 ++POIter;
7155 }
7156
7157 Order.clear();
7158 // Check the order of pointer operands or that all pointers are the same.
7159 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7160
7161 auto *VecTy = getWidenedType(ScalarTy, Sz);
7162 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7163 if (!IsSorted) {
7164 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7165 SPtrInfo))
7167
7168 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7169 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7170 return LoadsState::Gather;
7171
7172 if (!all_of(PointerOps, [&](Value *P) {
7173 return arePointersCompatible(P, PointerOps.front(), *TLI);
7174 }))
7175 return LoadsState::Gather;
7176
7177 } else {
7178 Value *Ptr0;
7179 Value *PtrN;
7180 if (Order.empty()) {
7181 Ptr0 = PointerOps.front();
7182 PtrN = PointerOps.back();
7183 } else {
7184 Ptr0 = PointerOps[Order.front()];
7185 PtrN = PointerOps[Order.back()];
7186 }
7187 std::optional<int64_t> Diff =
7188 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7189 // Check that the sorted loads are consecutive.
7190 if (static_cast<uint64_t>(*Diff) == Sz - 1)
7191 return LoadsState::Vectorize;
7192 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7193 *TLI, [&](Value *V) {
7194 return areAllUsersVectorized(
7195 cast<Instruction>(V), UserIgnoreList);
7196 }))
7198 Align Alignment =
7199 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7200 ->getAlign();
7201 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7202 *Diff, Ptr0, PtrN, SPtrInfo))
7204 }
7205 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7206 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7207 return LoadsState::Gather;
7208 // Correctly identify compare the cost of loads + shuffles rather than
7209 // strided/masked gather loads. Returns true if vectorized + shuffles
7210 // representation is better than just gather.
7211 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7212 unsigned *BestVF,
7213 bool ProfitableGatherPointers) {
7214 if (BestVF)
7215 *BestVF = 0;
7216 // Compare masked gather cost and loads + insert subvector costs.
7218 auto [ScalarGEPCost, VectorGEPCost] =
7219 getGEPCosts(TTI, PointerOps, PointerOps.front(),
7220 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7221 // Estimate the cost of masked gather GEP. If not a splat, roughly
7222 // estimate as a buildvector, otherwise estimate as splat.
7223 APInt DemandedElts = APInt::getAllOnes(Sz);
7224 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7225 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7226 if (static_cast<unsigned>(count_if(
7227 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7228 any_of(PointerOps, [&](Value *V) {
7229 return getUnderlyingObject(V) !=
7230 getUnderlyingObject(PointerOps.front());
7231 }))
7232 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7233 DemandedElts, /*Insert=*/true,
7234 /*Extract=*/false, CostKind);
7235 else
7236 VectorGEPCost +=
7238 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7239 /*Insert=*/true, /*Extract=*/false, CostKind) +
7240 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7241 // The cost of scalar loads.
7242 InstructionCost ScalarLoadsCost =
7243 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7244 [&](InstructionCost C, Value *V) {
7245 return C + TTI.getInstructionCost(
7247 }) +
7248 ScalarGEPCost;
7249 // The cost of masked gather.
7250 InstructionCost MaskedGatherCost =
7251 TTI.getMemIntrinsicInstrCost(
7252 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
7254 /*VariableMask=*/false, CommonAlignment),
7255 CostKind) +
7256 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7257 InstructionCost GatherCost =
7258 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7259 /*Insert=*/true,
7260 /*Extract=*/false, CostKind) +
7261 ScalarLoadsCost;
7262 // The list of loads is small or perform partial check already - directly
7263 // compare masked gather cost and gather cost.
7264 constexpr unsigned ListLimit = 4;
7265 if (!TryRecursiveCheck || VL.size() < ListLimit)
7266 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7267
7268 // FIXME: The following code has not been updated for non-power-of-2
7269 // vectors (and not whole registers). The splitting logic here does not
7270 // cover the original vector if the vector factor is not a power of two.
7271 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7272 return false;
7273
7274 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7275 unsigned MinVF = getMinVF(2 * Sz);
7276 DemandedElts.clearAllBits();
7277 // Iterate through possible vectorization factors and check if vectorized +
7278 // shuffles is better than just gather.
7279 for (unsigned VF =
7280 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7281 VF >= MinVF;
7282 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7284 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7285 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7287 SmallVector<Value *> PointerOps;
7288 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7289 PointerOps, SPtrInfo, BestVF,
7290 /*TryRecursiveCheck=*/false);
7291 // Check that the sorted loads are consecutive.
7292 if (LS == LoadsState::Gather) {
7293 if (BestVF) {
7294 DemandedElts.setAllBits();
7295 break;
7296 }
7297 DemandedElts.setBits(Cnt, Cnt + VF);
7298 continue;
7299 }
7300 // If need the reorder - consider as high-cost masked gather for now.
7301 if ((LS == LoadsState::Vectorize ||
7304 !Order.empty() && !isReverseOrder(Order))
7306 States.push_back(LS);
7307 }
7308 if (DemandedElts.isAllOnes())
7309 // All loads gathered - try smaller VF.
7310 continue;
7311 // Can be vectorized later as a serie of loads/insertelements.
7312 InstructionCost VecLdCost = 0;
7313 if (!DemandedElts.isZero()) {
7314 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7315 /*Insert=*/true,
7316 /*Extract=*/false, CostKind) +
7317 ScalarGEPCost;
7318 for (unsigned Idx : seq<unsigned>(VL.size()))
7319 if (DemandedElts[Idx])
7320 VecLdCost +=
7321 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7322 }
7323 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7324 for (auto [I, LS] : enumerate(States)) {
7325 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7326 InstructionCost VectorGEPCost =
7327 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7328 ? 0
7329 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7330 LI0->getPointerOperand(),
7331 Instruction::GetElementPtr, CostKind, ScalarTy,
7332 SubVecTy)
7333 .second;
7334 if (LS == LoadsState::ScatterVectorize) {
7335 if (static_cast<unsigned>(
7336 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7337 PointerOps.size() - 1 ||
7338 any_of(PointerOps, [&](Value *V) {
7339 return getUnderlyingObject(V) !=
7340 getUnderlyingObject(PointerOps.front());
7341 }))
7342 VectorGEPCost += getScalarizationOverhead(
7343 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7344 /*Insert=*/true, /*Extract=*/false, CostKind);
7345 else
7346 VectorGEPCost +=
7348 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7349 /*Insert=*/true, /*Extract=*/false, CostKind) +
7350 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7351 CostKind);
7352 }
7353 switch (LS) {
7355 VecLdCost +=
7356 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7357 LI0->getPointerAddressSpace(), CostKind,
7359 VectorGEPCost;
7360 break;
7362 VecLdCost += TTI.getMemIntrinsicInstrCost(
7364 Intrinsic::experimental_vp_strided_load,
7365 SubVecTy, LI0->getPointerOperand(),
7366 /*VariableMask=*/false, CommonAlignment),
7367 CostKind) +
7368 VectorGEPCost;
7369 break;
7371 VecLdCost += TTI.getMemIntrinsicInstrCost(
7373 Intrinsic::masked_load, SubVecTy,
7374 CommonAlignment, LI0->getPointerAddressSpace()),
7375 CostKind) +
7377 {}, CostKind);
7378 break;
7380 VecLdCost += TTI.getMemIntrinsicInstrCost(
7382 Intrinsic::masked_gather, SubVecTy,
7383 LI0->getPointerOperand(),
7384 /*VariableMask=*/false, CommonAlignment),
7385 CostKind) +
7386 VectorGEPCost;
7387 break;
7388 case LoadsState::Gather:
7389 // Gathers are already calculated - ignore.
7390 continue;
7391 }
7392 SmallVector<int> ShuffleMask(VL.size());
7393 for (int Idx : seq<int>(0, VL.size()))
7394 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7395 if (I > 0)
7396 VecLdCost +=
7397 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7398 CostKind, I * VF, SubVecTy);
7399 }
7400 // If masked gather cost is higher - better to vectorize, so
7401 // consider it as a gather node. It will be better estimated
7402 // later.
7403 if (MaskedGatherCost >= VecLdCost &&
7404 VecLdCost - GatherCost < -SLPCostThreshold) {
7405 if (BestVF)
7406 *BestVF = VF;
7407 return true;
7408 }
7409 }
7410 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7411 };
7412 // TODO: need to improve analysis of the pointers, if not all of them are
7413 // GEPs or have > 2 operands, we end up with a gather node, which just
7414 // increases the cost.
7415 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7416 bool ProfitableGatherPointers =
7417 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7418 return L->isLoopInvariant(V);
7419 })) <= Sz / 2;
7420 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7422 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7423 (GEP && GEP->getNumOperands() == 2 &&
7424 isa<Constant, Instruction>(GEP->getOperand(1)));
7425 })) {
7426 // Check if potential masked gather can be represented as series
7427 // of loads + insertsubvectors.
7428 // If masked gather cost is higher - better to vectorize, so
7429 // consider it as a gather node. It will be better estimated
7430 // later.
7431 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7432 ProfitableGatherPointers))
7434 }
7435
7436 return LoadsState::Gather;
7437}
7438
7440 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7441 const DataLayout &DL, ScalarEvolution &SE,
7442 SmallVectorImpl<unsigned> &SortedIndices) {
7443 assert(
7444 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7445 "Expected list of pointer operands.");
7446 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7447 // Ptr into, sort and return the sorted indices with values next to one
7448 // another.
7450 std::pair<BasicBlock *, Value *>,
7452 Bases;
7453 Bases
7454 .try_emplace(std::make_pair(
7456 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7457
7458 SortedIndices.clear();
7459 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7460 auto Key = std::make_pair(BBs[Cnt + 1],
7462 bool Found = any_of(Bases.try_emplace(Key).first->second,
7463 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7464 std::optional<int64_t> Diff =
7465 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7466 ElemTy, Ptr, DL, SE,
7467 /*StrictCheck=*/true);
7468 if (!Diff)
7469 return false;
7470
7471 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7472 return true;
7473 });
7474
7475 if (!Found) {
7476 // If we haven't found enough to usefully cluster, return early.
7477 if (Bases.size() > VL.size() / 2 - 1)
7478 return false;
7479
7480 // Not found already - add a new Base
7481 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7482 }
7483 }
7484
7485 if (Bases.size() == VL.size())
7486 return false;
7487
7488 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7489 Bases.front().second.size() == VL.size()))
7490 return false;
7491
7492 // For each of the bases sort the pointers by Offset and check if any of the
7493 // base become consecutively allocated.
7494 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7495 SmallPtrSet<Value *, 13> FirstPointers;
7496 SmallPtrSet<Value *, 13> SecondPointers;
7497 Value *P1 = Ptr1;
7498 Value *P2 = Ptr2;
7499 unsigned Depth = 0;
7500 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7501 if (P1 == P2 || Depth > RecursionMaxDepth)
7502 return false;
7503 FirstPointers.insert(P1);
7504 SecondPointers.insert(P2);
7505 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7506 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7507 ++Depth;
7508 }
7509 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7510 "Unable to find matching root.");
7511 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7512 };
7513 for (auto &Base : Bases) {
7514 for (auto &Vec : Base.second) {
7515 if (Vec.size() > 1) {
7517 int64_t InitialOffset = std::get<1>(Vec[0]);
7518 bool AnyConsecutive =
7519 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7520 return std::get<1>(P.value()) ==
7521 int64_t(P.index()) + InitialOffset;
7522 });
7523 // Fill SortedIndices array only if it looks worth-while to sort the
7524 // ptrs.
7525 if (!AnyConsecutive)
7526 return false;
7527 }
7528 }
7529 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7530 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7531 });
7532 }
7533
7534 for (auto &T : Bases)
7535 for (const auto &Vec : T.second)
7536 for (const auto &P : Vec)
7537 SortedIndices.push_back(std::get<2>(P));
7538
7539 assert(SortedIndices.size() == VL.size() &&
7540 "Expected SortedIndices to be the size of VL");
7541 return true;
7542}
7543
7544std::optional<BoUpSLP::OrdersType>
7545BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7546 assert(TE.isGather() && "Expected gather node only.");
7547 Type *ScalarTy = TE.Scalars[0]->getType();
7548
7550 Ptrs.reserve(TE.Scalars.size());
7552 BBs.reserve(TE.Scalars.size());
7553 for (Value *V : TE.Scalars) {
7554 auto *L = dyn_cast<LoadInst>(V);
7555 if (!L || !L->isSimple())
7556 return std::nullopt;
7557 Ptrs.push_back(L->getPointerOperand());
7558 BBs.push_back(L->getParent());
7559 }
7560
7561 BoUpSLP::OrdersType Order;
7562 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7563 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7564 return std::move(Order);
7565 return std::nullopt;
7566}
7567
7568/// Check if two insertelement instructions are from the same buildvector.
7571 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7572 // Instructions must be from the same basic blocks.
7573 if (VU->getParent() != V->getParent())
7574 return false;
7575 // Checks if 2 insertelements are from the same buildvector.
7576 if (VU->getType() != V->getType())
7577 return false;
7578 // Multiple used inserts are separate nodes.
7579 if (!VU->hasOneUse() && !V->hasOneUse())
7580 return false;
7581 auto *IE1 = VU;
7582 auto *IE2 = V;
7583 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7584 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7585 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7586 return false;
7587 // Go through the vector operand of insertelement instructions trying to find
7588 // either VU as the original vector for IE2 or V as the original vector for
7589 // IE1.
7590 SmallBitVector ReusedIdx(
7591 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7592 bool IsReusedIdx = false;
7593 do {
7594 if (IE2 == VU && !IE1)
7595 return VU->hasOneUse();
7596 if (IE1 == V && !IE2)
7597 return V->hasOneUse();
7598 if (IE1 && IE1 != V) {
7599 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7600 IsReusedIdx |= ReusedIdx.test(Idx1);
7601 ReusedIdx.set(Idx1);
7602 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7603 IE1 = nullptr;
7604 else
7605 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7606 }
7607 if (IE2 && IE2 != VU) {
7608 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7609 IsReusedIdx |= ReusedIdx.test(Idx2);
7610 ReusedIdx.set(Idx2);
7611 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7612 IE2 = nullptr;
7613 else
7614 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7615 }
7616 } while (!IsReusedIdx && (IE1 || IE2));
7617 return false;
7618}
7619
7620/// Checks if the specified instruction \p I is an alternate operation for
7621/// the given \p MainOp and \p AltOp instructions.
7622static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7623 Instruction *AltOp,
7624 const TargetLibraryInfo &TLI);
7625
7626std::optional<BoUpSLP::OrdersType>
7627BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7628 bool IgnoreReorder) {
7629 // No need to reorder if need to shuffle reuses, still need to shuffle the
7630 // node.
7631 if (!TE.ReuseShuffleIndices.empty()) {
7632 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7633 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7634 "Reshuffling scalars not yet supported for nodes with padding");
7635
7636 if (isSplat(TE.Scalars))
7637 return std::nullopt;
7638 // Check if reuse shuffle indices can be improved by reordering.
7639 // For this, check that reuse mask is "clustered", i.e. each scalar values
7640 // is used once in each submask of size <number_of_scalars>.
7641 // Example: 4 scalar values.
7642 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7643 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7644 // element 3 is used twice in the second submask.
7645 unsigned Sz = TE.Scalars.size();
7646 if (TE.isGather()) {
7647 if (std::optional<OrdersType> CurrentOrder =
7648 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7649 SmallVector<int> Mask;
7650 fixupOrderingIndices(*CurrentOrder);
7651 inversePermutation(*CurrentOrder, Mask);
7652 ::addMask(Mask, TE.ReuseShuffleIndices);
7653 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7654 unsigned Sz = TE.Scalars.size();
7655 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7656 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7657 if (Idx != PoisonMaskElem)
7658 Res[Idx + K * Sz] = I + K * Sz;
7659 }
7660 return std::move(Res);
7661 }
7662 }
7663 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7664 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7665 2 * TE.getVectorFactor())) == 1)
7666 return std::nullopt;
7667 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7668 return std::nullopt;
7669 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7670 Sz)) {
7671 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7672 if (TE.ReorderIndices.empty())
7673 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7674 else
7675 inversePermutation(TE.ReorderIndices, ReorderMask);
7676 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7677 unsigned VF = ReorderMask.size();
7678 OrdersType ResOrder(VF, VF);
7679 unsigned NumParts = divideCeil(VF, Sz);
7680 SmallBitVector UsedVals(NumParts);
7681 for (unsigned I = 0; I < VF; I += Sz) {
7682 int Val = PoisonMaskElem;
7683 unsigned UndefCnt = 0;
7684 unsigned Limit = std::min(Sz, VF - I);
7685 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7686 [&](int Idx) {
7687 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7688 Val = Idx;
7689 if (Idx == PoisonMaskElem)
7690 ++UndefCnt;
7691 return Idx != PoisonMaskElem && Idx != Val;
7692 }) ||
7693 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7694 UndefCnt > Sz / 2)
7695 return std::nullopt;
7696 UsedVals.set(Val);
7697 for (unsigned K = 0; K < NumParts; ++K) {
7698 unsigned Idx = Val + Sz * K;
7699 if (Idx < VF && I + K < VF)
7700 ResOrder[Idx] = I + K;
7701 }
7702 }
7703 return std::move(ResOrder);
7704 }
7705 unsigned VF = TE.getVectorFactor();
7706 // Try build correct order for extractelement instructions.
7707 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7708 TE.ReuseShuffleIndices.end());
7709 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7710 all_of(TE.Scalars, [Sz](Value *V) {
7711 if (isa<PoisonValue>(V))
7712 return true;
7713 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7714 return Idx && *Idx < Sz;
7715 })) {
7716 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7717 "by BinaryOperator and CastInst.");
7718 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7719 if (TE.ReorderIndices.empty())
7720 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7721 else
7722 inversePermutation(TE.ReorderIndices, ReorderMask);
7723 for (unsigned I = 0; I < VF; ++I) {
7724 int &Idx = ReusedMask[I];
7725 if (Idx == PoisonMaskElem)
7726 continue;
7727 Value *V = TE.Scalars[ReorderMask[Idx]];
7728 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7729 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7730 }
7731 }
7732 // Build the order of the VF size, need to reorder reuses shuffles, they are
7733 // always of VF size.
7734 OrdersType ResOrder(VF);
7735 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7736 auto *It = ResOrder.begin();
7737 for (unsigned K = 0; K < VF; K += Sz) {
7738 OrdersType CurrentOrder(TE.ReorderIndices);
7739 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7740 if (SubMask.front() == PoisonMaskElem)
7741 std::iota(SubMask.begin(), SubMask.end(), 0);
7742 reorderOrder(CurrentOrder, SubMask);
7743 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7744 std::advance(It, Sz);
7745 }
7746 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7747 return Data.index() == Data.value();
7748 }))
7749 return std::nullopt; // No need to reorder.
7750 return std::move(ResOrder);
7751 }
7752 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7753 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7754 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
7755 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
7756 return std::nullopt;
7757 if (TE.State == TreeEntry::SplitVectorize ||
7758 ((TE.State == TreeEntry::Vectorize ||
7759 TE.State == TreeEntry::StridedVectorize ||
7760 TE.State == TreeEntry::CompressVectorize) &&
7762 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7763 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7764 "Alternate instructions are only supported by "
7765 "BinaryOperator and CastInst.");
7766 return TE.ReorderIndices;
7767 }
7768 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7769 TE.isAltShuffle()) {
7770 assert(TE.ReuseShuffleIndices.empty() &&
7771 "ReuseShuffleIndices should be "
7772 "empty for alternate instructions.");
7773 SmallVector<int> Mask;
7774 TE.buildAltOpShuffleMask(
7775 [&](Instruction *I) {
7776 assert(TE.getMatchingMainOpOrAltOp(I) &&
7777 "Unexpected main/alternate opcode");
7778 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
7779 },
7780 Mask);
7781 const int VF = TE.getVectorFactor();
7782 OrdersType ResOrder(VF, VF);
7783 for (unsigned I : seq<unsigned>(VF)) {
7784 if (Mask[I] == PoisonMaskElem)
7785 continue;
7786 ResOrder[Mask[I] % VF] = I;
7787 }
7788 return std::move(ResOrder);
7789 }
7790 if (!TE.ReorderIndices.empty())
7791 return TE.ReorderIndices;
7792 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7793 if (!TE.ReorderIndices.empty())
7794 return TE.ReorderIndices;
7795
7796 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
7797 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
7798 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7799 continue;
7800 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
7801 if (!II)
7802 continue;
7803 Instruction *BVHead = nullptr;
7804 BasicBlock *BB = II->getParent();
7805 while (II && II->hasOneUse() && II->getParent() == BB) {
7806 BVHead = II;
7807 II = dyn_cast<InsertElementInst>(II->getOperand(0));
7808 }
7809 I = BVHead;
7810 }
7811
7812 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
7813 assert(BB1 != BB2 && "Expected different basic blocks.");
7814 if (!DT->isReachableFromEntry(BB1))
7815 return false;
7816 if (!DT->isReachableFromEntry(BB2))
7817 return true;
7818 auto *NodeA = DT->getNode(BB1);
7819 auto *NodeB = DT->getNode(BB2);
7820 assert(NodeA && "Should only process reachable instructions");
7821 assert(NodeB && "Should only process reachable instructions");
7822 assert((NodeA == NodeB) ==
7823 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7824 "Different nodes should have different DFS numbers");
7825 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7826 };
7827 auto PHICompare = [&](unsigned I1, unsigned I2) {
7828 Value *V1 = TE.Scalars[I1];
7829 Value *V2 = TE.Scalars[I2];
7830 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
7831 return false;
7832 if (isa<PoisonValue>(V1))
7833 return true;
7834 if (isa<PoisonValue>(V2))
7835 return false;
7836 if (V1->getNumUses() < V2->getNumUses())
7837 return true;
7838 if (V1->getNumUses() > V2->getNumUses())
7839 return false;
7840 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
7841 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
7842 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7843 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7844 FirstUserOfPhi2->getParent());
7845 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7846 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7847 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7848 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7849 if (IE1 && !IE2)
7850 return true;
7851 if (!IE1 && IE2)
7852 return false;
7853 if (IE1 && IE2) {
7854 if (UserBVHead[I1] && !UserBVHead[I2])
7855 return true;
7856 if (!UserBVHead[I1])
7857 return false;
7858 if (UserBVHead[I1] == UserBVHead[I2])
7859 return getElementIndex(IE1) < getElementIndex(IE2);
7860 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
7861 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
7862 UserBVHead[I2]->getParent());
7863 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7864 }
7865 if (EE1 && !EE2)
7866 return true;
7867 if (!EE1 && EE2)
7868 return false;
7869 if (EE1 && EE2) {
7870 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7871 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7872 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7873 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7874 if (!Inst2 && !P2)
7875 return Inst1 || P1;
7876 if (EE1->getOperand(0) == EE2->getOperand(0))
7877 return getElementIndex(EE1) < getElementIndex(EE2);
7878 if (!Inst1 && Inst2)
7879 return false;
7880 if (Inst1 && Inst2) {
7881 if (Inst1->getParent() != Inst2->getParent())
7882 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
7883 return Inst1->comesBefore(Inst2);
7884 }
7885 if (!P1 && P2)
7886 return false;
7887 assert(P1 && P2 &&
7888 "Expected either instructions or arguments vector operands.");
7889 return P1->getArgNo() < P2->getArgNo();
7890 }
7891 return false;
7892 };
7893 OrdersType Phis(TE.Scalars.size());
7894 std::iota(Phis.begin(), Phis.end(), 0);
7895 stable_sort(Phis, PHICompare);
7896 if (isIdentityOrder(Phis))
7897 return std::nullopt; // No need to reorder.
7898 return std::move(Phis);
7899 }
7900 if (TE.isGather() &&
7901 (!TE.hasState() || !TE.isAltShuffle() ||
7902 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7903 allSameType(TE.Scalars)) {
7904 // TODO: add analysis of other gather nodes with extractelement
7905 // instructions and other values/instructions, not only undefs.
7906 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7908 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7909 all_of(TE.Scalars, [](Value *V) {
7910 auto *EE = dyn_cast<ExtractElementInst>(V);
7911 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7912 })) {
7913 // Check that gather of extractelements can be represented as
7914 // just a shuffle of a single vector.
7915 OrdersType CurrentOrder;
7916 bool Reuse =
7917 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
7918 if (Reuse || !CurrentOrder.empty())
7919 return std::move(CurrentOrder);
7920 }
7921 // If the gather node is <undef, v, .., poison> and
7922 // insertelement poison, v, 0 [+ permute]
7923 // is cheaper than
7924 // insertelement poison, v, n - try to reorder.
7925 // If rotating the whole graph, exclude the permute cost, the whole graph
7926 // might be transformed.
7927 int Sz = TE.Scalars.size();
7928 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
7929 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7930 const auto *It = find_if_not(TE.Scalars, isConstant);
7931 if (It == TE.Scalars.begin())
7932 return OrdersType();
7933 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
7934 if (It != TE.Scalars.end()) {
7935 OrdersType Order(Sz, Sz);
7936 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7937 Order[Idx] = 0;
7938 fixupOrderingIndices(Order);
7939 SmallVector<int> Mask;
7940 inversePermutation(Order, Mask);
7941 InstructionCost PermuteCost =
7942 TopToBottom
7943 ? 0
7944 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
7945 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
7946 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
7947 PoisonValue::get(Ty), *It);
7948 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
7949 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
7950 PoisonValue::get(Ty), *It);
7951 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7952 OrdersType Order(Sz, Sz);
7953 Order[Idx] = 0;
7954 return std::move(Order);
7955 }
7956 }
7957 }
7958 if (isSplat(TE.Scalars))
7959 return std::nullopt;
7960 if (TE.Scalars.size() >= 3)
7961 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
7962 return Order;
7963 // Check if can include the order of vectorized loads. For masked gathers do
7964 // extra analysis later, so include such nodes into a special list.
7965 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7966 SmallVector<Value *> PointerOps;
7967 StridedPtrInfo SPtrInfo;
7968 OrdersType CurrentOrder;
7969 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7970 CurrentOrder, PointerOps, SPtrInfo);
7973 return std::move(CurrentOrder);
7974 }
7975 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
7976 // has been auditted for correctness with non-power-of-two vectors.
7977 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
7978 if (std::optional<OrdersType> CurrentOrder =
7979 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
7980 return CurrentOrder;
7981 }
7982 return std::nullopt;
7983}
7984
7985/// Checks if the given mask is a "clustered" mask with the same clusters of
7986/// size \p Sz, which are not identity submasks.
7988 unsigned Sz) {
7989 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
7990 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
7991 return false;
7992 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
7993 ArrayRef<int> Cluster = Mask.slice(I, Sz);
7994 if (Cluster != FirstCluster)
7995 return false;
7996 }
7997 return true;
7998}
7999
8000void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8001 // Reorder reuses mask.
8002 reorderReuses(TE.ReuseShuffleIndices, Mask);
8003 const unsigned Sz = TE.Scalars.size();
8004 // For vectorized and non-clustered reused no need to do anything else.
8005 if (!TE.isGather() ||
8007 Sz) ||
8008 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
8009 return;
8010 SmallVector<int> NewMask;
8011 inversePermutation(TE.ReorderIndices, NewMask);
8012 addMask(NewMask, TE.ReuseShuffleIndices);
8013 // Clear reorder since it is going to be applied to the new mask.
8014 TE.ReorderIndices.clear();
8015 // Try to improve gathered nodes with clustered reuses, if possible.
8016 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
8017 SmallVector<unsigned> NewOrder(Slice);
8018 inversePermutation(NewOrder, NewMask);
8019 reorderScalars(TE.Scalars, NewMask);
8020 // Fill the reuses mask with the identity submasks.
8021 for (auto *It = TE.ReuseShuffleIndices.begin(),
8022 *End = TE.ReuseShuffleIndices.end();
8023 It != End; std::advance(It, Sz))
8024 std::iota(It, std::next(It, Sz), 0);
8025}
8026
8028 ArrayRef<unsigned> SecondaryOrder) {
8029 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8030 "Expected same size of orders");
8031 size_t Sz = Order.size();
8032 SmallBitVector UsedIndices(Sz);
8033 for (unsigned Idx : seq<unsigned>(0, Sz)) {
8034 if (Order[Idx] != Sz)
8035 UsedIndices.set(Order[Idx]);
8036 }
8037 if (SecondaryOrder.empty()) {
8038 for (unsigned Idx : seq<unsigned>(0, Sz))
8039 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8040 Order[Idx] = Idx;
8041 } else {
8042 for (unsigned Idx : seq<unsigned>(0, Sz))
8043 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8044 !UsedIndices.test(SecondaryOrder[Idx]))
8045 Order[Idx] = SecondaryOrder[Idx];
8046 }
8047}
8048
8051 return false;
8052
8053 constexpr unsigned TinyVF = 2;
8054 constexpr unsigned TinyTree = 10;
8055 constexpr unsigned PhiOpsLimit = 12;
8056 constexpr unsigned GatherLoadsLimit = 2;
8057 if (VectorizableTree.size() <= TinyTree)
8058 return true;
8059 if (VectorizableTree.front()->hasState() &&
8060 !VectorizableTree.front()->isGather() &&
8061 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8062 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8063 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8064 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8065 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8066 VectorizableTree.front()->ReorderIndices.empty()) {
8067 // Check if the tree has only single store and single (unordered) load node,
8068 // other nodes are phis or geps/binops, combined with phis, and/or single
8069 // gather load node
8070 if (VectorizableTree.front()->hasState() &&
8071 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8072 VectorizableTree.front()->Scalars.size() == TinyVF &&
8073 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8074 return false;
8075 // Single node, which require reorder - skip.
8076 if (VectorizableTree.front()->hasState() &&
8077 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8078 VectorizableTree.front()->ReorderIndices.empty()) {
8079 const unsigned ReorderedSplitsCnt =
8080 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8081 return TE->State == TreeEntry::SplitVectorize &&
8082 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8083 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8084 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
8085 });
8086 if (ReorderedSplitsCnt <= 1 &&
8087 static_cast<unsigned>(count_if(
8088 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8089 return ((!TE->isGather() &&
8090 (TE->ReorderIndices.empty() ||
8091 (TE->UserTreeIndex.UserTE &&
8092 TE->UserTreeIndex.UserTE->State ==
8093 TreeEntry::Vectorize &&
8094 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8095 .empty()))) ||
8096 (TE->isGather() && TE->ReorderIndices.empty() &&
8097 (!TE->hasState() || TE->isAltShuffle() ||
8098 TE->getOpcode() == Instruction::Load ||
8099 TE->getOpcode() == Instruction::ZExt ||
8100 TE->getOpcode() == Instruction::SExt))) &&
8101 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8102 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
8103 return !isConstant(V) && isVectorized(V);
8104 }));
8105 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8106 return false;
8107 }
8108 bool HasPhis = false;
8109 bool HasLoad = true;
8110 unsigned GatherLoads = 0;
8111 for (const std::unique_ptr<TreeEntry> &TE :
8112 ArrayRef(VectorizableTree).drop_front()) {
8113 if (TE->State == TreeEntry::SplitVectorize)
8114 continue;
8115 if (!TE->hasState()) {
8116 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
8118 continue;
8119 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8121 continue;
8122 return true;
8123 }
8124 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8125 if (!TE->isGather()) {
8126 HasLoad = false;
8127 continue;
8128 }
8129 if (HasLoad)
8130 return true;
8131 ++GatherLoads;
8132 if (GatherLoads >= GatherLoadsLimit)
8133 return true;
8134 }
8135 if (TE->getOpcode() == Instruction::GetElementPtr ||
8136 Instruction::isBinaryOp(TE->getOpcode()))
8137 continue;
8138 if (TE->getOpcode() != Instruction::PHI &&
8139 (!TE->hasCopyableElements() ||
8140 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8141 TE->Scalars.size() / 2))
8142 return true;
8143 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8144 TE->getNumOperands() > PhiOpsLimit)
8145 return false;
8146 HasPhis = true;
8147 }
8148 return !HasPhis;
8149 }
8150 return true;
8151}
8152
8153void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8154 ArrayRef<int> MaskOrder) {
8155 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8156 SmallVector<int> NewMask(getVectorFactor());
8157 SmallVector<int> NewMaskOrder(getVectorFactor());
8158 std::iota(NewMask.begin(), NewMask.end(), 0);
8159 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8160 if (Idx == 0) {
8161 copy(Mask, NewMask.begin());
8162 copy(MaskOrder, NewMaskOrder.begin());
8163 } else {
8164 assert(Idx == 1 && "Expected either 0 or 1 index.");
8165 unsigned Offset = CombinedEntriesWithIndices.back().second;
8166 for (unsigned I : seq<unsigned>(Mask.size())) {
8167 NewMask[I + Offset] = Mask[I] + Offset;
8168 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8169 }
8170 }
8171 reorderScalars(Scalars, NewMask);
8172 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8173 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8174 ReorderIndices.clear();
8175}
8176
8178 // Maps VF to the graph nodes.
8180 // ExtractElement gather nodes which can be vectorized and need to handle
8181 // their ordering.
8183
8184 // Phi nodes can have preferred ordering based on their result users
8186
8187 // AltShuffles can also have a preferred ordering that leads to fewer
8188 // instructions, e.g., the addsub instruction in x86.
8189 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8190
8191 // Maps a TreeEntry to the reorder indices of external users.
8193 ExternalUserReorderMap;
8194 // Find all reorderable nodes with the given VF.
8195 // Currently the are vectorized stores,loads,extracts + some gathering of
8196 // extracts.
8197 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8198 const std::unique_ptr<TreeEntry> &TE) {
8199 // Look for external users that will probably be vectorized.
8200 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8201 findExternalStoreUsersReorderIndices(TE.get());
8202 if (!ExternalUserReorderIndices.empty()) {
8203 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8204 ExternalUserReorderMap.try_emplace(TE.get(),
8205 std::move(ExternalUserReorderIndices));
8206 }
8207
8208 // Patterns like [fadd,fsub] can be combined into a single instruction in
8209 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8210 // to take into account their order when looking for the most used order.
8211 if (TE->hasState() && TE->isAltShuffle() &&
8212 TE->State != TreeEntry::SplitVectorize) {
8213 Type *ScalarTy = TE->Scalars[0]->getType();
8214 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8215 unsigned Opcode0 = TE->getOpcode();
8216 unsigned Opcode1 = TE->getAltOpcode();
8217 SmallBitVector OpcodeMask(
8218 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8219 // If this pattern is supported by the target then we consider the order.
8220 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8221 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8222 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8223 }
8224 // TODO: Check the reverse order too.
8225 }
8226
8227 bool IgnoreReorder =
8228 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8229 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8230 VectorizableTree.front()->getOpcode() == Instruction::Store);
8231 if (std::optional<OrdersType> CurrentOrder =
8232 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8233 // Do not include ordering for nodes used in the alt opcode vectorization,
8234 // better to reorder them during bottom-to-top stage. If follow the order
8235 // here, it causes reordering of the whole graph though actually it is
8236 // profitable just to reorder the subgraph that starts from the alternate
8237 // opcode vectorization node. Such nodes already end-up with the shuffle
8238 // instruction and it is just enough to change this shuffle rather than
8239 // rotate the scalars for the whole graph.
8240 unsigned Cnt = 0;
8241 const TreeEntry *UserTE = TE.get();
8242 while (UserTE && Cnt < RecursionMaxDepth) {
8243 if (!UserTE->UserTreeIndex)
8244 break;
8245 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8246 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8247 UserTE->UserTreeIndex.UserTE->Idx != 0)
8248 return;
8249 UserTE = UserTE->UserTreeIndex.UserTE;
8250 ++Cnt;
8251 }
8252 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8253 if (!(TE->State == TreeEntry::Vectorize ||
8254 TE->State == TreeEntry::StridedVectorize ||
8255 TE->State == TreeEntry::SplitVectorize ||
8256 TE->State == TreeEntry::CompressVectorize) ||
8257 !TE->ReuseShuffleIndices.empty())
8258 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8259 if (TE->State == TreeEntry::Vectorize &&
8260 TE->getOpcode() == Instruction::PHI)
8261 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8262 }
8263 });
8264
8265 // Reorder the graph nodes according to their vectorization factor.
8266 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8267 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8268 auto It = VFToOrderedEntries.find(VF);
8269 if (It == VFToOrderedEntries.end())
8270 continue;
8271 // Try to find the most profitable order. We just are looking for the most
8272 // used order and reorder scalar elements in the nodes according to this
8273 // mostly used order.
8274 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8275 // Delete VF entry upon exit.
8276 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
8277
8278 // All operands are reordered and used only in this node - propagate the
8279 // most used order to the user node.
8282 OrdersUses;
8283 for (const TreeEntry *OpTE : OrderedEntries) {
8284 // No need to reorder this nodes, still need to extend and to use shuffle,
8285 // just need to merge reordering shuffle and the reuse shuffle.
8286 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8287 OpTE->State != TreeEntry::SplitVectorize)
8288 continue;
8289 // Count number of orders uses.
8290 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8291 &PhisToOrders]() -> const OrdersType & {
8292 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8293 auto It = GathersToOrders.find(OpTE);
8294 if (It != GathersToOrders.end())
8295 return It->second;
8296 }
8297 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8298 auto It = AltShufflesToOrders.find(OpTE);
8299 if (It != AltShufflesToOrders.end())
8300 return It->second;
8301 }
8302 if (OpTE->State == TreeEntry::Vectorize &&
8303 OpTE->getOpcode() == Instruction::PHI) {
8304 auto It = PhisToOrders.find(OpTE);
8305 if (It != PhisToOrders.end())
8306 return It->second;
8307 }
8308 return OpTE->ReorderIndices;
8309 }();
8310 // First consider the order of the external scalar users.
8311 auto It = ExternalUserReorderMap.find(OpTE);
8312 if (It != ExternalUserReorderMap.end()) {
8313 const auto &ExternalUserReorderIndices = It->second;
8314 // If the OpTE vector factor != number of scalars - use natural order,
8315 // it is an attempt to reorder node with reused scalars but with
8316 // external uses.
8317 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8318 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8319 ExternalUserReorderIndices.size();
8320 } else {
8321 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8322 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8323 }
8324 // No other useful reorder data in this entry.
8325 if (Order.empty())
8326 continue;
8327 }
8328 // Stores actually store the mask, not the order, need to invert.
8329 if (OpTE->State == TreeEntry::Vectorize &&
8330 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8331 assert(!OpTE->isAltShuffle() &&
8332 "Alternate instructions are only supported by BinaryOperator "
8333 "and CastInst.");
8334 SmallVector<int> Mask;
8335 inversePermutation(Order, Mask);
8336 unsigned E = Order.size();
8337 OrdersType CurrentOrder(E, E);
8338 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8339 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8340 });
8341 fixupOrderingIndices(CurrentOrder);
8342 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8343 } else {
8344 ++OrdersUses.try_emplace(Order, 0).first->second;
8345 }
8346 }
8347 if (OrdersUses.empty())
8348 continue;
8349 // Choose the most used order.
8350 unsigned IdentityCnt = 0;
8351 unsigned FilledIdentityCnt = 0;
8352 OrdersType IdentityOrder(VF, VF);
8353 for (auto &Pair : OrdersUses) {
8354 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8355 if (!Pair.first.empty())
8356 FilledIdentityCnt += Pair.second;
8357 IdentityCnt += Pair.second;
8358 combineOrders(IdentityOrder, Pair.first);
8359 }
8360 }
8361 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8362 unsigned Cnt = IdentityCnt;
8363 for (auto &Pair : OrdersUses) {
8364 // Prefer identity order. But, if filled identity found (non-empty order)
8365 // with same number of uses, as the new candidate order, we can choose
8366 // this candidate order.
8367 if (Cnt < Pair.second ||
8368 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8369 Cnt == Pair.second && !BestOrder.empty() &&
8370 isIdentityOrder(BestOrder))) {
8371 combineOrders(Pair.first, BestOrder);
8372 BestOrder = Pair.first;
8373 Cnt = Pair.second;
8374 } else {
8375 combineOrders(BestOrder, Pair.first);
8376 }
8377 }
8378 // Set order of the user node.
8379 if (isIdentityOrder(BestOrder))
8380 continue;
8381 fixupOrderingIndices(BestOrder);
8382 SmallVector<int> Mask;
8383 inversePermutation(BestOrder, Mask);
8384 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8385 unsigned E = BestOrder.size();
8386 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8387 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8388 });
8389 // Do an actual reordering, if profitable.
8390 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8391 // Just do the reordering for the nodes with the given VF.
8392 if (TE->Scalars.size() != VF) {
8393 if (TE->ReuseShuffleIndices.size() == VF) {
8394 assert(TE->State != TreeEntry::SplitVectorize &&
8395 "Split vectorized not expected.");
8396 // Need to reorder the reuses masks of the operands with smaller VF to
8397 // be able to find the match between the graph nodes and scalar
8398 // operands of the given node during vectorization/cost estimation.
8399 assert(
8400 (!TE->UserTreeIndex ||
8401 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8402 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8403 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8404 "All users must be of VF size.");
8405 if (SLPReVec) {
8406 assert(SLPReVec && "Only supported by REVEC.");
8407 // ShuffleVectorInst does not do reorderOperands (and it should not
8408 // because ShuffleVectorInst supports only a limited set of
8409 // patterns). Only do reorderNodeWithReuses if the user is not
8410 // ShuffleVectorInst.
8411 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8412 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8413 continue;
8414 }
8415 // Update ordering of the operands with the smaller VF than the given
8416 // one.
8417 reorderNodeWithReuses(*TE, Mask);
8418 // Update orders in user split vectorize nodes.
8419 if (TE->UserTreeIndex &&
8420 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8421 TE->UserTreeIndex.UserTE->reorderSplitNode(
8422 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8423 }
8424 continue;
8425 }
8426 if ((TE->State == TreeEntry::SplitVectorize &&
8427 TE->ReuseShuffleIndices.empty()) ||
8428 ((TE->State == TreeEntry::Vectorize ||
8429 TE->State == TreeEntry::StridedVectorize ||
8430 TE->State == TreeEntry::CompressVectorize) &&
8432 InsertElementInst>(TE->getMainOp()) ||
8433 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8434 assert(
8435 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8436 TE->ReuseShuffleIndices.empty())) &&
8437 "Alternate instructions are only supported by BinaryOperator "
8438 "and CastInst.");
8439 // Build correct orders for extract{element,value}, loads,
8440 // stores and alternate (split) nodes.
8441 reorderOrder(TE->ReorderIndices, Mask);
8442 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8443 TE->reorderOperands(Mask);
8444 } else {
8445 // Reorder the node and its operands.
8446 TE->reorderOperands(Mask);
8447 assert(TE->ReorderIndices.empty() &&
8448 "Expected empty reorder sequence.");
8449 reorderScalars(TE->Scalars, Mask);
8450 }
8451 if (!TE->ReuseShuffleIndices.empty()) {
8452 // Apply reversed order to keep the original ordering of the reused
8453 // elements to avoid extra reorder indices shuffling.
8454 OrdersType CurrentOrder;
8455 reorderOrder(CurrentOrder, MaskOrder);
8456 SmallVector<int> NewReuses;
8457 inversePermutation(CurrentOrder, NewReuses);
8458 addMask(NewReuses, TE->ReuseShuffleIndices);
8459 TE->ReuseShuffleIndices.swap(NewReuses);
8460 } else if (TE->UserTreeIndex &&
8461 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8462 // Update orders in user split vectorize nodes.
8463 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8464 Mask, MaskOrder);
8465 }
8466 }
8467}
8468
8469void BoUpSLP::buildReorderableOperands(
8470 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8471 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8472 SmallVectorImpl<TreeEntry *> &GatherOps) {
8473 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8474 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8475 return OpData.first == I &&
8476 (OpData.second->State == TreeEntry::Vectorize ||
8477 OpData.second->State == TreeEntry::StridedVectorize ||
8478 OpData.second->State == TreeEntry::CompressVectorize ||
8479 OpData.second->State == TreeEntry::SplitVectorize);
8480 }))
8481 continue;
8482 // Do not request operands, if they do not exist.
8483 if (UserTE->hasState()) {
8484 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8485 UserTE->getOpcode() == Instruction::ExtractValue)
8486 continue;
8487 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8488 continue;
8489 if (UserTE->getOpcode() == Instruction::Store &&
8490 UserTE->State == TreeEntry::Vectorize && I == 1)
8491 continue;
8492 if (UserTE->getOpcode() == Instruction::Load &&
8493 (UserTE->State == TreeEntry::Vectorize ||
8494 UserTE->State == TreeEntry::StridedVectorize ||
8495 UserTE->State == TreeEntry::CompressVectorize))
8496 continue;
8497 }
8498 TreeEntry *TE = getOperandEntry(UserTE, I);
8499 assert(TE && "Expected operand entry.");
8500 if (!TE->isGather()) {
8501 // Add the node to the list of the ordered nodes with the identity
8502 // order.
8503 Edges.emplace_back(I, TE);
8504 // Add ScatterVectorize nodes to the list of operands, where just
8505 // reordering of the scalars is required. Similar to the gathers, so
8506 // simply add to the list of gathered ops.
8507 // If there are reused scalars, process this node as a regular vectorize
8508 // node, just reorder reuses mask.
8509 if (TE->State == TreeEntry::ScatterVectorize &&
8510 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8511 GatherOps.push_back(TE);
8512 continue;
8513 }
8514 if (ReorderableGathers.contains(TE))
8515 GatherOps.push_back(TE);
8516 }
8517}
8518
8519void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8520 struct TreeEntryCompare {
8521 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8522 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8523 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8524 return LHS->Idx < RHS->Idx;
8525 }
8526 };
8528 DenseSet<const TreeEntry *> GathersToOrders;
8529 // Find all reorderable leaf nodes with the given VF.
8530 // Currently the are vectorized loads,extracts without alternate operands +
8531 // some gathering of extracts.
8533 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8534 if (TE->State != TreeEntry::Vectorize &&
8535 TE->State != TreeEntry::StridedVectorize &&
8536 TE->State != TreeEntry::CompressVectorize &&
8537 TE->State != TreeEntry::SplitVectorize)
8538 NonVectorized.insert(TE.get());
8539 if (std::optional<OrdersType> CurrentOrder =
8540 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8541 Queue.push(TE.get());
8542 if (!(TE->State == TreeEntry::Vectorize ||
8543 TE->State == TreeEntry::StridedVectorize ||
8544 TE->State == TreeEntry::CompressVectorize ||
8545 TE->State == TreeEntry::SplitVectorize) ||
8546 !TE->ReuseShuffleIndices.empty())
8547 GathersToOrders.insert(TE.get());
8548 }
8549 }
8550
8551 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8552 // I.e., if the node has operands, that are reordered, try to make at least
8553 // one operand order in the natural order and reorder others + reorder the
8554 // user node itself.
8555 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8556 while (!Queue.empty()) {
8557 // 1. Filter out only reordered nodes.
8558 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8559 TreeEntry *TE = Queue.top();
8560 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8561 Queue.pop();
8562 SmallVector<TreeEntry *> OrderedOps(1, TE);
8563 while (!Queue.empty()) {
8564 TE = Queue.top();
8565 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8566 break;
8567 Queue.pop();
8568 OrderedOps.push_back(TE);
8569 }
8570 for (TreeEntry *TE : OrderedOps) {
8571 if (!(TE->State == TreeEntry::Vectorize ||
8572 TE->State == TreeEntry::StridedVectorize ||
8573 TE->State == TreeEntry::CompressVectorize ||
8574 TE->State == TreeEntry::SplitVectorize ||
8575 (TE->isGather() && GathersToOrders.contains(TE))) ||
8576 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8577 !Visited.insert(TE).second)
8578 continue;
8579 // Build a map between user nodes and their operands order to speedup
8580 // search. The graph currently does not provide this dependency directly.
8581 Users.first = TE->UserTreeIndex.UserTE;
8582 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8583 }
8584 if (Users.first) {
8585 auto &Data = Users;
8586 if (Data.first->State == TreeEntry::SplitVectorize) {
8587 assert(
8588 Data.second.size() <= 2 &&
8589 "Expected not greater than 2 operands for split vectorize node.");
8590 if (any_of(Data.second,
8591 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8592 continue;
8593 // Update orders in user split vectorize nodes.
8594 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8595 "Expected exactly 2 entries.");
8596 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8597 TreeEntry &OpTE = *VectorizableTree[P.first];
8598 OrdersType Order = OpTE.ReorderIndices;
8599 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8600 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8601 continue;
8602 const auto BestOrder =
8603 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8604 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8605 continue;
8606 Order = *BestOrder;
8607 }
8608 fixupOrderingIndices(Order);
8609 SmallVector<int> Mask;
8610 inversePermutation(Order, Mask);
8611 const unsigned E = Order.size();
8612 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8613 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8614 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8615 });
8616 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8617 // Clear ordering of the operand.
8618 if (!OpTE.ReorderIndices.empty()) {
8619 OpTE.ReorderIndices.clear();
8620 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8621 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8622 } else {
8623 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8624 reorderScalars(OpTE.Scalars, Mask);
8625 }
8626 }
8627 if (Data.first->ReuseShuffleIndices.empty() &&
8628 !Data.first->ReorderIndices.empty()) {
8629 // Insert user node to the list to try to sink reordering deeper in
8630 // the graph.
8631 Queue.push(Data.first);
8632 }
8633 continue;
8634 }
8635 // Check that operands are used only in the User node.
8636 SmallVector<TreeEntry *> GatherOps;
8637 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8638 GatherOps);
8639 // All operands are reordered and used only in this node - propagate the
8640 // most used order to the user node.
8643 OrdersUses;
8644 // Do the analysis for each tree entry only once, otherwise the order of
8645 // the same node my be considered several times, though might be not
8646 // profitable.
8649 for (const auto &Op : Data.second) {
8650 TreeEntry *OpTE = Op.second;
8651 if (!VisitedOps.insert(OpTE).second)
8652 continue;
8653 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8654 continue;
8655 const auto Order = [&]() -> const OrdersType {
8656 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8657 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8658 IgnoreReorder)
8659 .value_or(OrdersType(1));
8660 return OpTE->ReorderIndices;
8661 }();
8662 // The order is partially ordered, skip it in favor of fully non-ordered
8663 // orders.
8664 if (Order.size() == 1)
8665 continue;
8666
8667 // Check that the reordering does not increase number of shuffles, i.e.
8668 // same-values-nodes has same parents or their parents has same parents.
8669 if (!Order.empty() && !isIdentityOrder(Order)) {
8670 Value *Root = OpTE->hasState()
8671 ? OpTE->getMainOp()
8672 : *find_if_not(OpTE->Scalars, isConstant);
8673 auto GetSameNodesUsers = [&](Value *Root) {
8675 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8676 if (TE != OpTE && TE->UserTreeIndex &&
8677 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8678 TE->Scalars.size() == OpTE->Scalars.size() &&
8679 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8680 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8681 Res.insert(TE->UserTreeIndex.UserTE);
8682 }
8683 for (const TreeEntry *TE : getTreeEntries(Root)) {
8684 if (TE != OpTE && TE->UserTreeIndex &&
8685 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8686 TE->Scalars.size() == OpTE->Scalars.size() &&
8687 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8688 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8689 Res.insert(TE->UserTreeIndex.UserTE);
8690 }
8691 return Res.takeVector();
8692 };
8693 auto GetNumOperands = [](const TreeEntry *TE) {
8694 if (TE->State == TreeEntry::SplitVectorize)
8695 return TE->getNumOperands();
8696 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8697 return CI->arg_size();
8698 return TE->getNumOperands();
8699 };
8700 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8701 const TreeEntry *TE) {
8703 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8705 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8708 continue;
8709 const TreeEntry *Op = getOperandEntry(TE, Idx);
8710 if (Op->isGather() && Op->hasState()) {
8711 const TreeEntry *VecOp =
8712 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8713 if (VecOp)
8714 Op = VecOp;
8715 }
8716 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8717 return false;
8718 }
8719 return true;
8720 };
8721 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8722 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8723 if (!RevisitedOps.insert(UTE).second)
8724 return false;
8725 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8726 !UTE->ReuseShuffleIndices.empty() ||
8727 (UTE->UserTreeIndex &&
8728 UTE->UserTreeIndex.UserTE == Data.first) ||
8729 (Data.first->UserTreeIndex &&
8730 Data.first->UserTreeIndex.UserTE == UTE) ||
8731 (IgnoreReorder && UTE->UserTreeIndex &&
8732 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8733 NodeShouldBeReorderedWithOperands(UTE);
8734 }))
8735 continue;
8736 for (TreeEntry *UTE : Users) {
8738 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8740 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8743 continue;
8744 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8745 Visited.erase(Op);
8746 Queue.push(const_cast<TreeEntry *>(Op));
8747 }
8748 }
8749 }
8750 unsigned NumOps = count_if(
8751 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
8752 return P.second == OpTE;
8753 });
8754 // Stores actually store the mask, not the order, need to invert.
8755 if (OpTE->State == TreeEntry::Vectorize &&
8756 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8757 assert(!OpTE->isAltShuffle() &&
8758 "Alternate instructions are only supported by BinaryOperator "
8759 "and CastInst.");
8760 SmallVector<int> Mask;
8761 inversePermutation(Order, Mask);
8762 unsigned E = Order.size();
8763 OrdersType CurrentOrder(E, E);
8764 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8765 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8766 });
8767 fixupOrderingIndices(CurrentOrder);
8768 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8769 } else {
8770 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8771 }
8772 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
8773 const auto AllowsReordering = [&](const TreeEntry *TE) {
8774 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8775 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8776 (IgnoreReorder && TE->Idx == 0))
8777 return true;
8778 if (TE->isGather()) {
8779 if (GathersToOrders.contains(TE))
8780 return !getReorderingData(*TE, /*TopToBottom=*/false,
8781 IgnoreReorder)
8782 .value_or(OrdersType(1))
8783 .empty();
8784 return true;
8785 }
8786 return false;
8787 };
8788 if (OpTE->UserTreeIndex) {
8789 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8790 if (!VisitedUsers.insert(UserTE).second)
8791 continue;
8792 // May reorder user node if it requires reordering, has reused
8793 // scalars, is an alternate op vectorize node or its op nodes require
8794 // reordering.
8795 if (AllowsReordering(UserTE))
8796 continue;
8797 // Check if users allow reordering.
8798 // Currently look up just 1 level of operands to avoid increase of
8799 // the compile time.
8800 // Profitable to reorder if definitely more operands allow
8801 // reordering rather than those with natural order.
8803 if (static_cast<unsigned>(count_if(
8804 Ops, [UserTE, &AllowsReordering](
8805 const std::pair<unsigned, TreeEntry *> &Op) {
8806 return AllowsReordering(Op.second) &&
8807 Op.second->UserTreeIndex.UserTE == UserTE;
8808 })) <= Ops.size() / 2)
8809 ++Res.first->second;
8810 }
8811 }
8812 if (OrdersUses.empty()) {
8813 Visited.insert_range(llvm::make_second_range(Data.second));
8814 continue;
8815 }
8816 // Choose the most used order.
8817 unsigned IdentityCnt = 0;
8818 unsigned VF = Data.second.front().second->getVectorFactor();
8819 OrdersType IdentityOrder(VF, VF);
8820 for (auto &Pair : OrdersUses) {
8821 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8822 IdentityCnt += Pair.second;
8823 combineOrders(IdentityOrder, Pair.first);
8824 }
8825 }
8826 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8827 unsigned Cnt = IdentityCnt;
8828 for (auto &Pair : OrdersUses) {
8829 // Prefer identity order. But, if filled identity found (non-empty
8830 // order) with same number of uses, as the new candidate order, we can
8831 // choose this candidate order.
8832 if (Cnt < Pair.second) {
8833 combineOrders(Pair.first, BestOrder);
8834 BestOrder = Pair.first;
8835 Cnt = Pair.second;
8836 } else {
8837 combineOrders(BestOrder, Pair.first);
8838 }
8839 }
8840 // Set order of the user node.
8841 if (isIdentityOrder(BestOrder)) {
8842 Visited.insert_range(llvm::make_second_range(Data.second));
8843 continue;
8844 }
8845 fixupOrderingIndices(BestOrder);
8846 // Erase operands from OrderedEntries list and adjust their orders.
8847 VisitedOps.clear();
8848 SmallVector<int> Mask;
8849 inversePermutation(BestOrder, Mask);
8850 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8851 unsigned E = BestOrder.size();
8852 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8853 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8854 });
8855 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
8856 TreeEntry *TE = Op.second;
8857 if (!VisitedOps.insert(TE).second)
8858 continue;
8859 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
8860 reorderNodeWithReuses(*TE, Mask);
8861 continue;
8862 }
8863 // Gathers are processed separately.
8864 if (TE->State != TreeEntry::Vectorize &&
8865 TE->State != TreeEntry::StridedVectorize &&
8866 TE->State != TreeEntry::CompressVectorize &&
8867 TE->State != TreeEntry::SplitVectorize &&
8868 (TE->State != TreeEntry::ScatterVectorize ||
8869 TE->ReorderIndices.empty()))
8870 continue;
8871 assert((BestOrder.size() == TE->ReorderIndices.size() ||
8872 TE->ReorderIndices.empty()) &&
8873 "Non-matching sizes of user/operand entries.");
8874 reorderOrder(TE->ReorderIndices, Mask);
8875 if (IgnoreReorder && TE == VectorizableTree.front().get())
8876 IgnoreReorder = false;
8877 }
8878 // For gathers just need to reorder its scalars.
8879 for (TreeEntry *Gather : GatherOps) {
8880 assert(Gather->ReorderIndices.empty() &&
8881 "Unexpected reordering of gathers.");
8882 if (!Gather->ReuseShuffleIndices.empty()) {
8883 // Just reorder reuses indices.
8884 reorderReuses(Gather->ReuseShuffleIndices, Mask);
8885 continue;
8886 }
8887 reorderScalars(Gather->Scalars, Mask);
8888 Visited.insert(Gather);
8889 }
8890 // Reorder operands of the user node and set the ordering for the user
8891 // node itself.
8892 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
8893 return TE.isAltShuffle() &&
8894 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8895 TE.ReorderIndices.empty());
8896 };
8897 if (Data.first->State != TreeEntry::Vectorize ||
8899 Data.first->getMainOp()) ||
8900 IsNotProfitableAltCodeNode(*Data.first))
8901 Data.first->reorderOperands(Mask);
8902 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
8903 IsNotProfitableAltCodeNode(*Data.first) ||
8904 Data.first->State == TreeEntry::StridedVectorize ||
8905 Data.first->State == TreeEntry::CompressVectorize) {
8906 reorderScalars(Data.first->Scalars, Mask);
8907 reorderOrder(Data.first->ReorderIndices, MaskOrder,
8908 /*BottomOrder=*/true);
8909 if (Data.first->ReuseShuffleIndices.empty() &&
8910 !Data.first->ReorderIndices.empty() &&
8911 !IsNotProfitableAltCodeNode(*Data.first)) {
8912 // Insert user node to the list to try to sink reordering deeper in
8913 // the graph.
8914 Queue.push(Data.first);
8915 }
8916 } else {
8917 reorderOrder(Data.first->ReorderIndices, Mask);
8918 }
8919 }
8920 }
8921 // If the reordering is unnecessary, just remove the reorder.
8922 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8923 VectorizableTree.front()->ReuseShuffleIndices.empty())
8924 VectorizableTree.front()->ReorderIndices.clear();
8925}
8926
8927Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
8928 if (Entry.hasState() &&
8929 (Entry.getOpcode() == Instruction::Store ||
8930 Entry.getOpcode() == Instruction::Load) &&
8931 Entry.State == TreeEntry::StridedVectorize &&
8932 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
8933 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8934 return dyn_cast<Instruction>(Entry.Scalars.front());
8935}
8936
8938 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
8939 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8940 DenseMap<Value *, unsigned> ScalarToExtUses;
8941 // Collect the values that we need to extract from the tree.
8942 for (auto &TEPtr : VectorizableTree) {
8943 TreeEntry *Entry = TEPtr.get();
8944
8945 // No need to handle users of gathered values.
8946 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8947 continue;
8948
8949 // For each lane:
8950 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8951 Value *Scalar = Entry->Scalars[Lane];
8952 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8953 continue;
8954
8955 // All uses must be replaced already? No need to do it again.
8956 auto It = ScalarToExtUses.find(Scalar);
8957 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
8958 continue;
8959
8960 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8961 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8962 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
8963 << " from " << *Scalar << "for many users.\n");
8964 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8965 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8966 ExternalUsesWithNonUsers.insert(Scalar);
8967 continue;
8968 }
8969
8970 // Check if the scalar is externally used as an extra arg.
8971 const auto ExtI = ExternallyUsedValues.find(Scalar);
8972 if (ExtI != ExternallyUsedValues.end()) {
8973 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8974 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
8975 << FoundLane << " from " << *Scalar << ".\n");
8976 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
8977 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8978 continue;
8979 }
8980 for (User *U : Scalar->users()) {
8981 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
8982
8983 Instruction *UserInst = dyn_cast<Instruction>(U);
8984 if (!UserInst || isDeleted(UserInst))
8985 continue;
8986
8987 // Ignore users in the user ignore list.
8988 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8989 continue;
8990
8991 // Skip in-tree scalars that become vectors
8992 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
8993 !UseEntries.empty()) {
8994 // Some in-tree scalars will remain as scalar in vectorized
8995 // instructions. If that is the case, the one in FoundLane will
8996 // be used.
8997 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8998 isa<LoadInst, StoreInst>(UserInst)) ||
8999 isa<CallInst>(UserInst)) ||
9000 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9001 return UseEntry->State == TreeEntry::ScatterVectorize ||
9003 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9004 TTI);
9005 })) {
9006 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9007 << ".\n");
9008 assert(none_of(UseEntries,
9009 [](TreeEntry *UseEntry) {
9010 return UseEntry->isGather();
9011 }) &&
9012 "Bad state");
9013 continue;
9014 }
9015 U = nullptr;
9016 if (It != ScalarToExtUses.end()) {
9017 ExternalUses[It->second].User = nullptr;
9018 break;
9019 }
9020 }
9021
9022 if (U && Scalar->hasNUsesOrMore(UsesLimit))
9023 U = nullptr;
9024 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9025 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9026 << " from lane " << FoundLane << " from " << *Scalar
9027 << ".\n");
9028 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9029 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9030 ExternalUsesWithNonUsers.insert(Scalar);
9031 if (!U)
9032 break;
9033 }
9034 }
9035 }
9036}
9037
9039BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9042 PtrToStoresMap;
9043 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
9044 Value *V = TE->Scalars[Lane];
9045 // Don't iterate over the users of constant data.
9046 if (!isa<Instruction>(V))
9047 continue;
9048 // To save compilation time we don't visit if we have too many users.
9049 if (V->hasNUsesOrMore(UsesLimit))
9050 break;
9051
9052 // Collect stores per pointer object.
9053 for (User *U : V->users()) {
9054 auto *SI = dyn_cast<StoreInst>(U);
9055 // Test whether we can handle the store. V might be a global, which could
9056 // be used in a different function.
9057 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9058 !isValidElementType(SI->getValueOperand()->getType()))
9059 continue;
9060 // Skip entry if already
9061 if (isVectorized(U))
9062 continue;
9063
9064 Value *Ptr =
9065 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
9066 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9067 SI->getValueOperand()->getType(), Ptr}];
9068 // For now just keep one store per pointer object per lane.
9069 // TODO: Extend this to support multiple stores per pointer per lane
9070 if (StoresVec.size() > Lane)
9071 continue;
9072 if (!StoresVec.empty()) {
9073 std::optional<int64_t> Diff = getPointersDiff(
9074 SI->getValueOperand()->getType(), SI->getPointerOperand(),
9075 SI->getValueOperand()->getType(),
9076 StoresVec.front()->getPointerOperand(), *DL, *SE,
9077 /*StrictCheck=*/true);
9078 // We failed to compare the pointers so just abandon this store.
9079 if (!Diff)
9080 continue;
9081 }
9082 StoresVec.push_back(SI);
9083 }
9084 }
9085 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9086 unsigned I = 0;
9087 for (auto &P : PtrToStoresMap) {
9088 Res[I].swap(P.second);
9089 ++I;
9090 }
9091 return Res;
9092}
9093
9094bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9095 OrdersType &ReorderIndices) const {
9096 // We check whether the stores in StoreVec can form a vector by sorting them
9097 // and checking whether they are consecutive.
9098
9099 // To avoid calling getPointersDiff() while sorting we create a vector of
9100 // pairs {store, offset from first} and sort this instead.
9102 StoreInst *S0 = StoresVec[0];
9103 StoreOffsetVec.emplace_back(0, 0);
9104 Type *S0Ty = S0->getValueOperand()->getType();
9105 Value *S0Ptr = S0->getPointerOperand();
9106 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
9107 StoreInst *SI = StoresVec[Idx];
9108 std::optional<int64_t> Diff =
9109 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
9110 SI->getPointerOperand(), *DL, *SE,
9111 /*StrictCheck=*/true);
9112 StoreOffsetVec.emplace_back(*Diff, Idx);
9113 }
9114
9115 // Check if the stores are consecutive by checking if their difference is 1.
9116 if (StoreOffsetVec.size() != StoresVec.size())
9117 return false;
9118 sort(StoreOffsetVec, llvm::less_first());
9119 unsigned Idx = 0;
9120 int64_t PrevDist = 0;
9121 for (const auto &P : StoreOffsetVec) {
9122 if (Idx > 0 && P.first != PrevDist + 1)
9123 return false;
9124 PrevDist = P.first;
9125 ++Idx;
9126 }
9127
9128 // Calculate the shuffle indices according to their offset against the sorted
9129 // StoreOffsetVec.
9130 ReorderIndices.assign(StoresVec.size(), 0);
9131 bool IsIdentity = true;
9132 for (auto [I, P] : enumerate(StoreOffsetVec)) {
9133 ReorderIndices[P.second] = I;
9134 IsIdentity &= P.second == I;
9135 }
9136 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9137 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9138 // same convention here.
9139 if (IsIdentity)
9140 ReorderIndices.clear();
9141
9142 return true;
9143}
9144
9145#ifndef NDEBUG
9147 for (unsigned Idx : Order)
9148 dbgs() << Idx << ", ";
9149 dbgs() << "\n";
9150}
9151#endif
9152
9154BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9155 unsigned NumLanes = TE->Scalars.size();
9156
9157 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9158
9159 // Holds the reorder indices for each candidate store vector that is a user of
9160 // the current TreeEntry.
9161 SmallVector<OrdersType, 1> ExternalReorderIndices;
9162
9163 // Now inspect the stores collected per pointer and look for vectorization
9164 // candidates. For each candidate calculate the reorder index vector and push
9165 // it into `ExternalReorderIndices`
9166 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9167 // If we have fewer than NumLanes stores, then we can't form a vector.
9168 if (StoresVec.size() != NumLanes)
9169 continue;
9170
9171 // If the stores are not consecutive then abandon this StoresVec.
9172 OrdersType ReorderIndices;
9173 if (!canFormVector(StoresVec, ReorderIndices))
9174 continue;
9175
9176 // We now know that the scalars in StoresVec can form a vector instruction,
9177 // so set the reorder indices.
9178 ExternalReorderIndices.push_back(ReorderIndices);
9179 }
9180 return ExternalReorderIndices;
9181}
9182
9184 const SmallDenseSet<Value *> &UserIgnoreLst) {
9185 deleteTree();
9186 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9187 "TreeEntryToStridedPtrInfoMap is not cleared");
9188 UserIgnoreList = &UserIgnoreLst;
9189 if (!allSameType(Roots))
9190 return;
9191 buildTreeRec(Roots, 0, EdgeInfo());
9192}
9193
9195 deleteTree();
9196 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9197 "TreeEntryToStridedPtrInfoMap is not cleared");
9198 if (!allSameType(Roots))
9199 return;
9200 buildTreeRec(Roots, 0, EdgeInfo());
9201}
9202
9203/// Tries to find subvector of loads and builds new vector of only loads if can
9204/// be profitable.
9206 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9208 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9209 bool AddNew = true) {
9210 if (VL.empty())
9211 return;
9212 Type *ScalarTy = getValueType(VL.front());
9213 if (!isValidElementType(ScalarTy))
9214 return;
9216 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9217 for (Value *V : VL) {
9218 auto *LI = dyn_cast<LoadInst>(V);
9219 if (!LI)
9220 continue;
9221 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9222 continue;
9223 bool IsFound = false;
9224 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9225 assert(LI->getParent() == Data.front().first->getParent() &&
9226 LI->getType() == Data.front().first->getType() &&
9227 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9228 getUnderlyingObject(Data.front().first->getPointerOperand(),
9230 "Expected loads with the same type, same parent and same "
9231 "underlying pointer.");
9232 std::optional<int64_t> Dist = getPointersDiff(
9233 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9234 Data.front().first->getPointerOperand(), DL, SE,
9235 /*StrictCheck=*/true);
9236 if (!Dist)
9237 continue;
9238 auto It = Map.find(*Dist);
9239 if (It != Map.end() && It->second != LI)
9240 continue;
9241 if (It == Map.end()) {
9242 Data.emplace_back(LI, *Dist);
9243 Map.try_emplace(*Dist, LI);
9244 }
9245 IsFound = true;
9246 break;
9247 }
9248 if (!IsFound) {
9249 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9250 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9251 }
9252 }
9253 auto FindMatchingLoads =
9256 &GatheredLoads,
9257 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9258 int64_t &Offset, unsigned &Start) {
9259 if (Loads.empty())
9260 return GatheredLoads.end();
9261 LoadInst *LI = Loads.front().first;
9262 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9263 if (Idx < Start)
9264 continue;
9265 ToAdd.clear();
9266 if (LI->getParent() != Data.front().first->getParent() ||
9267 LI->getType() != Data.front().first->getType())
9268 continue;
9269 std::optional<int64_t> Dist =
9271 Data.front().first->getType(),
9272 Data.front().first->getPointerOperand(), DL, SE,
9273 /*StrictCheck=*/true);
9274 if (!Dist)
9275 continue;
9276 SmallSet<int64_t, 4> DataDists;
9278 for (std::pair<LoadInst *, int64_t> P : Data) {
9279 DataDists.insert(P.second);
9280 DataLoads.insert(P.first);
9281 }
9282 // Found matching gathered loads - check if all loads are unique or
9283 // can be effectively vectorized.
9284 unsigned NumUniques = 0;
9285 for (auto [Cnt, Pair] : enumerate(Loads)) {
9286 bool Used = DataLoads.contains(Pair.first);
9287 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9288 ++NumUniques;
9289 ToAdd.insert(Cnt);
9290 } else if (Used) {
9291 Repeated.insert(Cnt);
9292 }
9293 }
9294 if (NumUniques > 0 &&
9295 (Loads.size() == NumUniques ||
9296 (Loads.size() - NumUniques >= 2 &&
9297 Loads.size() - NumUniques >= Loads.size() / 2 &&
9298 (has_single_bit(Data.size() + NumUniques) ||
9299 bit_ceil(Data.size()) <
9300 bit_ceil(Data.size() + NumUniques))))) {
9301 Offset = *Dist;
9302 Start = Idx + 1;
9303 return std::next(GatheredLoads.begin(), Idx);
9304 }
9305 }
9306 ToAdd.clear();
9307 return GatheredLoads.end();
9308 };
9309 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9310 unsigned Start = 0;
9311 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9312 int64_t Offset = 0;
9313 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9314 Offset, Start);
9315 while (It != GatheredLoads.end()) {
9316 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9317 for (unsigned Idx : LocalToAdd)
9318 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9319 ToAdd.insert_range(LocalToAdd);
9320 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9321 Start);
9322 }
9323 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9324 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9325 })) {
9326 auto AddNewLoads =
9328 for (unsigned Idx : seq<unsigned>(Data.size())) {
9329 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9330 continue;
9331 Loads.push_back(Data[Idx]);
9332 }
9333 };
9334 if (!AddNew) {
9335 LoadInst *LI = Data.front().first;
9336 It = find_if(
9337 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9338 return PD.front().first->getParent() == LI->getParent() &&
9339 PD.front().first->getType() == LI->getType();
9340 });
9341 while (It != GatheredLoads.end()) {
9342 AddNewLoads(*It);
9343 It = std::find_if(
9344 std::next(It), GatheredLoads.end(),
9345 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9346 return PD.front().first->getParent() == LI->getParent() &&
9347 PD.front().first->getType() == LI->getType();
9348 });
9349 }
9350 }
9351 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9352 AddNewLoads(GatheredLoads.emplace_back());
9353 }
9354 }
9355}
9356
9357void BoUpSLP::tryToVectorizeGatheredLoads(
9358 const SmallMapVector<
9359 std::tuple<BasicBlock *, Value *, Type *>,
9360 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9361 &GatheredLoads) {
9362 GatheredLoadsEntriesFirst = VectorizableTree.size();
9363
9364 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9365 LoadEntriesToVectorize.size());
9366 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9367 Set.insert_range(VectorizableTree[Idx]->Scalars);
9368
9369 // Sort loads by distance.
9370 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9371 const std::pair<LoadInst *, int64_t> &L2) {
9372 return L1.second > L2.second;
9373 };
9374
9375 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9376 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9377 Loads.size());
9378 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9379 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9380 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9381 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9382 };
9383
9384 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9385 BoUpSLP::ValueSet &VectorizedLoads,
9386 SmallVectorImpl<LoadInst *> &NonVectorized,
9387 bool Final, unsigned MaxVF) {
9389 unsigned StartIdx = 0;
9390 SmallVector<int> CandidateVFs;
9391 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9392 CandidateVFs.push_back(MaxVF);
9393 for (int NumElts = getFloorFullVectorNumberOfElements(
9394 *TTI, Loads.front()->getType(), MaxVF);
9395 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9396 *TTI, Loads.front()->getType(), NumElts - 1)) {
9397 CandidateVFs.push_back(NumElts);
9398 if (VectorizeNonPowerOf2 && NumElts > 2)
9399 CandidateVFs.push_back(NumElts - 1);
9400 }
9401
9402 if (Final && CandidateVFs.empty())
9403 return Results;
9404
9405 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9406 for (unsigned NumElts : CandidateVFs) {
9407 if (Final && NumElts > BestVF)
9408 continue;
9409 SmallVector<unsigned> MaskedGatherVectorized;
9410 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9411 ++Cnt) {
9412 ArrayRef<LoadInst *> Slice =
9413 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9414 if (VectorizedLoads.count(Slice.front()) ||
9415 VectorizedLoads.count(Slice.back()) ||
9417 continue;
9418 // Check if it is profitable to try vectorizing gathered loads. It is
9419 // profitable if we have more than 3 consecutive loads or if we have
9420 // less but all users are vectorized or deleted.
9421 bool AllowToVectorize = false;
9422 // Check if it is profitable to vectorize 2-elements loads.
9423 if (NumElts == 2) {
9424 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9425 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9426 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9427 for (LoadInst *LI : Slice) {
9428 // If single use/user - allow to vectorize.
9429 if (LI->hasOneUse())
9430 continue;
9431 // 1. Check if number of uses equals number of users.
9432 // 2. All users are deleted.
9433 // 3. The load broadcasts are not allowed or the load is not
9434 // broadcasted.
9435 if (static_cast<unsigned int>(std::distance(
9436 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9437 return false;
9438 if (!IsLegalBroadcastLoad)
9439 continue;
9440 if (LI->hasNUsesOrMore(UsesLimit))
9441 return false;
9442 for (User *U : LI->users()) {
9443 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9444 continue;
9445 for (const TreeEntry *UTE : getTreeEntries(U)) {
9446 for (int I : seq<int>(UTE->getNumOperands())) {
9447 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9448 return V == LI || isa<PoisonValue>(V);
9449 }))
9450 // Found legal broadcast - do not vectorize.
9451 return false;
9452 }
9453 }
9454 }
9455 }
9456 return true;
9457 };
9458 AllowToVectorize = CheckIfAllowed(Slice);
9459 } else {
9460 AllowToVectorize =
9461 (NumElts >= 3 ||
9462 any_of(ValueToGatherNodes.at(Slice.front()),
9463 [=](const TreeEntry *TE) {
9464 return TE->Scalars.size() == 2 &&
9465 ((TE->Scalars.front() == Slice.front() &&
9466 TE->Scalars.back() == Slice.back()) ||
9467 (TE->Scalars.front() == Slice.back() &&
9468 TE->Scalars.back() == Slice.front()));
9469 })) &&
9470 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9471 Slice.size());
9472 }
9473 if (AllowToVectorize) {
9474 SmallVector<Value *> PointerOps;
9475 OrdersType CurrentOrder;
9476 // Try to build vector load.
9477 ArrayRef<Value *> Values(
9478 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9479 StridedPtrInfo SPtrInfo;
9480 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9481 PointerOps, SPtrInfo, &BestVF);
9482 if (LS != LoadsState::Gather ||
9483 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9484 if (LS == LoadsState::ScatterVectorize) {
9485 if (MaskedGatherVectorized.empty() ||
9486 Cnt >= MaskedGatherVectorized.back() + NumElts)
9487 MaskedGatherVectorized.push_back(Cnt);
9488 continue;
9489 }
9490 if (LS != LoadsState::Gather) {
9491 Results.emplace_back(Values, LS);
9492 VectorizedLoads.insert_range(Slice);
9493 // If we vectorized initial block, no need to try to vectorize it
9494 // again.
9495 if (Cnt == StartIdx)
9496 StartIdx += NumElts;
9497 }
9498 // Check if the whole array was vectorized already - exit.
9499 if (StartIdx >= Loads.size())
9500 break;
9501 // Erase last masked gather candidate, if another candidate within
9502 // the range is found to be better.
9503 if (!MaskedGatherVectorized.empty() &&
9504 Cnt < MaskedGatherVectorized.back() + NumElts)
9505 MaskedGatherVectorized.pop_back();
9506 Cnt += NumElts - 1;
9507 continue;
9508 }
9509 }
9510 if (!AllowToVectorize || BestVF == 0)
9512 }
9513 // Mark masked gathers candidates as vectorized, if any.
9514 for (unsigned Cnt : MaskedGatherVectorized) {
9515 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9516 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9517 ArrayRef<Value *> Values(
9518 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9519 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9520 VectorizedLoads.insert_range(Slice);
9521 // If we vectorized initial block, no need to try to vectorize it again.
9522 if (Cnt == StartIdx)
9523 StartIdx += NumElts;
9524 }
9525 }
9526 for (LoadInst *LI : Loads) {
9527 if (!VectorizedLoads.contains(LI))
9528 NonVectorized.push_back(LI);
9529 }
9530 return Results;
9531 };
9532 auto ProcessGatheredLoads =
9533 [&, &TTI = *TTI](
9535 bool Final = false) {
9536 SmallVector<LoadInst *> NonVectorized;
9537 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9538 GatheredLoads) {
9539 if (LoadsDists.size() <= 1) {
9540 NonVectorized.push_back(LoadsDists.back().first);
9541 continue;
9542 }
9544 LoadsDists);
9545 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9546 stable_sort(LocalLoadsDists, LoadSorter);
9548 unsigned MaxConsecutiveDistance = 0;
9549 unsigned CurrentConsecutiveDist = 1;
9550 int64_t LastDist = LocalLoadsDists.front().second;
9551 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9552 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9553 if (isVectorized(L.first))
9554 continue;
9555 assert(LastDist >= L.second &&
9556 "Expected first distance always not less than second");
9557 if (static_cast<uint64_t>(LastDist - L.second) ==
9558 CurrentConsecutiveDist) {
9559 ++CurrentConsecutiveDist;
9560 MaxConsecutiveDistance =
9561 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9562 Loads.push_back(L.first);
9563 continue;
9564 }
9565 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9566 !Loads.empty())
9567 Loads.pop_back();
9568 CurrentConsecutiveDist = 1;
9569 LastDist = L.second;
9570 Loads.push_back(L.first);
9571 }
9572 if (Loads.size() <= 1)
9573 continue;
9574 if (AllowMaskedGather)
9575 MaxConsecutiveDistance = Loads.size();
9576 else if (MaxConsecutiveDistance < 2)
9577 continue;
9578 BoUpSLP::ValueSet VectorizedLoads;
9579 SmallVector<LoadInst *> SortedNonVectorized;
9581 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9582 Final, MaxConsecutiveDistance);
9583 if (!Results.empty() && !SortedNonVectorized.empty() &&
9584 OriginalLoads.size() == Loads.size() &&
9585 MaxConsecutiveDistance == Loads.size() &&
9587 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9588 return P.second == LoadsState::ScatterVectorize;
9589 })) {
9590 VectorizedLoads.clear();
9591 SmallVector<LoadInst *> UnsortedNonVectorized;
9593 UnsortedResults =
9594 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9595 UnsortedNonVectorized, Final,
9596 OriginalLoads.size());
9597 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9598 SortedNonVectorized.swap(UnsortedNonVectorized);
9599 Results.swap(UnsortedResults);
9600 }
9601 }
9602 for (auto [Slice, _] : Results) {
9603 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9604 << Slice.size() << ")\n");
9605 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9606 for (Value *L : Slice)
9607 if (!isVectorized(L))
9608 SortedNonVectorized.push_back(cast<LoadInst>(L));
9609 continue;
9610 }
9611
9612 // Select maximum VF as a maximum of user gathered nodes and
9613 // distance between scalar loads in these nodes.
9614 unsigned MaxVF = Slice.size();
9615 unsigned UserMaxVF = 0;
9616 unsigned InterleaveFactor = 0;
9617 if (MaxVF == 2) {
9618 UserMaxVF = MaxVF;
9619 } else {
9620 // Found distance between segments of the interleaved loads.
9621 std::optional<unsigned> InterleavedLoadsDistance = 0;
9622 unsigned Order = 0;
9623 std::optional<unsigned> CommonVF = 0;
9624 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9625 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9626 for (auto [Idx, V] : enumerate(Slice)) {
9627 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9628 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9629 unsigned Pos =
9630 EntryToPosition.try_emplace(E, Idx).first->second;
9631 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9632 if (CommonVF) {
9633 if (*CommonVF == 0) {
9634 CommonVF = E->Scalars.size();
9635 continue;
9636 }
9637 if (*CommonVF != E->Scalars.size())
9638 CommonVF.reset();
9639 }
9640 // Check if the load is the part of the interleaved load.
9641 if (Pos != Idx && InterleavedLoadsDistance) {
9642 if (!DeinterleavedNodes.contains(E) &&
9643 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9644 if (isa<Constant>(V))
9645 return false;
9646 if (isVectorized(V))
9647 return true;
9648 const auto &Nodes = ValueToGatherNodes.at(V);
9649 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9650 !is_contained(Slice, V);
9651 })) {
9652 InterleavedLoadsDistance.reset();
9653 continue;
9654 }
9655 DeinterleavedNodes.insert(E);
9656 if (*InterleavedLoadsDistance == 0) {
9657 InterleavedLoadsDistance = Idx - Pos;
9658 continue;
9659 }
9660 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9661 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9662 InterleavedLoadsDistance.reset();
9663 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9664 }
9665 }
9666 }
9667 DeinterleavedNodes.clear();
9668 // Check if the large load represents interleaved load operation.
9669 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9670 CommonVF.value_or(0) != 0) {
9671 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9672 unsigned VF = *CommonVF;
9673 OrdersType Order;
9674 SmallVector<Value *> PointerOps;
9675 StridedPtrInfo SPtrInfo;
9676 // Segmented load detected - vectorize at maximum vector factor.
9677 if (InterleaveFactor <= Slice.size() &&
9678 TTI.isLegalInterleavedAccessType(
9679 getWidenedType(Slice.front()->getType(), VF),
9680 InterleaveFactor,
9681 cast<LoadInst>(Slice.front())->getAlign(),
9682 cast<LoadInst>(Slice.front())
9684 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9685 SPtrInfo) == LoadsState::Vectorize) {
9686 UserMaxVF = InterleaveFactor * VF;
9687 } else {
9688 InterleaveFactor = 0;
9689 }
9690 }
9691 // Cannot represent the loads as consecutive vectorizable nodes -
9692 // just exit.
9693 unsigned ConsecutiveNodesSize = 0;
9694 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9695 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9696 [&, Slice = Slice](const auto &P) {
9697 const auto *It = find_if(Slice, [&](Value *V) {
9698 return std::get<1>(P).contains(V);
9699 });
9700 if (It == Slice.end())
9701 return false;
9702 const TreeEntry &TE =
9703 *VectorizableTree[std::get<0>(P)];
9704 ArrayRef<Value *> VL = TE.Scalars;
9705 OrdersType Order;
9706 SmallVector<Value *> PointerOps;
9707 StridedPtrInfo SPtrInfo;
9709 VL, VL.front(), Order, PointerOps, SPtrInfo);
9710 if (State == LoadsState::ScatterVectorize ||
9712 return false;
9713 ConsecutiveNodesSize += VL.size();
9714 size_t Start = std::distance(Slice.begin(), It);
9715 size_t Sz = Slice.size() - Start;
9716 return Sz < VL.size() ||
9717 Slice.slice(Start, VL.size()) != VL;
9718 }))
9719 continue;
9720 // Try to build long masked gather loads.
9721 UserMaxVF = bit_ceil(UserMaxVF);
9722 if (InterleaveFactor == 0 &&
9723 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9724 [&, Slice = Slice](unsigned Idx) {
9725 OrdersType Order;
9726 SmallVector<Value *> PointerOps;
9727 StridedPtrInfo SPtrInfo;
9728 return canVectorizeLoads(
9729 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9730 Slice[Idx * UserMaxVF], Order, PointerOps,
9731 SPtrInfo) == LoadsState::ScatterVectorize;
9732 }))
9733 UserMaxVF = MaxVF;
9734 if (Slice.size() != ConsecutiveNodesSize)
9735 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9736 }
9737 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9738 bool IsVectorized = true;
9739 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
9740 ArrayRef<Value *> SubSlice =
9741 Slice.slice(I, std::min(VF, E - I));
9742 if (isVectorized(SubSlice.front()))
9743 continue;
9744 // Check if the subslice is to be-vectorized entry, which is not
9745 // equal to entry.
9746 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9747 [&](const auto &P) {
9748 return !SubSlice.equals(
9749 VectorizableTree[std::get<0>(P)]
9750 ->Scalars) &&
9751 set_is_subset(SubSlice, std::get<1>(P));
9752 }))
9753 continue;
9754 unsigned Sz = VectorizableTree.size();
9755 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9756 if (Sz == VectorizableTree.size()) {
9757 IsVectorized = false;
9758 // Try non-interleaved vectorization with smaller vector
9759 // factor.
9760 if (InterleaveFactor > 0) {
9761 VF = 2 * (MaxVF / InterleaveFactor);
9762 InterleaveFactor = 0;
9763 }
9764 continue;
9765 }
9766 }
9767 if (IsVectorized)
9768 break;
9769 }
9770 }
9771 NonVectorized.append(SortedNonVectorized);
9772 }
9773 return NonVectorized;
9774 };
9775 for (const auto &GLs : GatheredLoads) {
9776 const auto &Ref = GLs.second;
9777 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
9778 if (!Ref.empty() && !NonVectorized.empty() &&
9779 std::accumulate(
9780 Ref.begin(), Ref.end(), 0u,
9781 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9782 -> unsigned { return S + LoadsDists.size(); }) !=
9783 NonVectorized.size() &&
9784 IsMaskedGatherSupported(NonVectorized)) {
9786 FinalGatheredLoads;
9787 for (LoadInst *LI : NonVectorized) {
9788 // Reinsert non-vectorized loads to other list of loads with the same
9789 // base pointers.
9790 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
9791 FinalGatheredLoads,
9792 /*AddNew=*/false);
9793 }
9794 // Final attempt to vectorize non-vectorized loads.
9795 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
9796 }
9797 }
9798 // Try to vectorize postponed load entries, previously marked as gathered.
9799 for (unsigned Idx : LoadEntriesToVectorize) {
9800 const TreeEntry &E = *VectorizableTree[Idx];
9801 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
9802 // Avoid reordering, if possible.
9803 if (!E.ReorderIndices.empty()) {
9804 // Build a mask out of the reorder indices and reorder scalars per this
9805 // mask.
9806 SmallVector<int> ReorderMask;
9807 inversePermutation(E.ReorderIndices, ReorderMask);
9808 reorderScalars(GatheredScalars, ReorderMask);
9809 }
9810 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9811 }
9812 // If no new entries created, consider it as no gathered loads entries must be
9813 // handled.
9814 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9815 VectorizableTree.size())
9816 GatheredLoadsEntriesFirst.reset();
9817}
9818
9819/// Generates key/subkey pair for the given value to provide effective sorting
9820/// of the values and better detection of the vectorizable values sequences. The
9821/// keys/subkeys can be used for better sorting of the values themselves (keys)
9822/// and in values subgroups (subkeys).
9823static std::pair<size_t, size_t> generateKeySubkey(
9824 Value *V, const TargetLibraryInfo *TLI,
9825 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
9826 bool AllowAlternate) {
9827 hash_code Key = hash_value(V->getValueID() + 2);
9828 hash_code SubKey = hash_value(0);
9829 // Sort the loads by the distance between the pointers.
9830 if (auto *LI = dyn_cast<LoadInst>(V)) {
9831 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
9832 if (LI->isSimple())
9833 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
9834 else
9835 Key = SubKey = hash_value(LI);
9836 } else if (isVectorLikeInstWithConstOps(V)) {
9837 // Sort extracts by the vector operands.
9839 Key = hash_value(Value::UndefValueVal + 1);
9840 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
9841 if (!isUndefVector(EI->getVectorOperand()).all() &&
9842 !isa<UndefValue>(EI->getIndexOperand()))
9843 SubKey = hash_value(EI->getVectorOperand());
9844 }
9845 } else if (auto *I = dyn_cast<Instruction>(V)) {
9846 // Sort other instructions just by the opcodes except for CMPInst.
9847 // For CMP also sort by the predicate kind.
9849 isValidForAlternation(I->getOpcode())) {
9850 if (AllowAlternate)
9851 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
9852 else
9853 Key = hash_combine(hash_value(I->getOpcode()), Key);
9854 SubKey = hash_combine(
9855 hash_value(I->getOpcode()), hash_value(I->getType()),
9857 ? I->getType()
9858 : cast<CastInst>(I)->getOperand(0)->getType()));
9859 // For casts, look through the only operand to improve compile time.
9860 if (isa<CastInst>(I)) {
9861 std::pair<size_t, size_t> OpVals =
9862 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
9863 /*AllowAlternate=*/true);
9864 Key = hash_combine(OpVals.first, Key);
9865 SubKey = hash_combine(OpVals.first, SubKey);
9866 }
9867 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
9868 CmpInst::Predicate Pred = CI->getPredicate();
9869 if (CI->isCommutative())
9870 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
9872 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
9873 hash_value(SwapPred),
9874 hash_value(CI->getOperand(0)->getType()));
9875 } else if (auto *Call = dyn_cast<CallInst>(I)) {
9878 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
9879 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
9880 SubKey = hash_combine(hash_value(I->getOpcode()),
9881 hash_value(Call->getCalledFunction()));
9882 } else {
9884 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
9885 }
9886 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
9887 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
9888 hash_value(Op.Tag), SubKey);
9889 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
9890 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9891 SubKey = hash_value(Gep->getPointerOperand());
9892 else
9893 SubKey = hash_value(Gep);
9894 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
9895 !isa<ConstantInt>(I->getOperand(1))) {
9896 // Do not try to vectorize instructions with potentially high cost.
9897 SubKey = hash_value(I);
9898 } else {
9899 SubKey = hash_value(I->getOpcode());
9900 }
9901 Key = hash_combine(hash_value(I->getParent()), Key);
9902 }
9903 return std::make_pair(Key, SubKey);
9904}
9905
9906/// Checks if the specified instruction \p I is an main operation for the given
9907/// \p MainOp and \p AltOp instructions.
9908static bool isMainInstruction(Instruction *I, Instruction *MainOp,
9909 Instruction *AltOp, const TargetLibraryInfo &TLI);
9910
9911bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
9912 ArrayRef<Value *> VL) const {
9913 Type *ScalarTy = S.getMainOp()->getType();
9914 unsigned Opcode0 = S.getOpcode();
9915 unsigned Opcode1 = S.getAltOpcode();
9916 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9917 // If this pattern is supported by the target then consider it profitable.
9918 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
9919 Opcode1, OpcodeMask))
9920 return true;
9921 SmallVector<ValueList> Operands;
9922 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9923 Operands.emplace_back();
9924 // Prepare the operand vector.
9925 for (Value *V : VL) {
9926 if (isa<PoisonValue>(V)) {
9927 Operands.back().push_back(
9928 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
9929 continue;
9930 }
9931 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
9932 }
9933 }
9934 if (Operands.size() == 2) {
9935 // Try find best operands candidates.
9936 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9938 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
9939 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
9940 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
9941 std::optional<int> Res = findBestRootPair(Candidates);
9942 switch (Res.value_or(0)) {
9943 case 0:
9944 break;
9945 case 1:
9946 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
9947 break;
9948 case 2:
9949 std::swap(Operands[0][I], Operands[1][I]);
9950 break;
9951 default:
9952 llvm_unreachable("Unexpected index.");
9953 }
9954 }
9955 }
9956 DenseSet<unsigned> UniqueOpcodes;
9957 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
9958 unsigned NonInstCnt = 0;
9959 // Estimate number of instructions, required for the vectorized node and for
9960 // the buildvector node.
9961 unsigned UndefCnt = 0;
9962 // Count the number of extra shuffles, required for vector nodes.
9963 unsigned ExtraShuffleInsts = 0;
9964 // Check that operands do not contain same values and create either perfect
9965 // diamond match or shuffled match.
9966 if (Operands.size() == 2) {
9967 // Do not count same operands twice.
9968 if (Operands.front() == Operands.back()) {
9969 Operands.erase(Operands.begin());
9970 } else if (!allConstant(Operands.front()) &&
9971 all_of(Operands.front(), [&](Value *V) {
9972 return is_contained(Operands.back(), V);
9973 })) {
9974 Operands.erase(Operands.begin());
9975 ++ExtraShuffleInsts;
9976 }
9977 }
9978 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
9979 // Vectorize node, if:
9980 // 1. at least single operand is constant or splat.
9981 // 2. Operands have many loop invariants (the instructions are not loop
9982 // invariants).
9983 // 3. At least single unique operands is supposed to vectorized.
9984 return none_of(Operands,
9985 [&](ArrayRef<Value *> Op) {
9986 if (allConstant(Op) ||
9987 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
9988 getSameOpcode(Op, *TLI)))
9989 return false;
9990 DenseMap<Value *, unsigned> Uniques;
9991 for (Value *V : Op) {
9993 isVectorized(V) || (L && L->isLoopInvariant(V))) {
9994 if (isa<UndefValue>(V))
9995 ++UndefCnt;
9996 continue;
9997 }
9998 auto Res = Uniques.try_emplace(V, 0);
9999 // Found first duplicate - need to add shuffle.
10000 if (!Res.second && Res.first->second == 1)
10001 ++ExtraShuffleInsts;
10002 ++Res.first->getSecond();
10003 if (auto *I = dyn_cast<Instruction>(V))
10004 UniqueOpcodes.insert(I->getOpcode());
10005 else if (Res.second)
10006 ++NonInstCnt;
10007 }
10008 return none_of(Uniques, [&](const auto &P) {
10009 return P.first->hasNUsesOrMore(P.second + 1) &&
10010 none_of(P.first->users(), [&](User *U) {
10011 return isVectorized(U) || Uniques.contains(U);
10012 });
10013 });
10014 }) ||
10015 // Do not vectorize node, if estimated number of vector instructions is
10016 // more than estimated number of buildvector instructions. Number of
10017 // vector operands is number of vector instructions + number of vector
10018 // instructions for operands (buildvectors). Number of buildvector
10019 // instructions is just number_of_operands * number_of_scalars.
10020 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10021 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
10022 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10023}
10024
10025/// Builds the arguments types vector for the given call instruction with the
10026/// given \p ID for the specified vector factor.
10029 const unsigned VF, unsigned MinBW,
10030 const TargetTransformInfo *TTI) {
10031 SmallVector<Type *> ArgTys;
10032 for (auto [Idx, Arg] : enumerate(CI->args())) {
10035 ArgTys.push_back(Arg->getType());
10036 continue;
10037 }
10038 if (MinBW > 0) {
10039 ArgTys.push_back(
10040 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
10041 continue;
10042 }
10043 }
10044 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
10045 }
10046 return ArgTys;
10047}
10048
10049/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10050/// function (if possible) calls. Returns invalid cost for the corresponding
10051/// calls, if they cannot be vectorized/will be scalarized.
10052static std::pair<InstructionCost, InstructionCost>
10055 ArrayRef<Type *> ArgTys) {
10056 auto Shape = VFShape::get(CI->getFunctionType(),
10058 false /*HasGlobalPred*/);
10059 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10060 auto LibCost = InstructionCost::getInvalid();
10061 if (!CI->isNoBuiltin() && VecFunc) {
10062 // Calculate the cost of the vector library call.
10063 // If the corresponding vector call is cheaper, return its cost.
10064 LibCost =
10065 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
10066 }
10068
10069 // Calculate the cost of the vector intrinsic call.
10070 FastMathFlags FMF;
10071 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
10072 FMF = FPCI->getFastMathFlags();
10073 const InstructionCost ScalarLimit = 10000;
10074 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10075 LibCost.isValid() ? LibCost : ScalarLimit);
10076 auto IntrinsicCost =
10077 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
10078 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10079 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10081
10082 return {IntrinsicCost, LibCost};
10083}
10084
10085BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10086 const InstructionsState &S, ArrayRef<Value *> VL,
10087 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10088 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10089 assert(S.getMainOp() &&
10090 "Expected instructions with same/alternate opcodes only.");
10091
10092 unsigned ShuffleOrOp =
10093 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10094 Instruction *VL0 = S.getMainOp();
10095 switch (ShuffleOrOp) {
10096 case Instruction::PHI: {
10097 // Too many operands - gather, most probably won't be vectorized.
10098 if (VL0->getNumOperands() > MaxPHINumOperands)
10099 return TreeEntry::NeedToGather;
10100 // Check for terminator values (e.g. invoke).
10101 for (Value *V : VL) {
10102 auto *PHI = dyn_cast<PHINode>(V);
10103 if (!PHI)
10104 continue;
10105 for (Value *Incoming : PHI->incoming_values()) {
10107 if (Term && Term->isTerminator()) {
10109 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10110 return TreeEntry::NeedToGather;
10111 }
10112 }
10113 }
10114
10115 return TreeEntry::Vectorize;
10116 }
10117 case Instruction::ExtractElement:
10118 if (any_of(VL, [&](Value *V) {
10119 auto *EI = dyn_cast<ExtractElementInst>(V);
10120 if (!EI)
10121 return true;
10122 return isVectorized(EI->getOperand(0));
10123 }))
10124 return TreeEntry::NeedToGather;
10125 [[fallthrough]];
10126 case Instruction::ExtractValue: {
10127 bool Reuse = canReuseExtract(VL, CurrentOrder);
10128 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10129 // non-full registers).
10130 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
10131 return TreeEntry::NeedToGather;
10132 if (Reuse || !CurrentOrder.empty())
10133 return TreeEntry::Vectorize;
10134 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10135 return TreeEntry::NeedToGather;
10136 }
10137 case Instruction::InsertElement: {
10138 // Check that we have a buildvector and not a shuffle of 2 or more
10139 // different vectors.
10140 ValueSet SourceVectors;
10141 for (Value *V : VL) {
10142 if (isa<PoisonValue>(V)) {
10143 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10144 return TreeEntry::NeedToGather;
10145 }
10146 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10147 assert(getElementIndex(V) != std::nullopt &&
10148 "Non-constant or undef index?");
10149 }
10150
10151 if (count_if(VL, [&SourceVectors](Value *V) {
10152 return !SourceVectors.contains(V);
10153 }) >= 2) {
10154 // Found 2nd source vector - cancel.
10155 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10156 "different source vectors.\n");
10157 return TreeEntry::NeedToGather;
10158 }
10159
10160 if (any_of(VL, [&SourceVectors](Value *V) {
10161 // The last InsertElement can have multiple uses.
10162 return SourceVectors.contains(V) && !V->hasOneUse();
10163 })) {
10164 assert(SLPReVec && "Only supported by REVEC.");
10165 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10166 "multiple uses.\n");
10167 return TreeEntry::NeedToGather;
10168 }
10169
10170 return TreeEntry::Vectorize;
10171 }
10172 case Instruction::Load: {
10173 // Check that a vectorized load would load the same memory as a scalar
10174 // load. For example, we don't want to vectorize loads that are smaller
10175 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10176 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10177 // from such a struct, we read/write packed bits disagreeing with the
10178 // unvectorized version.
10179 auto IsGatheredNode = [&]() {
10180 if (!GatheredLoadsEntriesFirst)
10181 return false;
10182 return all_of(VL, [&](Value *V) {
10183 if (isa<PoisonValue>(V))
10184 return true;
10185 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10186 return TE->Idx >= *GatheredLoadsEntriesFirst;
10187 });
10188 });
10189 };
10190 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10192 return TreeEntry::Vectorize;
10194 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10195 // Delay slow vectorized nodes for better vectorization attempts.
10196 LoadEntriesToVectorize.insert(VectorizableTree.size());
10197 return TreeEntry::NeedToGather;
10198 }
10199 return IsGatheredNode() ? TreeEntry::NeedToGather
10200 : TreeEntry::CompressVectorize;
10202 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10203 // Delay slow vectorized nodes for better vectorization attempts.
10204 LoadEntriesToVectorize.insert(VectorizableTree.size());
10205 return TreeEntry::NeedToGather;
10206 }
10207 return IsGatheredNode() ? TreeEntry::NeedToGather
10208 : TreeEntry::ScatterVectorize;
10210 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10211 // Delay slow vectorized nodes for better vectorization attempts.
10212 LoadEntriesToVectorize.insert(VectorizableTree.size());
10213 return TreeEntry::NeedToGather;
10214 }
10215 return IsGatheredNode() ? TreeEntry::NeedToGather
10216 : TreeEntry::StridedVectorize;
10217 case LoadsState::Gather:
10218#ifndef NDEBUG
10219 Type *ScalarTy = VL0->getType();
10220 if (DL->getTypeSizeInBits(ScalarTy) !=
10221 DL->getTypeAllocSizeInBits(ScalarTy))
10222 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10223 else if (any_of(VL, [](Value *V) {
10224 auto *LI = dyn_cast<LoadInst>(V);
10225 return !LI || !LI->isSimple();
10226 }))
10227 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10228 else
10229 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10230#endif // NDEBUG
10232 return TreeEntry::NeedToGather;
10233 }
10234 llvm_unreachable("Unexpected state of loads");
10235 }
10236 case Instruction::ZExt:
10237 case Instruction::SExt:
10238 case Instruction::FPToUI:
10239 case Instruction::FPToSI:
10240 case Instruction::FPExt:
10241 case Instruction::PtrToInt:
10242 case Instruction::IntToPtr:
10243 case Instruction::SIToFP:
10244 case Instruction::UIToFP:
10245 case Instruction::Trunc:
10246 case Instruction::FPTrunc:
10247 case Instruction::BitCast: {
10248 Type *SrcTy = VL0->getOperand(0)->getType();
10249 for (Value *V : VL) {
10250 if (isa<PoisonValue>(V))
10251 continue;
10252 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10253 if (Ty != SrcTy || !isValidElementType(Ty)) {
10254 LLVM_DEBUG(
10255 dbgs() << "SLP: Gathering casts with different src types.\n");
10256 return TreeEntry::NeedToGather;
10257 }
10258 }
10259 return TreeEntry::Vectorize;
10260 }
10261 case Instruction::ICmp:
10262 case Instruction::FCmp: {
10263 // Check that all of the compares have the same predicate.
10264 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10266 Type *ComparedTy = VL0->getOperand(0)->getType();
10267 for (Value *V : VL) {
10268 if (isa<PoisonValue>(V))
10269 continue;
10270 auto *Cmp = cast<CmpInst>(V);
10271 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10272 Cmp->getOperand(0)->getType() != ComparedTy) {
10273 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10274 return TreeEntry::NeedToGather;
10275 }
10276 }
10277 return TreeEntry::Vectorize;
10278 }
10279 case Instruction::Select:
10280 case Instruction::FNeg:
10281 case Instruction::Add:
10282 case Instruction::FAdd:
10283 case Instruction::Sub:
10284 case Instruction::FSub:
10285 case Instruction::Mul:
10286 case Instruction::FMul:
10287 case Instruction::UDiv:
10288 case Instruction::SDiv:
10289 case Instruction::FDiv:
10290 case Instruction::URem:
10291 case Instruction::SRem:
10292 case Instruction::FRem:
10293 case Instruction::Shl:
10294 case Instruction::LShr:
10295 case Instruction::AShr:
10296 case Instruction::And:
10297 case Instruction::Or:
10298 case Instruction::Xor:
10299 case Instruction::Freeze:
10300 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10301 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10302 auto *I = dyn_cast<Instruction>(V);
10303 return I && I->isBinaryOp() && !I->isFast();
10304 }))
10305 return TreeEntry::NeedToGather;
10306 return TreeEntry::Vectorize;
10307 case Instruction::GetElementPtr: {
10308 // We don't combine GEPs with complicated (nested) indexing.
10309 for (Value *V : VL) {
10310 auto *I = dyn_cast<GetElementPtrInst>(V);
10311 if (!I)
10312 continue;
10313 if (I->getNumOperands() != 2) {
10314 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10315 return TreeEntry::NeedToGather;
10316 }
10317 }
10318
10319 // We can't combine several GEPs into one vector if they operate on
10320 // different types.
10321 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10322 for (Value *V : VL) {
10323 auto *GEP = dyn_cast<GEPOperator>(V);
10324 if (!GEP)
10325 continue;
10326 Type *CurTy = GEP->getSourceElementType();
10327 if (Ty0 != CurTy) {
10328 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10329 return TreeEntry::NeedToGather;
10330 }
10331 }
10332
10333 // We don't combine GEPs with non-constant indexes.
10334 Type *Ty1 = VL0->getOperand(1)->getType();
10335 for (Value *V : VL) {
10336 auto *I = dyn_cast<GetElementPtrInst>(V);
10337 if (!I)
10338 continue;
10339 auto *Op = I->getOperand(1);
10340 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10341 (Op->getType() != Ty1 &&
10342 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10343 Op->getType()->getScalarSizeInBits() >
10344 DL->getIndexSizeInBits(
10345 V->getType()->getPointerAddressSpace())))) {
10346 LLVM_DEBUG(
10347 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10348 return TreeEntry::NeedToGather;
10349 }
10350 }
10351
10352 return TreeEntry::Vectorize;
10353 }
10354 case Instruction::Store: {
10355 // Check if the stores are consecutive or if we need to swizzle them.
10356 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10357 // Avoid types that are padded when being allocated as scalars, while
10358 // being packed together in a vector (such as i1).
10359 if (DL->getTypeSizeInBits(ScalarTy) !=
10360 DL->getTypeAllocSizeInBits(ScalarTy)) {
10361 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10362 return TreeEntry::NeedToGather;
10363 }
10364 // Make sure all stores in the bundle are simple - we can't vectorize
10365 // atomic or volatile stores.
10366 for (Value *V : VL) {
10367 auto *SI = cast<StoreInst>(V);
10368 if (!SI->isSimple()) {
10369 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10370 return TreeEntry::NeedToGather;
10371 }
10372 PointerOps.push_back(SI->getPointerOperand());
10373 }
10374
10375 // Check the order of pointer operands.
10376 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10377 Value *Ptr0;
10378 Value *PtrN;
10379 if (CurrentOrder.empty()) {
10380 Ptr0 = PointerOps.front();
10381 PtrN = PointerOps.back();
10382 } else {
10383 Ptr0 = PointerOps[CurrentOrder.front()];
10384 PtrN = PointerOps[CurrentOrder.back()];
10385 }
10386 std::optional<int64_t> Dist =
10387 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10388 // Check that the sorted pointer operands are consecutive.
10389 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10390 return TreeEntry::Vectorize;
10391 }
10392
10393 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10394 return TreeEntry::NeedToGather;
10395 }
10396 case Instruction::Call: {
10397 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10398 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10399 auto *I = dyn_cast<Instruction>(V);
10400 return I && !I->isFast();
10401 }))
10402 return TreeEntry::NeedToGather;
10403 // Check if the calls are all to the same vectorizable intrinsic or
10404 // library function.
10405 CallInst *CI = cast<CallInst>(VL0);
10407
10408 VFShape Shape = VFShape::get(
10409 CI->getFunctionType(),
10410 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10411 false /*HasGlobalPred*/);
10412 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10413
10414 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10415 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10416 return TreeEntry::NeedToGather;
10417 }
10418 Function *F = CI->getCalledFunction();
10419 unsigned NumArgs = CI->arg_size();
10420 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10421 for (unsigned J = 0; J != NumArgs; ++J)
10423 ScalarArgs[J] = CI->getArgOperand(J);
10424 for (Value *V : VL) {
10425 CallInst *CI2 = dyn_cast<CallInst>(V);
10426 if (!CI2 || CI2->getCalledFunction() != F ||
10427 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10428 (VecFunc &&
10429 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10431 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10432 << "\n");
10433 return TreeEntry::NeedToGather;
10434 }
10435 // Some intrinsics have scalar arguments and should be same in order for
10436 // them to be vectorized.
10437 for (unsigned J = 0; J != NumArgs; ++J) {
10439 Value *A1J = CI2->getArgOperand(J);
10440 if (ScalarArgs[J] != A1J) {
10442 << "SLP: mismatched arguments in call:" << *CI
10443 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10444 return TreeEntry::NeedToGather;
10445 }
10446 }
10447 }
10448 // Verify that the bundle operands are identical between the two calls.
10449 if (CI->hasOperandBundles() &&
10450 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10451 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10452 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10453 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10454 << "!=" << *V << '\n');
10455 return TreeEntry::NeedToGather;
10456 }
10457 }
10458 SmallVector<Type *> ArgTys =
10459 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10460 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10461 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10462 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10463 return TreeEntry::NeedToGather;
10464
10465 return TreeEntry::Vectorize;
10466 }
10467 case Instruction::ShuffleVector: {
10468 if (!S.isAltShuffle()) {
10469 // REVEC can support non alternate shuffle.
10471 return TreeEntry::Vectorize;
10472 // If this is not an alternate sequence of opcode like add-sub
10473 // then do not vectorize this instruction.
10474 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10475 return TreeEntry::NeedToGather;
10476 }
10477 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10478 LLVM_DEBUG(
10479 dbgs()
10480 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10481 "the whole alt sequence is not profitable.\n");
10482 return TreeEntry::NeedToGather;
10483 }
10484
10485 return TreeEntry::Vectorize;
10486 }
10487 default:
10488 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10489 return TreeEntry::NeedToGather;
10490 }
10491}
10492
10493namespace {
10494/// Allows to correctly handle operands of the phi nodes based on the \p Main
10495/// PHINode order of incoming basic blocks/values.
10496class PHIHandler {
10497 DominatorTree &DT;
10498 PHINode *Main = nullptr;
10501
10502public:
10503 PHIHandler() = delete;
10504 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10505 : DT(DT), Main(Main), Phis(Phis),
10506 Operands(Main->getNumIncomingValues(),
10507 SmallVector<Value *>(Phis.size(), nullptr)) {}
10508 void buildOperands() {
10509 constexpr unsigned FastLimit = 4;
10510 if (Main->getNumIncomingValues() <= FastLimit) {
10511 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10512 BasicBlock *InBB = Main->getIncomingBlock(I);
10513 if (!DT.isReachableFromEntry(InBB)) {
10514 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10515 continue;
10516 }
10517 // Prepare the operand vector.
10518 for (auto [Idx, V] : enumerate(Phis)) {
10519 auto *P = dyn_cast<PHINode>(V);
10520 if (!P) {
10522 "Expected isa instruction or poison value.");
10523 Operands[I][Idx] = V;
10524 continue;
10525 }
10526 if (P->getIncomingBlock(I) == InBB)
10527 Operands[I][Idx] = P->getIncomingValue(I);
10528 else
10529 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10530 }
10531 }
10532 return;
10533 }
10534 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10535 Blocks;
10536 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10537 BasicBlock *InBB = Main->getIncomingBlock(I);
10538 if (!DT.isReachableFromEntry(InBB)) {
10539 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10540 continue;
10541 }
10542 Blocks.try_emplace(InBB).first->second.push_back(I);
10543 }
10544 for (auto [Idx, V] : enumerate(Phis)) {
10545 if (isa<PoisonValue>(V)) {
10546 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10547 Operands[I][Idx] = V;
10548 continue;
10549 }
10550 auto *P = cast<PHINode>(V);
10551 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10552 BasicBlock *InBB = P->getIncomingBlock(I);
10553 if (InBB == Main->getIncomingBlock(I)) {
10554 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10555 continue;
10556 Operands[I][Idx] = P->getIncomingValue(I);
10557 continue;
10558 }
10559 auto *It = Blocks.find(InBB);
10560 if (It == Blocks.end())
10561 continue;
10562 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10563 }
10564 }
10565 for (const auto &P : Blocks) {
10566 ArrayRef<unsigned> IncomingValues = P.second;
10567 if (IncomingValues.size() <= 1)
10568 continue;
10569 unsigned BasicI = IncomingValues.consume_front();
10570 for (unsigned I : IncomingValues) {
10571 assert(all_of(enumerate(Operands[I]),
10572 [&](const auto &Data) {
10573 return !Data.value() ||
10574 Data.value() == Operands[BasicI][Data.index()];
10575 }) &&
10576 "Expected empty operands list.");
10577 Operands[I] = Operands[BasicI];
10578 }
10579 }
10580 }
10581 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10582};
10583} // namespace
10584
10585/// Returns main/alternate instructions for the given \p VL. Unlike
10586/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10587/// node support.
10588/// \returns first main/alt instructions, if only poisons and instruction with
10589/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10590static std::pair<Instruction *, Instruction *>
10592 Instruction *MainOp = nullptr;
10593 Instruction *AltOp = nullptr;
10594 for (Value *V : VL) {
10595 if (isa<PoisonValue>(V))
10596 continue;
10597 auto *I = dyn_cast<Instruction>(V);
10598 if (!I)
10599 return {};
10600 if (!MainOp) {
10601 MainOp = I;
10602 continue;
10603 }
10604 if (MainOp->getOpcode() == I->getOpcode()) {
10605 if (I->getParent() != MainOp->getParent())
10606 return {};
10607 continue;
10608 }
10609 if (!AltOp) {
10610 AltOp = I;
10611 continue;
10612 }
10613 if (AltOp->getOpcode() == I->getOpcode()) {
10614 if (I->getParent() != AltOp->getParent())
10615 return {};
10616 continue;
10617 }
10618 return {};
10619 }
10620 if (!AltOp)
10621 return {};
10622 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10623 "Expected different main and alt instructions.");
10624 return std::make_pair(MainOp, AltOp);
10625}
10626
10627/// Checks that every instruction appears once in the list and if not, packs
10628/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10629/// unique scalars is extended by poison values to the whole register size.
10630///
10631/// \returns false if \p VL could not be uniquified, in which case \p VL is
10632/// unchanged and \p ReuseShuffleIndices is empty.
10634 SmallVectorImpl<int> &ReuseShuffleIndices,
10635 const TargetTransformInfo &TTI,
10636 const TargetLibraryInfo &TLI,
10637 const InstructionsState &S,
10638 const BoUpSLP::EdgeInfo &UserTreeIdx,
10639 bool TryPad = false) {
10640 // Check that every instruction appears once in this bundle.
10641 SmallVector<Value *> UniqueValues;
10642 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10643 for (Value *V : VL) {
10644 if (isConstant(V)) {
10645 // Constants are always considered distinct, even if the same constant
10646 // appears multiple times in VL.
10647 ReuseShuffleIndices.emplace_back(
10648 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10649 UniqueValues.emplace_back(V);
10650 continue;
10651 }
10652 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10653 ReuseShuffleIndices.emplace_back(Res.first->second);
10654 if (Res.second)
10655 UniqueValues.emplace_back(V);
10656 }
10657
10658 // Easy case: VL has unique values and a "natural" size
10659 size_t NumUniqueScalarValues = UniqueValues.size();
10660 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10661 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10662 if (NumUniqueScalarValues == VL.size() &&
10663 (VectorizeNonPowerOf2 || IsFullVectors)) {
10664 ReuseShuffleIndices.clear();
10665 return true;
10666 }
10667
10668 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10669 if ((UserTreeIdx.UserTE &&
10670 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10672 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10673 "for nodes with padding.\n");
10674 ReuseShuffleIndices.clear();
10675 return false;
10676 }
10677
10678 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10679 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10680 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10681 return isa<UndefValue>(V) || !isConstant(V);
10682 }))) {
10683 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10684 S.getMainOp()->isSafeToRemove() &&
10685 (S.areInstructionsWithCopyableElements() ||
10686 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10687 // Find the number of elements, which forms full vectors.
10688 unsigned PWSz = getFullVectorNumberOfElements(
10689 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10690 PWSz = std::min<unsigned>(PWSz, VL.size());
10691 if (PWSz == VL.size()) {
10692 // We ended up with the same size after removing duplicates and
10693 // upgrading the resulting vector size to a "nice size". Just keep
10694 // the initial VL then.
10695 ReuseShuffleIndices.clear();
10696 } else {
10697 // Pad unique values with poison to grow the vector to a "nice" size
10698 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10699 UniqueValues.end());
10700 PaddedUniqueValues.append(
10701 PWSz - UniqueValues.size(),
10702 PoisonValue::get(UniqueValues.front()->getType()));
10703 // Check that extended with poisons/copyable operations are still valid
10704 // for vectorization (div/rem are not allowed).
10705 if ((!S.areInstructionsWithCopyableElements() &&
10706 !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
10707 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10708 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10709 isa<CallInst>(S.getMainOp())))) {
10710 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10711 ReuseShuffleIndices.clear();
10712 return false;
10713 }
10714 VL = std::move(PaddedUniqueValues);
10715 }
10716 return true;
10717 }
10718 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10719 ReuseShuffleIndices.clear();
10720 return false;
10721 }
10722 VL = std::move(UniqueValues);
10723 return true;
10724}
10725
10726bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10727 const InstructionsState &LocalState,
10728 SmallVectorImpl<Value *> &Op1,
10729 SmallVectorImpl<Value *> &Op2,
10730 OrdersType &ReorderIndices) const {
10731 constexpr unsigned SmallNodeSize = 4;
10732 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10734 return false;
10735
10736 // Check if this is a duplicate of another split entry.
10737 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10738 << ".\n");
10739 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10740 if (E->isSame(VL)) {
10741 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10742 << *LocalState.getMainOp() << ".\n");
10743 return false;
10744 }
10745 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10746 if (all_of(VL, [&](Value *V) {
10747 return isa<PoisonValue>(V) || Values.contains(V);
10748 })) {
10749 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10750 return false;
10751 }
10752 }
10753
10754 ReorderIndices.assign(VL.size(), VL.size());
10755 SmallBitVector Op1Indices(VL.size());
10756 for (auto [Idx, V] : enumerate(VL)) {
10757 auto *I = dyn_cast<Instruction>(V);
10758 if (!I) {
10759 Op1.push_back(V);
10760 Op1Indices.set(Idx);
10761 continue;
10762 }
10763 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10764 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
10765 *TLI)) ||
10766 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10767 !isAlternateInstruction(I, LocalState.getMainOp(),
10768 LocalState.getAltOp(), *TLI))) {
10769 Op1.push_back(V);
10770 Op1Indices.set(Idx);
10771 continue;
10772 }
10773 Op2.push_back(V);
10774 }
10775 Type *ScalarTy = getValueType(VL.front());
10776 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
10777 unsigned Opcode0 = LocalState.getOpcode();
10778 unsigned Opcode1 = LocalState.getAltOpcode();
10779 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10780 // Enable split node, only if all nodes do not form legal alternate
10781 // instruction (like X86 addsub).
10782 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
10783 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
10784 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10785 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10786 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
10787 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
10788 return false;
10789 // Enable split node, only if all nodes are power-of-2/full registers.
10790 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
10791 for (unsigned Idx : seq<unsigned>(VL.size())) {
10792 if (Op1Indices.test(Idx)) {
10793 ReorderIndices[Op1Cnt] = Idx;
10794 ++Op1Cnt;
10795 } else {
10796 ReorderIndices[Op2Cnt] = Idx;
10797 ++Op2Cnt;
10798 }
10799 }
10800 if (isIdentityOrder(ReorderIndices))
10801 ReorderIndices.clear();
10802 SmallVector<int> Mask;
10803 if (!ReorderIndices.empty())
10804 inversePermutation(ReorderIndices, Mask);
10805 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10806 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
10807 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
10808 // Check non-profitable single register ops, which better to be represented
10809 // as alternate ops.
10810 if (NumParts >= VL.size())
10811 return false;
10813 InstructionCost InsertCost = ::getShuffleCost(
10814 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
10815 FixedVectorType *SubVecTy =
10816 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
10817 InstructionCost NewShuffleCost =
10818 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
10819 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10820 (Mask.empty() || InsertCost >= NewShuffleCost))
10821 return false;
10822 if ((LocalState.getMainOp()->isBinaryOp() &&
10823 LocalState.getAltOp()->isBinaryOp() &&
10824 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10825 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10826 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10827 (LocalState.getMainOp()->isUnaryOp() &&
10828 LocalState.getAltOp()->isUnaryOp())) {
10829 InstructionCost OriginalVecOpsCost =
10830 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10831 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10832 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
10833 for (unsigned Idx : seq<unsigned>(VL.size())) {
10834 if (isa<PoisonValue>(VL[Idx]))
10835 continue;
10836 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
10837 }
10838 InstructionCost OriginalCost =
10839 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
10840 VecTy, OriginalMask, Kind);
10841 InstructionCost NewVecOpsCost =
10842 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10843 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10844 InstructionCost NewCost =
10845 NewVecOpsCost + InsertCost +
10846 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10847 VectorizableTree.front()->getOpcode() == Instruction::Store
10848 ? NewShuffleCost
10849 : 0);
10850 // If not profitable to split - exit.
10851 if (NewCost >= OriginalCost)
10852 return false;
10853 }
10854 return true;
10855}
10856
10857namespace {
10858/// Class accepts incoming list of values, checks if it is able to model
10859/// "copyable" values as compatible operations, and generates the list of values
10860/// for scheduling and list of operands doe the new nodes.
10861class InstructionsCompatibilityAnalysis {
10862 DominatorTree &DT;
10863 const DataLayout &DL;
10864 const TargetTransformInfo &TTI;
10865 const TargetLibraryInfo &TLI;
10866 unsigned MainOpcode = 0;
10867 Instruction *MainOp = nullptr;
10868
10869 /// Checks if the opcode is supported as the main opcode for copyable
10870 /// elements.
10871 static bool isSupportedOpcode(const unsigned Opcode) {
10872 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
10873 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
10874 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
10875 Opcode == Instruction::And || Opcode == Instruction::Or ||
10876 Opcode == Instruction::Xor;
10877 }
10878
10879 /// Identifies the best candidate value, which represents main opcode
10880 /// operation.
10881 /// Currently the best candidate is the Add instruction with the parent
10882 /// block with the highest DFS incoming number (block, that dominates other).
10883 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
10884 BasicBlock *Parent = nullptr;
10885 // Checks if the instruction has supported opcode.
10886 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
10887 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
10888 return false;
10889 return I && isSupportedOpcode(I->getOpcode()) &&
10890 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
10891 };
10892 // Exclude operands instructions immediately to improve compile time, it
10893 // will be unable to schedule anyway.
10894 SmallDenseSet<Value *, 8> Operands;
10895 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10896 bool AnyUndef = false;
10897 for (Value *V : VL) {
10898 auto *I = dyn_cast<Instruction>(V);
10899 if (!I) {
10900 AnyUndef |= isa<UndefValue>(V);
10901 continue;
10902 }
10903 if (!DT.isReachableFromEntry(I->getParent()))
10904 continue;
10905 if (Candidates.empty()) {
10906 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10907 Parent = I->getParent();
10908 Operands.insert(I->op_begin(), I->op_end());
10909 continue;
10910 }
10911 if (Parent == I->getParent()) {
10912 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10913 Operands.insert(I->op_begin(), I->op_end());
10914 continue;
10915 }
10916 auto *NodeA = DT.getNode(Parent);
10917 auto *NodeB = DT.getNode(I->getParent());
10918 assert(NodeA && "Should only process reachable instructions");
10919 assert(NodeB && "Should only process reachable instructions");
10920 assert((NodeA == NodeB) ==
10921 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10922 "Different nodes should have different DFS numbers");
10923 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10924 Candidates.clear();
10925 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10926 Parent = I->getParent();
10927 Operands.clear();
10928 Operands.insert(I->op_begin(), I->op_end());
10929 }
10930 }
10931 unsigned BestOpcodeNum = 0;
10932 MainOp = nullptr;
10933 bool UsedOutside = false;
10934 for (const auto &P : Candidates) {
10935 bool PUsedOutside = all_of(P.second, isUsedOutsideBlock);
10936 if (UsedOutside && !PUsedOutside)
10937 continue;
10938 if (!UsedOutside && PUsedOutside)
10939 BestOpcodeNum = 0;
10940 if (P.second.size() < BestOpcodeNum)
10941 continue;
10942 // If have inner dependencies - skip.
10943 if (!PUsedOutside && any_of(P.second, [&](Instruction *I) {
10944 return Operands.contains(I);
10945 }))
10946 continue;
10947 UsedOutside = PUsedOutside;
10948 for (Instruction *I : P.second) {
10949 if (IsSupportedInstruction(I, AnyUndef)) {
10950 MainOp = I;
10951 BestOpcodeNum = P.second.size();
10952 break;
10953 }
10954 }
10955 }
10956 if (MainOp) {
10957 // Do not match, if any copyable is a terminator from the same block as
10958 // the main operation.
10959 if (any_of(VL, [&](Value *V) {
10960 auto *I = dyn_cast<Instruction>(V);
10961 return I && I->getParent() == MainOp->getParent() &&
10962 I->isTerminator();
10963 })) {
10964 MainOp = nullptr;
10965 return;
10966 }
10967 MainOpcode = MainOp->getOpcode();
10968 }
10969 }
10970
10971 /// Returns the idempotent value for the \p MainOp with the detected \p
10972 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
10973 /// the operand itself, since V or V == V.
10974 Value *selectBestIdempotentValue() const {
10975 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10976 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
10977 !MainOp->isCommutative());
10978 }
10979
10980 /// Returns the value and operands for the \p V, considering if it is original
10981 /// instruction and its actual operands should be returned, or it is a
10982 /// copyable element and its should be represented as idempotent instruction.
10983 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
10984 if (isa<PoisonValue>(V))
10985 return {V, V};
10986 if (!S.isCopyableElement(V))
10987 return convertTo(cast<Instruction>(V), S).second;
10988 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10989 return {V, selectBestIdempotentValue()};
10990 }
10991
10992 /// Builds operands for the original instructions.
10993 void
10994 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
10995 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
10996
10997 unsigned ShuffleOrOp =
10998 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10999 Instruction *VL0 = S.getMainOp();
11000
11001 switch (ShuffleOrOp) {
11002 case Instruction::PHI: {
11003 auto *PH = cast<PHINode>(VL0);
11004
11005 // Keeps the reordered operands to avoid code duplication.
11006 PHIHandler Handler(DT, PH, VL);
11007 Handler.buildOperands();
11008 Operands.assign(PH->getNumOperands(), {});
11009 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
11010 Operands[I].assign(Handler.getOperands(I).begin(),
11011 Handler.getOperands(I).end());
11012 return;
11013 }
11014 case Instruction::ExtractValue:
11015 case Instruction::ExtractElement:
11016 // This is a special case, as it does not gather, but at the same time
11017 // we are not extending buildTree_rec() towards the operands.
11018 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
11019 return;
11020 case Instruction::InsertElement:
11021 Operands.assign(2, {VL.size(), nullptr});
11022 for (auto [Idx, V] : enumerate(VL)) {
11023 auto *IE = cast<InsertElementInst>(V);
11024 for (auto [OpIdx, Ops] : enumerate(Operands))
11025 Ops[Idx] = IE->getOperand(OpIdx);
11026 }
11027 return;
11028 case Instruction::Load:
11029 Operands.assign(
11030 1, {VL.size(),
11031 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
11032 for (auto [V, Op] : zip(VL, Operands.back())) {
11033 auto *LI = dyn_cast<LoadInst>(V);
11034 if (!LI)
11035 continue;
11036 Op = LI->getPointerOperand();
11037 }
11038 return;
11039 case Instruction::ZExt:
11040 case Instruction::SExt:
11041 case Instruction::FPToUI:
11042 case Instruction::FPToSI:
11043 case Instruction::FPExt:
11044 case Instruction::PtrToInt:
11045 case Instruction::IntToPtr:
11046 case Instruction::SIToFP:
11047 case Instruction::UIToFP:
11048 case Instruction::Trunc:
11049 case Instruction::FPTrunc:
11050 case Instruction::BitCast:
11051 case Instruction::ICmp:
11052 case Instruction::FCmp:
11053 case Instruction::Select:
11054 case Instruction::FNeg:
11055 case Instruction::Add:
11056 case Instruction::FAdd:
11057 case Instruction::Sub:
11058 case Instruction::FSub:
11059 case Instruction::Mul:
11060 case Instruction::FMul:
11061 case Instruction::UDiv:
11062 case Instruction::SDiv:
11063 case Instruction::FDiv:
11064 case Instruction::URem:
11065 case Instruction::SRem:
11066 case Instruction::FRem:
11067 case Instruction::Shl:
11068 case Instruction::LShr:
11069 case Instruction::AShr:
11070 case Instruction::And:
11071 case Instruction::Or:
11072 case Instruction::Xor:
11073 case Instruction::Freeze:
11074 case Instruction::Store:
11075 case Instruction::ShuffleVector:
11076 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11077 for (auto [Idx, V] : enumerate(VL)) {
11078 auto *I = dyn_cast<Instruction>(V);
11079 if (!I) {
11080 for (auto [OpIdx, Ops] : enumerate(Operands))
11081 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11082 continue;
11083 }
11084 auto [Op, ConvertedOps] = convertTo(I, S);
11085 for (auto [OpIdx, Ops] : enumerate(Operands))
11086 Ops[Idx] = ConvertedOps[OpIdx];
11087 }
11088 return;
11089 case Instruction::GetElementPtr: {
11090 Operands.assign(2, {VL.size(), nullptr});
11091 // Need to cast all indices to the same type before vectorization to
11092 // avoid crash.
11093 // Required to be able to find correct matches between different gather
11094 // nodes and reuse the vectorized values rather than trying to gather them
11095 // again.
11096 const unsigned IndexIdx = 1;
11097 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
11098 Type *Ty =
11099 all_of(VL,
11100 [&](Value *V) {
11102 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
11103 })
11104 ? VL0Ty
11105 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
11106 ->getPointerOperandType()
11107 ->getScalarType());
11108 for (auto [Idx, V] : enumerate(VL)) {
11110 if (!GEP) {
11111 Operands[0][Idx] = V;
11112 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11113 continue;
11114 }
11115 Operands[0][Idx] = GEP->getPointerOperand();
11116 auto *Op = GEP->getOperand(IndexIdx);
11117 auto *CI = dyn_cast<ConstantInt>(Op);
11118 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
11119 CI, Ty, CI->getValue().isSignBitSet(), DL)
11120 : Op;
11121 }
11122 return;
11123 }
11124 case Instruction::Call: {
11125 auto *CI = cast<CallInst>(VL0);
11127 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
11129 continue;
11130 auto &Ops = Operands.emplace_back();
11131 for (Value *V : VL) {
11132 auto *I = dyn_cast<Instruction>(V);
11133 Ops.push_back(I ? I->getOperand(Idx)
11134 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
11135 }
11136 }
11137 return;
11138 }
11139 default:
11140 break;
11141 }
11142 llvm_unreachable("Unexpected vectorization of the instructions.");
11143 }
11144
11145public:
11146 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11147 const TargetTransformInfo &TTI,
11148 const TargetLibraryInfo &TLI)
11149 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11150
11151 InstructionsState
11152 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
11153 bool TryCopyableElementsVectorization,
11154 bool WithProfitabilityCheck = false,
11155 bool SkipSameCodeCheck = false) {
11156 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11157 ? InstructionsState::invalid()
11158 : getSameOpcode(VL, TLI);
11159 if (S)
11160 return S;
11161 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
11162 return S;
11163 findAndSetMainInstruction(VL, R);
11164 if (!MainOp)
11165 return InstructionsState::invalid();
11166 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11167 if (!WithProfitabilityCheck)
11168 return S;
11169 // Check if it is profitable to vectorize the instruction.
11170 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11171 auto BuildCandidates =
11172 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11173 Value *V2) {
11174 if (V1 != V2 && isa<PHINode>(V1))
11175 return;
11176 auto *I1 = dyn_cast<Instruction>(V1);
11177 auto *I2 = dyn_cast<Instruction>(V2);
11178 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11179 I1->getParent() != I2->getParent())
11180 return;
11181 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
11182 };
11183 if (VL.size() == 2) {
11184 // Check if the operands allow better vectorization.
11185 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11186 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11187 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11188 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11189 R.findBestRootPair(Candidates1) &&
11190 R.findBestRootPair(Candidates2);
11191 if (!Res && isCommutative(MainOp)) {
11192 Candidates1.clear();
11193 Candidates2.clear();
11194 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11195 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11196 Res = !Candidates1.empty() && !Candidates2.empty() &&
11197 R.findBestRootPair(Candidates1) &&
11198 R.findBestRootPair(Candidates2);
11199 }
11200 if (!Res)
11201 return InstructionsState::invalid();
11203 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
11204 InstructionCost VectorCost;
11205 FixedVectorType *VecTy =
11206 getWidenedType(S.getMainOp()->getType(), VL.size());
11207 switch (MainOpcode) {
11208 case Instruction::Add:
11209 case Instruction::Sub:
11210 case Instruction::LShr:
11211 case Instruction::Shl:
11212 case Instruction::SDiv:
11213 case Instruction::UDiv:
11214 case Instruction::And:
11215 case Instruction::Or:
11216 case Instruction::Xor:
11217 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
11218 break;
11219 default:
11220 llvm_unreachable("Unexpected instruction.");
11221 }
11222 if (VectorCost > ScalarCost)
11223 return InstructionsState::invalid();
11224 return S;
11225 }
11226 assert(Operands.size() == 2 && "Unexpected number of operands!");
11227 unsigned CopyableNum =
11228 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
11229 if (CopyableNum < VL.size() / 2)
11230 return S;
11231 // Too many phi copyables - exit.
11232 const unsigned Limit = VL.size() / 24;
11233 if ((CopyableNum >= VL.size() - Limit ||
11234 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11235 CopyableNum >= MaxPHINumOperands) &&
11236 all_of(VL, [&](Value *V) {
11237 return isa<PHINode>(V) || !S.isCopyableElement(V);
11238 }))
11239 return InstructionsState::invalid();
11240 // Check profitability if number of copyables > VL.size() / 2.
11241 // 1. Reorder operands for better matching.
11242 if (isCommutative(MainOp)) {
11243 for (auto &Ops : Operands) {
11244 // Make instructions the first operands.
11245 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
11246 std::swap(Ops.front(), Ops.back());
11247 continue;
11248 }
11249 // Make constants the second operands.
11250 if (isa<Constant>(Ops.front())) {
11251 std::swap(Ops.front(), Ops.back());
11252 continue;
11253 }
11254 }
11255 }
11256 // 2. Check, if operands can be vectorized.
11257 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
11258 return InstructionsState::invalid();
11259 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11260 if (allConstant(Ops) || isSplat(Ops))
11261 return true;
11262 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11263 // one is different.
11264 constexpr unsigned Limit = 4;
11265 if (Operands.front().size() >= Limit) {
11266 SmallDenseMap<const Value *, unsigned> Counters;
11267 for (Value *V : Ops) {
11268 if (isa<UndefValue>(V))
11269 continue;
11270 ++Counters[V];
11271 }
11272 if (Counters.size() == 2 &&
11273 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11274 return C.second == 1;
11275 }))
11276 return true;
11277 }
11278 // First operand not a constant or splat? Last attempt - check for
11279 // potential vectorization.
11280 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11281 InstructionsState OpS = Analysis.buildInstructionsState(
11282 Ops, R, /*TryCopyableElementsVectorization=*/true);
11283 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11284 return false;
11285 unsigned CopyableNum =
11286 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11287 return CopyableNum <= VL.size() / 2;
11288 };
11289 if (!CheckOperand(Operands.front()))
11290 return InstructionsState::invalid();
11291
11292 return S;
11293 }
11294
11295 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11296 ArrayRef<Value *> VL) {
11297 assert(S && "Invalid state!");
11299 if (S.areInstructionsWithCopyableElements()) {
11300 MainOp = S.getMainOp();
11301 MainOpcode = S.getOpcode();
11302 Operands.assign(MainOp->getNumOperands(),
11303 BoUpSLP::ValueList(VL.size(), nullptr));
11304 for (auto [Idx, V] : enumerate(VL)) {
11305 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11306 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11307 Operands[OperandIdx][Idx] = Operand;
11308 }
11309 } else {
11310 buildOriginalOperands(S, VL, Operands);
11311 }
11312 return Operands;
11313 }
11314};
11315} // namespace
11316
11317BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11318 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11319 bool TryCopyableElementsVectorization) const {
11320 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11321
11322 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11323 InstructionsState S = Analysis.buildInstructionsState(
11324 VL, *this, TryCopyableElementsVectorization,
11325 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11326
11327 bool AreScatterAllGEPSameBlock = false;
11328 if (!S) {
11329 SmallVector<unsigned> SortedIndices;
11330 BasicBlock *BB = nullptr;
11331 bool IsScatterVectorizeUserTE =
11332 UserTreeIdx.UserTE &&
11333 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11334 AreScatterAllGEPSameBlock =
11335 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11336 VL.size() > 2 &&
11337 all_of(VL,
11338 [&BB](Value *V) {
11339 auto *I = dyn_cast<GetElementPtrInst>(V);
11340 if (!I)
11341 return doesNotNeedToBeScheduled(V);
11342 if (!BB)
11343 BB = I->getParent();
11344 return BB == I->getParent() && I->getNumOperands() == 2;
11345 }) &&
11346 BB &&
11347 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL,
11348 *SE, SortedIndices));
11349 if (!AreScatterAllGEPSameBlock) {
11350 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11351 "C,S,B,O, small shuffle. \n";
11352 dbgs() << "[";
11353 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11354 dbgs() << "]\n");
11355 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11356 /*TryToFindDuplicates=*/true,
11357 /*TrySplitVectorize=*/true);
11358 }
11359 // Reset S to make it GetElementPtr kind of node.
11360 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11361 assert(It != VL.end() && "Expected at least one GEP.");
11362 S = getSameOpcode(*It, *TLI);
11363 }
11364 assert(S && "Must be valid.");
11365
11366 // Don't handle vectors.
11367 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11368 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11369 // Do not try to pack to avoid extra instructions here.
11370 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11371 /*TryToFindDuplicates=*/false);
11372 }
11373
11374 // Check that all of the users of the scalars that we want to vectorize are
11375 // schedulable.
11376 BasicBlock *BB = S.getMainOp()->getParent();
11377
11379 !DT->isReachableFromEntry(BB)) {
11380 // Don't go into unreachable blocks. They may contain instructions with
11381 // dependency cycles which confuse the final scheduling.
11382 // Do not vectorize EH and non-returning blocks, not profitable in most
11383 // cases.
11384 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11385 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11386 }
11387
11388 // Don't go into catchswitch blocks, which can happen with PHIs.
11389 // Such blocks can only have PHIs and the catchswitch. There is no
11390 // place to insert a shuffle if we need to, so just avoid that issue.
11392 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11393 // Do not try to pack to avoid extra instructions here.
11394 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11395 /*TryToFindDuplicates=*/false);
11396 }
11397
11398 // Don't handle scalable vectors
11399 if (S.getOpcode() == Instruction::ExtractElement &&
11401 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11402 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11403 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11404 }
11405
11406 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11407 // a load), in which case peek through to include it in the tree, without
11408 // ballooning over-budget.
11409 if (Depth >= RecursionMaxDepth &&
11410 (S.isAltShuffle() || VL.size() < 4 ||
11411 !(match(S.getMainOp(), m_Load(m_Value())) ||
11412 all_of(VL, [&S](const Value *I) {
11413 return match(I,
11415 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11416 })))) {
11417 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11418 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11419 }
11420
11421 // Check if this is a duplicate of another entry.
11422 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11423 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11424 if (E->isSame(VL)) {
11425 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11426 << ".\n");
11427 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11428 }
11429 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11430 if (all_of(VL, [&](Value *V) {
11431 return isa<PoisonValue>(V) || Values.contains(V) ||
11432 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11433 LI->getLoopFor(S.getMainOp()->getParent()) &&
11434 isVectorized(V));
11435 })) {
11436 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11437 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11438 }
11439 }
11440
11441 // If all of the operands are identical or constant we have a simple solution.
11442 // If we deal with insert/extract instructions, they all must have constant
11443 // indices, otherwise we should gather them, not try to vectorize.
11444 // If alternate op node with 2 elements with gathered operands - do not
11445 // vectorize.
11446 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11447 if (!S || !S.isAltShuffle() || VL.size() > 2)
11448 return false;
11449 if (VectorizableTree.size() < MinTreeSize)
11450 return false;
11451 if (Depth >= RecursionMaxDepth - 1)
11452 return true;
11453 // Check if all operands are extracts, part of vector node or can build a
11454 // regular vectorize node.
11455 SmallVector<unsigned, 8> InstsCount;
11456 for (Value *V : VL) {
11457 auto *I = cast<Instruction>(V);
11458 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11459 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11460 }));
11461 }
11462 bool IsCommutative =
11463 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11464 if ((IsCommutative &&
11465 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11466 (!IsCommutative &&
11467 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11468 return true;
11469 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11471 auto *I1 = cast<Instruction>(VL.front());
11472 auto *I2 = cast<Instruction>(VL.back());
11473 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11474 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11475 I2->getOperand(Op));
11476 if (static_cast<unsigned>(count_if(
11477 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11479 })) >= S.getMainOp()->getNumOperands() / 2)
11480 return false;
11481 if (S.getMainOp()->getNumOperands() > 2)
11482 return true;
11483 if (IsCommutative) {
11484 // Check permuted operands.
11485 Candidates.clear();
11486 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11487 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11488 I2->getOperand((Op + 1) % E));
11489 if (any_of(
11490 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11492 }))
11493 return false;
11494 }
11495 return true;
11496 };
11497 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11498 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11499 if (!AreAllSameInsts || isSplat(VL) ||
11501 S.getMainOp()) &&
11503 NotProfitableForVectorization(VL)) {
11504 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11505 dbgs() << "[";
11506 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11507 dbgs() << "]\n");
11508 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11509 }
11510
11511 // Don't vectorize ephemeral values.
11512 if (!EphValues.empty()) {
11513 for (Value *V : VL) {
11514 if (EphValues.count(V)) {
11515 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11516 << ") is ephemeral.\n");
11517 // Do not try to pack to avoid extra instructions here.
11518 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11519 /*TryToFindDuplicates=*/false);
11520 }
11521 }
11522 }
11523
11524 // We now know that this is a vector of instructions of the same type from
11525 // the same block.
11526
11527 // Check that none of the instructions in the bundle are already in the tree
11528 // and the node may be not profitable for the vectorization as the small
11529 // alternate node.
11530 if (S.isAltShuffle()) {
11531 auto GetNumVectorizedExtracted = [&]() {
11532 APInt Extracted = APInt::getZero(VL.size());
11533 APInt Vectorized = APInt::getAllOnes(VL.size());
11534 for (auto [Idx, V] : enumerate(VL)) {
11535 auto *I = dyn_cast<Instruction>(V);
11536 if (!I || doesNotNeedToBeScheduled(I) ||
11537 all_of(I->operands(), [&](const Use &U) {
11538 return isa<ExtractElementInst>(U.get());
11539 }))
11540 continue;
11541 if (isVectorized(I))
11542 Vectorized.clearBit(Idx);
11543 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11544 Extracted.setBit(Idx);
11545 }
11546 return std::make_pair(Vectorized, Extracted);
11547 };
11548 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11550 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11551 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11552 // Rough cost estimation, if the vector code (+ potential extracts) is
11553 // more profitable than the scalar + buildvector.
11554 Type *ScalarTy = VL.front()->getType();
11555 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11556 InstructionCost VectorizeCostEstimate =
11557 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11558 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11559 /*Insert=*/false, /*Extract=*/true, Kind);
11560 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11561 *TTI, ScalarTy, VecTy, Vectorized,
11562 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11563 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11564 }
11565 if (PreferScalarize) {
11566 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11567 "node is not profitable.\n");
11568 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11569 }
11570 }
11571
11572 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11573 if (UserIgnoreList && !UserIgnoreList->empty()) {
11574 for (Value *V : VL) {
11575 if (UserIgnoreList->contains(V)) {
11576 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11577 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11578 }
11579 }
11580 }
11581
11582 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11583}
11584
11585void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11586 const EdgeInfo &UserTreeIdx,
11587 unsigned InterleaveFactor) {
11588 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11589
11590 SmallVector<int> ReuseShuffleIndices;
11591 SmallVector<Value *> VL(VLRef);
11592
11593 // Tries to build split node.
11594 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11595 SmallVector<Value *> Op1, Op2;
11596 OrdersType ReorderIndices;
11597 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11598 return false;
11599
11600 auto Invalid = ScheduleBundle::invalid();
11601 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11602 UserTreeIdx, {}, ReorderIndices);
11603 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11604 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11605 InstructionsState S = getSameOpcode(Op, *TLI);
11606 if (S && (isa<LoadInst>(S.getMainOp()) ||
11607 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11608 // Build gather node for loads, they will be gathered later.
11609 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11610 Idx == 0 ? 0 : Op1.size());
11611 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11612 } else {
11613 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11614 Idx == 0 ? 0 : Op1.size());
11615 buildTreeRec(Op, Depth, {TE, Idx});
11616 }
11617 };
11618 AddNode(Op1, 0);
11619 AddNode(Op2, 1);
11620 return true;
11621 };
11622
11623 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11624 bool AreConsts = false;
11625 for (Value *V : VL) {
11626 if (isa<PoisonValue>(V))
11627 continue;
11628 if (isa<Constant>(V)) {
11629 AreConsts = true;
11630 continue;
11631 }
11632 if (!isa<PHINode>(V))
11633 return false;
11634 }
11635 return AreConsts;
11636 };
11637 if (AreOnlyConstsWithPHIs(VL)) {
11638 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11639 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11640 return;
11641 }
11642
11643 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11644 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11645 InstructionsState S = Legality.getInstructionsState();
11646 if (!Legality.isLegal()) {
11647 if (Legality.trySplitVectorize()) {
11648 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11649 // Last chance to try to vectorize alternate node.
11650 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11651 return;
11652 }
11653 if (!S)
11654 Legality = getScalarsVectorizationLegality(
11655 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11656 if (!Legality.isLegal()) {
11657 if (Legality.tryToFindDuplicates())
11658 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11659 UserTreeIdx);
11660
11661 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11662 return;
11663 }
11664 S = Legality.getInstructionsState();
11665 }
11666
11667 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11668 if (S.isAltShuffle() && TrySplitNode(S))
11669 return;
11670
11671 // Check that every instruction appears once in this bundle.
11672 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11673 /*TryPad=*/true)) {
11674 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11675 return;
11676 }
11677
11678 // Perform specific checks for each particular instruction kind.
11679 bool IsScatterVectorizeUserTE =
11680 UserTreeIdx.UserTE &&
11681 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11682 OrdersType CurrentOrder;
11683 SmallVector<Value *> PointerOps;
11684 StridedPtrInfo SPtrInfo;
11685 TreeEntry::EntryState State = getScalarsVectorizationState(
11686 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11687 if (State == TreeEntry::NeedToGather) {
11688 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11689 return;
11690 }
11691
11692 Instruction *VL0 = S.getMainOp();
11693 BasicBlock *BB = VL0->getParent();
11694 auto &BSRef = BlocksSchedules[BB];
11695 if (!BSRef)
11696 BSRef = std::make_unique<BlockScheduling>(BB);
11697
11698 BlockScheduling &BS = *BSRef;
11699
11700 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11701 std::optional<ScheduleBundle *> BundlePtr =
11702 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11703#ifdef EXPENSIVE_CHECKS
11704 // Make sure we didn't break any internal invariants
11705 BS.verify();
11706#endif
11707 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11708 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11709 // Last chance to try to vectorize alternate node.
11710 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11711 return;
11712 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11713 NonScheduledFirst.insert(VL.front());
11714 if (S.getOpcode() == Instruction::Load &&
11715 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11717 return;
11718 }
11719 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11720 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11721 ScheduleBundle Empty;
11722 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11723 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11724
11725 unsigned ShuffleOrOp =
11726 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11727 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11728 // Postpone PHI nodes creation
11729 SmallVector<unsigned> PHIOps;
11730 for (unsigned I : seq<unsigned>(Operands.size())) {
11731 ArrayRef<Value *> Op = Operands[I];
11732 if (Op.empty())
11733 continue;
11734 InstructionsState S = getSameOpcode(Op, *TLI);
11735 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11736 buildTreeRec(Op, Depth + 1, {TE, I});
11737 else
11738 PHIOps.push_back(I);
11739 }
11740 for (unsigned I : PHIOps)
11741 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11742 };
11743 switch (ShuffleOrOp) {
11744 case Instruction::PHI: {
11745 TreeEntry *TE =
11746 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11747 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11748 TE->dump());
11749
11750 TE->setOperands(Operands);
11751 CreateOperandNodes(TE, Operands);
11752 return;
11753 }
11754 case Instruction::ExtractValue:
11755 case Instruction::ExtractElement: {
11756 if (CurrentOrder.empty()) {
11757 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11758 } else {
11759 LLVM_DEBUG({
11760 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11761 "with order";
11762 for (unsigned Idx : CurrentOrder)
11763 dbgs() << " " << Idx;
11764 dbgs() << "\n";
11765 });
11766 fixupOrderingIndices(CurrentOrder);
11767 }
11768 // Insert new order with initial value 0, if it does not exist,
11769 // otherwise return the iterator to the existing one.
11770 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11771 ReuseShuffleIndices, CurrentOrder);
11772 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11773 "(ExtractValueInst/ExtractElementInst).\n";
11774 TE->dump());
11775 // This is a special case, as it does not gather, but at the same time
11776 // we are not extending buildTreeRec() towards the operands.
11777 TE->setOperands(Operands);
11778 return;
11779 }
11780 case Instruction::InsertElement: {
11781 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11782
11783 auto OrdCompare = [](const std::pair<int, int> &P1,
11784 const std::pair<int, int> &P2) {
11785 return P1.first > P2.first;
11786 };
11787 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
11788 decltype(OrdCompare)>
11789 Indices(OrdCompare);
11790 for (int I = 0, E = VL.size(); I < E; ++I) {
11791 unsigned Idx = *getElementIndex(VL[I]);
11792 Indices.emplace(Idx, I);
11793 }
11794 OrdersType CurrentOrder(VL.size(), VL.size());
11795 bool IsIdentity = true;
11796 for (int I = 0, E = VL.size(); I < E; ++I) {
11797 CurrentOrder[Indices.top().second] = I;
11798 IsIdentity &= Indices.top().second == I;
11799 Indices.pop();
11800 }
11801 if (IsIdentity)
11802 CurrentOrder.clear();
11803 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11804 {}, CurrentOrder);
11805 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
11806 TE->dump());
11807
11808 TE->setOperands(Operands);
11809 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
11810 return;
11811 }
11812 case Instruction::Load: {
11813 // Check that a vectorized load would load the same memory as a scalar
11814 // load. For example, we don't want to vectorize loads that are smaller
11815 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
11816 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
11817 // from such a struct, we read/write packed bits disagreeing with the
11818 // unvectorized version.
11819 TreeEntry *TE = nullptr;
11820 fixupOrderingIndices(CurrentOrder);
11821 switch (State) {
11822 case TreeEntry::Vectorize:
11823 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11824 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11825 if (CurrentOrder.empty())
11826 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
11827 TE->dump());
11828 else
11830 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
11831 TE->dump());
11832 break;
11833 case TreeEntry::CompressVectorize:
11834 // Vectorizing non-consecutive loads with (masked)load + compress.
11835 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11836 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11837 LLVM_DEBUG(
11838 dbgs()
11839 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11840 TE->dump());
11841 break;
11842 case TreeEntry::StridedVectorize:
11843 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11844 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11845 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11846 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
11847 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11848 TE->dump());
11849 break;
11850 case TreeEntry::ScatterVectorize:
11851 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11852 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11853 UserTreeIdx, ReuseShuffleIndices);
11854 LLVM_DEBUG(
11855 dbgs()
11856 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11857 TE->dump());
11858 break;
11859 case TreeEntry::CombinedVectorize:
11860 case TreeEntry::SplitVectorize:
11861 case TreeEntry::NeedToGather:
11862 llvm_unreachable("Unexpected loads state.");
11863 }
11864 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11865 assert(Operands.size() == 1 && "Expected a single operand only");
11866 SmallVector<int> Mask;
11867 inversePermutation(CurrentOrder, Mask);
11868 reorderScalars(Operands.front(), Mask);
11869 }
11870 TE->setOperands(Operands);
11871 if (State == TreeEntry::ScatterVectorize)
11872 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
11873 return;
11874 }
11875 case Instruction::ZExt:
11876 case Instruction::SExt:
11877 case Instruction::FPToUI:
11878 case Instruction::FPToSI:
11879 case Instruction::FPExt:
11880 case Instruction::PtrToInt:
11881 case Instruction::IntToPtr:
11882 case Instruction::SIToFP:
11883 case Instruction::UIToFP:
11884 case Instruction::Trunc:
11885 case Instruction::FPTrunc:
11886 case Instruction::BitCast: {
11887 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11888 std::make_pair(std::numeric_limits<unsigned>::min(),
11889 std::numeric_limits<unsigned>::max()));
11890 if (ShuffleOrOp == Instruction::ZExt ||
11891 ShuffleOrOp == Instruction::SExt) {
11892 CastMaxMinBWSizes = std::make_pair(
11893 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11894 PrevMaxBW),
11895 std::min<unsigned>(
11896 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11897 PrevMinBW));
11898 } else if (ShuffleOrOp == Instruction::Trunc) {
11899 CastMaxMinBWSizes = std::make_pair(
11900 std::max<unsigned>(
11901 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11902 PrevMaxBW),
11903 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11904 PrevMinBW));
11905 }
11906 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11907 ReuseShuffleIndices);
11908 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
11909 TE->dump());
11910
11911 TE->setOperands(Operands);
11912 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11913 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11914 if (ShuffleOrOp == Instruction::Trunc) {
11915 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11916 } else if (ShuffleOrOp == Instruction::SIToFP ||
11917 ShuffleOrOp == Instruction::UIToFP) {
11918 unsigned NumSignBits =
11919 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11920 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
11921 APInt Mask = DB->getDemandedBits(OpI);
11922 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
11923 }
11924 if (NumSignBits * 2 >=
11925 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11926 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11927 }
11928 return;
11929 }
11930 case Instruction::ICmp:
11931 case Instruction::FCmp: {
11932 // Check that all of the compares have the same predicate.
11933 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11934 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11935 ReuseShuffleIndices);
11936 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
11937 TE->dump());
11938
11939 VLOperands Ops(VL, Operands, S, *this);
11940 if (cast<CmpInst>(VL0)->isCommutative()) {
11941 // Commutative predicate - collect + sort operands of the instructions
11942 // so that each side is more likely to have the same opcode.
11944 "Commutative Predicate mismatch");
11945 Ops.reorder();
11946 Operands.front() = Ops.getVL(0);
11947 Operands.back() = Ops.getVL(1);
11948 } else {
11949 // Collect operands - commute if it uses the swapped predicate.
11950 for (auto [Idx, V] : enumerate(VL)) {
11951 if (isa<PoisonValue>(V))
11952 continue;
11953 auto *Cmp = cast<CmpInst>(V);
11954 if (Cmp->getPredicate() != P0)
11955 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11956 }
11957 }
11958 TE->setOperands(Operands);
11959 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11960 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11961 if (ShuffleOrOp == Instruction::ICmp) {
11962 unsigned NumSignBits0 =
11963 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11964 if (NumSignBits0 * 2 >=
11965 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11966 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11967 unsigned NumSignBits1 =
11968 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
11969 if (NumSignBits1 * 2 >=
11970 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
11971 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11972 }
11973 return;
11974 }
11975 case Instruction::Select:
11976 case Instruction::FNeg:
11977 case Instruction::Add:
11978 case Instruction::FAdd:
11979 case Instruction::Sub:
11980 case Instruction::FSub:
11981 case Instruction::Mul:
11982 case Instruction::FMul:
11983 case Instruction::UDiv:
11984 case Instruction::SDiv:
11985 case Instruction::FDiv:
11986 case Instruction::URem:
11987 case Instruction::SRem:
11988 case Instruction::FRem:
11989 case Instruction::Shl:
11990 case Instruction::LShr:
11991 case Instruction::AShr:
11992 case Instruction::And:
11993 case Instruction::Or:
11994 case Instruction::Xor:
11995 case Instruction::Freeze: {
11996 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11997 ReuseShuffleIndices);
11998 LLVM_DEBUG(
11999 dbgs() << "SLP: added a new TreeEntry "
12000 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12001 TE->dump());
12002
12003 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
12004 VLOperands Ops(VL, Operands, S, *this);
12005 Ops.reorder();
12006 Operands[0] = Ops.getVL(0);
12007 Operands[1] = Ops.getVL(1);
12008 }
12009 TE->setOperands(Operands);
12010 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12011 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12012 return;
12013 }
12014 case Instruction::GetElementPtr: {
12015 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12016 ReuseShuffleIndices);
12017 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
12018 TE->dump());
12019 TE->setOperands(Operands);
12020
12021 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
12022 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12023 return;
12024 }
12025 case Instruction::Store: {
12026 bool Consecutive = CurrentOrder.empty();
12027 if (!Consecutive)
12028 fixupOrderingIndices(CurrentOrder);
12029 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12030 ReuseShuffleIndices, CurrentOrder);
12031 if (Consecutive)
12032 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
12033 TE->dump());
12034 else
12035 LLVM_DEBUG(
12036 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
12037 TE->dump());
12038 TE->setOperands(Operands);
12039 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
12040 return;
12041 }
12042 case Instruction::Call: {
12043 // Check if the calls are all to the same vectorizable intrinsic or
12044 // library function.
12045 CallInst *CI = cast<CallInst>(VL0);
12047
12048 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12049 ReuseShuffleIndices);
12050 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
12051 TE->dump());
12052 if (isCommutative(VL0)) {
12053 VLOperands Ops(VL, Operands, S, *this);
12054 Ops.reorder();
12055 Operands[0] = Ops.getVL(0);
12056 Operands[1] = Ops.getVL(1);
12057 }
12058 TE->setOperands(Operands);
12059 for (unsigned I : seq<unsigned>(CI->arg_size())) {
12060 // For scalar operands no need to create an entry since no need to
12061 // vectorize it.
12063 continue;
12064 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12065 }
12066 return;
12067 }
12068 case Instruction::ShuffleVector: {
12069 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12070 ReuseShuffleIndices);
12071 if (S.isAltShuffle()) {
12072 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
12073 TE->dump());
12074 } else {
12075 assert(SLPReVec && "Only supported by REVEC.");
12076 LLVM_DEBUG(
12077 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12078 TE->dump());
12079 }
12080
12081 // Reorder operands if reordering would enable vectorization.
12082 auto *CI = dyn_cast<CmpInst>(VL0);
12083 if (CI && any_of(VL, [](Value *V) {
12084 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
12085 })) {
12086 auto *MainCI = cast<CmpInst>(S.getMainOp());
12087 auto *AltCI = cast<CmpInst>(S.getAltOp());
12088 CmpInst::Predicate MainP = MainCI->getPredicate();
12089 CmpInst::Predicate AltP = AltCI->getPredicate();
12090 assert(MainP != AltP &&
12091 "Expected different main/alternate predicates.");
12092 // Collect operands - commute if it uses the swapped predicate or
12093 // alternate operation.
12094 for (auto [Idx, V] : enumerate(VL)) {
12095 if (isa<PoisonValue>(V))
12096 continue;
12097 auto *Cmp = cast<CmpInst>(V);
12098
12099 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
12100 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12101 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12102 } else {
12103 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12104 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12105 }
12106 }
12107 TE->setOperands(Operands);
12108 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
12109 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
12110 return;
12111 }
12112
12113 if (isa<BinaryOperator>(VL0) || CI) {
12114 VLOperands Ops(VL, Operands, S, *this);
12115 Ops.reorder();
12116 Operands[0] = Ops.getVL(0);
12117 Operands[1] = Ops.getVL(1);
12118 }
12119 TE->setOperands(Operands);
12120 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12121 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12122 return;
12123 }
12124 default:
12125 break;
12126 }
12127 llvm_unreachable("Unexpected vectorization of the instructions.");
12128}
12129
12130unsigned BoUpSLP::canMapToVector(Type *T) const {
12131 unsigned N = 1;
12132 Type *EltTy = T;
12133
12135 if (EltTy->isEmptyTy())
12136 return 0;
12137 if (auto *ST = dyn_cast<StructType>(EltTy)) {
12138 // Check that struct is homogeneous.
12139 for (const auto *Ty : ST->elements())
12140 if (Ty != *ST->element_begin())
12141 return 0;
12142 N *= ST->getNumElements();
12143 EltTy = *ST->element_begin();
12144 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
12145 N *= AT->getNumElements();
12146 EltTy = AT->getElementType();
12147 } else {
12148 auto *VT = cast<FixedVectorType>(EltTy);
12149 N *= VT->getNumElements();
12150 EltTy = VT->getElementType();
12151 }
12152 }
12153
12154 if (!isValidElementType(EltTy))
12155 return 0;
12156 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
12157 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12158 VTSize != DL->getTypeStoreSizeInBits(T))
12159 return 0;
12160 return N;
12161}
12162
12163bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12164 SmallVectorImpl<unsigned> &CurrentOrder,
12165 bool ResizeAllowed) const {
12167 assert(It != VL.end() && "Expected at least one extract instruction.");
12168 auto *E0 = cast<Instruction>(*It);
12169 assert(
12171 "Invalid opcode");
12172 // Check if all of the extracts come from the same vector and from the
12173 // correct offset.
12174 Value *Vec = E0->getOperand(0);
12175
12176 CurrentOrder.clear();
12177
12178 // We have to extract from a vector/aggregate with the same number of elements.
12179 unsigned NElts;
12180 if (E0->getOpcode() == Instruction::ExtractValue) {
12181 NElts = canMapToVector(Vec->getType());
12182 if (!NElts)
12183 return false;
12184 // Check if load can be rewritten as load of vector.
12185 LoadInst *LI = dyn_cast<LoadInst>(Vec);
12186 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
12187 return false;
12188 } else {
12189 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
12190 }
12191
12192 unsigned E = VL.size();
12193 if (!ResizeAllowed && NElts != E)
12194 return false;
12195 SmallVector<int> Indices(E, PoisonMaskElem);
12196 unsigned MinIdx = NElts, MaxIdx = 0;
12197 for (auto [I, V] : enumerate(VL)) {
12198 auto *Inst = dyn_cast<Instruction>(V);
12199 if (!Inst)
12200 continue;
12201 if (Inst->getOperand(0) != Vec)
12202 return false;
12203 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
12204 if (isa<UndefValue>(EE->getIndexOperand()))
12205 continue;
12206 std::optional<unsigned> Idx = getExtractIndex(Inst);
12207 if (!Idx)
12208 return false;
12209 const unsigned ExtIdx = *Idx;
12210 if (ExtIdx >= NElts)
12211 continue;
12212 Indices[I] = ExtIdx;
12213 if (MinIdx > ExtIdx)
12214 MinIdx = ExtIdx;
12215 if (MaxIdx < ExtIdx)
12216 MaxIdx = ExtIdx;
12217 }
12218 if (MaxIdx - MinIdx + 1 > E)
12219 return false;
12220 if (MaxIdx + 1 <= E)
12221 MinIdx = 0;
12222
12223 // Check that all of the indices extract from the correct offset.
12224 bool ShouldKeepOrder = true;
12225 // Assign to all items the initial value E + 1 so we can check if the extract
12226 // instruction index was used already.
12227 // Also, later we can check that all the indices are used and we have a
12228 // consecutive access in the extract instructions, by checking that no
12229 // element of CurrentOrder still has value E + 1.
12230 CurrentOrder.assign(E, E);
12231 for (unsigned I = 0; I < E; ++I) {
12232 if (Indices[I] == PoisonMaskElem)
12233 continue;
12234 const unsigned ExtIdx = Indices[I] - MinIdx;
12235 if (CurrentOrder[ExtIdx] != E) {
12236 CurrentOrder.clear();
12237 return false;
12238 }
12239 ShouldKeepOrder &= ExtIdx == I;
12240 CurrentOrder[ExtIdx] = I;
12241 }
12242 if (ShouldKeepOrder)
12243 CurrentOrder.clear();
12244
12245 return ShouldKeepOrder;
12246}
12247
12248bool BoUpSLP::areAllUsersVectorized(
12249 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12250 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
12251 all_of(I->users(), [this](User *U) {
12252 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12253 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12254 });
12255}
12256
12257void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12258 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12259 SmallVectorImpl<Value *> *OpScalars,
12260 SmallVectorImpl<Value *> *AltScalars) const {
12261 unsigned Sz = Scalars.size();
12262 Mask.assign(Sz, PoisonMaskElem);
12263 SmallVector<int> OrderMask;
12264 if (!ReorderIndices.empty())
12265 inversePermutation(ReorderIndices, OrderMask);
12266 for (unsigned I = 0; I < Sz; ++I) {
12267 unsigned Idx = I;
12268 if (!ReorderIndices.empty())
12269 Idx = OrderMask[I];
12270 if (isa<PoisonValue>(Scalars[Idx]))
12271 continue;
12272 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12273 if (IsAltOp(OpInst)) {
12274 Mask[I] = Sz + Idx;
12275 if (AltScalars)
12276 AltScalars->push_back(OpInst);
12277 } else {
12278 Mask[I] = Idx;
12279 if (OpScalars)
12280 OpScalars->push_back(OpInst);
12281 }
12282 }
12283 if (!ReuseShuffleIndices.empty()) {
12284 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12285 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12286 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12287 });
12288 Mask.swap(NewMask);
12289 }
12290}
12291
12293 Instruction *AltOp,
12294 const TargetLibraryInfo &TLI) {
12295 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12296}
12297
12299 Instruction *AltOp,
12300 const TargetLibraryInfo &TLI) {
12301 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12302 auto *AltCI = cast<CmpInst>(AltOp);
12303 CmpInst::Predicate MainP = MainCI->getPredicate();
12304 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12305 assert(MainP != AltP && "Expected different main/alternate predicates.");
12306 auto *CI = cast<CmpInst>(I);
12307 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12308 return false;
12309 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12310 return true;
12311 CmpInst::Predicate P = CI->getPredicate();
12313
12314 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12315 "CmpInst expected to match either main or alternate predicate or "
12316 "their swap.");
12317 return MainP != P && MainP != SwappedP;
12318 }
12319 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12320}
12321
12322TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12323 assert(!Ops.empty());
12324 const auto *Op0 = Ops.front();
12325
12326 const bool IsConstant = all_of(Ops, [](Value *V) {
12327 // TODO: We should allow undef elements here
12328 return isConstant(V) && !isa<UndefValue>(V);
12329 });
12330 const bool IsUniform = all_of(Ops, [=](Value *V) {
12331 // TODO: We should allow undef elements here
12332 return V == Op0;
12333 });
12334 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12335 // TODO: We should allow undef elements here
12336 if (auto *CI = dyn_cast<ConstantInt>(V))
12337 return CI->getValue().isPowerOf2();
12338 return false;
12339 });
12340 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12341 // TODO: We should allow undef elements here
12342 if (auto *CI = dyn_cast<ConstantInt>(V))
12343 return CI->getValue().isNegatedPowerOf2();
12344 return false;
12345 });
12346
12348 if (IsConstant && IsUniform)
12350 else if (IsConstant)
12352 else if (IsUniform)
12354
12356 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12357 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12358
12359 return {VK, VP};
12360}
12361
12362namespace {
12363/// The base class for shuffle instruction emission and shuffle cost estimation.
12364class BaseShuffleAnalysis {
12365protected:
12366 Type *ScalarTy = nullptr;
12367
12368 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12369
12370 /// V is expected to be a vectorized value.
12371 /// When REVEC is disabled, there is no difference between VF and
12372 /// VNumElements.
12373 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12374 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12375 /// of 8.
12376 unsigned getVF(Value *V) const {
12377 assert(V && "V cannot be nullptr");
12378 assert(isa<FixedVectorType>(V->getType()) &&
12379 "V does not have FixedVectorType");
12380 assert(ScalarTy && "ScalarTy cannot be nullptr");
12381 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12382 unsigned VNumElements =
12383 cast<FixedVectorType>(V->getType())->getNumElements();
12384 assert(VNumElements > ScalarTyNumElements &&
12385 "the number of elements of V is not large enough");
12386 assert(VNumElements % ScalarTyNumElements == 0 &&
12387 "the number of elements of V is not a vectorized value");
12388 return VNumElements / ScalarTyNumElements;
12389 }
12390
12391 /// Checks if the mask is an identity mask.
12392 /// \param IsStrict if is true the function returns false if mask size does
12393 /// not match vector size.
12394 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12395 bool IsStrict) {
12396 int Limit = Mask.size();
12397 int VF = VecTy->getNumElements();
12398 int Index = -1;
12399 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12400 return true;
12401 if (!IsStrict) {
12402 // Consider extract subvector starting from index 0.
12403 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12404 Index == 0)
12405 return true;
12406 // All VF-size submasks are identity (e.g.
12407 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12408 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12409 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12410 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
12412 }))
12413 return true;
12414 }
12415 return false;
12416 }
12417
12418 /// Tries to combine 2 different masks into single one.
12419 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12420 /// change the size of the vector, \p LocalVF is the original size of the
12421 /// shuffled vector.
12422 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12423 ArrayRef<int> ExtMask) {
12424 unsigned VF = Mask.size();
12425 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12426 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12427 if (ExtMask[I] == PoisonMaskElem)
12428 continue;
12429 int MaskedIdx = Mask[ExtMask[I] % VF];
12430 NewMask[I] =
12431 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12432 }
12433 Mask.swap(NewMask);
12434 }
12435
12436 /// Looks through shuffles trying to reduce final number of shuffles in the
12437 /// code. The function looks through the previously emitted shuffle
12438 /// instructions and properly mark indices in mask as undef.
12439 /// For example, given the code
12440 /// \code
12441 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12442 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12443 /// \endcode
12444 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12445 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12446 /// <0, 1, 2, 3> for the shuffle.
12447 /// If 2 operands are of different size, the smallest one will be resized and
12448 /// the mask recalculated properly.
12449 /// For example, given the code
12450 /// \code
12451 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12452 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12453 /// \endcode
12454 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12455 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12456 /// <0, 1, 2, 3> for the shuffle.
12457 /// So, it tries to transform permutations to simple vector merge, if
12458 /// possible.
12459 /// \param V The input vector which must be shuffled using the given \p Mask.
12460 /// If the better candidate is found, \p V is set to this best candidate
12461 /// vector.
12462 /// \param Mask The input mask for the shuffle. If the best candidate is found
12463 /// during looking-through-shuffles attempt, it is updated accordingly.
12464 /// \param SinglePermute true if the shuffle operation is originally a
12465 /// single-value-permutation. In this case the look-through-shuffles procedure
12466 /// may look for resizing shuffles as the best candidates.
12467 /// \return true if the shuffle results in the non-resizing identity shuffle
12468 /// (and thus can be ignored), false - otherwise.
12469 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12470 bool SinglePermute) {
12471 Value *Op = V;
12472 ShuffleVectorInst *IdentityOp = nullptr;
12473 SmallVector<int> IdentityMask;
12474 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12475 // Exit if not a fixed vector type or changing size shuffle.
12476 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12477 if (!SVTy)
12478 break;
12479 // Remember the identity or broadcast mask, if it is not a resizing
12480 // shuffle. If no better candidates are found, this Op and Mask will be
12481 // used in the final shuffle.
12482 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12483 if (!IdentityOp || !SinglePermute ||
12484 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12486 IdentityMask.size()))) {
12487 IdentityOp = SV;
12488 // Store current mask in the IdentityMask so later we did not lost
12489 // this info if IdentityOp is selected as the best candidate for the
12490 // permutation.
12491 IdentityMask.assign(Mask);
12492 }
12493 }
12494 // Remember the broadcast mask. If no better candidates are found, this Op
12495 // and Mask will be used in the final shuffle.
12496 // Zero splat can be used as identity too, since it might be used with
12497 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12498 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12499 // expensive, the analysis founds out, that the source vector is just a
12500 // broadcast, this original mask can be transformed to identity mask <0,
12501 // 1, 2, 3>.
12502 // \code
12503 // %0 = shuffle %v, poison, zeroinitalizer
12504 // %res = shuffle %0, poison, <3, 1, 2, 0>
12505 // \endcode
12506 // may be transformed to
12507 // \code
12508 // %0 = shuffle %v, poison, zeroinitalizer
12509 // %res = shuffle %0, poison, <0, 1, 2, 3>
12510 // \endcode
12511 if (SV->isZeroEltSplat()) {
12512 IdentityOp = SV;
12513 IdentityMask.assign(Mask);
12514 }
12515 int LocalVF = Mask.size();
12516 if (auto *SVOpTy =
12517 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12518 LocalVF = SVOpTy->getNumElements();
12519 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12520 for (auto [Idx, I] : enumerate(Mask)) {
12521 if (I == PoisonMaskElem ||
12522 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12523 continue;
12524 ExtMask[Idx] = SV->getMaskValue(I);
12525 }
12526 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12527 SV->getOperand(0),
12528 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12529 .all();
12530 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12531 SV->getOperand(1),
12532 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12533 .all();
12534 if (!IsOp1Undef && !IsOp2Undef) {
12535 // Update mask and mark undef elems.
12536 for (int &I : Mask) {
12537 if (I == PoisonMaskElem)
12538 continue;
12539 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12541 I = PoisonMaskElem;
12542 }
12543 break;
12544 }
12545 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12546 combineMasks(LocalVF, ShuffleMask, Mask);
12547 Mask.swap(ShuffleMask);
12548 if (IsOp2Undef)
12549 Op = SV->getOperand(0);
12550 else
12551 Op = SV->getOperand(1);
12552 }
12553 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12554 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12556 if (IdentityOp) {
12557 V = IdentityOp;
12558 assert(Mask.size() == IdentityMask.size() &&
12559 "Expected masks of same sizes.");
12560 // Clear known poison elements.
12561 for (auto [I, Idx] : enumerate(Mask))
12562 if (Idx == PoisonMaskElem)
12563 IdentityMask[I] = PoisonMaskElem;
12564 Mask.swap(IdentityMask);
12565 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12566 return SinglePermute &&
12567 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12568 /*IsStrict=*/true) ||
12569 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12570 Shuffle->isZeroEltSplat() &&
12572 all_of(enumerate(Mask), [&](const auto &P) {
12573 return P.value() == PoisonMaskElem ||
12574 Shuffle->getShuffleMask()[P.index()] == 0;
12575 })));
12576 }
12577 V = Op;
12578 return false;
12579 }
12580 V = Op;
12581 return true;
12582 }
12583
12584 /// Smart shuffle instruction emission, walks through shuffles trees and
12585 /// tries to find the best matching vector for the actual shuffle
12586 /// instruction.
12587 template <typename T, typename ShuffleBuilderTy>
12588 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12589 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12590 assert(V1 && "Expected at least one vector value.");
12591 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12592 SmallVector<int> NewMask(Mask);
12593 if (ScalarTyNumElements != 1) {
12594 assert(SLPReVec && "FixedVectorType is not expected.");
12595 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12596 Mask = NewMask;
12597 }
12598 if (V2)
12599 Builder.resizeToMatch(V1, V2);
12600 int VF = Mask.size();
12601 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12602 VF = FTy->getNumElements();
12604 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12605 .all()) {
12606 // Peek through shuffles.
12607 Value *Op1 = V1;
12608 Value *Op2 = V2;
12609 int VF =
12610 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12611 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12612 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12613 for (int I = 0, E = Mask.size(); I < E; ++I) {
12614 if (Mask[I] < VF)
12615 CombinedMask1[I] = Mask[I];
12616 else
12617 CombinedMask2[I] = Mask[I] - VF;
12618 }
12619 Value *PrevOp1;
12620 Value *PrevOp2;
12621 do {
12622 PrevOp1 = Op1;
12623 PrevOp2 = Op2;
12624 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12625 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12626 // Check if we have 2 resizing shuffles - need to peek through operands
12627 // again.
12628 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12629 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12630 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12631 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12632 if (I == PoisonMaskElem)
12633 continue;
12634 ExtMask1[Idx] = SV1->getMaskValue(I);
12635 }
12636 SmallBitVector UseMask1 = buildUseMask(
12637 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12638 ->getNumElements(),
12639 ExtMask1, UseMask::SecondArg);
12640 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12641 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12642 if (I == PoisonMaskElem)
12643 continue;
12644 ExtMask2[Idx] = SV2->getMaskValue(I);
12645 }
12646 SmallBitVector UseMask2 = buildUseMask(
12647 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12648 ->getNumElements(),
12649 ExtMask2, UseMask::SecondArg);
12650 if (SV1->getOperand(0)->getType() ==
12651 SV2->getOperand(0)->getType() &&
12652 SV1->getOperand(0)->getType() != SV1->getType() &&
12653 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12654 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12655 Op1 = SV1->getOperand(0);
12656 Op2 = SV2->getOperand(0);
12657 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12658 int LocalVF = ShuffleMask1.size();
12659 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12660 LocalVF = FTy->getNumElements();
12661 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12662 CombinedMask1.swap(ShuffleMask1);
12663 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12664 LocalVF = ShuffleMask2.size();
12665 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12666 LocalVF = FTy->getNumElements();
12667 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12668 CombinedMask2.swap(ShuffleMask2);
12669 }
12670 }
12671 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12672 Builder.resizeToMatch(Op1, Op2);
12673 VF = std::max(cast<VectorType>(Op1->getType())
12674 ->getElementCount()
12675 .getKnownMinValue(),
12677 ->getElementCount()
12678 .getKnownMinValue());
12679 for (int I = 0, E = Mask.size(); I < E; ++I) {
12680 if (CombinedMask2[I] != PoisonMaskElem) {
12681 assert(CombinedMask1[I] == PoisonMaskElem &&
12682 "Expected undefined mask element");
12683 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12684 }
12685 }
12686 if (Op1 == Op2 &&
12687 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12688 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12690 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12691 ArrayRef(CombinedMask1))))
12692 return Builder.createIdentity(Op1);
12693 return Builder.createShuffleVector(
12694 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12695 CombinedMask1);
12696 }
12697 if (isa<PoisonValue>(V1))
12698 return Builder.createPoison(
12699 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12700 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12701 assert(V1 && "Expected non-null value after looking through shuffles.");
12702
12703 if (!IsIdentity)
12704 return Builder.createShuffleVector(V1, NewMask);
12705 return Builder.createIdentity(V1);
12706 }
12707
12708 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12709 /// shuffle emission.
12710 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12711 ArrayRef<int> Mask) {
12712 for (unsigned I : seq<unsigned>(CommonMask.size()))
12713 if (Mask[I] != PoisonMaskElem)
12714 CommonMask[I] = I;
12715 }
12716};
12717} // namespace
12718
12719/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12720static std::pair<InstructionCost, InstructionCost>
12722 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12723 Type *ScalarTy, VectorType *VecTy) {
12724 InstructionCost ScalarCost = 0;
12725 InstructionCost VecCost = 0;
12726 // Here we differentiate two cases: (1) when Ptrs represent a regular
12727 // vectorization tree node (as they are pointer arguments of scattered
12728 // loads) or (2) when Ptrs are the arguments of loads or stores being
12729 // vectorized as plane wide unit-stride load/store since all the
12730 // loads/stores are known to be from/to adjacent locations.
12731 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12732 // Case 2: estimate costs for pointer related costs when vectorizing to
12733 // a wide load/store.
12734 // Scalar cost is estimated as a set of pointers with known relationship
12735 // between them.
12736 // For vector code we will use BasePtr as argument for the wide load/store
12737 // but we also need to account all the instructions which are going to
12738 // stay in vectorized code due to uses outside of these scalar
12739 // loads/stores.
12740 ScalarCost = TTI.getPointersChainCost(
12741 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12742 CostKind);
12743
12744 SmallVector<const Value *> PtrsRetainedInVecCode;
12745 for (Value *V : Ptrs) {
12746 if (V == BasePtr) {
12747 PtrsRetainedInVecCode.push_back(V);
12748 continue;
12749 }
12750 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
12751 // For simplicity assume Ptr to stay in vectorized code if it's not a
12752 // GEP instruction. We don't care since it's cost considered free.
12753 // TODO: We should check for any uses outside of vectorizable tree
12754 // rather than just single use.
12755 if (!Ptr || !Ptr->hasOneUse())
12756 PtrsRetainedInVecCode.push_back(V);
12757 }
12758
12759 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12760 // If all pointers stay in vectorized code then we don't have
12761 // any savings on that.
12762 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
12763 }
12764 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12765 TTI::PointersChainInfo::getKnownStride(),
12766 VecTy, CostKind);
12767 } else {
12768 // Case 1: Ptrs are the arguments of loads that we are going to transform
12769 // into masked gather load intrinsic.
12770 // All the scalar GEPs will be removed as a result of vectorization.
12771 // For any external uses of some lanes extract element instructions will
12772 // be generated (which cost is estimated separately).
12773 TTI::PointersChainInfo PtrsInfo =
12774 all_of(Ptrs,
12775 [](const Value *V) {
12776 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
12777 return Ptr && !Ptr->hasAllConstantIndices();
12778 })
12779 ? TTI::PointersChainInfo::getUnknownStride()
12780 : TTI::PointersChainInfo::getKnownStride();
12781
12782 ScalarCost =
12783 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
12784 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12785 if (!BaseGEP) {
12786 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
12787 if (It != Ptrs.end())
12788 BaseGEP = cast<GEPOperator>(*It);
12789 }
12790 if (BaseGEP) {
12791 SmallVector<const Value *> Indices(BaseGEP->indices());
12792 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
12793 BaseGEP->getPointerOperand(), Indices, VecTy,
12794 CostKind);
12795 }
12796 }
12797
12798 return std::make_pair(ScalarCost, VecCost);
12799}
12800
12801void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12802 assert(TE.isGather() && TE.ReorderIndices.empty() &&
12803 "Expected gather node without reordering.");
12804 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
12805 SmallSet<size_t, 2> LoadKeyUsed;
12806
12807 // Do not reorder nodes if it small (just 2 elements), all-constant or all
12808 // instructions have same opcode already.
12809 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
12810 all_of(TE.Scalars, isConstant))
12811 return;
12812
12813 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
12814 return VectorizableTree[Idx]->isSame(TE.Scalars);
12815 }))
12816 return;
12817
12818 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
12819 Key = hash_combine(hash_value(LI->getParent()), Key);
12820 Value *Ptr =
12821 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
12822 if (LoadKeyUsed.contains(Key)) {
12823 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
12824 if (LIt != LoadsMap.end()) {
12825 for (LoadInst *RLI : LIt->second) {
12826 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
12827 LI->getType(), LI->getPointerOperand(), *DL, *SE,
12828 /*StrictCheck=*/true))
12829 return hash_value(RLI->getPointerOperand());
12830 }
12831 for (LoadInst *RLI : LIt->second) {
12833 LI->getPointerOperand(), *TLI)) {
12834 hash_code SubKey = hash_value(RLI->getPointerOperand());
12835 return SubKey;
12836 }
12837 }
12838 if (LIt->second.size() > 2) {
12839 hash_code SubKey =
12840 hash_value(LIt->second.back()->getPointerOperand());
12841 return SubKey;
12842 }
12843 }
12844 }
12845 LoadKeyUsed.insert(Key);
12846 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
12847 return hash_value(LI->getPointerOperand());
12848 };
12849 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12850 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12851 bool IsOrdered = true;
12852 unsigned NumInstructions = 0;
12853 // Try to "cluster" scalar instructions, to be able to build extra vectorized
12854 // nodes.
12855 for (auto [I, V] : enumerate(TE.Scalars)) {
12856 size_t Key = 1, Idx = 1;
12857 if (auto *Inst = dyn_cast<Instruction>(V);
12859 !isDeleted(Inst) && !isVectorized(V)) {
12860 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
12861 /*AllowAlternate=*/false);
12862 ++NumInstructions;
12863 }
12864 auto &Container = SortedValues[Key];
12865 if (IsOrdered && !KeyToIndex.contains(V) &&
12868 ((Container.contains(Idx) &&
12869 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
12870 (!Container.empty() && !Container.contains(Idx) &&
12871 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
12872 IsOrdered = false;
12873 auto &KTI = KeyToIndex[V];
12874 if (KTI.empty())
12875 Container[Idx].push_back(V);
12876 KTI.push_back(I);
12877 }
12879 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12880 if (!IsOrdered && NumInstructions > 1) {
12881 unsigned Cnt = 0;
12882 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
12883 for (const auto &D : SortedValues) {
12884 for (const auto &P : D.second) {
12885 unsigned Sz = 0;
12886 for (Value *V : P.second) {
12887 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
12888 for (auto [K, Idx] : enumerate(Indices)) {
12889 TE.ReorderIndices[Cnt + K] = Idx;
12890 TE.Scalars[Cnt + K] = V;
12891 }
12892 Sz += Indices.size();
12893 Cnt += Indices.size();
12894 }
12895 if (Sz > 1 && isa<Instruction>(P.second.front())) {
12896 const unsigned SubVF = getFloorFullVectorNumberOfElements(
12897 *TTI, TE.Scalars.front()->getType(), Sz);
12898 SubVectors.emplace_back(Cnt - Sz, SubVF);
12899 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12900 DemandedElts.clearBit(I);
12901 } else if (!P.second.empty() && isConstant(P.second.front())) {
12902 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12903 DemandedElts.clearBit(I);
12904 }
12905 }
12906 }
12907 }
12908 // Reuses always require shuffles, so consider it as profitable.
12909 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
12910 return;
12911 // Do simple cost estimation.
12914 auto *ScalarTy = TE.Scalars.front()->getType();
12915 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
12916 for (auto [Idx, Sz] : SubVectors) {
12918 Idx, getWidenedType(ScalarTy, Sz));
12919 }
12920 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12921 /*Insert=*/true,
12922 /*Extract=*/false, CostKind);
12923 int Sz = TE.Scalars.size();
12924 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
12925 TE.ReorderIndices.end());
12926 for (unsigned I : seq<unsigned>(Sz)) {
12927 Value *V = TE.getOrdered(I);
12928 if (isa<PoisonValue>(V)) {
12929 ReorderMask[I] = PoisonMaskElem;
12930 } else if (isConstant(V) || DemandedElts[I]) {
12931 ReorderMask[I] = I + TE.ReorderIndices.size();
12932 }
12933 }
12934 Cost += ::getShuffleCost(*TTI,
12935 any_of(ReorderMask, [&](int I) { return I >= Sz; })
12938 VecTy, ReorderMask);
12939 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12940 ReorderMask.assign(Sz, PoisonMaskElem);
12941 for (unsigned I : seq<unsigned>(Sz)) {
12942 Value *V = TE.getOrdered(I);
12943 if (isConstant(V)) {
12944 DemandedElts.clearBit(I);
12945 if (!isa<PoisonValue>(V))
12946 ReorderMask[I] = I;
12947 } else {
12948 ReorderMask[I] = I + Sz;
12949 }
12950 }
12951 InstructionCost BVCost =
12952 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12953 /*Insert=*/true, /*Extract=*/false, CostKind);
12954 if (!DemandedElts.isAllOnes())
12955 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
12956 if (Cost >= BVCost) {
12957 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
12958 reorderScalars(TE.Scalars, Mask);
12959 TE.ReorderIndices.clear();
12960 }
12961}
12962
12963/// Check if we can convert fadd/fsub sequence to FMAD.
12964/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
12966 const InstructionsState &S,
12967 DominatorTree &DT, const DataLayout &DL,
12969 const TargetLibraryInfo &TLI) {
12970 assert(all_of(VL,
12971 [](Value *V) {
12972 return V->getType()->getScalarType()->isFloatingPointTy();
12973 }) &&
12974 "Can only convert to FMA for floating point types");
12975 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12976
12977 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12978 FastMathFlags FMF;
12979 FMF.set();
12980 for (Value *V : VL) {
12981 auto *I = dyn_cast<Instruction>(V);
12982 if (!I)
12983 continue;
12984 if (S.isCopyableElement(I))
12985 continue;
12986 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12987 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12988 continue;
12989 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12990 FMF &= FPCI->getFastMathFlags();
12991 }
12992 return FMF.allowContract();
12993 };
12994 if (!CheckForContractable(VL))
12996 // fmul also should be contractable
12997 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12998 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12999
13000 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
13001 if (!OpS.valid())
13003
13004 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13006 if (!CheckForContractable(Operands.front()))
13008 // Compare the costs.
13009 InstructionCost FMulPlusFAddCost = 0;
13010 InstructionCost FMACost = 0;
13012 FastMathFlags FMF;
13013 FMF.set();
13014 for (Value *V : VL) {
13015 auto *I = dyn_cast<Instruction>(V);
13016 if (!I)
13017 continue;
13018 if (!S.isCopyableElement(I))
13019 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13020 FMF &= FPCI->getFastMathFlags();
13021 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13022 }
13023 unsigned NumOps = 0;
13024 for (auto [V, Op] : zip(VL, Operands.front())) {
13025 if (S.isCopyableElement(V))
13026 continue;
13027 auto *I = dyn_cast<Instruction>(Op);
13028 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
13029 if (auto *OpI = dyn_cast<Instruction>(V))
13030 FMACost += TTI.getInstructionCost(OpI, CostKind);
13031 if (I)
13032 FMACost += TTI.getInstructionCost(I, CostKind);
13033 continue;
13034 }
13035 ++NumOps;
13036 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13037 FMF &= FPCI->getFastMathFlags();
13038 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13039 }
13040 Type *Ty = VL.front()->getType();
13041 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
13042 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
13043 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
13044}
13045
13048 BaseGraphSize = VectorizableTree.size();
13049 // Turn graph transforming mode on and off, when done.
13050 class GraphTransformModeRAAI {
13051 bool &SavedIsGraphTransformMode;
13052
13053 public:
13054 GraphTransformModeRAAI(bool &IsGraphTransformMode)
13055 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13056 IsGraphTransformMode = true;
13057 }
13058 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
13059 } TransformContext(IsGraphTransformMode);
13060 // Operands are profitable if they are:
13061 // 1. At least one constant
13062 // or
13063 // 2. Splats
13064 // or
13065 // 3. Results in good vectorization opportunity, i.e. may generate vector
13066 // nodes and reduce cost of the graph.
13067 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
13068 const InstructionsState &S) {
13070 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
13071 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
13072 I2->getOperand(Op));
13073 return all_of(
13074 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
13075 return all_of(Cand,
13076 [](const std::pair<Value *, Value *> &P) {
13077 return isa<Constant>(P.first) ||
13078 isa<Constant>(P.second) || P.first == P.second;
13079 }) ||
13081 });
13082 };
13083
13084 // Try to reorder gather nodes for better vectorization opportunities.
13085 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13086 TreeEntry &E = *VectorizableTree[Idx];
13087 if (E.isGather())
13088 reorderGatherNode(E);
13089 }
13090
13091 // Better to use full gathered loads analysis, if there are only 2 loads
13092 // gathered nodes each having less than 16 elements.
13093 constexpr unsigned VFLimit = 16;
13094 bool ForceLoadGather =
13095 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
13096 return TE->isGather() && TE->hasState() &&
13097 TE->getOpcode() == Instruction::Load &&
13098 TE->getVectorFactor() < VFLimit;
13099 }) == 2;
13100
13101 // Checks if the scalars are used in other node.
13102 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
13103 function_ref<bool(Value *)> CheckContainer) {
13104 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
13105 if (isa<PoisonValue>(V))
13106 return true;
13107 auto *I = dyn_cast<Instruction>(V);
13108 if (!I)
13109 return false;
13110 return is_contained(TE->Scalars, I) || CheckContainer(I);
13111 });
13112 };
13113 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
13114 if (E.hasState()) {
13115 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
13116 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13117 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13118 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13119 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13120 return is_contained(TEs, TE);
13121 });
13122 });
13123 }))
13124 return true;
13125 ;
13126 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
13127 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13128 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13129 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13130 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13131 return is_contained(TEs, TE);
13132 });
13133 });
13134 }))
13135 return true;
13136 } else {
13137 // Check if the gather node full copy of split node.
13138 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
13139 if (It != E.Scalars.end()) {
13140 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
13141 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13142 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13143 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13144 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13145 return is_contained(TEs, TE);
13146 });
13147 });
13148 }))
13149 return true;
13150 }
13151 }
13152 return false;
13153 };
13154 // The tree may grow here, so iterate over nodes, built before.
13155 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13156 TreeEntry &E = *VectorizableTree[Idx];
13157 if (E.isGather()) {
13158 ArrayRef<Value *> VL = E.Scalars;
13159 const unsigned Sz = getVectorElementSize(VL.front());
13160 unsigned MinVF = getMinVF(2 * Sz);
13161 // Do not try partial vectorization for small nodes (<= 2), nodes with the
13162 // same opcode and same parent block or all constants.
13163 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13164 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
13165 // We use allSameOpcode instead of isAltShuffle because we don't
13166 // want to use interchangeable instruction here.
13167 !allSameOpcode(VL) || !allSameBlock(VL)) ||
13168 allConstant(VL) || isSplat(VL))
13169 continue;
13170 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13171 continue;
13172 // Check if the node is a copy of other vector nodes.
13173 if (CheckForSameVectorNodes(E))
13174 continue;
13175 // Try to find vectorizable sequences and transform them into a series of
13176 // insertvector instructions.
13177 unsigned StartIdx = 0;
13178 unsigned End = VL.size();
13179 for (unsigned VF = getFloorFullVectorNumberOfElements(
13180 *TTI, VL.front()->getType(), VL.size() - 1);
13181 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13182 *TTI, VL.front()->getType(), VF - 1)) {
13183 if (StartIdx + VF > End)
13184 continue;
13186 bool AllStrided = true;
13187 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13188 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
13189 // If any instruction is vectorized already - do not try again.
13190 // Reuse the existing node, if it fully matches the slice.
13191 if (isVectorized(Slice.front()) &&
13192 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
13193 continue;
13194 // Constant already handled effectively - skip.
13195 if (allConstant(Slice))
13196 continue;
13197 // Do not try to vectorize small splats (less than vector register and
13198 // only with the single non-undef element).
13199 bool IsSplat = isSplat(Slice);
13200 bool IsTwoRegisterSplat = true;
13201 if (IsSplat && VF == 2) {
13202 unsigned NumRegs2VF = ::getNumberOfParts(
13203 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
13204 IsTwoRegisterSplat = NumRegs2VF == 2;
13205 }
13206 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
13207 count(Slice, Slice.front()) ==
13208 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
13209 : 1)) {
13210 if (IsSplat)
13211 continue;
13212 InstructionsState S = getSameOpcode(Slice, *TLI);
13213 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
13214 (S.getOpcode() == Instruction::Load &&
13216 (S.getOpcode() != Instruction::Load &&
13217 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
13218 continue;
13219 if (VF == 2) {
13220 // Try to vectorize reduced values or if all users are vectorized.
13221 // For expensive instructions extra extracts might be profitable.
13222 if ((!UserIgnoreList || E.Idx != 0) &&
13223 TTI->getInstructionCost(S.getMainOp(), CostKind) <
13225 !all_of(Slice, [&](Value *V) {
13226 if (isa<PoisonValue>(V))
13227 return true;
13228 return areAllUsersVectorized(cast<Instruction>(V),
13229 UserIgnoreList);
13230 }))
13231 continue;
13232 if (S.getOpcode() == Instruction::Load) {
13233 OrdersType Order;
13234 SmallVector<Value *> PointerOps;
13235 StridedPtrInfo SPtrInfo;
13236 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
13237 PointerOps, SPtrInfo);
13238 AllStrided &= Res == LoadsState::StridedVectorize ||
13240 Res == LoadsState::Gather;
13241 // Do not vectorize gathers.
13242 if (Res == LoadsState::ScatterVectorize ||
13243 Res == LoadsState::Gather) {
13244 if (Res == LoadsState::Gather) {
13246 // If reductions and the scalars from the root node are
13247 // analyzed - mark as non-vectorizable reduction.
13248 if (UserIgnoreList && E.Idx == 0)
13249 analyzedReductionVals(Slice);
13250 }
13251 continue;
13252 }
13253 } else if (S.getOpcode() == Instruction::ExtractElement ||
13254 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
13256 !CheckOperandsProfitability(
13257 S.getMainOp(),
13260 S))) {
13261 // Do not vectorize extractelements (handled effectively
13262 // alread). Do not vectorize non-profitable instructions (with
13263 // low cost and non-vectorizable operands.)
13264 continue;
13265 }
13266 }
13267 }
13268 Slices.emplace_back(Cnt, Slice.size());
13269 }
13270 // Do not try to vectorize if all slides are strided or gathered with
13271 // vector factor 2 and there are more than 2 slices. Better to handle
13272 // them in gathered loads analysis, may result in better vectorization.
13273 if (VF == 2 && AllStrided && Slices.size() > 2)
13274 continue;
13275 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13276 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13277 if (StartIdx == Cnt)
13278 StartIdx = Cnt + Sz;
13279 if (End == Cnt + Sz)
13280 End = Cnt;
13281 };
13282 for (auto [Cnt, Sz] : Slices) {
13283 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13284 const TreeEntry *SameTE = nullptr;
13285 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13286 It != Slice.end()) {
13287 // If any instruction is vectorized already - do not try again.
13288 SameTE = getSameValuesTreeEntry(*It, Slice);
13289 }
13290 unsigned PrevSize = VectorizableTree.size();
13291 [[maybe_unused]] unsigned PrevEntriesSize =
13292 LoadEntriesToVectorize.size();
13293 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13294 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13295 VectorizableTree[PrevSize]->isGather() &&
13296 VectorizableTree[PrevSize]->hasState() &&
13297 VectorizableTree[PrevSize]->getOpcode() !=
13298 Instruction::ExtractElement &&
13299 !isSplat(Slice)) {
13300 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13301 analyzedReductionVals(Slice);
13302 VectorizableTree.pop_back();
13303 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13304 "LoadEntriesToVectorize expected to remain the same");
13305 continue;
13306 }
13307 AddCombinedNode(PrevSize, Cnt, Sz);
13308 }
13309 }
13310 // Restore ordering, if no extra vectorization happened.
13311 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13312 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13313 reorderScalars(E.Scalars, Mask);
13314 E.ReorderIndices.clear();
13315 }
13316 }
13317 if (!E.hasState())
13318 continue;
13319 switch (E.getOpcode()) {
13320 case Instruction::Load: {
13321 // No need to reorder masked gather loads, just reorder the scalar
13322 // operands.
13323 if (E.State != TreeEntry::Vectorize)
13324 break;
13325 Type *ScalarTy = E.getMainOp()->getType();
13326 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13327 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13328 // Check if profitable to represent consecutive load + reverse as strided
13329 // load with stride -1.
13330 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13331 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13332 SmallVector<int> Mask;
13333 inversePermutation(E.ReorderIndices, Mask);
13334 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13335 InstructionCost OriginalVecCost =
13336 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13337 BaseLI->getPointerAddressSpace(), CostKind,
13339 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13340 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13341 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13342 VecTy, BaseLI->getPointerOperand(),
13343 /*VariableMask=*/false, CommonAlignment,
13344 BaseLI),
13345 CostKind);
13346 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13347 // Strided load is more profitable than consecutive load + reverse -
13348 // transform the node to strided load.
13349 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13350 ->getPointerOperand()
13351 ->getType());
13352 StridedPtrInfo SPtrInfo;
13353 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13354 SPtrInfo.Ty = VecTy;
13355 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13356 E.State = TreeEntry::StridedVectorize;
13357 }
13358 }
13359 break;
13360 }
13361 case Instruction::Store: {
13362 Type *ScalarTy =
13363 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13364 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13365 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13366 // Check if profitable to represent consecutive load + reverse as strided
13367 // load with stride -1.
13368 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13369 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13370 SmallVector<int> Mask;
13371 inversePermutation(E.ReorderIndices, Mask);
13372 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13373 InstructionCost OriginalVecCost =
13374 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13375 BaseSI->getPointerAddressSpace(), CostKind,
13377 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13378 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13379 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13380 VecTy, BaseSI->getPointerOperand(),
13381 /*VariableMask=*/false, CommonAlignment,
13382 BaseSI),
13383 CostKind);
13384 if (StridedCost < OriginalVecCost)
13385 // Strided store is more profitable than reverse + consecutive store -
13386 // transform the node to strided store.
13387 E.State = TreeEntry::StridedVectorize;
13388 } else if (!E.ReorderIndices.empty()) {
13389 // Check for interleaved stores.
13390 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13391 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13392 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13393 if (Mask.size() < 4)
13394 return 0u;
13395 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13397 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13398 TTI.isLegalInterleavedAccessType(
13399 VecTy, Factor, BaseSI->getAlign(),
13400 BaseSI->getPointerAddressSpace()))
13401 return Factor;
13402 }
13403
13404 return 0u;
13405 };
13406 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13407 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13408 if (InterleaveFactor != 0)
13409 E.setInterleave(InterleaveFactor);
13410 }
13411 break;
13412 }
13413 case Instruction::Select: {
13414 if (E.State != TreeEntry::Vectorize)
13415 break;
13416 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13417 if (MinMaxID == Intrinsic::not_intrinsic)
13418 break;
13419 // This node is a minmax node.
13420 E.CombinedOp = TreeEntry::MinMax;
13421 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13422 if (SelectOnly && CondEntry->UserTreeIndex &&
13423 CondEntry->State == TreeEntry::Vectorize) {
13424 // The condition node is part of the combined minmax node.
13425 CondEntry->State = TreeEntry::CombinedVectorize;
13426 }
13427 break;
13428 }
13429 case Instruction::FSub:
13430 case Instruction::FAdd: {
13431 // Check if possible to convert (a*b)+c to fma.
13432 if (E.State != TreeEntry::Vectorize ||
13433 !E.getOperations().isAddSubLikeOp())
13434 break;
13435 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13436 .isValid())
13437 break;
13438 // This node is a fmuladd node.
13439 E.CombinedOp = TreeEntry::FMulAdd;
13440 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13441 if (FMulEntry->UserTreeIndex &&
13442 FMulEntry->State == TreeEntry::Vectorize) {
13443 // The FMul node is part of the combined fmuladd node.
13444 FMulEntry->State = TreeEntry::CombinedVectorize;
13445 }
13446 break;
13447 }
13448 default:
13449 break;
13450 }
13451 }
13452
13453 if (LoadEntriesToVectorize.empty()) {
13454 // Single load node - exit.
13455 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13456 VectorizableTree.front()->getOpcode() == Instruction::Load)
13457 return;
13458 // Small graph with small VF - exit.
13459 constexpr unsigned SmallTree = 3;
13460 constexpr unsigned SmallVF = 2;
13461 if ((VectorizableTree.size() <= SmallTree &&
13462 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13463 (VectorizableTree.size() <= 2 && UserIgnoreList))
13464 return;
13465
13466 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13467 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13468 getCanonicalGraphSize() <= SmallTree &&
13469 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13470 [](const std::unique_ptr<TreeEntry> &TE) {
13471 return TE->isGather() && TE->hasState() &&
13472 TE->getOpcode() == Instruction::Load &&
13473 !allSameBlock(TE->Scalars);
13474 }) == 1)
13475 return;
13476 }
13477
13478 // A list of loads to be gathered during the vectorization process. We can
13479 // try to vectorize them at the end, if profitable.
13480 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13482 GatheredLoads;
13483
13484 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13485 TreeEntry &E = *TE;
13486 if (E.isGather() &&
13487 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13488 (!E.hasState() && any_of(E.Scalars,
13489 [&](Value *V) {
13490 return isa<LoadInst>(V) &&
13491 !isVectorized(V) &&
13492 !isDeleted(cast<Instruction>(V));
13493 }))) &&
13494 !isSplat(E.Scalars)) {
13495 for (Value *V : E.Scalars) {
13496 auto *LI = dyn_cast<LoadInst>(V);
13497 if (!LI)
13498 continue;
13499 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13500 continue;
13502 *this, V, *DL, *SE, *TTI,
13503 GatheredLoads[std::make_tuple(
13504 LI->getParent(),
13505 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13506 LI->getType())]);
13507 }
13508 }
13509 }
13510 // Try to vectorize gathered loads if this is not just a gather of loads.
13511 if (!GatheredLoads.empty())
13512 tryToVectorizeGatheredLoads(GatheredLoads);
13513}
13514
13515/// Merges shuffle masks and emits final shuffle instruction, if required. It
13516/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13517/// when the actual shuffle instruction is generated only if this is actually
13518/// required. Otherwise, the shuffle instruction emission is delayed till the
13519/// end of the process, to reduce the number of emitted instructions and further
13520/// analysis/transformations.
13521class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13522 bool IsFinalized = false;
13523 SmallVector<int> CommonMask;
13525 const TargetTransformInfo &TTI;
13526 InstructionCost Cost = 0;
13527 SmallDenseSet<Value *> VectorizedVals;
13528 BoUpSLP &R;
13529 SmallPtrSetImpl<Value *> &CheckedExtracts;
13530 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13531 /// While set, still trying to estimate the cost for the same nodes and we
13532 /// can delay actual cost estimation (virtual shuffle instruction emission).
13533 /// May help better estimate the cost if same nodes must be permuted + allows
13534 /// to move most of the long shuffles cost estimation to TTI.
13535 bool SameNodesEstimated = true;
13536
13537 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13538 if (Ty->getScalarType()->isPointerTy()) {
13541 IntegerType::get(Ty->getContext(),
13542 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13543 Ty->getScalarType());
13544 if (auto *VTy = dyn_cast<VectorType>(Ty))
13545 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13546 return Res;
13547 }
13548 return Constant::getAllOnesValue(Ty);
13549 }
13550
13551 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13552 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13553 return TTI::TCC_Free;
13554 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13555 InstructionCost GatherCost = 0;
13556 SmallVector<Value *> Gathers(VL);
13557 if (!Root && isSplat(VL)) {
13558 // Found the broadcasting of the single scalar, calculate the cost as
13559 // the broadcast.
13560 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13561 assert(It != VL.end() && "Expected at least one non-undef value.");
13562 // Add broadcast for non-identity shuffle only.
13563 bool NeedShuffle =
13564 count(VL, *It) > 1 &&
13565 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13566 if (!NeedShuffle) {
13567 if (isa<FixedVectorType>(ScalarTy)) {
13568 assert(SLPReVec && "FixedVectorType is not expected.");
13569 return TTI.getShuffleCost(
13570 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13571 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13572 cast<FixedVectorType>(ScalarTy));
13573 }
13574 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13575 CostKind, std::distance(VL.begin(), It),
13576 PoisonValue::get(VecTy), *It);
13577 }
13578
13579 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13580 transform(VL, ShuffleMask.begin(), [](Value *V) {
13581 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13582 });
13583 InstructionCost InsertCost =
13584 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13585 PoisonValue::get(VecTy), *It);
13586 return InsertCost + ::getShuffleCost(TTI,
13588 VecTy, ShuffleMask, CostKind,
13589 /*Index=*/0, /*SubTp=*/nullptr,
13590 /*Args=*/*It);
13591 }
13592 return GatherCost +
13593 (all_of(Gathers, IsaPred<UndefValue>)
13595 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13596 ScalarTy));
13597 };
13598
13599 /// Compute the cost of creating a vector containing the extracted values from
13600 /// \p VL.
13602 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13603 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13604 unsigned NumParts) {
13605 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13606 unsigned NumElts =
13607 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13608 auto *EE = dyn_cast<ExtractElementInst>(V);
13609 if (!EE)
13610 return Sz;
13611 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13612 if (!VecTy)
13613 return Sz;
13614 return std::max(Sz, VecTy->getNumElements());
13615 });
13616 // FIXME: this must be moved to TTI for better estimation.
13617 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13618 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13620 SmallVectorImpl<unsigned> &SubVecSizes)
13621 -> std::optional<TTI::ShuffleKind> {
13622 if (NumElts <= EltsPerVector)
13623 return std::nullopt;
13624 int OffsetReg0 =
13625 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13626 [](int S, int I) {
13627 if (I == PoisonMaskElem)
13628 return S;
13629 return std::min(S, I);
13630 }),
13631 EltsPerVector);
13632 int OffsetReg1 = OffsetReg0;
13633 DenseSet<int> RegIndices;
13634 // Check that if trying to permute same single/2 input vectors.
13636 int FirstRegId = -1;
13637 Indices.assign(1, OffsetReg0);
13638 for (auto [Pos, I] : enumerate(Mask)) {
13639 if (I == PoisonMaskElem)
13640 continue;
13641 int Idx = I - OffsetReg0;
13642 int RegId =
13643 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13644 if (FirstRegId < 0)
13645 FirstRegId = RegId;
13646 RegIndices.insert(RegId);
13647 if (RegIndices.size() > 2)
13648 return std::nullopt;
13649 if (RegIndices.size() == 2) {
13650 ShuffleKind = TTI::SK_PermuteTwoSrc;
13651 if (Indices.size() == 1) {
13652 OffsetReg1 = alignDown(
13653 std::accumulate(
13654 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13655 [&](int S, int I) {
13656 if (I == PoisonMaskElem)
13657 return S;
13658 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13659 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13660 if (RegId == FirstRegId)
13661 return S;
13662 return std::min(S, I);
13663 }),
13664 EltsPerVector);
13665 unsigned Index = OffsetReg1 % NumElts;
13666 Indices.push_back(Index);
13667 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13668 }
13669 Idx = I - OffsetReg1;
13670 }
13671 I = (Idx % NumElts) % EltsPerVector +
13672 (RegId == FirstRegId ? 0 : EltsPerVector);
13673 }
13674 return ShuffleKind;
13675 };
13676 InstructionCost Cost = 0;
13677
13678 // Process extracts in blocks of EltsPerVector to check if the source vector
13679 // operand can be re-used directly. If not, add the cost of creating a
13680 // shuffle to extract the values into a vector register.
13681 for (unsigned Part : seq<unsigned>(NumParts)) {
13682 if (!ShuffleKinds[Part])
13683 continue;
13684 ArrayRef<int> MaskSlice = Mask.slice(
13685 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13686 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13687 copy(MaskSlice, SubMask.begin());
13689 SmallVector<unsigned, 2> SubVecSizes;
13690 std::optional<TTI::ShuffleKind> RegShuffleKind =
13691 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13692 if (!RegShuffleKind) {
13693 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13695 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13696 Cost +=
13697 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13698 getWidenedType(ScalarTy, NumElts), MaskSlice);
13699 continue;
13700 }
13701 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13702 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13703 Cost +=
13704 ::getShuffleCost(TTI, *RegShuffleKind,
13705 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13706 }
13707 const unsigned BaseVF = getFullVectorNumberOfElements(
13708 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13709 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13710 assert((Idx + SubVecSize) <= BaseVF &&
13711 "SK_ExtractSubvector index out of range");
13713 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13714 Idx, getWidenedType(ScalarTy, SubVecSize));
13715 }
13716 // Second attempt to check, if just a permute is better estimated than
13717 // subvector extract.
13718 SubMask.assign(NumElts, PoisonMaskElem);
13719 copy(MaskSlice, SubMask.begin());
13720 InstructionCost OriginalCost = ::getShuffleCost(
13721 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13722 if (OriginalCost < Cost)
13723 Cost = OriginalCost;
13724 }
13725 return Cost;
13726 }
13727 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13728 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13729 /// elements.
13730 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13731 ArrayRef<int> Mask, unsigned Part,
13732 unsigned SliceSize) {
13733 if (SameNodesEstimated) {
13734 // Delay the cost estimation if the same nodes are reshuffling.
13735 // If we already requested the cost of reshuffling of E1 and E2 before, no
13736 // need to estimate another cost with the sub-Mask, instead include this
13737 // sub-Mask into the CommonMask to estimate it later and avoid double cost
13738 // estimation.
13739 if ((InVectors.size() == 2 &&
13740 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
13741 cast<const TreeEntry *>(InVectors.back()) == E2) ||
13742 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
13743 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
13744 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
13745 [](int Idx) { return Idx == PoisonMaskElem; }) &&
13746 "Expected all poisoned elements.");
13747 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
13748 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13749 return;
13750 }
13751 // Found non-matching nodes - need to estimate the cost for the matched
13752 // and transform mask.
13753 Cost += createShuffle(InVectors.front(),
13754 InVectors.size() == 1 ? nullptr : InVectors.back(),
13755 CommonMask);
13756 transformMaskAfterShuffle(CommonMask, CommonMask);
13757 } else if (InVectors.size() == 2) {
13758 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13759 transformMaskAfterShuffle(CommonMask, CommonMask);
13760 }
13761 SameNodesEstimated = false;
13762 if (!E2 && InVectors.size() == 1) {
13763 unsigned VF = E1.getVectorFactor();
13764 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
13765 VF = std::max(VF, getVF(V1));
13766 } else {
13767 const auto *E = cast<const TreeEntry *>(InVectors.front());
13768 VF = std::max(VF, E->getVectorFactor());
13769 }
13770 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13771 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13772 CommonMask[Idx] = Mask[Idx] + VF;
13773 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13774 transformMaskAfterShuffle(CommonMask, CommonMask);
13775 } else {
13776 auto P = InVectors.front();
13777 Cost += createShuffle(&E1, E2, Mask);
13778 unsigned VF = Mask.size();
13779 if (Value *V1 = dyn_cast<Value *>(P)) {
13780 VF = std::max(VF,
13781 getNumElements(V1->getType()));
13782 } else {
13783 const auto *E = cast<const TreeEntry *>(P);
13784 VF = std::max(VF, E->getVectorFactor());
13785 }
13786 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13787 if (Mask[Idx] != PoisonMaskElem)
13788 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13789 Cost += createShuffle(P, InVectors.front(), CommonMask);
13790 transformMaskAfterShuffle(CommonMask, CommonMask);
13791 }
13792 }
13793
13794 class ShuffleCostBuilder {
13795 const TargetTransformInfo &TTI;
13796
13797 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
13798 int Index = -1;
13799 return Mask.empty() ||
13800 (VF == Mask.size() &&
13803 Index == 0);
13804 }
13805
13806 public:
13807 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
13808 ~ShuffleCostBuilder() = default;
13809 InstructionCost createShuffleVector(Value *V1, Value *,
13810 ArrayRef<int> Mask) const {
13811 // Empty mask or identity mask are free.
13812 unsigned VF =
13813 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13814 if (isEmptyOrIdentity(Mask, VF))
13815 return TTI::TCC_Free;
13816 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
13817 cast<VectorType>(V1->getType()), Mask);
13818 }
13819 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
13820 // Empty mask or identity mask are free.
13821 unsigned VF =
13822 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13823 if (isEmptyOrIdentity(Mask, VF))
13824 return TTI::TCC_Free;
13825 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
13826 cast<VectorType>(V1->getType()), Mask);
13827 }
13828 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
13829 InstructionCost createPoison(Type *Ty, unsigned VF) const {
13830 return TTI::TCC_Free;
13831 }
13832 void resizeToMatch(Value *&, Value *&) const {}
13833 };
13834
13835 /// Smart shuffle instruction emission, walks through shuffles trees and
13836 /// tries to find the best matching vector for the actual shuffle
13837 /// instruction.
13839 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
13841 ArrayRef<int> Mask) {
13842 ShuffleCostBuilder Builder(TTI);
13843 SmallVector<int> CommonMask(Mask);
13844 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
13845 unsigned CommonVF = Mask.size();
13846 InstructionCost ExtraCost = 0;
13847 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
13848 unsigned VF) -> InstructionCost {
13849 if (E.isGather() && allConstant(E.Scalars))
13850 return TTI::TCC_Free;
13851 Type *EScalarTy = E.Scalars.front()->getType();
13852 bool IsSigned = true;
13853 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13854 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
13855 IsSigned = It->second.second;
13856 }
13857 if (EScalarTy != ScalarTy) {
13858 unsigned CastOpcode = Instruction::Trunc;
13859 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13860 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13861 if (DstSz > SrcSz)
13862 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13863 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
13864 getWidenedType(EScalarTy, VF),
13865 TTI::CastContextHint::None, CostKind);
13866 }
13867 return TTI::TCC_Free;
13868 };
13869 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
13870 if (isa<Constant>(V))
13871 return TTI::TCC_Free;
13872 auto *VecTy = cast<VectorType>(V->getType());
13873 Type *EScalarTy = VecTy->getElementType();
13874 if (EScalarTy != ScalarTy) {
13875 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
13876 unsigned CastOpcode = Instruction::Trunc;
13877 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13878 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13879 if (DstSz > SrcSz)
13880 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13881 return TTI.getCastInstrCost(
13882 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
13883 VecTy, TTI::CastContextHint::None, CostKind);
13884 }
13885 return TTI::TCC_Free;
13886 };
13887 if (!V1 && !V2 && !P2.isNull()) {
13888 // Shuffle 2 entry nodes.
13889 const TreeEntry *E = cast<const TreeEntry *>(P1);
13890 unsigned VF = E->getVectorFactor();
13891 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13892 CommonVF = std::max(VF, E2->getVectorFactor());
13893 assert(all_of(Mask,
13894 [=](int Idx) {
13895 return Idx < 2 * static_cast<int>(CommonVF);
13896 }) &&
13897 "All elements in mask must be less than 2 * CommonVF.");
13898 if (E->Scalars.size() == E2->Scalars.size()) {
13899 SmallVector<int> EMask = E->getCommonMask();
13900 SmallVector<int> E2Mask = E2->getCommonMask();
13901 if (!EMask.empty() || !E2Mask.empty()) {
13902 for (int &Idx : CommonMask) {
13903 if (Idx == PoisonMaskElem)
13904 continue;
13905 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
13906 Idx = EMask[Idx];
13907 else if (Idx >= static_cast<int>(CommonVF))
13908 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13909 E->Scalars.size();
13910 }
13911 }
13912 CommonVF = E->Scalars.size();
13913 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13914 GetNodeMinBWAffectedCost(*E2, CommonVF);
13915 } else {
13916 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13917 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13918 }
13919 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13920 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13921 } else if (!V1 && P2.isNull()) {
13922 // Shuffle single entry node.
13923 const TreeEntry *E = cast<const TreeEntry *>(P1);
13924 unsigned VF = E->getVectorFactor();
13925 CommonVF = VF;
13926 assert(
13927 all_of(Mask,
13928 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13929 "All elements in mask must be less than CommonVF.");
13930 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13931 SmallVector<int> EMask = E->getCommonMask();
13932 assert(!EMask.empty() && "Expected non-empty common mask.");
13933 for (int &Idx : CommonMask) {
13934 if (Idx != PoisonMaskElem)
13935 Idx = EMask[Idx];
13936 }
13937 CommonVF = E->Scalars.size();
13938 } else if (unsigned Factor = E->getInterleaveFactor();
13939 Factor > 0 && E->Scalars.size() != Mask.size() &&
13941 Factor)) {
13942 // Deinterleaved nodes are free.
13943 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13944 }
13945 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13946 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13947 // Not identity/broadcast? Try to see if the original vector is better.
13948 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13949 CommonVF == CommonMask.size() &&
13950 any_of(enumerate(CommonMask),
13951 [](const auto &&P) {
13952 return P.value() != PoisonMaskElem &&
13953 static_cast<unsigned>(P.value()) != P.index();
13954 }) &&
13955 any_of(CommonMask,
13956 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
13957 SmallVector<int> ReorderMask;
13958 inversePermutation(E->ReorderIndices, ReorderMask);
13959 ::addMask(CommonMask, ReorderMask);
13960 }
13961 } else if (V1 && P2.isNull()) {
13962 // Shuffle single vector.
13963 ExtraCost += GetValueMinBWAffectedCost(V1);
13964 CommonVF = getVF(V1);
13965 assert(
13966 all_of(Mask,
13967 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13968 "All elements in mask must be less than CommonVF.");
13969 } else if (V1 && !V2) {
13970 // Shuffle vector and tree node.
13971 unsigned VF = getVF(V1);
13972 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13973 CommonVF = std::max(VF, E2->getVectorFactor());
13974 assert(all_of(Mask,
13975 [=](int Idx) {
13976 return Idx < 2 * static_cast<int>(CommonVF);
13977 }) &&
13978 "All elements in mask must be less than 2 * CommonVF.");
13979 if (E2->Scalars.size() == VF && VF != CommonVF) {
13980 SmallVector<int> E2Mask = E2->getCommonMask();
13981 assert(!E2Mask.empty() && "Expected non-empty common mask.");
13982 for (int &Idx : CommonMask) {
13983 if (Idx == PoisonMaskElem)
13984 continue;
13985 if (Idx >= static_cast<int>(CommonVF))
13986 Idx = E2Mask[Idx - CommonVF] + VF;
13987 }
13988 CommonVF = VF;
13989 }
13990 ExtraCost += GetValueMinBWAffectedCost(V1);
13991 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13992 ExtraCost += GetNodeMinBWAffectedCost(
13993 *E2, std::min(CommonVF, E2->getVectorFactor()));
13994 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13995 } else if (!V1 && V2) {
13996 // Shuffle vector and tree node.
13997 unsigned VF = getVF(V2);
13998 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13999 CommonVF = std::max(VF, E1->getVectorFactor());
14000 assert(all_of(Mask,
14001 [=](int Idx) {
14002 return Idx < 2 * static_cast<int>(CommonVF);
14003 }) &&
14004 "All elements in mask must be less than 2 * CommonVF.");
14005 if (E1->Scalars.size() == VF && VF != CommonVF) {
14006 SmallVector<int> E1Mask = E1->getCommonMask();
14007 assert(!E1Mask.empty() && "Expected non-empty common mask.");
14008 for (int &Idx : CommonMask) {
14009 if (Idx == PoisonMaskElem)
14010 continue;
14011 if (Idx >= static_cast<int>(CommonVF))
14012 Idx = E1Mask[Idx - CommonVF] + VF;
14013 else
14014 Idx = E1Mask[Idx];
14015 }
14016 CommonVF = VF;
14017 }
14018 ExtraCost += GetNodeMinBWAffectedCost(
14019 *E1, std::min(CommonVF, E1->getVectorFactor()));
14020 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14021 ExtraCost += GetValueMinBWAffectedCost(V2);
14022 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14023 } else {
14024 assert(V1 && V2 && "Expected both vectors.");
14025 unsigned VF = getVF(V1);
14026 CommonVF = std::max(VF, getVF(V2));
14027 assert(all_of(Mask,
14028 [=](int Idx) {
14029 return Idx < 2 * static_cast<int>(CommonVF);
14030 }) &&
14031 "All elements in mask must be less than 2 * CommonVF.");
14032 ExtraCost +=
14033 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14034 if (V1->getType() != V2->getType()) {
14035 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14036 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14037 } else {
14038 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
14039 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14040 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
14041 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14042 }
14043 }
14044 InVectors.front() =
14045 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14046 if (InVectors.size() == 2)
14047 InVectors.pop_back();
14048 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14049 V1, V2, CommonMask, Builder, ScalarTy);
14050 }
14051
14052public:
14054 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
14055 SmallPtrSetImpl<Value *> &CheckedExtracts)
14056 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14057 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14058 CheckedExtracts(CheckedExtracts) {}
14059 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14060 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14061 unsigned NumParts, bool &UseVecBaseAsInput) {
14062 UseVecBaseAsInput = false;
14063 if (Mask.empty())
14064 return nullptr;
14065 Value *VecBase = nullptr;
14066 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14067 if (!E->ReorderIndices.empty()) {
14068 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14069 E->ReorderIndices.end());
14070 reorderScalars(VL, ReorderMask);
14071 }
14072 // Check if it can be considered reused if same extractelements were
14073 // vectorized already.
14074 bool PrevNodeFound = any_of(
14075 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14076 [&](const std::unique_ptr<TreeEntry> &TE) {
14077 return ((TE->hasState() && !TE->isAltShuffle() &&
14078 TE->getOpcode() == Instruction::ExtractElement) ||
14079 TE->isGather()) &&
14080 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14081 return VL.size() > Data.index() &&
14082 (Mask[Data.index()] == PoisonMaskElem ||
14083 isa<UndefValue>(VL[Data.index()]) ||
14084 Data.value() == VL[Data.index()]);
14085 });
14086 });
14087 SmallPtrSet<Value *, 4> UniqueBases;
14088 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14089 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
14090 for (unsigned Part : seq<unsigned>(NumParts)) {
14091 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14092 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14093 for (auto [I, V] :
14094 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
14095 // Ignore non-extractelement scalars.
14096 if (isa<UndefValue>(V) ||
14097 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
14098 continue;
14099 // If all users of instruction are going to be vectorized and this
14100 // instruction itself is not going to be vectorized, consider this
14101 // instruction as dead and remove its cost from the final cost of the
14102 // vectorized tree.
14103 // Also, avoid adjusting the cost for extractelements with multiple uses
14104 // in different graph entries.
14105 auto *EE = cast<ExtractElementInst>(V);
14106 VecBase = EE->getVectorOperand();
14107 UniqueBases.insert(VecBase);
14108 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
14109 if (!CheckedExtracts.insert(V).second ||
14110 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
14111 any_of(EE->users(),
14112 [&](User *U) {
14113 return isa<GetElementPtrInst>(U) &&
14114 !R.areAllUsersVectorized(cast<Instruction>(U),
14115 &VectorizedVals);
14116 }) ||
14117 (!VEs.empty() && !is_contained(VEs, E)))
14118 continue;
14119 std::optional<unsigned> EEIdx = getExtractIndex(EE);
14120 if (!EEIdx)
14121 continue;
14122 unsigned Idx = *EEIdx;
14123 // Take credit for instruction that will become dead.
14124 if (EE->hasOneUse() || !PrevNodeFound) {
14125 Instruction *Ext = EE->user_back();
14126 if (isa<SExtInst, ZExtInst>(Ext) &&
14128 // Use getExtractWithExtendCost() to calculate the cost of
14129 // extractelement/ext pair.
14130 Cost -= TTI.getExtractWithExtendCost(
14131 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
14132 Idx, CostKind);
14133 // Add back the cost of s|zext which is subtracted separately.
14134 Cost += TTI.getCastInstrCost(
14135 Ext->getOpcode(), Ext->getType(), EE->getType(),
14137 continue;
14138 }
14139 }
14140 APInt &DemandedElts =
14141 VectorOpsToExtracts
14142 .try_emplace(VecBase,
14143 APInt::getZero(getNumElements(VecBase->getType())))
14144 .first->getSecond();
14145 DemandedElts.setBit(Idx);
14146 }
14147 }
14148 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14150 DemandedElts, /*Insert=*/false,
14151 /*Extract=*/true, CostKind);
14152 // Check that gather of extractelements can be represented as just a
14153 // shuffle of a single/two vectors the scalars are extracted from.
14154 // Found the bunch of extractelement instructions that must be gathered
14155 // into a vector and can be represented as a permutation elements in a
14156 // single input vector or of 2 input vectors.
14157 // Done for reused if same extractelements were vectorized already.
14158 if (!PrevNodeFound)
14159 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14160 InVectors.assign(1, E);
14161 CommonMask.assign(Mask.begin(), Mask.end());
14162 transformMaskAfterShuffle(CommonMask, CommonMask);
14163 SameNodesEstimated = false;
14164 if (NumParts != 1 && UniqueBases.size() != 1) {
14165 UseVecBaseAsInput = true;
14166 VecBase =
14167 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14168 }
14169 return VecBase;
14170 }
14171 /// Checks if the specified entry \p E needs to be delayed because of its
14172 /// dependency nodes.
14173 std::optional<InstructionCost>
14174 needToDelay(const TreeEntry *,
14176 // No need to delay the cost estimation during analysis.
14177 return std::nullopt;
14178 }
14179 /// Reset the builder to handle perfect diamond match.
14181 IsFinalized = false;
14182 CommonMask.clear();
14183 InVectors.clear();
14184 Cost = 0;
14185 VectorizedVals.clear();
14186 SameNodesEstimated = true;
14187 }
14188 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14189 if (&E1 == &E2) {
14190 assert(all_of(Mask,
14191 [&](int Idx) {
14192 return Idx < static_cast<int>(E1.getVectorFactor());
14193 }) &&
14194 "Expected single vector shuffle mask.");
14195 add(E1, Mask);
14196 return;
14197 }
14198 if (InVectors.empty()) {
14199 CommonMask.assign(Mask.begin(), Mask.end());
14200 InVectors.assign({&E1, &E2});
14201 return;
14202 }
14203 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14204 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14205 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14206 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14207 const auto *It =
14208 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
14209 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14210 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14211 }
14212 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14213 if (InVectors.empty()) {
14214 CommonMask.assign(Mask.begin(), Mask.end());
14215 InVectors.assign(1, &E1);
14216 return;
14217 }
14218 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14219 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14220 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14221 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14222 const auto *It =
14223 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
14224 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14225 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
14226 if (!SameNodesEstimated && InVectors.size() == 1)
14227 InVectors.emplace_back(&E1);
14228 }
14229 /// Adds 2 input vectors and the mask for their shuffling.
14230 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14231 // May come only for shuffling of 2 vectors with extractelements, already
14232 // handled in adjustExtracts.
14233 assert(InVectors.size() == 1 &&
14234 all_of(enumerate(CommonMask),
14235 [&](auto P) {
14236 if (P.value() == PoisonMaskElem)
14237 return Mask[P.index()] == PoisonMaskElem;
14238 auto *EI = cast<ExtractElementInst>(
14239 cast<const TreeEntry *>(InVectors.front())
14240 ->getOrdered(P.index()));
14241 return EI->getVectorOperand() == V1 ||
14242 EI->getVectorOperand() == V2;
14243 }) &&
14244 "Expected extractelement vectors.");
14245 }
14246 /// Adds another one input vector and the mask for the shuffling.
14247 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14248 if (InVectors.empty()) {
14249 assert(CommonMask.empty() && !ForExtracts &&
14250 "Expected empty input mask/vectors.");
14251 CommonMask.assign(Mask.begin(), Mask.end());
14252 InVectors.assign(1, V1);
14253 return;
14254 }
14255 if (ForExtracts) {
14256 // No need to add vectors here, already handled them in adjustExtracts.
14257 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14258 !CommonMask.empty() &&
14259 all_of(enumerate(CommonMask),
14260 [&](auto P) {
14261 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14262 ->getOrdered(P.index());
14263 if (P.value() == PoisonMaskElem)
14264 return P.value() == Mask[P.index()] ||
14265 isa<UndefValue>(Scalar);
14266 if (isa<Constant>(V1))
14267 return true;
14268 auto *EI = cast<ExtractElementInst>(Scalar);
14269 return EI->getVectorOperand() == V1;
14270 }) &&
14271 "Expected only tree entry for extractelement vectors.");
14272 return;
14273 }
14274 assert(!InVectors.empty() && !CommonMask.empty() &&
14275 "Expected only tree entries from extracts/reused buildvectors.");
14276 unsigned VF = getVF(V1);
14277 if (InVectors.size() == 2) {
14278 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14279 transformMaskAfterShuffle(CommonMask, CommonMask);
14280 VF = std::max<unsigned>(VF, CommonMask.size());
14281 } else if (const auto *InTE =
14282 InVectors.front().dyn_cast<const TreeEntry *>()) {
14283 VF = std::max(VF, InTE->getVectorFactor());
14284 } else {
14285 VF = std::max(
14286 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14287 ->getNumElements());
14288 }
14289 InVectors.push_back(V1);
14290 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14291 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14292 CommonMask[Idx] = Mask[Idx] + VF;
14293 }
14294 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14295 Value *Root = nullptr) {
14296 Cost += getBuildVectorCost(VL, Root);
14297 if (!Root) {
14298 // FIXME: Need to find a way to avoid use of getNullValue here.
14300 unsigned VF = VL.size();
14301 if (MaskVF != 0)
14302 VF = std::min(VF, MaskVF);
14303 Type *VLScalarTy = VL.front()->getType();
14304 for (Value *V : VL.take_front(VF)) {
14305 Type *ScalarTy = VLScalarTy->getScalarType();
14306 if (isa<PoisonValue>(V)) {
14307 Vals.push_back(PoisonValue::get(ScalarTy));
14308 continue;
14309 }
14310 if (isa<UndefValue>(V)) {
14311 Vals.push_back(UndefValue::get(ScalarTy));
14312 continue;
14313 }
14314 Vals.push_back(Constant::getNullValue(ScalarTy));
14315 }
14316 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14317 assert(SLPReVec && "FixedVectorType is not expected.");
14318 // When REVEC is enabled, we need to expand vector types into scalar
14319 // types.
14320 Vals = replicateMask(Vals, VecTy->getNumElements());
14321 }
14322 return ConstantVector::get(Vals);
14323 }
14326 cast<FixedVectorType>(Root->getType())->getNumElements()),
14327 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14328 }
14330 /// Finalize emission of the shuffles.
14332 ArrayRef<int> ExtMask,
14333 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14334 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14337 Action = {}) {
14338 IsFinalized = true;
14339 if (Action) {
14340 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14341 if (InVectors.size() == 2)
14342 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14343 else
14344 Cost += createShuffle(Vec, nullptr, CommonMask);
14345 transformMaskAfterShuffle(CommonMask, CommonMask);
14346 assert(VF > 0 &&
14347 "Expected vector length for the final value before action.");
14348 Value *V = cast<Value *>(Vec);
14349 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14350 Cost += createShuffle(V1, V2, Mask);
14351 return V1;
14352 });
14353 InVectors.front() = V;
14354 }
14355 if (!SubVectors.empty()) {
14356 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14357 if (InVectors.size() == 2)
14358 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14359 else
14360 Cost += createShuffle(Vec, nullptr, CommonMask);
14361 transformMaskAfterShuffle(CommonMask, CommonMask);
14362 // Add subvectors permutation cost.
14363 if (!SubVectorsMask.empty()) {
14364 assert(SubVectorsMask.size() <= CommonMask.size() &&
14365 "Expected same size of masks for subvectors and common mask.");
14366 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14367 copy(SubVectorsMask, SVMask.begin());
14368 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14369 if (I2 != PoisonMaskElem) {
14370 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14371 I1 = I2 + CommonMask.size();
14372 }
14373 }
14375 getWidenedType(ScalarTy, CommonMask.size()),
14376 SVMask, CostKind);
14377 }
14378 for (auto [E, Idx] : SubVectors) {
14379 Type *EScalarTy = E->Scalars.front()->getType();
14380 bool IsSigned = true;
14381 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14382 EScalarTy =
14383 IntegerType::get(EScalarTy->getContext(), It->second.first);
14384 IsSigned = It->second.second;
14385 }
14386 if (ScalarTy != EScalarTy) {
14387 unsigned CastOpcode = Instruction::Trunc;
14388 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14389 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14390 if (DstSz > SrcSz)
14391 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14392 Cost += TTI.getCastInstrCost(
14393 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14394 getWidenedType(EScalarTy, E->getVectorFactor()),
14396 }
14399 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14400 getWidenedType(ScalarTy, E->getVectorFactor()));
14401 if (!CommonMask.empty()) {
14402 std::iota(std::next(CommonMask.begin(), Idx),
14403 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14404 Idx);
14405 }
14406 }
14407 }
14408
14409 if (!ExtMask.empty()) {
14410 if (CommonMask.empty()) {
14411 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14412 } else {
14413 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14414 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14415 if (ExtMask[I] == PoisonMaskElem)
14416 continue;
14417 NewMask[I] = CommonMask[ExtMask[I]];
14418 }
14419 CommonMask.swap(NewMask);
14420 }
14421 }
14422 if (CommonMask.empty()) {
14423 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14424 return Cost;
14425 }
14426 return Cost +
14427 createShuffle(InVectors.front(),
14428 InVectors.size() == 2 ? InVectors.back() : nullptr,
14429 CommonMask);
14430 }
14431
14433 assert((IsFinalized || CommonMask.empty()) &&
14434 "Shuffle construction must be finalized.");
14435 }
14436};
14437
14438const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14439 unsigned Idx) const {
14440 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14441 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14442 return Op;
14443}
14444
14445TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14446 if (TE.State == TreeEntry::ScatterVectorize ||
14447 TE.State == TreeEntry::StridedVectorize)
14449 if (TE.State == TreeEntry::CompressVectorize)
14451 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14452 !TE.isAltShuffle()) {
14453 if (TE.ReorderIndices.empty())
14455 SmallVector<int> Mask;
14456 inversePermutation(TE.ReorderIndices, Mask);
14457 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14459 }
14461}
14462
14464BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14465 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14466 ArrayRef<Value *> VL = E->Scalars;
14467
14468 Type *ScalarTy = getValueType(VL[0]);
14469 if (!isValidElementType(ScalarTy))
14470 return InstructionCost::getInvalid();
14472
14473 // If we have computed a smaller type for the expression, update VecTy so
14474 // that the costs will be accurate.
14475 auto It = MinBWs.find(E);
14476 Type *OrigScalarTy = ScalarTy;
14477 if (It != MinBWs.end()) {
14478 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14479 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14480 if (VecTy)
14481 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14482 }
14483 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14484 unsigned EntryVF = E->getVectorFactor();
14485 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14486
14487 if (E->isGather()) {
14488 if (allConstant(VL))
14489 return 0;
14490 if (isa<InsertElementInst>(VL[0]))
14491 return InstructionCost::getInvalid();
14492 if (isa<CmpInst>(VL.front()))
14493 ScalarTy = VL.front()->getType();
14494 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14495 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14496 }
14497 if (E->State == TreeEntry::SplitVectorize) {
14498 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14499 "Expected exactly 2 combined entries.");
14500 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14501 InstructionCost VectorCost = 0;
14502 if (E->ReorderIndices.empty()) {
14503 VectorCost = ::getShuffleCost(
14504 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14505 E->CombinedEntriesWithIndices.back().second,
14507 ScalarTy,
14508 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14509 ->getVectorFactor()));
14510 } else {
14511 unsigned CommonVF =
14512 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14513 ->getVectorFactor(),
14514 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14515 ->getVectorFactor());
14516 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14517 getWidenedType(ScalarTy, CommonVF),
14518 E->getSplitMask(), CostKind);
14519 }
14520 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14521 return VectorCost;
14522 }
14523 InstructionCost CommonCost = 0;
14524 SmallVector<int> Mask;
14525 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14526 (E->State != TreeEntry::StridedVectorize ||
14527 !isReverseOrder(E->ReorderIndices))) {
14528 SmallVector<int> NewMask;
14529 if (E->getOpcode() == Instruction::Store) {
14530 // For stores the order is actually a mask.
14531 NewMask.resize(E->ReorderIndices.size());
14532 copy(E->ReorderIndices, NewMask.begin());
14533 } else {
14534 inversePermutation(E->ReorderIndices, NewMask);
14535 }
14536 ::addMask(Mask, NewMask);
14537 }
14538 if (!E->ReuseShuffleIndices.empty())
14539 ::addMask(Mask, E->ReuseShuffleIndices);
14540 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14541 CommonCost =
14542 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14543 assert((E->State == TreeEntry::Vectorize ||
14544 E->State == TreeEntry::ScatterVectorize ||
14545 E->State == TreeEntry::StridedVectorize ||
14546 E->State == TreeEntry::CompressVectorize) &&
14547 "Unhandled state");
14548 assert(E->getOpcode() &&
14549 ((allSameType(VL) && allSameBlock(VL)) ||
14550 (E->getOpcode() == Instruction::GetElementPtr &&
14551 E->getMainOp()->getType()->isPointerTy()) ||
14552 E->hasCopyableElements()) &&
14553 "Invalid VL");
14554 Instruction *VL0 = E->getMainOp();
14555 unsigned ShuffleOrOp =
14556 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14557 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14558 ShuffleOrOp = E->CombinedOp;
14559 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
14560 const unsigned Sz = UniqueValues.size();
14561 SmallBitVector UsedScalars(Sz, false);
14562 for (unsigned I = 0; I < Sz; ++I) {
14563 if (isa<Instruction>(UniqueValues[I]) &&
14564 !E->isCopyableElement(UniqueValues[I]) &&
14565 getTreeEntries(UniqueValues[I]).front() == E)
14566 continue;
14567 UsedScalars.set(I);
14568 }
14569 auto GetCastContextHint = [&](Value *V) {
14570 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14571 return getCastContextHint(*OpTEs.front());
14572 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14573 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14574 !SrcState.isAltShuffle())
14577 };
14578 auto GetCostDiff =
14579 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14580 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14581 // Calculate the cost of this instruction.
14582 InstructionCost ScalarCost = 0;
14583 if (isa<CastInst, CallInst>(VL0)) {
14584 // For some of the instructions no need to calculate cost for each
14585 // particular instruction, we can use the cost of the single
14586 // instruction x total number of scalar instructions.
14587 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14588 } else {
14589 for (unsigned I = 0; I < Sz; ++I) {
14590 if (UsedScalars.test(I))
14591 continue;
14592 ScalarCost += ScalarEltCost(I);
14593 }
14594 }
14595
14596 InstructionCost VecCost = VectorCost(CommonCost);
14597 // Check if the current node must be resized, if the parent node is not
14598 // resized.
14599 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14600 E->Idx != 0 &&
14601 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14602 const EdgeInfo &EI = E->UserTreeIndex;
14603 if (!EI.UserTE->hasState() ||
14604 EI.UserTE->getOpcode() != Instruction::Select ||
14605 EI.EdgeIdx != 0) {
14606 auto UserBWIt = MinBWs.find(EI.UserTE);
14607 Type *UserScalarTy =
14608 (EI.UserTE->isGather() ||
14609 EI.UserTE->State == TreeEntry::SplitVectorize)
14610 ? EI.UserTE->Scalars.front()->getType()
14611 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14612 if (UserBWIt != MinBWs.end())
14613 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14614 UserBWIt->second.first);
14615 if (ScalarTy != UserScalarTy) {
14616 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14617 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14618 unsigned VecOpcode;
14619 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14620 if (BWSz > SrcBWSz)
14621 VecOpcode = Instruction::Trunc;
14622 else
14623 VecOpcode =
14624 It->second.second ? Instruction::SExt : Instruction::ZExt;
14625 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14626 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14627 CostKind);
14628 }
14629 }
14630 }
14631 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14632 ScalarCost, "Calculated costs for Tree"));
14633 return VecCost - ScalarCost;
14634 };
14635 // Calculate cost difference from vectorizing set of GEPs.
14636 // Negative value means vectorizing is profitable.
14637 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14638 assert((E->State == TreeEntry::Vectorize ||
14639 E->State == TreeEntry::StridedVectorize ||
14640 E->State == TreeEntry::CompressVectorize) &&
14641 "Entry state expected to be Vectorize, StridedVectorize or "
14642 "MaskedLoadCompressVectorize here.");
14643 InstructionCost ScalarCost = 0;
14644 InstructionCost VecCost = 0;
14645 std::tie(ScalarCost, VecCost) = getGEPCosts(
14646 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14647 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14648 "Calculated GEPs cost for Tree"));
14649
14650 return VecCost - ScalarCost;
14651 };
14652
14653 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14654 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14655 if (MinMaxID == Intrinsic::not_intrinsic)
14656 return InstructionCost::getInvalid();
14657 Type *CanonicalType = Ty;
14658 if (CanonicalType->isPtrOrPtrVectorTy())
14659 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14660 CanonicalType->getContext(),
14661 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14662
14663 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14664 {CanonicalType, CanonicalType});
14666 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14667 // If the selects are the only uses of the compares, they will be
14668 // dead and we can adjust the cost by removing their cost.
14669 if (VI && SelectOnly) {
14670 assert((!Ty->isVectorTy() || SLPReVec) &&
14671 "Expected only for scalar type.");
14672 auto *CI = cast<CmpInst>(VI->getOperand(0));
14673 IntrinsicCost -= TTI->getCmpSelInstrCost(
14674 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14675 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14676 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14677 }
14678 return IntrinsicCost;
14679 };
14680 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14681 Instruction *VI) {
14682 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14683 return Cost;
14684 };
14685 switch (ShuffleOrOp) {
14686 case Instruction::PHI: {
14687 // Count reused scalars.
14688 InstructionCost ScalarCost = 0;
14689 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14690 for (Value *V : UniqueValues) {
14691 auto *PHI = dyn_cast<PHINode>(V);
14692 if (!PHI)
14693 continue;
14694
14695 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14696 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14697 Value *Op = PHI->getIncomingValue(I);
14698 Operands[I] = Op;
14699 }
14700 if (const TreeEntry *OpTE =
14701 getSameValuesTreeEntry(Operands.front(), Operands))
14702 if (CountedOps.insert(OpTE).second &&
14703 !OpTE->ReuseShuffleIndices.empty())
14704 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14705 OpTE->Scalars.size());
14706 }
14707
14708 return CommonCost - ScalarCost;
14709 }
14710 case Instruction::ExtractValue:
14711 case Instruction::ExtractElement: {
14712 APInt DemandedElts;
14713 VectorType *SrcVecTy = nullptr;
14714 auto GetScalarCost = [&](unsigned Idx) {
14715 if (isa<PoisonValue>(UniqueValues[Idx]))
14717
14718 auto *I = cast<Instruction>(UniqueValues[Idx]);
14719 if (!SrcVecTy) {
14720 if (ShuffleOrOp == Instruction::ExtractElement) {
14721 auto *EE = cast<ExtractElementInst>(I);
14722 SrcVecTy = EE->getVectorOperandType();
14723 } else {
14724 auto *EV = cast<ExtractValueInst>(I);
14725 Type *AggregateTy = EV->getAggregateOperand()->getType();
14726 unsigned NumElts;
14727 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14728 NumElts = ATy->getNumElements();
14729 else
14730 NumElts = AggregateTy->getStructNumElements();
14731 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
14732 }
14733 }
14734 if (I->hasOneUse()) {
14735 Instruction *Ext = I->user_back();
14736 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14738 // Use getExtractWithExtendCost() to calculate the cost of
14739 // extractelement/ext pair.
14740 InstructionCost Cost = TTI->getExtractWithExtendCost(
14741 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
14742 CostKind);
14743 // Subtract the cost of s|zext which is subtracted separately.
14744 Cost -= TTI->getCastInstrCost(
14745 Ext->getOpcode(), Ext->getType(), I->getType(),
14747 return Cost;
14748 }
14749 }
14750 if (DemandedElts.isZero())
14751 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
14752 DemandedElts.setBit(*getExtractIndex(I));
14754 };
14755 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14756 return CommonCost - (DemandedElts.isZero()
14758 : TTI.getScalarizationOverhead(
14759 SrcVecTy, DemandedElts, /*Insert=*/false,
14760 /*Extract=*/true, CostKind));
14761 };
14762 return GetCostDiff(GetScalarCost, GetVectorCost);
14763 }
14764 case Instruction::InsertElement: {
14765 assert(E->ReuseShuffleIndices.empty() &&
14766 "Unique insertelements only are expected.");
14767 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
14768 unsigned const NumElts = SrcVecTy->getNumElements();
14769 unsigned const NumScalars = VL.size();
14770
14771 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
14772
14773 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14774 unsigned OffsetBeg = *getElementIndex(VL.front());
14775 unsigned OffsetEnd = OffsetBeg;
14776 InsertMask[OffsetBeg] = 0;
14777 for (auto [I, V] : enumerate(VL.drop_front())) {
14778 unsigned Idx = *getElementIndex(V);
14779 if (OffsetBeg > Idx)
14780 OffsetBeg = Idx;
14781 else if (OffsetEnd < Idx)
14782 OffsetEnd = Idx;
14783 InsertMask[Idx] = I + 1;
14784 }
14785 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
14786 if (NumOfParts > 0 && NumOfParts < NumElts)
14787 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14788 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14789 VecScalarsSz;
14790 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14791 unsigned InsertVecSz = std::min<unsigned>(
14792 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
14793 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14794 bool IsWholeSubvector =
14795 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14796 // Check if we can safely insert a subvector. If it is not possible, just
14797 // generate a whole-sized vector and shuffle the source vector and the new
14798 // subvector.
14799 if (OffsetBeg + InsertVecSz > VecSz) {
14800 // Align OffsetBeg to generate correct mask.
14801 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
14802 InsertVecSz = VecSz;
14803 }
14804
14805 APInt DemandedElts = APInt::getZero(NumElts);
14806 // TODO: Add support for Instruction::InsertValue.
14807 SmallVector<int> Mask;
14808 if (!E->ReorderIndices.empty()) {
14809 inversePermutation(E->ReorderIndices, Mask);
14810 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
14811 } else {
14812 Mask.assign(VecSz, PoisonMaskElem);
14813 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
14814 }
14815 bool IsIdentity = true;
14816 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
14817 Mask.swap(PrevMask);
14818 for (unsigned I = 0; I < NumScalars; ++I) {
14819 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
14820 DemandedElts.setBit(InsertIdx);
14821 IsIdentity &= InsertIdx - OffsetBeg == I;
14822 Mask[InsertIdx - OffsetBeg] = I;
14823 }
14824 assert(Offset < NumElts && "Failed to find vector index offset");
14825
14827 Cost -=
14828 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
14829 /*Insert*/ true, /*Extract*/ false, CostKind);
14830
14831 // First cost - resize to actual vector size if not identity shuffle or
14832 // need to shift the vector.
14833 // Do not calculate the cost if the actual size is the register size and
14834 // we can merge this shuffle with the following SK_Select.
14835 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
14836 if (!IsIdentity)
14838 InsertVecTy, Mask);
14839 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14840 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14841 }));
14842 // Second cost - permutation with subvector, if some elements are from the
14843 // initial vector or inserting a subvector.
14844 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
14845 // subvector of ActualVecTy.
14846 SmallBitVector InMask =
14847 isUndefVector(FirstInsert->getOperand(0),
14848 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14849 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
14850 if (InsertVecSz != VecSz) {
14851 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
14852 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
14853 CostKind, OffsetBeg - Offset, InsertVecTy);
14854 } else {
14855 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
14856 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
14857 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
14858 I <= End; ++I)
14859 if (Mask[I] != PoisonMaskElem)
14860 Mask[I] = I + VecSz;
14861 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
14862 Mask[I] =
14863 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
14864 Cost +=
14865 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
14866 }
14867 }
14868 return Cost;
14869 }
14870 case Instruction::ZExt:
14871 case Instruction::SExt:
14872 case Instruction::FPToUI:
14873 case Instruction::FPToSI:
14874 case Instruction::FPExt:
14875 case Instruction::PtrToInt:
14876 case Instruction::IntToPtr:
14877 case Instruction::SIToFP:
14878 case Instruction::UIToFP:
14879 case Instruction::Trunc:
14880 case Instruction::FPTrunc:
14881 case Instruction::BitCast: {
14882 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14883 Type *SrcScalarTy = VL0->getOperand(0)->getType();
14884 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
14885 unsigned Opcode = ShuffleOrOp;
14886 unsigned VecOpcode = Opcode;
14887 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14888 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14889 // Check if the values are candidates to demote.
14890 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
14891 if (SrcIt != MinBWs.end()) {
14892 SrcBWSz = SrcIt->second.first;
14893 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
14894 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
14895 SrcVecTy =
14896 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
14897 }
14898 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14899 if (BWSz == SrcBWSz) {
14900 VecOpcode = Instruction::BitCast;
14901 } else if (BWSz < SrcBWSz) {
14902 VecOpcode = Instruction::Trunc;
14903 } else if (It != MinBWs.end()) {
14904 assert(BWSz > SrcBWSz && "Invalid cast!");
14905 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14906 } else if (SrcIt != MinBWs.end()) {
14907 assert(BWSz > SrcBWSz && "Invalid cast!");
14908 VecOpcode =
14909 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14910 }
14911 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14912 !SrcIt->second.second) {
14913 VecOpcode = Instruction::UIToFP;
14914 }
14915 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
14916 assert(Idx == 0 && "Expected 0 index only");
14917 return TTI->getCastInstrCost(Opcode, VL0->getType(),
14918 VL0->getOperand(0)->getType(),
14920 };
14921 auto GetVectorCost = [=](InstructionCost CommonCost) {
14922 // Do not count cost here if minimum bitwidth is in effect and it is just
14923 // a bitcast (here it is just a noop).
14924 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14925 return CommonCost;
14926 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
14927 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
14928
14929 bool IsArithmeticExtendedReduction =
14930 E->Idx == 0 && UserIgnoreList &&
14931 all_of(*UserIgnoreList, [](Value *V) {
14932 auto *I = cast<Instruction>(V);
14933 return is_contained({Instruction::Add, Instruction::FAdd,
14934 Instruction::Mul, Instruction::FMul,
14935 Instruction::And, Instruction::Or,
14936 Instruction::Xor},
14937 I->getOpcode());
14938 });
14939 if (IsArithmeticExtendedReduction &&
14940 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14941 return CommonCost;
14942 return CommonCost +
14943 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
14944 VecOpcode == Opcode ? VI : nullptr);
14945 };
14946 return GetCostDiff(GetScalarCost, GetVectorCost);
14947 }
14948 case Instruction::FCmp:
14949 case Instruction::ICmp:
14950 case Instruction::Select: {
14951 CmpPredicate VecPred, SwappedVecPred;
14952 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
14953 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
14954 match(VL0, MatchCmp))
14955 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
14956 else
14957 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
14960 auto GetScalarCost = [&](unsigned Idx) {
14961 if (isa<PoisonValue>(UniqueValues[Idx]))
14963
14964 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14965 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
14968 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
14969 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
14970 !match(VI, MatchCmp)) ||
14971 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
14972 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
14973 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
14976
14977 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
14978 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14979 CostKind, getOperandInfo(VI->getOperand(0)),
14980 getOperandInfo(VI->getOperand(1)), VI);
14981 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
14982 if (IntrinsicCost.isValid())
14983 ScalarCost = IntrinsicCost;
14984
14985 return ScalarCost;
14986 };
14987 auto GetVectorCost = [&](InstructionCost CommonCost) {
14988 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
14989
14990 InstructionCost VecCost =
14991 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
14992 CostKind, getOperandInfo(E->getOperand(0)),
14993 getOperandInfo(E->getOperand(1)), VL0);
14994 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
14995 auto *CondType =
14996 getWidenedType(SI->getCondition()->getType(), VL.size());
14997 unsigned CondNumElements = CondType->getNumElements();
14998 unsigned VecTyNumElements = getNumElements(VecTy);
14999 assert(VecTyNumElements >= CondNumElements &&
15000 VecTyNumElements % CondNumElements == 0 &&
15001 "Cannot vectorize Instruction::Select");
15002 if (CondNumElements != VecTyNumElements) {
15003 // When the return type is i1 but the source is fixed vector type, we
15004 // need to duplicate the condition value.
15005 VecCost += ::getShuffleCost(
15006 *TTI, TTI::SK_PermuteSingleSrc, CondType,
15007 createReplicatedMask(VecTyNumElements / CondNumElements,
15008 CondNumElements));
15009 }
15010 }
15011 return VecCost + CommonCost;
15012 };
15013 return GetCostDiff(GetScalarCost, GetVectorCost);
15014 }
15015 case TreeEntry::MinMax: {
15016 auto GetScalarCost = [&](unsigned Idx) {
15017 return GetMinMaxCost(OrigScalarTy);
15018 };
15019 auto GetVectorCost = [&](InstructionCost CommonCost) {
15020 InstructionCost VecCost = GetMinMaxCost(VecTy);
15021 return VecCost + CommonCost;
15022 };
15023 return GetCostDiff(GetScalarCost, GetVectorCost);
15024 }
15025 case TreeEntry::FMulAdd: {
15026 auto GetScalarCost = [&](unsigned Idx) {
15027 if (isa<PoisonValue>(UniqueValues[Idx]))
15029 return GetFMulAddCost(E->getOperations(),
15030 cast<Instruction>(UniqueValues[Idx]));
15031 };
15032 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15033 FastMathFlags FMF;
15034 FMF.set();
15035 for (Value *V : E->Scalars) {
15036 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
15037 FMF &= FPCI->getFastMathFlags();
15038 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
15039 FMF &= FPCIOp->getFastMathFlags();
15040 }
15041 }
15042 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15043 {VecTy, VecTy, VecTy}, FMF);
15044 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
15045 return VecCost + CommonCost;
15046 };
15047 return GetCostDiff(GetScalarCost, GetVectorCost);
15048 }
15049 case Instruction::FNeg:
15050 case Instruction::Add:
15051 case Instruction::FAdd:
15052 case Instruction::Sub:
15053 case Instruction::FSub:
15054 case Instruction::Mul:
15055 case Instruction::FMul:
15056 case Instruction::UDiv:
15057 case Instruction::SDiv:
15058 case Instruction::FDiv:
15059 case Instruction::URem:
15060 case Instruction::SRem:
15061 case Instruction::FRem:
15062 case Instruction::Shl:
15063 case Instruction::LShr:
15064 case Instruction::AShr:
15065 case Instruction::And:
15066 case Instruction::Or:
15067 case Instruction::Xor: {
15068 auto GetScalarCost = [&](unsigned Idx) {
15069 if (isa<PoisonValue>(UniqueValues[Idx]))
15071
15072 // We cannot retrieve the operand from UniqueValues[Idx] because an
15073 // interchangeable instruction may be used. The order and the actual
15074 // operand might differ from what is retrieved from UniqueValues[Idx].
15075 Value *Op1 = E->getOperand(0)[Idx];
15076 Value *Op2;
15077 SmallVector<const Value *, 2> Operands(1, Op1);
15078 if (isa<UnaryOperator>(UniqueValues[Idx])) {
15079 Op2 = Op1;
15080 } else {
15081 Op2 = E->getOperand(1)[Idx];
15082 Operands.push_back(Op2);
15083 }
15086 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
15087 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
15088 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
15089 I && (ShuffleOrOp == Instruction::FAdd ||
15090 ShuffleOrOp == Instruction::FSub)) {
15091 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
15092 if (IntrinsicCost.isValid())
15093 ScalarCost = IntrinsicCost;
15094 }
15095 return ScalarCost;
15096 };
15097 auto GetVectorCost = [=](InstructionCost CommonCost) {
15098 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15099 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15100 ArrayRef<Value *> Ops = E->getOperand(I);
15101 if (all_of(Ops, [&](Value *Op) {
15102 auto *CI = dyn_cast<ConstantInt>(Op);
15103 return CI && CI->getValue().countr_one() >= It->second.first;
15104 }))
15105 return CommonCost;
15106 }
15107 }
15108 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
15109 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
15110 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
15111 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
15112 Op2Info, {}, nullptr, TLI) +
15113 CommonCost;
15114 };
15115 return GetCostDiff(GetScalarCost, GetVectorCost);
15116 }
15117 case Instruction::GetElementPtr: {
15118 return CommonCost + GetGEPCostDiff(VL, VL0);
15119 }
15120 case Instruction::Load: {
15121 auto GetScalarCost = [&](unsigned Idx) {
15122 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
15123 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15124 VI->getAlign(), VI->getPointerAddressSpace(),
15126 };
15127 auto *LI0 = cast<LoadInst>(VL0);
15128 auto GetVectorCost = [&](InstructionCost CommonCost) {
15129 InstructionCost VecLdCost;
15130 switch (E->State) {
15131 case TreeEntry::Vectorize:
15132 if (unsigned Factor = E->getInterleaveFactor()) {
15133 VecLdCost = TTI->getInterleavedMemoryOpCost(
15134 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15135 LI0->getPointerAddressSpace(), CostKind);
15136
15137 } else {
15138 VecLdCost = TTI->getMemoryOpCost(
15139 Instruction::Load, VecTy, LI0->getAlign(),
15140 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15141 }
15142 break;
15143 case TreeEntry::StridedVectorize: {
15144 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
15145 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15146 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
15147 Align CommonAlignment =
15148 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15149 VecLdCost = TTI->getMemIntrinsicInstrCost(
15150 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15151 StridedLoadTy, LI0->getPointerOperand(),
15152 /*VariableMask=*/false, CommonAlignment),
15153 CostKind);
15154 if (StridedLoadTy != VecTy)
15155 VecLdCost +=
15156 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15157 getCastContextHint(*E), CostKind);
15158
15159 break;
15160 }
15161 case TreeEntry::CompressVectorize: {
15162 bool IsMasked;
15163 unsigned InterleaveFactor;
15164 SmallVector<int> CompressMask;
15165 VectorType *LoadVecTy;
15166 SmallVector<Value *> Scalars(VL);
15167 if (!E->ReorderIndices.empty()) {
15168 SmallVector<int> Mask(E->ReorderIndices.begin(),
15169 E->ReorderIndices.end());
15170 reorderScalars(Scalars, Mask);
15171 }
15172 SmallVector<Value *> PointerOps(Scalars.size());
15173 for (auto [I, V] : enumerate(Scalars))
15174 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15175 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15176 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15177 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
15178 CompressMask, LoadVecTy);
15179 assert(IsVectorized && "Failed to vectorize load");
15180 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
15181 InterleaveFactor, IsMasked);
15182 Align CommonAlignment = LI0->getAlign();
15183 if (InterleaveFactor) {
15184 VecLdCost = TTI->getInterleavedMemoryOpCost(
15185 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15186 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
15187 } else if (IsMasked) {
15188 VecLdCost = TTI->getMemIntrinsicInstrCost(
15189 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15190 CommonAlignment,
15191 LI0->getPointerAddressSpace()),
15192 CostKind);
15193 // TODO: include this cost into CommonCost.
15194 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15195 LoadVecTy, CompressMask, CostKind);
15196 } else {
15197 VecLdCost = TTI->getMemoryOpCost(
15198 Instruction::Load, LoadVecTy, CommonAlignment,
15199 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15200 // TODO: include this cost into CommonCost.
15201 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15202 LoadVecTy, CompressMask, CostKind);
15203 }
15204 break;
15205 }
15206 case TreeEntry::ScatterVectorize: {
15207 Align CommonAlignment =
15208 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15209 VecLdCost = TTI->getMemIntrinsicInstrCost(
15210 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15211 LI0->getPointerOperand(),
15212 /*VariableMask=*/false, CommonAlignment),
15213 CostKind);
15214 break;
15215 }
15216 case TreeEntry::CombinedVectorize:
15217 case TreeEntry::SplitVectorize:
15218 case TreeEntry::NeedToGather:
15219 llvm_unreachable("Unexpected vectorization state.");
15220 }
15221 return VecLdCost + CommonCost;
15222 };
15223
15224 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
15225 // If this node generates masked gather load then it is not a terminal node.
15226 // Hence address operand cost is estimated separately.
15227 if (E->State == TreeEntry::ScatterVectorize)
15228 return Cost;
15229
15230 // Estimate cost of GEPs since this tree node is a terminator.
15231 SmallVector<Value *> PointerOps(VL.size());
15232 for (auto [I, V] : enumerate(VL))
15233 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15234 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15235 }
15236 case Instruction::Store: {
15237 bool IsReorder = !E->ReorderIndices.empty();
15238 auto GetScalarCost = [=](unsigned Idx) {
15239 auto *VI = cast<StoreInst>(VL[Idx]);
15240 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
15241 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15242 VI->getAlign(), VI->getPointerAddressSpace(),
15243 CostKind, OpInfo, VI);
15244 };
15245 auto *BaseSI =
15246 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
15247 auto GetVectorCost = [=](InstructionCost CommonCost) {
15248 // We know that we can merge the stores. Calculate the cost.
15249 InstructionCost VecStCost;
15250 if (E->State == TreeEntry::StridedVectorize) {
15251 Align CommonAlignment =
15252 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
15253 VecStCost = TTI->getMemIntrinsicInstrCost(
15254 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15255 VecTy, BaseSI->getPointerOperand(),
15256 /*VariableMask=*/false, CommonAlignment),
15257 CostKind);
15258 } else {
15259 assert(E->State == TreeEntry::Vectorize &&
15260 "Expected either strided or consecutive stores.");
15261 if (unsigned Factor = E->getInterleaveFactor()) {
15262 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
15263 "No reused shuffles expected");
15264 CommonCost = 0;
15265 VecStCost = TTI->getInterleavedMemoryOpCost(
15266 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15267 BaseSI->getPointerAddressSpace(), CostKind);
15268 } else {
15269 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
15270 VecStCost = TTI->getMemoryOpCost(
15271 Instruction::Store, VecTy, BaseSI->getAlign(),
15272 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
15273 }
15274 }
15275 return VecStCost + CommonCost;
15276 };
15277 SmallVector<Value *> PointerOps(VL.size());
15278 for (auto [I, V] : enumerate(VL)) {
15279 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
15280 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
15281 }
15282
15283 return GetCostDiff(GetScalarCost, GetVectorCost) +
15284 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15285 }
15286 case Instruction::Call: {
15287 auto GetScalarCost = [&](unsigned Idx) {
15288 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15291 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15292 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15293 }
15294 return TTI->getCallInstrCost(CI->getCalledFunction(),
15296 CI->getFunctionType()->params(), CostKind);
15297 };
15298 auto GetVectorCost = [=](InstructionCost CommonCost) {
15299 auto *CI = cast<CallInst>(VL0);
15302 CI, ID, VecTy->getNumElements(),
15303 It != MinBWs.end() ? It->second.first : 0, TTI);
15304 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15305 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15306 };
15307 return GetCostDiff(GetScalarCost, GetVectorCost);
15308 }
15309 case Instruction::ShuffleVector: {
15310 if (!SLPReVec || E->isAltShuffle())
15311 assert(E->isAltShuffle() &&
15312 ((Instruction::isBinaryOp(E->getOpcode()) &&
15313 Instruction::isBinaryOp(E->getAltOpcode())) ||
15314 (Instruction::isCast(E->getOpcode()) &&
15315 Instruction::isCast(E->getAltOpcode())) ||
15316 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15317 "Invalid Shuffle Vector Operand");
15318 // Try to find the previous shuffle node with the same operands and same
15319 // main/alternate ops.
15320 auto TryFindNodeWithEqualOperands = [=]() {
15321 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15322 if (TE.get() == E)
15323 break;
15324 if (TE->hasState() && TE->isAltShuffle() &&
15325 ((TE->getOpcode() == E->getOpcode() &&
15326 TE->getAltOpcode() == E->getAltOpcode()) ||
15327 (TE->getOpcode() == E->getAltOpcode() &&
15328 TE->getAltOpcode() == E->getOpcode())) &&
15329 TE->hasEqualOperands(*E))
15330 return true;
15331 }
15332 return false;
15333 };
15334 auto GetScalarCost = [&](unsigned Idx) {
15335 if (isa<PoisonValue>(UniqueValues[Idx]))
15337
15338 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15339 assert(E->getMatchingMainOpOrAltOp(VI) &&
15340 "Unexpected main/alternate opcode");
15341 (void)E;
15342 return TTI->getInstructionCost(VI, CostKind);
15343 };
15344 // Need to clear CommonCost since the final shuffle cost is included into
15345 // vector cost.
15346 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15347 // VecCost is equal to sum of the cost of creating 2 vectors
15348 // and the cost of creating shuffle.
15349 InstructionCost VecCost = 0;
15350 if (TryFindNodeWithEqualOperands()) {
15351 LLVM_DEBUG({
15352 dbgs() << "SLP: diamond match for alternate node found.\n";
15353 E->dump();
15354 });
15355 // No need to add new vector costs here since we're going to reuse
15356 // same main/alternate vector ops, just do different shuffling.
15357 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15358 VecCost =
15359 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15360 VecCost +=
15361 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15362 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15363 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15364 VecCost = TTIRef.getCmpSelInstrCost(
15365 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15366 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15367 VL0);
15368 VecCost += TTIRef.getCmpSelInstrCost(
15369 E->getOpcode(), VecTy, MaskTy,
15370 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15371 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15372 E->getAltOp());
15373 } else {
15374 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15375 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15376 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15377 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15378 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15379 unsigned SrcBWSz =
15380 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15381 if (SrcIt != MinBWs.end()) {
15382 SrcBWSz = SrcIt->second.first;
15383 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15384 SrcTy = getWidenedType(SrcSclTy, VL.size());
15385 }
15386 if (BWSz <= SrcBWSz) {
15387 if (BWSz < SrcBWSz)
15388 VecCost =
15389 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15391 LLVM_DEBUG({
15392 dbgs()
15393 << "SLP: alternate extension, which should be truncated.\n";
15394 E->dump();
15395 });
15396 return VecCost;
15397 }
15398 }
15399 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15401 VecCost +=
15402 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15404 }
15405 SmallVector<int> Mask;
15406 E->buildAltOpShuffleMask(
15407 [&](Instruction *I) {
15408 assert(E->getMatchingMainOpOrAltOp(I) &&
15409 "Unexpected main/alternate opcode");
15410 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15411 *TLI);
15412 },
15413 Mask);
15415 FinalVecTy, Mask, CostKind);
15416 // Patterns like [fadd,fsub] can be combined into a single instruction
15417 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15418 // need to take into account their order when looking for the most used
15419 // order.
15420 unsigned Opcode0 = E->getOpcode();
15421 unsigned Opcode1 = E->getAltOpcode();
15422 SmallBitVector OpcodeMask(
15423 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15424 // If this pattern is supported by the target then we consider the
15425 // order.
15426 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15427 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15428 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15429 return AltVecCost < VecCost ? AltVecCost : VecCost;
15430 }
15431 // TODO: Check the reverse order too.
15432 return VecCost;
15433 };
15434 if (SLPReVec && !E->isAltShuffle())
15435 return GetCostDiff(
15436 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15437 // If a group uses mask in order, the shufflevector can be
15438 // eliminated by instcombine. Then the cost is 0.
15440 "Not supported shufflevector usage.");
15441 auto *SV = cast<ShuffleVectorInst>(VL.front());
15442 unsigned SVNumElements =
15443 cast<FixedVectorType>(SV->getOperand(0)->getType())
15444 ->getNumElements();
15445 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15446 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15447 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15448 int NextIndex = 0;
15449 if (!all_of(Group, [&](Value *V) {
15451 "Not supported shufflevector usage.");
15452 auto *SV = cast<ShuffleVectorInst>(V);
15453 int Index;
15454 [[maybe_unused]] bool IsExtractSubvectorMask =
15455 SV->isExtractSubvectorMask(Index);
15456 assert(IsExtractSubvectorMask &&
15457 "Not supported shufflevector usage.");
15458 if (NextIndex != Index)
15459 return false;
15460 NextIndex += SV->getShuffleMask().size();
15461 return true;
15462 }))
15463 return ::getShuffleCost(
15465 calculateShufflevectorMask(E->Scalars));
15466 }
15467 return TTI::TCC_Free;
15468 });
15469 return GetCostDiff(GetScalarCost, GetVectorCost);
15470 }
15471 case Instruction::Freeze:
15472 return CommonCost;
15473 default:
15474 llvm_unreachable("Unknown instruction");
15475 }
15476}
15477
15478bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15479 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15480 << VectorizableTree.size() << " is fully vectorizable .\n");
15481
15482 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15483 SmallVector<int> Mask;
15484 return TE->isGather() &&
15485 !any_of(TE->Scalars,
15486 [this](Value *V) { return EphValues.contains(V); }) &&
15487 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15488 TE->Scalars.size() < Limit ||
15489 (((TE->hasState() &&
15490 TE->getOpcode() == Instruction::ExtractElement) ||
15492 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15493 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15494 !TE->isAltShuffle()) ||
15495 any_of(TE->Scalars, IsaPred<LoadInst>));
15496 };
15497
15498 // We only handle trees of heights 1 and 2.
15499 if (VectorizableTree.size() == 1 &&
15500 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15501 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15502 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15503 (ForReduction &&
15504 AreVectorizableGathers(VectorizableTree[0].get(),
15505 VectorizableTree[0]->Scalars.size()) &&
15506 VectorizableTree[0]->getVectorFactor() > 2)))
15507 return true;
15508
15509 if (VectorizableTree.size() != 2)
15510 return false;
15511
15512 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15513 // with the second gather nodes if they have less scalar operands rather than
15514 // the initial tree element (may be profitable to shuffle the second gather)
15515 // or they are extractelements, which form shuffle.
15516 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15517 AreVectorizableGathers(VectorizableTree[1].get(),
15518 VectorizableTree[0]->Scalars.size()))
15519 return true;
15520
15521 // Gathering cost would be too much for tiny trees.
15522 if (VectorizableTree[0]->isGather() ||
15523 (VectorizableTree[1]->isGather() &&
15524 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15525 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15526 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15527 return false;
15528
15529 return true;
15530}
15531
15532static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15534 bool MustMatchOrInst) {
15535 // Look past the root to find a source value. Arbitrarily follow the
15536 // path through operand 0 of any 'or'. Also, peek through optional
15537 // shift-left-by-multiple-of-8-bits.
15538 Value *ZextLoad = Root;
15539 const APInt *ShAmtC;
15540 bool FoundOr = false;
15541 while (!isa<ConstantExpr>(ZextLoad) &&
15542 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15543 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15544 ShAmtC->urem(8) == 0))) {
15545 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15546 ZextLoad = BinOp->getOperand(0);
15547 if (BinOp->getOpcode() == Instruction::Or)
15548 FoundOr = true;
15549 }
15550 // Check if the input is an extended load of the required or/shift expression.
15551 Value *Load;
15552 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15553 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15554 return false;
15555
15556 // Require that the total load bit width is a legal integer type.
15557 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15558 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15559 Type *SrcTy = Load->getType();
15560 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15561 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15562 return false;
15563
15564 // Everything matched - assume that we can fold the whole sequence using
15565 // load combining.
15566 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15567 << *(cast<Instruction>(Root)) << "\n");
15568
15569 return true;
15570}
15571
15573 if (RdxKind != RecurKind::Or)
15574 return false;
15575
15576 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15577 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15578 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15579 /* MatchOr */ false);
15580}
15581
15583 // Peek through a final sequence of stores and check if all operations are
15584 // likely to be load-combined.
15585 unsigned NumElts = Stores.size();
15586 for (Value *Scalar : Stores) {
15587 Value *X;
15588 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15589 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15590 return false;
15591 }
15592 return true;
15593}
15594
15595bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15596 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15597 return true;
15598
15599 // Graph is empty - do nothing.
15600 if (VectorizableTree.empty()) {
15601 assert(ExternalUses.empty() && "We shouldn't have any external users");
15602
15603 return true;
15604 }
15605
15606 // No need to vectorize inserts of gathered values.
15607 if (VectorizableTree.size() == 2 &&
15608 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15609 VectorizableTree[1]->isGather() &&
15610 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15611 !(isSplat(VectorizableTree[1]->Scalars) ||
15612 allConstant(VectorizableTree[1]->Scalars))))
15613 return true;
15614
15615 // If the graph includes only PHI nodes and gathers, it is defnitely not
15616 // profitable for the vectorization, we can skip it, if the cost threshold is
15617 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15618 // gathers/buildvectors.
15619 constexpr int Limit = 4;
15620 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15621 !VectorizableTree.empty() &&
15622 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15623 return (TE->isGather() &&
15624 (!TE->hasState() ||
15625 TE->getOpcode() != Instruction::ExtractElement) &&
15626 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15627 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15628 }))
15629 return true;
15630
15631 // Do not vectorize small tree of phis only, if all vector phis are also
15632 // gathered.
15633 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15634 VectorizableTree.size() <= Limit &&
15635 all_of(VectorizableTree,
15636 [&](const std::unique_ptr<TreeEntry> &TE) {
15637 return (TE->isGather() &&
15638 (!TE->hasState() ||
15639 TE->getOpcode() != Instruction::ExtractElement) &&
15640 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15641 Limit) ||
15642 (TE->hasState() &&
15643 (TE->getOpcode() == Instruction::InsertElement ||
15644 (TE->getOpcode() == Instruction::PHI &&
15645 all_of(TE->Scalars, [&](Value *V) {
15646 return isa<PoisonValue>(V) || MustGather.contains(V);
15647 }))));
15648 }) &&
15649 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15650 return TE->State == TreeEntry::Vectorize &&
15651 TE->getOpcode() == Instruction::PHI;
15652 }))
15653 return true;
15654
15655 // If the tree contains only phis, buildvectors, split nodes and
15656 // small nodes with reuses, we can skip it.
15657 SmallVector<const TreeEntry *> StoreLoadNodes;
15658 unsigned NumGathers = 0;
15659 constexpr int LimitTreeSize = 36;
15660 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15661 all_of(VectorizableTree,
15662 [&](const std::unique_ptr<TreeEntry> &TE) {
15663 if (!TE->isGather() && TE->hasState() &&
15664 (TE->getOpcode() == Instruction::Load ||
15665 TE->getOpcode() == Instruction::Store)) {
15666 StoreLoadNodes.push_back(TE.get());
15667 return true;
15668 }
15669 if (TE->isGather())
15670 ++NumGathers;
15671 return TE->State == TreeEntry::SplitVectorize ||
15672 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15673 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15674 VectorizableTree.size() > LimitTreeSize) ||
15675 (TE->isGather() &&
15676 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15677 (TE->hasState() &&
15678 (TE->getOpcode() == Instruction::PHI ||
15679 (TE->hasCopyableElements() &&
15680 static_cast<unsigned>(count_if(
15681 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15682 TE->Scalars.size() / 2) ||
15683 ((!TE->ReuseShuffleIndices.empty() ||
15684 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15685 TE->Scalars.size() == 2)));
15686 }) &&
15687 (StoreLoadNodes.empty() ||
15688 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15689 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15690 return TE->getOpcode() == Instruction::Store ||
15691 all_of(TE->Scalars, [&](Value *V) {
15692 return !isa<LoadInst>(V) ||
15693 areAllUsersVectorized(cast<Instruction>(V));
15694 });
15695 })))))
15696 return true;
15697
15698 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15699 // tree node) and other buildvectors, we can skip it.
15700 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15701 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15702 VectorizableTree.size() >= Limit &&
15703 count_if(ArrayRef(VectorizableTree).drop_front(),
15704 [&](const std::unique_ptr<TreeEntry> &TE) {
15705 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15706 TE->UserTreeIndex.UserTE->Idx == 0;
15707 }) == 2)
15708 return true;
15709
15710 // If the tree contains only vectorization of the phi node from the
15711 // buildvector - skip it.
15712 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15713 VectorizableTree.size() > 2 &&
15714 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15715 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15716 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15717 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15718 all_of(
15719 ArrayRef(VectorizableTree).drop_front(2),
15720 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
15721 return true;
15722
15723 // We can vectorize the tree if its size is greater than or equal to the
15724 // minimum size specified by the MinTreeSize command line option.
15725 if (VectorizableTree.size() >= MinTreeSize)
15726 return false;
15727
15728 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
15729 // can vectorize it if we can prove it fully vectorizable.
15730 if (isFullyVectorizableTinyTree(ForReduction))
15731 return false;
15732
15733 // Check if any of the gather node forms an insertelement buildvector
15734 // somewhere.
15735 bool IsAllowedSingleBVNode =
15736 VectorizableTree.size() > 1 ||
15737 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15738 !VectorizableTree.front()->isAltShuffle() &&
15739 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15740 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15741 allSameBlock(VectorizableTree.front()->Scalars));
15742 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15743 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
15744 return isa<ExtractElementInst, Constant>(V) ||
15745 (IsAllowedSingleBVNode &&
15746 !V->hasNUsesOrMore(UsesLimit) &&
15747 any_of(V->users(), IsaPred<InsertElementInst>));
15748 });
15749 }))
15750 return false;
15751
15752 if (VectorizableTree.back()->isGather() &&
15753 VectorizableTree.back()->hasState() &&
15754 VectorizableTree.back()->isAltShuffle() &&
15755 VectorizableTree.back()->getVectorFactor() > 2 &&
15756 allSameBlock(VectorizableTree.back()->Scalars) &&
15757 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15758 TTI->getScalarizationOverhead(
15759 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15760 VectorizableTree.back()->getVectorFactor()),
15761 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
15762 /*Insert=*/true, /*Extract=*/false,
15764 return false;
15765
15766 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
15767 // vectorizable.
15768 return true;
15769}
15770
15773 constexpr unsigned SmallTree = 3;
15774 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15775 getCanonicalGraphSize() <= SmallTree &&
15776 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15777 [](const std::unique_ptr<TreeEntry> &TE) {
15778 return TE->isGather() && TE->hasState() &&
15779 TE->getOpcode() == Instruction::Load &&
15780 !allSameBlock(TE->Scalars);
15781 }) == 1)
15782 return true;
15783 return false;
15784 }
15785 bool Res = false;
15786 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
15787 TreeEntry &E = *VectorizableTree[Idx];
15788 if (E.State == TreeEntry::SplitVectorize)
15789 return false;
15790 if (!E.isGather())
15791 continue;
15792 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15793 (!E.hasState() &&
15795 (isa<ExtractElementInst>(E.Scalars.front()) &&
15796 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
15797 return false;
15798 if (isSplat(E.Scalars) || allConstant(E.Scalars))
15799 continue;
15800 Res = true;
15801 }
15802 return Res;
15803}
15804
15806 // Walk from the bottom of the tree to the top, tracking which values are
15807 // live. When we see a call instruction that is not part of our tree,
15808 // query TTI to see if there is a cost to keeping values live over it
15809 // (for example, if spills and fills are required).
15810
15811 const TreeEntry *Root = VectorizableTree.front().get();
15812 if (Root->isGather())
15813 return 0;
15814
15815 InstructionCost Cost = 0;
15817 EntriesToOperands;
15818 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
15819 SmallPtrSet<const Instruction *, 8> LastInstructions;
15820 for (const auto &TEPtr : VectorizableTree) {
15821 if (!TEPtr->isGather()) {
15822 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15823 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
15824 LastInstructions.insert(LastInst);
15825 }
15826 if (TEPtr->UserTreeIndex)
15827 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15828 }
15829
15830 auto NoCallIntrinsic = [this](const Instruction *I) {
15831 const auto *II = dyn_cast<IntrinsicInst>(I);
15832 if (!II)
15833 return false;
15834 if (II->isAssumeLikeIntrinsic())
15835 return true;
15836 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
15837 InstructionCost IntrCost =
15838 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
15839 InstructionCost CallCost = TTI->getCallInstrCost(
15840 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
15841 return IntrCost < CallCost;
15842 };
15843
15844 // Maps last instruction in the entry to the last instruction for the one of
15845 // operand entries and the flag. If the flag is true, there are no calls in
15846 // between these instructions.
15848 CheckedInstructions;
15849 unsigned Budget = 0;
15850 const unsigned BudgetLimit =
15851 ScheduleRegionSizeBudget / VectorizableTree.size();
15852 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
15853 const Instruction *Last) {
15854 assert(First->getParent() == Last->getParent() &&
15855 "Expected instructions in same block.");
15856 if (auto It = CheckedInstructions.find(Last);
15857 It != CheckedInstructions.end()) {
15858 const Instruction *Checked = It->second.getPointer();
15859 if (Checked == First || Checked->comesBefore(First))
15860 return It->second.getInt() != 0;
15861 Last = Checked;
15862 } else if (Last == First || Last->comesBefore(First)) {
15863 return true;
15864 }
15866 ++First->getIterator().getReverse(),
15867 PrevInstIt =
15868 Last->getIterator().getReverse();
15869 SmallVector<const Instruction *> LastInstsInRange;
15870 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15871 // Debug information does not impact spill cost.
15872 // Vectorized calls, represented as vector intrinsics, do not impact spill
15873 // cost.
15874 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15875 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
15876 for (const Instruction *LastInst : LastInstsInRange)
15877 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
15878 return false;
15879 }
15880 if (LastInstructions.contains(&*PrevInstIt))
15881 LastInstsInRange.push_back(&*PrevInstIt);
15882
15883 ++PrevInstIt;
15884 ++Budget;
15885 }
15886 for (const Instruction *LastInst : LastInstsInRange)
15887 CheckedInstructions.try_emplace(
15888 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
15889 Budget <= BudgetLimit ? 1 : 0);
15890 return Budget <= BudgetLimit;
15891 };
15892 auto AddCosts = [&](const TreeEntry *Op) {
15893 Type *ScalarTy = Op->Scalars.front()->getType();
15894 auto It = MinBWs.find(Op);
15895 if (It != MinBWs.end())
15896 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
15897 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
15898 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15899 if (ScalarTy->isVectorTy()) {
15900 // Handle revec dead vector instructions.
15901 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15902 }
15903 };
15904 // Memoize the relationship between blocks, i.e. if there is (at least one)
15905 // non-vectorized call between the blocks. This allows to skip the analysis of
15906 // the same block paths multiple times.
15908 ParentOpParentToPreds;
15909 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
15910 BasicBlock *OpParent) {
15911 auto Key = std::make_pair(Root, OpParent);
15912 if (auto It = ParentOpParentToPreds.find(Key);
15913 It != ParentOpParentToPreds.end())
15914 return It->second;
15916 if (Pred)
15917 Worklist.push_back(Pred);
15918 else
15919 Worklist.append(pred_begin(Root), pred_end(Root));
15922 ParentsPairsToAdd;
15923 bool Res = false;
15924 auto Cleanup = make_scope_exit([&]() {
15925 for (const auto &KeyPair : ParentsPairsToAdd) {
15926 assert(!ParentOpParentToPreds.contains(KeyPair) &&
15927 "Should not have been added before.");
15928 ParentOpParentToPreds.try_emplace(KeyPair, Res);
15929 }
15930 });
15931 while (!Worklist.empty()) {
15932 BasicBlock *BB = Worklist.pop_back_val();
15933 if (BB == OpParent || !Visited.insert(BB).second)
15934 continue;
15935 auto Pair = std::make_pair(BB, OpParent);
15936 if (auto It = ParentOpParentToPreds.find(Pair);
15937 It != ParentOpParentToPreds.end()) {
15938 Res = It->second;
15939 return Res;
15940 }
15941 ParentsPairsToAdd.insert(Pair);
15942 unsigned BlockSize = BB->size();
15943 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
15944 return Res;
15945 Budget += BlockSize;
15946 if (Budget > BudgetLimit)
15947 return Res;
15948 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
15949 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
15950 BB->getTerminator()))
15951 return Res;
15952 Worklist.append(pred_begin(BB), pred_end(BB));
15953 }
15954 Res = true;
15955 return Res;
15956 };
15957 SmallVector<const TreeEntry *> LiveEntries(1, Root);
15958 while (!LiveEntries.empty()) {
15959 const TreeEntry *Entry = LiveEntries.pop_back_val();
15960 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
15961 if (Operands.empty())
15962 continue;
15963 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
15964 BasicBlock *Parent = LastInst->getParent();
15965 for (const TreeEntry *Op : Operands) {
15966 if (!Op->isGather())
15967 LiveEntries.push_back(Op);
15968 if (Entry->State == TreeEntry::SplitVectorize ||
15969 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
15970 (Op->isGather() && allConstant(Op->Scalars)))
15971 continue;
15972 Budget = 0;
15973 BasicBlock *Pred = nullptr;
15974 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15975 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15976 BasicBlock *OpParent;
15977 Instruction *OpLastInst;
15978 if (Op->isGather()) {
15979 assert(Entry->getOpcode() == Instruction::PHI &&
15980 "Expected phi node only.");
15981 OpParent = cast<PHINode>(Entry->getMainOp())
15982 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15983 OpLastInst = OpParent->getTerminator();
15984 for (Value *V : Op->Scalars) {
15985 auto *Inst = dyn_cast<Instruction>(V);
15986 if (!Inst)
15987 continue;
15988 if (isVectorized(V)) {
15989 OpParent = Inst->getParent();
15990 OpLastInst = Inst;
15991 break;
15992 }
15993 }
15994 } else {
15995 OpLastInst = EntriesToLastInstruction.at(Op);
15996 OpParent = OpLastInst->getParent();
15997 }
15998 // Check the call instructions within the same basic blocks.
15999 if (OpParent == Parent) {
16000 if (Entry->getOpcode() == Instruction::PHI) {
16001 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16002 AddCosts(Op);
16003 continue;
16004 }
16005 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16006 AddCosts(Op);
16007 continue;
16008 }
16009 // Check for call instruction in between blocks.
16010 // 1. Check entry's block to the head.
16011 if (Entry->getOpcode() != Instruction::PHI &&
16012 !CheckForNonVecCallsInSameBlock(
16013 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
16014 LastInst)) {
16015 AddCosts(Op);
16016 continue;
16017 }
16018 // 2. Check op's block from the end.
16019 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16020 OpParent->getTerminator())) {
16021 AddCosts(Op);
16022 continue;
16023 }
16024 // 3. Check the predecessors of entry's block till op's block.
16025 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16026 AddCosts(Op);
16027 continue;
16028 }
16029 }
16030 }
16031
16032 return Cost;
16033}
16034
16035/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
16036/// buildvector sequence.
16038 const InsertElementInst *IE2) {
16039 if (IE1 == IE2)
16040 return false;
16041 const auto *I1 = IE1;
16042 const auto *I2 = IE2;
16043 const InsertElementInst *PrevI1;
16044 const InsertElementInst *PrevI2;
16045 unsigned Idx1 = *getElementIndex(IE1);
16046 unsigned Idx2 = *getElementIndex(IE2);
16047 do {
16048 if (I2 == IE1)
16049 return true;
16050 if (I1 == IE2)
16051 return false;
16052 PrevI1 = I1;
16053 PrevI2 = I2;
16054 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16055 getElementIndex(I1).value_or(Idx2) != Idx2)
16056 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
16057 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
16058 getElementIndex(I2).value_or(Idx1) != Idx1)
16059 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
16060 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16061 llvm_unreachable("Two different buildvectors not expected.");
16062}
16063
16064namespace {
16065/// Returns incoming Value *, if the requested type is Value * too, or a default
16066/// value, otherwise.
16067struct ValueSelect {
16068 template <typename U>
16069 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
16070 return V;
16071 }
16072 template <typename U>
16073 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
16074 return U();
16075 }
16076};
16077} // namespace
16078
16079/// Does the analysis of the provided shuffle masks and performs the requested
16080/// actions on the vectors with the given shuffle masks. It tries to do it in
16081/// several steps.
16082/// 1. If the Base vector is not undef vector, resizing the very first mask to
16083/// have common VF and perform action for 2 input vectors (including non-undef
16084/// Base). Other shuffle masks are combined with the resulting after the 1 stage
16085/// and processed as a shuffle of 2 elements.
16086/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
16087/// action only for 1 vector with the given mask, if it is not the identity
16088/// mask.
16089/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
16090/// vectors, combing the masks properly between the steps.
16091template <typename T>
16093 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
16094 function_ref<unsigned(T *)> GetVF,
16095 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
16097 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
16098 SmallVector<int> Mask(ShuffleMask.begin()->second);
16099 auto VMIt = std::next(ShuffleMask.begin());
16100 T *Prev = nullptr;
16101 SmallBitVector UseMask =
16102 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16103 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
16104 if (!IsBaseUndef.all()) {
16105 // Base is not undef, need to combine it with the next subvectors.
16106 std::pair<T *, bool> Res =
16107 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
16108 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
16109 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16110 if (Mask[Idx] == PoisonMaskElem)
16111 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
16112 else
16113 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16114 }
16115 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
16116 assert((!V || GetVF(V) == Mask.size()) &&
16117 "Expected base vector of VF number of elements.");
16118 Prev = Action(Mask, {nullptr, Res.first});
16119 } else if (ShuffleMask.size() == 1) {
16120 // Base is undef and only 1 vector is shuffled - perform the action only for
16121 // single vector, if the mask is not the identity mask.
16122 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16123 /*ForSingleMask=*/true);
16124 if (Res.second)
16125 // Identity mask is found.
16126 Prev = Res.first;
16127 else
16128 Prev = Action(Mask, {ShuffleMask.begin()->first});
16129 } else {
16130 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
16131 // shuffles step by step, combining shuffle between the steps.
16132 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16133 unsigned Vec2VF = GetVF(VMIt->first);
16134 if (Vec1VF == Vec2VF) {
16135 // No need to resize the input vectors since they are of the same size, we
16136 // can shuffle them directly.
16137 ArrayRef<int> SecMask = VMIt->second;
16138 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16139 if (SecMask[I] != PoisonMaskElem) {
16140 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16141 Mask[I] = SecMask[I] + Vec1VF;
16142 }
16143 }
16144 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16145 } else {
16146 // Vectors of different sizes - resize and reshuffle.
16147 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16148 /*ForSingleMask=*/false);
16149 std::pair<T *, bool> Res2 =
16150 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16151 ArrayRef<int> SecMask = VMIt->second;
16152 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16153 if (Mask[I] != PoisonMaskElem) {
16154 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16155 if (Res1.second)
16156 Mask[I] = I;
16157 } else if (SecMask[I] != PoisonMaskElem) {
16158 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16159 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
16160 }
16161 }
16162 Prev = Action(Mask, {Res1.first, Res2.first});
16163 }
16164 VMIt = std::next(VMIt);
16165 }
16166 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
16167 // Perform requested actions for the remaining masks/vectors.
16168 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16169 // Shuffle other input vectors, if any.
16170 std::pair<T *, bool> Res =
16171 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16172 ArrayRef<int> SecMask = VMIt->second;
16173 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16174 if (SecMask[I] != PoisonMaskElem) {
16175 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
16176 "Multiple uses of scalars.");
16177 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
16178 } else if (Mask[I] != PoisonMaskElem) {
16179 Mask[I] = I;
16180 }
16181 }
16182 Prev = Action(Mask, {Prev, Res.first});
16183 }
16184 return Prev;
16185}
16186
16187namespace {
16188/// Data type for handling buildvector sequences with the reused scalars from
16189/// other tree entries.
16190template <typename T> struct ShuffledInsertData {
16191 /// List of insertelements to be replaced by shuffles.
16192 SmallVector<InsertElementInst *> InsertElements;
16193 /// The parent vectors and shuffle mask for the given list of inserts.
16194 MapVector<T, SmallVector<int>> ValueMasks;
16195};
16196} // namespace
16197
16199 InstructionCost ReductionCost) {
16200 InstructionCost Cost = ReductionCost;
16201 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16202 << VectorizableTree.size() << ".\n");
16203
16204 SmallPtrSet<Value *, 4> CheckedExtracts;
16205 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
16206 TreeEntry &TE = *VectorizableTree[I];
16207 // No need to count the cost for combined entries, they are combined and
16208 // just skip their cost.
16209 if (TE.State == TreeEntry::CombinedVectorize) {
16210 LLVM_DEBUG(
16211 dbgs() << "SLP: Skipping cost for combined node that starts with "
16212 << *TE.Scalars[0] << ".\n";
16213 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16214 continue;
16215 }
16216 if (TE.hasState() &&
16217 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16218 if (const TreeEntry *E =
16219 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16220 E && E->getVectorFactor() == TE.getVectorFactor()) {
16221 // Some gather nodes might be absolutely the same as some vectorizable
16222 // nodes after reordering, need to handle it.
16223 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16224 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16225 << "SLP: Current total cost = " << Cost << "\n");
16226 continue;
16227 }
16228 }
16229
16230 // Exclude cost of gather loads nodes which are not used. These nodes were
16231 // built as part of the final attempt to vectorize gathered loads.
16232 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16233 "Expected gather nodes with users only.");
16234
16235 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
16236 Cost += C;
16237 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16238 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16239 << "SLP: Current total cost = " << Cost << "\n");
16240 }
16241
16242 if (Cost >= -SLPCostThreshold &&
16243 none_of(ExternalUses, [](const ExternalUser &EU) {
16244 return isa_and_nonnull<InsertElementInst>(EU.User);
16245 }))
16246 return Cost;
16247
16248 SmallPtrSet<Value *, 16> ExtractCostCalculated;
16249 InstructionCost ExtractCost = 0;
16251 SmallVector<APInt> DemandedElts;
16252 SmallDenseSet<Value *, 4> UsedInserts;
16254 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16256 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
16257 // Keep track {Scalar, Index, User} tuple.
16258 // On AArch64, this helps in fusing a mov instruction, associated with
16259 // extractelement, with fmul in the backend so that extractelement is free.
16261 for (ExternalUser &EU : ExternalUses) {
16262 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
16263 }
16264 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
16265 for (ExternalUser &EU : ExternalUses) {
16266 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
16267 << EU.E.Idx << " in lane " << EU.Lane << "\n");
16268 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
16269 else dbgs() << " User: nullptr\n");
16270 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
16271
16272 // Uses by ephemeral values are free (because the ephemeral value will be
16273 // removed prior to code generation, and so the extraction will be
16274 // removed as well).
16275 if (EphValues.count(EU.User))
16276 continue;
16277
16278 // Check if the scalar for the given user or all users is accounted already.
16279 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
16280 (EU.User &&
16281 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
16282 continue;
16283
16284 // Used in unreachable blocks or in EH pads (rarely executed) or is
16285 // terminated with unreachable instruction.
16286 if (BasicBlock *UserParent =
16287 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
16288 UserParent &&
16289 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16290 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
16291 continue;
16292
16293 // We only add extract cost once for the same scalar.
16294 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16295 !ExtractCostCalculated.insert(EU.Scalar).second)
16296 continue;
16297
16298 // No extract cost for vector "scalar" if REVEC is disabled
16299 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16300 continue;
16301
16302 // If found user is an insertelement, do not calculate extract cost but try
16303 // to detect it as a final shuffled/identity match.
16304 // TODO: what if a user is insertvalue when REVEC is enabled?
16305 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16306 VU && VU->getOperand(1) == EU.Scalar) {
16307 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16308 if (!UsedInserts.insert(VU).second)
16309 continue;
16310 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16311 if (InsertIdx) {
16312 const TreeEntry *ScalarTE = &EU.E;
16313 auto *It = find_if(
16314 ShuffledInserts,
16315 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16316 // Checks if 2 insertelements are from the same buildvector.
16317 InsertElementInst *VecInsert = Data.InsertElements.front();
16319 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16320 Value *Op0 = II->getOperand(0);
16321 if (isVectorized(II) && !isVectorized(Op0))
16322 return nullptr;
16323 return Op0;
16324 });
16325 });
16326 int VecId = -1;
16327 if (It == ShuffledInserts.end()) {
16328 auto &Data = ShuffledInserts.emplace_back();
16329 Data.InsertElements.emplace_back(VU);
16330 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16331 VecId = ShuffledInserts.size() - 1;
16332 auto It = MinBWs.find(ScalarTE);
16333 if (It != MinBWs.end() &&
16334 VectorCasts
16335 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16336 .second) {
16337 unsigned BWSz = It->second.first;
16338 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16339 unsigned VecOpcode;
16340 if (DstBWSz < BWSz)
16341 VecOpcode = Instruction::Trunc;
16342 else
16343 VecOpcode =
16344 It->second.second ? Instruction::SExt : Instruction::ZExt;
16346 InstructionCost C = TTI->getCastInstrCost(
16347 VecOpcode, FTy,
16348 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16349 FTy->getNumElements()),
16351 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16352 << " for extending externally used vector with "
16353 "non-equal minimum bitwidth.\n");
16354 Cost += C;
16355 }
16356 } else {
16357 if (isFirstInsertElement(VU, It->InsertElements.front()))
16358 It->InsertElements.front() = VU;
16359 VecId = std::distance(ShuffledInserts.begin(), It);
16360 }
16361 int InIdx = *InsertIdx;
16362 SmallVectorImpl<int> &Mask =
16363 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16364 if (Mask.empty())
16365 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16366 Mask[InIdx] = EU.Lane;
16367 DemandedElts[VecId].setBit(InIdx);
16368 continue;
16369 }
16370 }
16371 }
16372
16374 // If we plan to rewrite the tree in a smaller type, we will need to sign
16375 // extend the extracted value back to the original type. Here, we account
16376 // for the extract and the added cost of the sign extend if needed.
16377 InstructionCost ExtraCost = TTI::TCC_Free;
16378 auto *ScalarTy = EU.Scalar->getType();
16379 const unsigned BundleWidth = EU.E.getVectorFactor();
16380 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16381 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16382 const TreeEntry *Entry = &EU.E;
16383 auto It = MinBWs.find(Entry);
16384 if (It != MinBWs.end()) {
16385 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16386 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16387 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16388 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16389 ? Instruction::ZExt
16390 : Instruction::SExt;
16391 VecTy = getWidenedType(MinTy, BundleWidth);
16392 ExtraCost =
16393 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16394 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16395 << ExtraCost << "\n");
16396 } else {
16397 ExtraCost =
16398 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16399 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16400 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16401 << *VecTy << ": " << ExtraCost << "\n");
16402 }
16403 // Leave the scalar instructions as is if they are cheaper than extracts.
16404 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16405 Entry->getOpcode() == Instruction::Load) {
16406 // Checks if the user of the external scalar is phi in loop body.
16407 auto IsPhiInLoop = [&](const ExternalUser &U) {
16408 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16409 auto *I = cast<Instruction>(U.Scalar);
16410 const Loop *L = LI->getLoopFor(Phi->getParent());
16411 return L && (Phi->getParent() == I->getParent() ||
16412 L == LI->getLoopFor(I->getParent()));
16413 }
16414 return false;
16415 };
16416 if (!ValueToExtUses) {
16417 ValueToExtUses.emplace();
16418 for (const auto &P : enumerate(ExternalUses)) {
16419 // Ignore phis in loops.
16420 if (IsPhiInLoop(P.value()))
16421 continue;
16422
16423 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16424 }
16425 }
16426 // Can use original instruction, if no operands vectorized or they are
16427 // marked as externally used already.
16428 auto *Inst = cast<Instruction>(EU.Scalar);
16429 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16430 auto OperandIsScalar = [&](Value *V) {
16431 if (!isVectorized(V)) {
16432 // Some extractelements might be not vectorized, but
16433 // transformed into shuffle and removed from the function,
16434 // consider it here.
16435 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16436 return !EE->hasOneUse() || !MustGather.contains(EE);
16437 return true;
16438 }
16439 return ValueToExtUses->contains(V);
16440 };
16441 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16442 bool CanBeUsedAsScalarCast = false;
16443 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16444 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16445 Op && all_of(Op->operands(), OperandIsScalar)) {
16446 InstructionCost OpCost =
16447 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16448 ? TTI->getInstructionCost(Op, CostKind)
16449 : 0;
16450 if (ScalarCost + OpCost <= ExtraCost) {
16451 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16452 ScalarCost += OpCost;
16453 }
16454 }
16455 }
16456 if (CanBeUsedAsScalar) {
16457 bool KeepScalar = ScalarCost <= ExtraCost;
16458 // Try to keep original scalar if the user is the phi node from the same
16459 // block as the root phis, currently vectorized. It allows to keep
16460 // better ordering info of PHIs, being vectorized currently.
16461 bool IsProfitablePHIUser =
16462 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16463 VectorizableTree.front()->Scalars.size() > 2)) &&
16464 VectorizableTree.front()->hasState() &&
16465 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16466 !Inst->hasNUsesOrMore(UsesLimit) &&
16467 none_of(Inst->users(),
16468 [&](User *U) {
16469 auto *PHIUser = dyn_cast<PHINode>(U);
16470 return (!PHIUser ||
16471 PHIUser->getParent() !=
16472 cast<Instruction>(
16473 VectorizableTree.front()->getMainOp())
16474 ->getParent()) &&
16475 !isVectorized(U);
16476 }) &&
16477 count_if(Entry->Scalars, [&](Value *V) {
16478 return ValueToExtUses->contains(V);
16479 }) <= 2;
16480 if (IsProfitablePHIUser) {
16481 KeepScalar = true;
16482 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16483 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16484 (!GatheredLoadsEntriesFirst.has_value() ||
16485 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16486 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16487 return ValueToExtUses->contains(V);
16488 });
16489 auto It = ExtractsCount.find(Entry);
16490 if (It != ExtractsCount.end()) {
16491 assert(ScalarUsesCount >= It->getSecond().size() &&
16492 "Expected total number of external uses not less than "
16493 "number of scalar uses.");
16494 ScalarUsesCount -= It->getSecond().size();
16495 }
16496 // Keep original scalar if number of externally used instructions in
16497 // the same entry is not power of 2. It may help to do some extra
16498 // vectorization for now.
16499 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16500 }
16501 if (KeepScalar) {
16502 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16503 for (Value *V : Inst->operands()) {
16504 auto It = ValueToExtUses->find(V);
16505 if (It != ValueToExtUses->end()) {
16506 // Replace all uses to avoid compiler crash.
16507 ExternalUses[It->second].User = nullptr;
16508 }
16509 }
16510 ExtraCost = ScalarCost;
16511 if (!IsPhiInLoop(EU))
16512 ExtractsCount[Entry].insert(Inst);
16513 if (CanBeUsedAsScalarCast) {
16514 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16515 // Update the users of the operands of the cast operand to avoid
16516 // compiler crash.
16517 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16518 for (Value *V : IOp->operands()) {
16519 auto It = ValueToExtUses->find(V);
16520 if (It != ValueToExtUses->end()) {
16521 // Replace all uses to avoid compiler crash.
16522 ExternalUses[It->second].User = nullptr;
16523 }
16524 }
16525 }
16526 }
16527 }
16528 }
16529 }
16530
16531 ExtractCost += ExtraCost;
16532 }
16533 // Insert externals for extract of operands of casts to be emitted as scalars
16534 // instead of extractelement.
16535 for (Value *V : ScalarOpsFromCasts) {
16536 ExternalUsesAsOriginalScalar.insert(V);
16537 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
16538 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
16539 TEs.front()->findLaneForValue(V));
16540 }
16541 }
16542 // Add reduced value cost, if resized.
16543 if (!VectorizedVals.empty()) {
16544 const TreeEntry &Root = *VectorizableTree.front();
16545 auto BWIt = MinBWs.find(&Root);
16546 if (BWIt != MinBWs.end()) {
16547 Type *DstTy = Root.Scalars.front()->getType();
16548 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
16549 unsigned SrcSz =
16550 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16551 if (OriginalSz != SrcSz) {
16552 unsigned Opcode = Instruction::Trunc;
16553 if (OriginalSz > SrcSz)
16554 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16555 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
16556 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16557 assert(SLPReVec && "Only supported by REVEC.");
16558 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
16559 }
16560 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16563 }
16564 }
16565 }
16566
16567 Cost += ExtractCost;
16568 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
16569 bool ForSingleMask) {
16570 InstructionCost C = 0;
16571 unsigned VF = Mask.size();
16572 unsigned VecVF = TE->getVectorFactor();
16573 bool HasLargeIndex =
16574 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
16575 if ((VF != VecVF && HasLargeIndex) ||
16577
16578 if (HasLargeIndex) {
16579 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
16580 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16581 OrigMask.begin());
16583 getWidenedType(TE->getMainOp()->getType(), VecVF),
16584 OrigMask);
16585 LLVM_DEBUG(
16586 dbgs() << "SLP: Adding cost " << C
16587 << " for final shuffle of insertelement external users.\n";
16588 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16589 Cost += C;
16590 return std::make_pair(TE, true);
16591 }
16592
16593 if (!ForSingleMask) {
16594 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16595 for (unsigned I = 0; I < VF; ++I) {
16596 if (Mask[I] != PoisonMaskElem)
16597 ResizeMask[Mask[I]] = Mask[I];
16598 }
16599 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
16602 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
16603 LLVM_DEBUG(
16604 dbgs() << "SLP: Adding cost " << C
16605 << " for final shuffle of insertelement external users.\n";
16606 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16607
16608 Cost += C;
16609 }
16610 }
16611 return std::make_pair(TE, false);
16612 };
16613 // Calculate the cost of the reshuffled vectors, if any.
16614 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16615 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
16616 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16617 unsigned VF = 0;
16618 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
16620 assert((TEs.size() == 1 || TEs.size() == 2) &&
16621 "Expected exactly 1 or 2 tree entries.");
16622 if (TEs.size() == 1) {
16623 if (VF == 0)
16624 VF = TEs.front()->getVectorFactor();
16625 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16626 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
16627 !all_of(enumerate(Mask), [=](const auto &Data) {
16628 return Data.value() == PoisonMaskElem ||
16629 (Data.index() < VF &&
16630 static_cast<int>(Data.index()) == Data.value());
16631 })) {
16634 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16635 << " for final shuffle of insertelement "
16636 "external users.\n";
16637 TEs.front()->dump();
16638 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16639 Cost += C;
16640 }
16641 } else {
16642 if (VF == 0) {
16643 if (TEs.front() &&
16644 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16645 VF = TEs.front()->getVectorFactor();
16646 else
16647 VF = Mask.size();
16648 }
16649 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16651 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
16652 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16653 << " for final shuffle of vector node and external "
16654 "insertelement users.\n";
16655 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16656 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16657 Cost += C;
16658 }
16659 VF = Mask.size();
16660 return TEs.back();
16661 };
16663 MutableArrayRef(Vector.data(), Vector.size()), Base,
16664 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16665 EstimateShufflesCost);
16666 InstructionCost InsertCost = TTI->getScalarizationOverhead(
16668 ShuffledInserts[I].InsertElements.front()->getType()),
16669 DemandedElts[I],
16670 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
16671 Cost -= InsertCost;
16672 }
16673
16674 // Add the cost for reduced value resize (if required).
16675 if (ReductionBitWidth != 0) {
16676 assert(UserIgnoreList && "Expected reduction tree.");
16677 const TreeEntry &E = *VectorizableTree.front();
16678 auto It = MinBWs.find(&E);
16679 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16680 unsigned SrcSize = It->second.first;
16681 unsigned DstSize = ReductionBitWidth;
16682 unsigned Opcode = Instruction::Trunc;
16683 if (SrcSize < DstSize) {
16684 bool IsArithmeticExtendedReduction =
16685 all_of(*UserIgnoreList, [](Value *V) {
16686 auto *I = cast<Instruction>(V);
16687 return is_contained({Instruction::Add, Instruction::FAdd,
16688 Instruction::Mul, Instruction::FMul,
16689 Instruction::And, Instruction::Or,
16690 Instruction::Xor},
16691 I->getOpcode());
16692 });
16693 if (IsArithmeticExtendedReduction)
16694 Opcode =
16695 Instruction::BitCast; // Handle it by getExtendedReductionCost
16696 else
16697 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16698 }
16699 if (Opcode != Instruction::BitCast) {
16700 auto *SrcVecTy =
16701 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16702 auto *DstVecTy =
16703 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16704 TTI::CastContextHint CCH = getCastContextHint(E);
16705 InstructionCost CastCost;
16706 switch (E.getOpcode()) {
16707 case Instruction::SExt:
16708 case Instruction::ZExt:
16709 case Instruction::Trunc: {
16710 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16711 CCH = getCastContextHint(*OpTE);
16712 break;
16713 }
16714 default:
16715 break;
16716 }
16717 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16719 Cost += CastCost;
16720 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
16721 << " for final resize for reduction from " << SrcVecTy
16722 << " to " << DstVecTy << "\n";
16723 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16724 }
16725 }
16726 }
16727
16728 std::optional<InstructionCost> SpillCost;
16729 if (Cost < -SLPCostThreshold) {
16730 SpillCost = getSpillCost();
16731 Cost += *SpillCost;
16732 }
16733#ifndef NDEBUG
16734 SmallString<256> Str;
16735 {
16736 raw_svector_ostream OS(Str);
16737 OS << "SLP: Spill Cost = ";
16738 if (SpillCost)
16739 OS << *SpillCost;
16740 else
16741 OS << "<skipped>";
16742 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
16743 << "SLP: Total Cost = " << Cost << ".\n";
16744 }
16745 LLVM_DEBUG(dbgs() << Str);
16746 if (ViewSLPTree)
16747 ViewGraph(this, "SLP" + F->getName(), false, Str);
16748#endif
16749
16750 return Cost;
16751}
16752
16753/// Tries to find extractelement instructions with constant indices from fixed
16754/// vector type and gather such instructions into a bunch, which highly likely
16755/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16756/// successful, the matched scalars are replaced by poison values in \p VL for
16757/// future analysis.
16758std::optional<TTI::ShuffleKind>
16759BoUpSLP::tryToGatherSingleRegisterExtractElements(
16761 // Scan list of gathered scalars for extractelements that can be represented
16762 // as shuffles.
16764 SmallVector<int> UndefVectorExtracts;
16765 for (int I = 0, E = VL.size(); I < E; ++I) {
16766 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16767 if (!EI) {
16768 if (isa<UndefValue>(VL[I]))
16769 UndefVectorExtracts.push_back(I);
16770 continue;
16771 }
16772 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16773 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16774 continue;
16775 std::optional<unsigned> Idx = getExtractIndex(EI);
16776 // Undefined index.
16777 if (!Idx) {
16778 UndefVectorExtracts.push_back(I);
16779 continue;
16780 }
16781 if (Idx >= VecTy->getNumElements()) {
16782 UndefVectorExtracts.push_back(I);
16783 continue;
16784 }
16785 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
16786 ExtractMask.reset(*Idx);
16787 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
16788 UndefVectorExtracts.push_back(I);
16789 continue;
16790 }
16791 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
16792 }
16793 // Sort the vector operands by the maximum number of uses in extractelements.
16795 VectorOpToIdx.takeVector();
16796 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
16797 return P1.second.size() > P2.second.size();
16798 });
16799 // Find the best pair of the vectors or a single vector.
16800 const int UndefSz = UndefVectorExtracts.size();
16801 unsigned SingleMax = 0;
16802 unsigned PairMax = 0;
16803 if (!Vectors.empty()) {
16804 SingleMax = Vectors.front().second.size() + UndefSz;
16805 if (Vectors.size() > 1) {
16806 auto *ItNext = std::next(Vectors.begin());
16807 PairMax = SingleMax + ItNext->second.size();
16808 }
16809 }
16810 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16811 return std::nullopt;
16812 // Check if better to perform a shuffle of 2 vectors or just of a single
16813 // vector.
16814 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
16815 SmallVector<Value *> GatheredExtracts(
16816 VL.size(), PoisonValue::get(VL.front()->getType()));
16817 if (SingleMax >= PairMax && SingleMax) {
16818 for (int Idx : Vectors.front().second)
16819 std::swap(GatheredExtracts[Idx], VL[Idx]);
16820 } else if (!Vectors.empty()) {
16821 for (unsigned Idx : {0, 1})
16822 for (int Idx : Vectors[Idx].second)
16823 std::swap(GatheredExtracts[Idx], VL[Idx]);
16824 }
16825 // Add extracts from undefs too.
16826 for (int Idx : UndefVectorExtracts)
16827 std::swap(GatheredExtracts[Idx], VL[Idx]);
16828 // Check that gather of extractelements can be represented as just a
16829 // shuffle of a single/two vectors the scalars are extracted from.
16830 std::optional<TTI::ShuffleKind> Res =
16831 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
16832 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16833 // TODO: try to check other subsets if possible.
16834 // Restore the original VL if attempt was not successful.
16835 copy(SavedVL, VL.begin());
16836 return std::nullopt;
16837 }
16838 // Restore unused scalars from mask, if some of the extractelements were not
16839 // selected for shuffle.
16840 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
16841 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
16842 isa<UndefValue>(GatheredExtracts[I])) {
16843 std::swap(VL[I], GatheredExtracts[I]);
16844 continue;
16845 }
16846 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16847 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16848 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16849 is_contained(UndefVectorExtracts, I))
16850 continue;
16851 }
16852 return Res;
16853}
16854
16855/// Tries to find extractelement instructions with constant indices from fixed
16856/// vector type and gather such instructions into a bunch, which highly likely
16857/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16858/// successful, the matched scalars are replaced by poison values in \p VL for
16859/// future analysis.
16861BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16862 SmallVectorImpl<int> &Mask,
16863 unsigned NumParts) const {
16864 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
16865 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
16866 Mask.assign(VL.size(), PoisonMaskElem);
16867 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16868 for (unsigned Part : seq<unsigned>(NumParts)) {
16869 // Scan list of gathered scalars for extractelements that can be represented
16870 // as shuffles.
16871 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
16872 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
16873 SmallVector<int> SubMask;
16874 std::optional<TTI::ShuffleKind> Res =
16875 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16876 ShufflesRes[Part] = Res;
16877 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
16878 }
16879 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
16880 return Res.has_value();
16881 }))
16882 ShufflesRes.clear();
16883 return ShufflesRes;
16884}
16885
16886std::optional<TargetTransformInfo::ShuffleKind>
16887BoUpSLP::isGatherShuffledSingleRegisterEntry(
16888 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
16889 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
16890 Entries.clear();
16891 // TODO: currently checking only for Scalars in the tree entry, need to count
16892 // reused elements too for better cost estimation.
16893 auto GetUserEntry = [&](const TreeEntry *TE) {
16894 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16895 TE = TE->UserTreeIndex.UserTE;
16896 if (TE == VectorizableTree.front().get())
16897 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
16898 return TE->UserTreeIndex;
16899 };
16900 auto HasGatherUser = [&](const TreeEntry *TE) {
16901 while (TE->Idx != 0 && TE->UserTreeIndex) {
16902 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16903 return true;
16904 TE = TE->UserTreeIndex.UserTE;
16905 }
16906 return false;
16907 };
16908 const EdgeInfo TEUseEI = GetUserEntry(TE);
16909 if (!TEUseEI)
16910 return std::nullopt;
16911 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16912 const BasicBlock *TEInsertBlock = nullptr;
16913 // Main node of PHI entries keeps the correct order of operands/incoming
16914 // blocks.
16915 if (auto *PHI = dyn_cast_or_null<PHINode>(
16916 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
16917 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16918 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16919 TEInsertPt = TEInsertBlock->getTerminator();
16920 } else {
16921 TEInsertBlock = TEInsertPt->getParent();
16922 }
16923 if (!DT->isReachableFromEntry(TEInsertBlock))
16924 return std::nullopt;
16925 auto *NodeUI = DT->getNode(TEInsertBlock);
16926 assert(NodeUI && "Should only process reachable instructions");
16927 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
16928 auto CheckOrdering = [&](const Instruction *InsertPt) {
16929 // Argument InsertPt is an instruction where vector code for some other
16930 // tree entry (one that shares one or more scalars with TE) is going to be
16931 // generated. This lambda returns true if insertion point of vector code
16932 // for the TE dominates that point (otherwise dependency is the other way
16933 // around). The other node is not limited to be of a gather kind. Gather
16934 // nodes are not scheduled and their vector code is inserted before their
16935 // first user. If user is PHI, that is supposed to be at the end of a
16936 // predecessor block. Otherwise it is the last instruction among scalars of
16937 // the user node. So, instead of checking dependency between instructions
16938 // themselves, we check dependency between their insertion points for vector
16939 // code (since each scalar instruction ends up as a lane of a vector
16940 // instruction).
16941 const BasicBlock *InsertBlock = InsertPt->getParent();
16942 auto *NodeEUI = DT->getNode(InsertBlock);
16943 if (!NodeEUI)
16944 return false;
16945 assert((NodeUI == NodeEUI) ==
16946 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16947 "Different nodes should have different DFS numbers");
16948 // Check the order of the gather nodes users.
16949 if (TEInsertPt->getParent() != InsertBlock &&
16950 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16951 return false;
16952 if (TEInsertPt->getParent() == InsertBlock &&
16953 TEInsertPt->comesBefore(InsertPt))
16954 return false;
16955 return true;
16956 };
16957 // Find all tree entries used by the gathered values. If no common entries
16958 // found - not a shuffle.
16959 // Here we build a set of tree nodes for each gathered value and trying to
16960 // find the intersection between these sets. If we have at least one common
16961 // tree node for each gathered value - we have just a permutation of the
16962 // single vector. If we have 2 different sets, we're in situation where we
16963 // have a permutation of 2 input vectors.
16965 SmallDenseMap<Value *, int> UsedValuesEntry;
16966 SmallPtrSet<const Value *, 16> VisitedValue;
16967 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
16968 // The node is reused - exit.
16969 if ((TEPtr->getVectorFactor() != VL.size() &&
16970 TEPtr->Scalars.size() != VL.size()) ||
16971 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
16972 return false;
16973 UsedTEs.clear();
16974 UsedTEs.emplace_back().insert(TEPtr);
16975 for (Value *V : VL) {
16976 if (isConstant(V))
16977 continue;
16978 UsedValuesEntry.try_emplace(V, 0);
16979 }
16980 return true;
16981 };
16982 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
16983 unsigned EdgeIdx) {
16984 const TreeEntry *Ptr1 = User1;
16985 const TreeEntry *Ptr2 = User2;
16986 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16987 while (Ptr2) {
16988 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
16989 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16990 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16991 }
16992 while (Ptr1) {
16993 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16994 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16995 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
16996 return Idx < It->second;
16997 }
16998 return false;
16999 };
17000 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
17001 Instruction *InsertPt) {
17002 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
17003 !TEUseEI.UserTE->isCopyableElement(
17004 const_cast<Instruction *>(TEInsertPt)) &&
17005 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17006 InsertPt->getNextNode() == TEInsertPt &&
17007 (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) ||
17008 !isUsedOutsideBlock(InsertPt));
17009 };
17010 for (Value *V : VL) {
17011 if (isConstant(V) || !VisitedValue.insert(V).second)
17012 continue;
17013 // Build a list of tree entries where V is used.
17014 SmallPtrSet<const TreeEntry *, 4> VToTEs;
17015 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
17016 if (TEPtr == TE || TEPtr->Idx == 0)
17017 continue;
17018 assert(any_of(TEPtr->Scalars,
17019 [&](Value *V) { return GatheredScalars.contains(V); }) &&
17020 "Must contain at least single gathered value.");
17021 assert(TEPtr->UserTreeIndex &&
17022 "Expected only single user of a gather node.");
17023 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17024
17025 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17026 UseEI.UserTE->hasState())
17027 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
17028 : nullptr;
17029 Instruction *InsertPt =
17030 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
17031 : &getLastInstructionInBundle(UseEI.UserTE);
17032 if (TEInsertPt == InsertPt) {
17033 // Check nodes, which might be emitted first.
17034 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17035 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17036 TEUseEI.UserTE->isAltShuffle()) &&
17037 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
17038 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17039 (UseEI.UserTE->hasState() &&
17040 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17041 !UseEI.UserTE->isAltShuffle()) ||
17042 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
17043 continue;
17044 }
17045
17046 // If the schedulable insertion point is used in multiple entries - just
17047 // exit, no known ordering at this point, available only after real
17048 // scheduling.
17049 if (!doesNotNeedToBeScheduled(InsertPt) &&
17050 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17051 continue;
17052 // If the users are the PHI nodes with the same incoming blocks - skip.
17053 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17054 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17055 UseEI.UserTE->State == TreeEntry::Vectorize &&
17056 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17057 TEUseEI.UserTE != UseEI.UserTE)
17058 continue;
17059 // If 2 gathers are operands of the same entry (regardless of whether
17060 // user is PHI or else), compare operands indices, use the earlier one
17061 // as the base.
17062 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17063 continue;
17064 // If the user instruction is used for some reason in different
17065 // vectorized nodes - make it depend on index.
17066 if (TEUseEI.UserTE != UseEI.UserTE &&
17067 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17068 HasGatherUser(TEUseEI.UserTE)))
17069 continue;
17070 // If the user node is the operand of the other user node - skip.
17071 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17072 continue;
17073 }
17074
17075 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17076 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17077 UseEI.UserTE->doesNotNeedToSchedule() &&
17078 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
17079 continue;
17080 // Check if the user node of the TE comes after user node of TEPtr,
17081 // otherwise TEPtr depends on TE.
17082 if ((TEInsertBlock != InsertPt->getParent() ||
17083 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17084 (!CheckOrdering(InsertPt) ||
17085 (UseEI.UserTE->hasCopyableElements() &&
17086 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17087 is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
17088 continue;
17089 // The node is reused - exit.
17090 if (CheckAndUseSameNode(TEPtr))
17091 break;
17092 // The parent node is copyable with last inst used outside? And the last
17093 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
17094 // preserve def-use chain.
17095 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17096 continue;
17097 VToTEs.insert(TEPtr);
17098 }
17099 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
17100 const auto *It = find_if(
17101 VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
17102 if (It != VTEs.end()) {
17103 const TreeEntry *VTE = *It;
17104 if (none_of(TE->CombinedEntriesWithIndices,
17105 [&](const auto &P) { return P.first == VTE->Idx; })) {
17106 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17107 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17108 continue;
17109 }
17110 // The node is reused - exit.
17111 if (CheckAndUseSameNode(VTE))
17112 break;
17113 VToTEs.insert(VTE);
17114 }
17115 }
17116 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
17117 const TreeEntry *VTE = VTEs.front();
17118 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17119 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17120 VTEs = VTEs.drop_front();
17121 // Iterate through all vectorized nodes.
17122 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
17123 return MTE->State == TreeEntry::Vectorize;
17124 });
17125 if (MIt == VTEs.end())
17126 continue;
17127 VTE = *MIt;
17128 }
17129 if (none_of(TE->CombinedEntriesWithIndices,
17130 [&](const auto &P) { return P.first == VTE->Idx; })) {
17131 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17132 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst) ||
17133 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
17134 continue;
17135 }
17136 // The node is reused - exit.
17137 if (CheckAndUseSameNode(VTE))
17138 break;
17139 VToTEs.insert(VTE);
17140 }
17141 if (VToTEs.empty())
17142 continue;
17143 if (UsedTEs.empty()) {
17144 // The first iteration, just insert the list of nodes to vector.
17145 UsedTEs.push_back(VToTEs);
17146 UsedValuesEntry.try_emplace(V, 0);
17147 } else {
17148 // Need to check if there are any previously used tree nodes which use V.
17149 // If there are no such nodes, consider that we have another one input
17150 // vector.
17151 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
17152 unsigned Idx = 0;
17153 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
17154 // Do we have a non-empty intersection of previously listed tree entries
17155 // and tree entries using current V?
17156 set_intersect(VToTEs, Set);
17157 if (!VToTEs.empty()) {
17158 // Yes, write the new subset and continue analysis for the next
17159 // scalar.
17160 Set.swap(VToTEs);
17161 break;
17162 }
17163 VToTEs = SavedVToTEs;
17164 ++Idx;
17165 }
17166 // No non-empty intersection found - need to add a second set of possible
17167 // source vectors.
17168 if (Idx == UsedTEs.size()) {
17169 // If the number of input vectors is greater than 2 - not a permutation,
17170 // fallback to the regular gather.
17171 // TODO: support multiple reshuffled nodes.
17172 if (UsedTEs.size() == 2)
17173 continue;
17174 UsedTEs.push_back(SavedVToTEs);
17175 Idx = UsedTEs.size() - 1;
17176 }
17177 UsedValuesEntry.try_emplace(V, Idx);
17178 }
17179 }
17180
17181 if (UsedTEs.empty()) {
17182 Entries.clear();
17183 return std::nullopt;
17184 }
17185
17186 unsigned VF = 0;
17187 if (UsedTEs.size() == 1) {
17188 // Keep the order to avoid non-determinism.
17189 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
17190 UsedTEs.front().end());
17191 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17192 return TE1->Idx < TE2->Idx;
17193 });
17194 // Try to find the perfect match in another gather node at first.
17195 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
17196 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
17197 });
17198 if (It != FirstEntries.end() &&
17199 ((*It)->getVectorFactor() == VL.size() ||
17200 ((*It)->getVectorFactor() == TE->Scalars.size() &&
17201 TE->ReuseShuffleIndices.size() == VL.size() &&
17202 (*It)->isSame(TE->Scalars)))) {
17203 Entries.push_back(*It);
17204 if ((*It)->getVectorFactor() == VL.size()) {
17205 std::iota(std::next(Mask.begin(), Part * VL.size()),
17206 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
17207 } else {
17208 SmallVector<int> CommonMask = TE->getCommonMask();
17209 copy(CommonMask, Mask.begin());
17210 }
17211 // Clear undef scalars.
17212 for (unsigned I : seq<unsigned>(VL.size()))
17213 if (isa<PoisonValue>(VL[I]))
17214 Mask[Part * VL.size() + I] = PoisonMaskElem;
17216 }
17217 // No perfect match, just shuffle, so choose the first tree node from the
17218 // tree.
17219 Entries.push_back(FirstEntries.front());
17220 // Update mapping between values and corresponding tree entries.
17221 for (auto &P : UsedValuesEntry)
17222 P.second = 0;
17223 VF = FirstEntries.front()->getVectorFactor();
17224 } else {
17225 // Try to find nodes with the same vector factor.
17226 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
17227 // Keep the order of tree nodes to avoid non-determinism.
17228 DenseMap<int, const TreeEntry *> VFToTE;
17229 for (const TreeEntry *TE : UsedTEs.front()) {
17230 unsigned VF = TE->getVectorFactor();
17231 auto It = VFToTE.find(VF);
17232 if (It != VFToTE.end()) {
17233 if (It->second->Idx > TE->Idx)
17234 It->getSecond() = TE;
17235 continue;
17236 }
17237 VFToTE.try_emplace(VF, TE);
17238 }
17239 // Same, keep the order to avoid non-determinism.
17240 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
17241 UsedTEs.back().end());
17242 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17243 return TE1->Idx < TE2->Idx;
17244 });
17245 for (const TreeEntry *TE : SecondEntries) {
17246 auto It = VFToTE.find(TE->getVectorFactor());
17247 if (It != VFToTE.end()) {
17248 VF = It->first;
17249 Entries.push_back(It->second);
17250 Entries.push_back(TE);
17251 break;
17252 }
17253 }
17254 // No 2 source vectors with the same vector factor - just choose 2 with max
17255 // index.
17256 if (Entries.empty()) {
17258 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
17259 return TE1->Idx < TE2->Idx;
17260 }));
17261 Entries.push_back(SecondEntries.front());
17262 VF = std::max(Entries.front()->getVectorFactor(),
17263 Entries.back()->getVectorFactor());
17264 } else {
17265 VF = Entries.front()->getVectorFactor();
17266 }
17267 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
17268 for (const TreeEntry *E : Entries)
17269 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
17270 E->Scalars.end());
17271 // Update mapping between values and corresponding tree entries.
17272 for (auto &P : UsedValuesEntry) {
17273 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
17274 if (ValuesToEntries[Idx].contains(P.first)) {
17275 P.second = Idx;
17276 break;
17277 }
17278 }
17279 }
17280
17281 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
17282 // Checks if the 2 PHIs are compatible in terms of high possibility to be
17283 // vectorized.
17284 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
17285 auto *PHI = cast<PHINode>(V);
17286 auto *PHI1 = cast<PHINode>(V1);
17287 // Check that all incoming values are compatible/from same parent (if they
17288 // are instructions).
17289 // The incoming values are compatible if they all are constants, or
17290 // instruction with the same/alternate opcodes from the same basic block.
17291 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
17292 Value *In = PHI->getIncomingValue(I);
17293 Value *In1 = PHI1->getIncomingValue(I);
17294 if (isConstant(In) && isConstant(In1))
17295 continue;
17296 if (!getSameOpcode({In, In1}, *TLI))
17297 return false;
17298 if (cast<Instruction>(In)->getParent() !=
17300 return false;
17301 }
17302 return true;
17303 };
17304 // Check if the value can be ignored during analysis for shuffled gathers.
17305 // We suppose it is better to ignore instruction, which do not form splats,
17306 // are not vectorized/not extractelements (these instructions will be handled
17307 // by extractelements processing) or may form vector node in future.
17308 auto MightBeIgnored = [=](Value *V) {
17309 auto *I = dyn_cast<Instruction>(V);
17310 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17312 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17313 };
17314 // Check that the neighbor instruction may form a full vector node with the
17315 // current instruction V. It is possible, if they have same/alternate opcode
17316 // and same parent basic block.
17317 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17318 Value *V1 = VL[Idx];
17319 bool UsedInSameVTE = false;
17320 auto It = UsedValuesEntry.find(V1);
17321 if (It != UsedValuesEntry.end())
17322 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17323 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17324 getSameOpcode({V, V1}, *TLI) &&
17325 cast<Instruction>(V)->getParent() ==
17326 cast<Instruction>(V1)->getParent() &&
17327 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17328 };
17329 // Build a shuffle mask for better cost estimation and vector emission.
17330 SmallBitVector UsedIdxs(Entries.size());
17332 for (int I = 0, E = VL.size(); I < E; ++I) {
17333 Value *V = VL[I];
17334 auto It = UsedValuesEntry.find(V);
17335 if (It == UsedValuesEntry.end())
17336 continue;
17337 // Do not try to shuffle scalars, if they are constants, or instructions
17338 // that can be vectorized as a result of the following vector build
17339 // vectorization.
17340 if (isConstant(V) || (MightBeIgnored(V) &&
17341 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17342 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17343 continue;
17344 unsigned Idx = It->second;
17345 EntryLanes.emplace_back(Idx, I);
17346 UsedIdxs.set(Idx);
17347 }
17348 // Iterate through all shuffled scalars and select entries, which can be used
17349 // for final shuffle.
17351 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17352 if (!UsedIdxs.test(I))
17353 continue;
17354 // Fix the entry number for the given scalar. If it is the first entry, set
17355 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17356 // These indices are used when calculating final shuffle mask as the vector
17357 // offset.
17358 for (std::pair<unsigned, int> &Pair : EntryLanes)
17359 if (Pair.first == I)
17360 Pair.first = TempEntries.size();
17361 TempEntries.push_back(Entries[I]);
17362 }
17363 Entries.swap(TempEntries);
17364 if (EntryLanes.size() == Entries.size() &&
17365 !VL.equals(ArrayRef(TE->Scalars)
17366 .slice(Part * VL.size(),
17367 std::min<int>(VL.size(), TE->Scalars.size())))) {
17368 // We may have here 1 or 2 entries only. If the number of scalars is equal
17369 // to the number of entries, no need to do the analysis, it is not very
17370 // profitable. Since VL is not the same as TE->Scalars, it means we already
17371 // have some shuffles before. Cut off not profitable case.
17372 Entries.clear();
17373 return std::nullopt;
17374 }
17375 // Build the final mask, check for the identity shuffle, if possible.
17376 bool IsIdentity = Entries.size() == 1;
17377 // Pair.first is the offset to the vector, while Pair.second is the index of
17378 // scalar in the list.
17379 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17380 unsigned Idx = Part * VL.size() + Pair.second;
17381 Mask[Idx] =
17382 Pair.first * VF +
17383 (ForOrder ? std::distance(
17384 Entries[Pair.first]->Scalars.begin(),
17385 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17386 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17387 IsIdentity &= Mask[Idx] == Pair.second;
17388 }
17389 if (ForOrder || IsIdentity || Entries.empty()) {
17390 switch (Entries.size()) {
17391 case 1:
17392 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17394 break;
17395 case 2:
17396 if (EntryLanes.size() > 2 || VL.size() <= 2)
17398 break;
17399 default:
17400 break;
17401 }
17402 } else if (!isa<VectorType>(VL.front()->getType()) &&
17403 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17404 // Do the cost estimation if shuffle beneficial than buildvector.
17405 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17406 std::next(Mask.begin(), (Part + 1) * VL.size()));
17407 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17408 for (int Idx : SubMask) {
17409 if (Idx == PoisonMaskElem)
17410 continue;
17411 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17412 MinElement = Idx;
17413 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17414 MaxElement = Idx;
17415 }
17416 assert(MaxElement >= 0 && MinElement >= 0 &&
17417 MaxElement % VF >= MinElement % VF &&
17418 "Expected at least single element.");
17419 unsigned NewVF = std::max<unsigned>(
17420 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17421 (MaxElement % VF) -
17422 (MinElement % VF) + 1));
17423 if (NewVF < VF) {
17424 for (int &Idx : SubMask) {
17425 if (Idx == PoisonMaskElem)
17426 continue;
17427 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17428 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17429 }
17430 } else {
17431 NewVF = VF;
17432 }
17433
17435 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17436 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17437 auto GetShuffleCost = [&,
17438 &TTI = *TTI](ArrayRef<int> Mask,
17440 VectorType *VecTy) -> InstructionCost {
17441 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17443 Mask, Entries.front()->getInterleaveFactor()))
17444 return TTI::TCC_Free;
17445 return ::getShuffleCost(TTI,
17446 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17448 VecTy, Mask, CostKind);
17449 };
17450 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17451 InstructionCost FirstShuffleCost = 0;
17452 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17453 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17454 FirstShuffleCost = ShuffleCost;
17455 } else {
17456 // Transform mask to include only first entry.
17457 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17458 bool IsIdentity = true;
17459 for (auto [I, Idx] : enumerate(FirstMask)) {
17460 if (Idx >= static_cast<int>(NewVF)) {
17461 Idx = PoisonMaskElem;
17462 } else {
17463 DemandedElts.clearBit(I);
17464 if (Idx != PoisonMaskElem)
17465 IsIdentity &= static_cast<int>(I) == Idx;
17466 }
17467 }
17468 if (!IsIdentity)
17469 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17470 FirstShuffleCost += getScalarizationOverhead(
17471 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17472 /*Extract=*/false, CostKind);
17473 }
17474 InstructionCost SecondShuffleCost = 0;
17475 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17476 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17477 SecondShuffleCost = ShuffleCost;
17478 } else {
17479 // Transform mask to include only first entry.
17480 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17481 bool IsIdentity = true;
17482 for (auto [I, Idx] : enumerate(SecondMask)) {
17483 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
17484 Idx = PoisonMaskElem;
17485 } else {
17486 DemandedElts.clearBit(I);
17487 if (Idx != PoisonMaskElem) {
17488 Idx -= NewVF;
17489 IsIdentity &= static_cast<int>(I) == Idx;
17490 }
17491 }
17492 }
17493 if (!IsIdentity)
17494 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17495 SecondShuffleCost += getScalarizationOverhead(
17496 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17497 /*Extract=*/false, CostKind);
17498 }
17499 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17500 for (auto [I, Idx] : enumerate(SubMask))
17501 if (Idx == PoisonMaskElem)
17502 DemandedElts.clearBit(I);
17503 InstructionCost BuildVectorCost = getScalarizationOverhead(
17504 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17505 /*Extract=*/false, CostKind);
17506 const TreeEntry *BestEntry = nullptr;
17507 if (FirstShuffleCost < ShuffleCost) {
17508 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17509 std::next(Mask.begin(), (Part + 1) * VL.size()),
17510 [&](int &Idx) {
17511 if (Idx >= static_cast<int>(VF))
17512 Idx = PoisonMaskElem;
17513 });
17514 BestEntry = Entries.front();
17515 ShuffleCost = FirstShuffleCost;
17516 }
17517 if (SecondShuffleCost < ShuffleCost) {
17518 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17519 std::next(Mask.begin(), (Part + 1) * VL.size()),
17520 [&](int &Idx) {
17521 if (Idx < static_cast<int>(VF))
17522 Idx = PoisonMaskElem;
17523 else
17524 Idx -= VF;
17525 });
17526 BestEntry = Entries[1];
17527 ShuffleCost = SecondShuffleCost;
17528 }
17529 if (BuildVectorCost >= ShuffleCost) {
17530 if (BestEntry) {
17531 Entries.clear();
17532 Entries.push_back(BestEntry);
17533 }
17534 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
17536 }
17537 }
17538 Entries.clear();
17539 // Clear the corresponding mask elements.
17540 std::fill(std::next(Mask.begin(), Part * VL.size()),
17541 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
17542 return std::nullopt;
17543}
17544
17546BoUpSLP::isGatherShuffledEntry(
17547 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
17548 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
17549 bool ForOrder) {
17550 assert(NumParts > 0 && NumParts < VL.size() &&
17551 "Expected positive number of registers.");
17552 Entries.clear();
17553 // No need to check for the topmost gather node.
17554 if (TE == VectorizableTree.front().get() &&
17555 (!GatheredLoadsEntriesFirst.has_value() ||
17556 none_of(ArrayRef(VectorizableTree).drop_front(),
17557 [](const std::unique_ptr<TreeEntry> &TE) {
17558 return !TE->isGather();
17559 })))
17560 return {};
17561 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
17562 // implemented yet.
17563 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17564 return {};
17565 Mask.assign(VL.size(), PoisonMaskElem);
17566 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17567 "Expected only single user of the gather node.");
17568 assert(VL.size() % NumParts == 0 &&
17569 "Number of scalars must be divisible by NumParts.");
17570 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
17571 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17572 (TE->Idx == 0 ||
17573 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
17574 isSplat(TE->Scalars) ||
17575 (TE->hasState() &&
17576 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
17577 return {};
17578 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17580 for (unsigned Part : seq<unsigned>(NumParts)) {
17581 ArrayRef<Value *> SubVL =
17582 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17583 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17584 std::optional<TTI::ShuffleKind> SubRes =
17585 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17586 ForOrder);
17587 if (!SubRes)
17588 SubEntries.clear();
17589 Res.push_back(SubRes);
17590 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
17591 SubEntries.front()->getVectorFactor() == VL.size() &&
17592 (SubEntries.front()->isSame(TE->Scalars) ||
17593 SubEntries.front()->isSame(VL))) {
17594 SmallVector<const TreeEntry *> LocalSubEntries;
17595 LocalSubEntries.swap(SubEntries);
17596 Entries.clear();
17597 Res.clear();
17598 std::iota(Mask.begin(), Mask.end(), 0);
17599 // Clear undef scalars.
17600 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
17601 if (isa<PoisonValue>(VL[I]))
17603 Entries.emplace_back(1, LocalSubEntries.front());
17605 return Res;
17606 }
17607 }
17608 if (all_of(Res,
17609 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
17610 Entries.clear();
17611 return {};
17612 }
17613 return Res;
17614}
17615
17616InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
17617 Type *ScalarTy) const {
17618 const unsigned VF = VL.size();
17619 auto *VecTy = getWidenedType(ScalarTy, VF);
17620 // Find the cost of inserting/extracting values from the vector.
17621 // Check if the same elements are inserted several times and count them as
17622 // shuffle candidates.
17623 APInt DemandedElements = APInt::getZero(VF);
17626 auto EstimateInsertCost = [&](unsigned I, Value *V) {
17627 DemandedElements.setBit(I);
17628 if (V->getType() != ScalarTy)
17629 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
17631 };
17632 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
17633 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17634 for (auto [I, V] : enumerate(VL)) {
17635 // No need to shuffle duplicates for constants.
17636 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
17637 continue;
17638
17639 if (isConstant(V)) {
17640 ConstantShuffleMask[I] = I + VF;
17641 continue;
17642 }
17643 EstimateInsertCost(I, V);
17644 }
17645 // FIXME: add a cost for constant vector materialization.
17646 bool IsAnyNonUndefConst =
17647 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
17648 // 1. Shuffle input source vector and constant vector.
17649 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17651 ConstantShuffleMask);
17652 }
17653
17654 // 2. Insert unique non-constants.
17655 if (!DemandedElements.isZero())
17656 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
17657 /*Insert=*/true,
17658 /*Extract=*/false, CostKind,
17659 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17660 return Cost;
17661}
17662
17663Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
17664 auto It = EntryToLastInstruction.find(E);
17665 if (It != EntryToLastInstruction.end())
17666 return *cast<Instruction>(It->second);
17667 Instruction *Res = nullptr;
17668 // Get the basic block this bundle is in. All instructions in the bundle
17669 // should be in this block (except for extractelement-like instructions with
17670 // constant indices or gathered loads or copyables).
17671 Instruction *Front;
17672 unsigned Opcode;
17673 if (E->hasState()) {
17674 Front = E->getMainOp();
17675 Opcode = E->getOpcode();
17676 } else {
17677 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
17678 Opcode = Front->getOpcode();
17679 }
17680 auto *BB = Front->getParent();
17681 assert(
17682 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17683 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17684 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17685 all_of(E->Scalars,
17686 [=](Value *V) -> bool {
17687 if (Opcode == Instruction::GetElementPtr &&
17688 !isa<GetElementPtrInst>(V))
17689 return true;
17690 auto *I = dyn_cast<Instruction>(V);
17691 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17692 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17693 })) &&
17694 "Expected gathered loads or GEPs or instructions from same basic "
17695 "block.");
17696
17697 auto FindLastInst = [&]() {
17698 Instruction *LastInst = Front;
17699 for (Value *V : E->Scalars) {
17700 auto *I = dyn_cast<Instruction>(V);
17701 if (!I)
17702 continue;
17703 if (E->isCopyableElement(I))
17704 continue;
17705 if (LastInst->getParent() == I->getParent()) {
17706 if (LastInst->comesBefore(I))
17707 LastInst = I;
17708 continue;
17709 }
17710 assert(((Opcode == Instruction::GetElementPtr &&
17712 E->State == TreeEntry::SplitVectorize ||
17713 (isVectorLikeInstWithConstOps(LastInst) &&
17715 (GatheredLoadsEntriesFirst.has_value() &&
17716 Opcode == Instruction::Load && E->isGather() &&
17717 E->Idx < *GatheredLoadsEntriesFirst)) &&
17718 "Expected vector-like or non-GEP in GEP node insts only.");
17719 if (!DT->isReachableFromEntry(LastInst->getParent())) {
17720 LastInst = I;
17721 continue;
17722 }
17723 if (!DT->isReachableFromEntry(I->getParent()))
17724 continue;
17725 auto *NodeA = DT->getNode(LastInst->getParent());
17726 auto *NodeB = DT->getNode(I->getParent());
17727 assert(NodeA && "Should only process reachable instructions");
17728 assert(NodeB && "Should only process reachable instructions");
17729 assert((NodeA == NodeB) ==
17730 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17731 "Different nodes should have different DFS numbers");
17732 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17733 LastInst = I;
17734 }
17735 BB = LastInst->getParent();
17736 return LastInst;
17737 };
17738
17739 auto FindFirstInst = [&]() {
17740 Instruction *FirstInst = Front;
17741 for (Value *V : E->Scalars) {
17742 auto *I = dyn_cast<Instruction>(V);
17743 if (!I)
17744 continue;
17745 if (E->isCopyableElement(I))
17746 continue;
17747 if (FirstInst->getParent() == I->getParent()) {
17748 if (I->comesBefore(FirstInst))
17749 FirstInst = I;
17750 continue;
17751 }
17752 assert(((Opcode == Instruction::GetElementPtr &&
17754 (isVectorLikeInstWithConstOps(FirstInst) &&
17756 "Expected vector-like or non-GEP in GEP node insts only.");
17757 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
17758 FirstInst = I;
17759 continue;
17760 }
17761 if (!DT->isReachableFromEntry(I->getParent()))
17762 continue;
17763 auto *NodeA = DT->getNode(FirstInst->getParent());
17764 auto *NodeB = DT->getNode(I->getParent());
17765 assert(NodeA && "Should only process reachable instructions");
17766 assert(NodeB && "Should only process reachable instructions");
17767 assert((NodeA == NodeB) ==
17768 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17769 "Different nodes should have different DFS numbers");
17770 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17771 FirstInst = I;
17772 }
17773 return FirstInst;
17774 };
17775
17776 if (E->State == TreeEntry::SplitVectorize) {
17777 Res = FindLastInst();
17778 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
17779 for (auto *E : Entries) {
17780 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17781 if (!I)
17782 I = &getLastInstructionInBundle(E);
17783 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
17784 Res = I;
17785 }
17786 }
17787 EntryToLastInstruction.try_emplace(E, Res);
17788 return *Res;
17789 }
17790
17791 // Set insertpoint for gathered loads to the very first load.
17792 if (GatheredLoadsEntriesFirst.has_value() &&
17793 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17794 Opcode == Instruction::Load) {
17795 Res = FindFirstInst();
17796 EntryToLastInstruction.try_emplace(E, Res);
17797 return *Res;
17798 }
17799
17800 // Set the insert point to the beginning of the basic block if the entry
17801 // should not be scheduled.
17802 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
17803 if (E->isGather())
17804 return nullptr;
17805 // Found previously that the instruction do not need to be scheduled.
17806 const auto *It = BlocksSchedules.find(BB);
17807 if (It == BlocksSchedules.end())
17808 return nullptr;
17809 for (Value *V : E->Scalars) {
17810 auto *I = dyn_cast<Instruction>(V);
17811 if (!I || isa<PHINode>(I) ||
17812 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
17813 continue;
17814 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
17815 if (Bundles.empty())
17816 continue;
17817 const auto *It = find_if(
17818 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
17819 if (It != Bundles.end())
17820 return *It;
17821 }
17822 return nullptr;
17823 };
17824 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17825 if (!E->isGather() && !Bundle) {
17826 if ((Opcode == Instruction::GetElementPtr &&
17827 any_of(E->Scalars,
17828 [](Value *V) {
17829 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17830 })) ||
17831 (all_of(E->Scalars,
17832 [&](Value *V) {
17833 return isa<PoisonValue>(V) ||
17834 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17835 E->isCopyableElement(V) ||
17836 (!isVectorLikeInstWithConstOps(V) &&
17837 isUsedOutsideBlock(V));
17838 }) &&
17839 (!E->doesNotNeedToSchedule() ||
17840 any_of(E->Scalars,
17841 [&](Value *V) {
17842 if (!isa<Instruction>(V) ||
17843 (E->hasCopyableElements() && E->isCopyableElement(V)))
17844 return false;
17845 return !areAllOperandsNonInsts(V);
17846 }) ||
17847 none_of(E->Scalars, [&](Value *V) {
17848 if (!isa<Instruction>(V) ||
17849 (E->hasCopyableElements() && E->isCopyableElement(V)))
17850 return false;
17851 return MustGather.contains(V);
17852 }))))
17853 Res = FindLastInst();
17854 else
17855 Res = FindFirstInst();
17856 EntryToLastInstruction.try_emplace(E, Res);
17857 return *Res;
17858 }
17859
17860 // Find the last instruction. The common case should be that BB has been
17861 // scheduled, and the last instruction is VL.back(). So we start with
17862 // VL.back() and iterate over schedule data until we reach the end of the
17863 // bundle. The end of the bundle is marked by null ScheduleData.
17864 if (Bundle) {
17865 assert(!E->isGather() && "Gathered instructions should not be scheduled");
17866 Res = Bundle->getBundle().back()->getInst();
17867 EntryToLastInstruction.try_emplace(E, Res);
17868 return *Res;
17869 }
17870
17871 // LastInst can still be null at this point if there's either not an entry
17872 // for BB in BlocksSchedules or there's no ScheduleData available for
17873 // VL.back(). This can be the case if buildTreeRec aborts for various
17874 // reasons (e.g., the maximum recursion depth is reached, the maximum region
17875 // size is reached, etc.). ScheduleData is initialized in the scheduling
17876 // "dry-run".
17877 //
17878 // If this happens, we can still find the last instruction by brute force. We
17879 // iterate forwards from Front (inclusive) until we either see all
17880 // instructions in the bundle or reach the end of the block. If Front is the
17881 // last instruction in program order, LastInst will be set to Front, and we
17882 // will visit all the remaining instructions in the block.
17883 //
17884 // One of the reasons we exit early from buildTreeRec is to place an upper
17885 // bound on compile-time. Thus, taking an additional compile-time hit here is
17886 // not ideal. However, this should be exceedingly rare since it requires that
17887 // we both exit early from buildTreeRec and that the bundle be out-of-order
17888 // (causing us to iterate all the way to the end of the block).
17889 if (!Res)
17890 Res = FindLastInst();
17891 assert(Res && "Failed to find last instruction in bundle");
17892 EntryToLastInstruction.try_emplace(E, Res);
17893 return *Res;
17894}
17895
17896void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
17897 auto *Front = E->getMainOp();
17898 Instruction *LastInst = &getLastInstructionInBundle(E);
17899 assert(LastInst && "Failed to find last instruction in bundle");
17900 BasicBlock::iterator LastInstIt = LastInst->getIterator();
17901 // If the instruction is PHI, set the insert point after all the PHIs.
17902 bool IsPHI = isa<PHINode>(LastInst);
17903 if (IsPHI) {
17904 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
17905 if (LastInstIt != LastInst->getParent()->end() &&
17906 LastInstIt->getParent()->isLandingPad())
17907 LastInstIt = std::next(LastInstIt);
17908 }
17909 if (IsPHI ||
17910 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17911 (E->doesNotNeedToSchedule() ||
17912 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
17913 isUsedOutsideBlock(LastInst)))) ||
17914 (GatheredLoadsEntriesFirst.has_value() &&
17915 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17916 E->getOpcode() == Instruction::Load)) {
17917 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
17918 } else {
17919 // Set the insertion point after the last instruction in the bundle. Set the
17920 // debug location to Front.
17921 Builder.SetInsertPoint(
17922 LastInst->getParent(),
17923 LastInst->getNextNode()->getIterator());
17924 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
17925 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
17926 } else {
17927 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
17928 PoisonValue::get(Builder.getPtrTy()),
17929 MaybeAlign());
17930 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
17931 eraseInstruction(Res);
17932 LastInstructionToPos.try_emplace(LastInst, Res);
17933 }
17934 }
17935 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
17936}
17937
17938Value *BoUpSLP::gather(
17939 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
17940 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
17941 // List of instructions/lanes from current block and/or the blocks which are
17942 // part of the current loop. These instructions will be inserted at the end to
17943 // make it possible to optimize loops and hoist invariant instructions out of
17944 // the loops body with better chances for success.
17946 SmallSet<int, 4> PostponedIndices;
17947 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
17948 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
17949 SmallPtrSet<BasicBlock *, 4> Visited;
17950 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
17951 InsertBB = InsertBB->getSinglePredecessor();
17952 return InsertBB && InsertBB == InstBB;
17953 };
17954 for (int I = 0, E = VL.size(); I < E; ++I) {
17955 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
17956 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17957 isVectorized(Inst) ||
17958 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
17959 PostponedIndices.insert(I).second)
17960 PostponedInsts.emplace_back(Inst, I);
17961 }
17962
17963 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
17964 Type *Ty) {
17965 Value *Scalar = V;
17966 if (Scalar->getType() != Ty) {
17967 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17968 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
17969 Value *V = Scalar;
17970 if (auto *CI = dyn_cast<CastInst>(Scalar);
17972 Value *Op = CI->getOperand(0);
17973 if (auto *IOp = dyn_cast<Instruction>(Op);
17974 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
17975 V = Op;
17976 }
17977 Scalar = Builder.CreateIntCast(
17978 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
17979 }
17980
17981 Instruction *InsElt;
17982 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17983 assert(SLPReVec && "FixedVectorType is not expected.");
17984 Vec =
17985 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
17986 auto *II = dyn_cast<Instruction>(Vec);
17987 if (!II)
17988 return Vec;
17989 InsElt = II;
17990 } else {
17991 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17992 InsElt = dyn_cast<InsertElementInst>(Vec);
17993 if (!InsElt)
17994 return Vec;
17995 }
17996 GatherShuffleExtractSeq.insert(InsElt);
17997 CSEBlocks.insert(InsElt->getParent());
17998 // Add to our 'need-to-extract' list.
17999 if (isa<Instruction>(V)) {
18000 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
18001 // Find which lane we need to extract.
18002 User *UserOp = nullptr;
18003 if (Scalar != V) {
18004 if (auto *SI = dyn_cast<Instruction>(Scalar))
18005 UserOp = SI;
18006 } else {
18007 if (V->getType()->isVectorTy()) {
18008 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
18009 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
18010 // Find shufflevector, caused by resize.
18011 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
18012 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
18013 if (SV->getOperand(0) == V)
18014 return SV;
18015 if (SV->getOperand(1) == V)
18016 return SV;
18017 }
18018 return nullptr;
18019 };
18020 InsElt = nullptr;
18021 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18022 InsElt = User;
18023 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18024 InsElt = User;
18025 assert(InsElt &&
18026 "Failed to find shufflevector, caused by resize.");
18027 }
18028 }
18029 UserOp = InsElt;
18030 }
18031 if (UserOp) {
18032 unsigned FoundLane = Entries.front()->findLaneForValue(V);
18033 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
18034 }
18035 }
18036 }
18037 return Vec;
18038 };
18039 auto *VecTy = getWidenedType(ScalarTy, VL.size());
18040 Value *Vec = PoisonValue::get(VecTy);
18041 SmallVector<int> NonConsts;
18042 SmallVector<int> Mask(VL.size());
18043 std::iota(Mask.begin(), Mask.end(), 0);
18044 Value *OriginalRoot = Root;
18045 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
18046 SV && isa<PoisonValue>(SV->getOperand(1)) &&
18047 SV->getOperand(0)->getType() == VecTy) {
18048 Root = SV->getOperand(0);
18049 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18050 }
18051 // Insert constant values at first.
18052 for (int I = 0, E = VL.size(); I < E; ++I) {
18053 if (PostponedIndices.contains(I))
18054 continue;
18055 if (!isConstant(VL[I])) {
18056 NonConsts.push_back(I);
18057 continue;
18058 }
18059 if (isa<PoisonValue>(VL[I]))
18060 continue;
18061 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18062 Mask[I] = I + E;
18063 }
18064 if (Root) {
18065 if (isa<PoisonValue>(Vec)) {
18066 Vec = OriginalRoot;
18067 } else {
18068 Vec = CreateShuffle(Root, Vec, Mask);
18069 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
18070 OI && OI->use_empty() &&
18071 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18072 return TE->VectorizedValue == OI;
18073 }))
18074 eraseInstruction(OI);
18075 }
18076 }
18077 // Insert non-constant values.
18078 for (int I : NonConsts)
18079 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18080 // Append instructions, which are/may be part of the loop, in the end to make
18081 // it possible to hoist non-loop-based instructions.
18082 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18083 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18084
18085 return Vec;
18086}
18087
18088/// Merges shuffle masks and emits final shuffle instruction, if required. It
18089/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
18090/// when the actual shuffle instruction is generated only if this is actually
18091/// required. Otherwise, the shuffle instruction emission is delayed till the
18092/// end of the process, to reduce the number of emitted instructions and further
18093/// analysis/transformations.
18094/// The class also will look through the previously emitted shuffle instructions
18095/// and properly mark indices in mask as undef.
18096/// For example, given the code
18097/// \code
18098/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
18099/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
18100/// \endcode
18101/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
18102/// look through %s1 and %s2 and emit
18103/// \code
18104/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18105/// \endcode
18106/// instead.
18107/// If 2 operands are of different size, the smallest one will be resized and
18108/// the mask recalculated properly.
18109/// For example, given the code
18110/// \code
18111/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
18112/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
18113/// \endcode
18114/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
18115/// look through %s1 and %s2 and emit
18116/// \code
18117/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18118/// \endcode
18119/// instead.
18120class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
18121 bool IsFinalized = false;
18122 /// Combined mask for all applied operands and masks. It is built during
18123 /// analysis and actual emission of shuffle vector instructions.
18124 SmallVector<int> CommonMask;
18125 /// List of operands for the shuffle vector instruction. It hold at max 2
18126 /// operands, if the 3rd is going to be added, the first 2 are combined into
18127 /// shuffle with \p CommonMask mask, the first operand sets to be the
18128 /// resulting shuffle and the second operand sets to be the newly added
18129 /// operand. The \p CommonMask is transformed in the proper way after that.
18130 SmallVector<Value *, 2> InVectors;
18131 IRBuilderBase &Builder;
18132 BoUpSLP &R;
18133
18134 class ShuffleIRBuilder {
18135 IRBuilderBase &Builder;
18136 /// Holds all of the instructions that we gathered.
18137 SetVector<Instruction *> &GatherShuffleExtractSeq;
18138 /// A list of blocks that we are going to CSE.
18139 DenseSet<BasicBlock *> &CSEBlocks;
18140 /// Data layout.
18141 const DataLayout &DL;
18142
18143 public:
18144 ShuffleIRBuilder(IRBuilderBase &Builder,
18145 SetVector<Instruction *> &GatherShuffleExtractSeq,
18146 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
18147 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
18148 CSEBlocks(CSEBlocks), DL(DL) {}
18149 ~ShuffleIRBuilder() = default;
18150 /// Creates shufflevector for the 2 operands with the given mask.
18151 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
18152 if (V1->getType() != V2->getType()) {
18154 V1->getType()->isIntOrIntVectorTy() &&
18155 "Expected integer vector types only.");
18156 if (V1->getType() != V2->getType()) {
18157 if (cast<VectorType>(V2->getType())
18158 ->getElementType()
18159 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
18160 ->getElementType()
18161 ->getIntegerBitWidth())
18162 V2 = Builder.CreateIntCast(
18163 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
18164 else
18165 V1 = Builder.CreateIntCast(
18166 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
18167 }
18168 }
18169 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
18170 if (auto *I = dyn_cast<Instruction>(Vec)) {
18171 GatherShuffleExtractSeq.insert(I);
18172 CSEBlocks.insert(I->getParent());
18173 }
18174 return Vec;
18175 }
18176 /// Creates permutation of the single vector operand with the given mask, if
18177 /// it is not identity mask.
18178 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
18179 if (Mask.empty())
18180 return V1;
18181 unsigned VF = Mask.size();
18182 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
18183 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
18184 return V1;
18185 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18186 if (auto *I = dyn_cast<Instruction>(Vec)) {
18187 GatherShuffleExtractSeq.insert(I);
18188 CSEBlocks.insert(I->getParent());
18189 }
18190 return Vec;
18191 }
18192 Value *createIdentity(Value *V) { return V; }
18193 Value *createPoison(Type *Ty, unsigned VF) {
18194 return PoisonValue::get(getWidenedType(Ty, VF));
18195 }
18196 /// Resizes 2 input vector to match the sizes, if the they are not equal
18197 /// yet. The smallest vector is resized to the size of the larger vector.
18198 void resizeToMatch(Value *&V1, Value *&V2) {
18199 if (V1->getType() == V2->getType())
18200 return;
18201 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
18202 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
18203 int VF = std::max(V1VF, V2VF);
18204 int MinVF = std::min(V1VF, V2VF);
18205 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
18206 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
18207 0);
18208 Value *&Op = MinVF == V1VF ? V1 : V2;
18209 Op = Builder.CreateShuffleVector(Op, IdentityMask);
18210 if (auto *I = dyn_cast<Instruction>(Op)) {
18211 GatherShuffleExtractSeq.insert(I);
18212 CSEBlocks.insert(I->getParent());
18213 }
18214 if (MinVF == V1VF)
18215 V1 = Op;
18216 else
18217 V2 = Op;
18218 }
18219 };
18220
18221 /// Smart shuffle instruction emission, walks through shuffles trees and
18222 /// tries to find the best matching vector for the actual shuffle
18223 /// instruction.
18224 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
18225 assert(V1 && "Expected at least one vector value.");
18226 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18227 R.CSEBlocks, *R.DL);
18228 return BaseShuffleAnalysis::createShuffle<Value *>(
18229 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18230 }
18231
18232 /// Cast value \p V to the vector type with the same number of elements, but
18233 /// the base type \p ScalarTy.
18234 Value *castToScalarTyElem(Value *V,
18235 std::optional<bool> IsSigned = std::nullopt) {
18236 auto *VecTy = cast<VectorType>(V->getType());
18237 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
18238 if (VecTy->getElementType() == ScalarTy->getScalarType())
18239 return V;
18240 return Builder.CreateIntCast(
18241 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18242 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
18243 }
18244
18245 Value *getVectorizedValue(const TreeEntry &E) {
18246 Value *Vec = E.VectorizedValue;
18247 if (!Vec->getType()->isIntOrIntVectorTy())
18248 return Vec;
18249 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
18250 return !isa<PoisonValue>(V) &&
18251 !isKnownNonNegative(
18252 V, SimplifyQuery(*R.DL));
18253 }));
18254 }
18255
18256public:
18258 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18259
18260 /// Adjusts extractelements after reusing them.
18261 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
18262 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18263 unsigned NumParts, bool &UseVecBaseAsInput) {
18264 UseVecBaseAsInput = false;
18265 SmallPtrSet<Value *, 4> UniqueBases;
18266 Value *VecBase = nullptr;
18267 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
18268 if (!E->ReorderIndices.empty()) {
18269 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18270 E->ReorderIndices.end());
18271 reorderScalars(VL, ReorderMask);
18272 }
18273 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18274 int Idx = Mask[I];
18275 if (Idx == PoisonMaskElem)
18276 continue;
18277 auto *EI = cast<ExtractElementInst>(VL[I]);
18278 VecBase = EI->getVectorOperand();
18279 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
18280 VecBase = TEs.front()->VectorizedValue;
18281 assert(VecBase && "Expected vectorized value.");
18282 UniqueBases.insert(VecBase);
18283 // If the only one use is vectorized - can delete the extractelement
18284 // itself.
18285 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18286 (NumParts != 1 && count(VL, EI) > 1) ||
18287 any_of(EI->users(), [&](User *U) {
18288 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18289 return UTEs.empty() || UTEs.size() > 1 ||
18290 (isa<GetElementPtrInst>(U) &&
18291 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18292 (!UTEs.empty() &&
18293 count_if(R.VectorizableTree,
18294 [&](const std::unique_ptr<TreeEntry> &TE) {
18295 return TE->UserTreeIndex.UserTE ==
18296 UTEs.front() &&
18297 is_contained(VL, EI);
18298 }) != 1);
18299 }))
18300 continue;
18301 R.eraseInstruction(EI);
18302 }
18303 if (NumParts == 1 || UniqueBases.size() == 1) {
18304 assert(VecBase && "Expected vectorized value.");
18305 return castToScalarTyElem(VecBase);
18306 }
18307 UseVecBaseAsInput = true;
18308 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
18309 for (auto [I, Idx] : enumerate(Mask))
18310 if (Idx != PoisonMaskElem)
18311 Idx = I;
18312 };
18313 // Perform multi-register vector shuffle, joining them into a single virtual
18314 // long vector.
18315 // Need to shuffle each part independently and then insert all this parts
18316 // into a long virtual vector register, forming the original vector.
18317 Value *Vec = nullptr;
18318 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18319 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18320 for (unsigned Part : seq<unsigned>(NumParts)) {
18321 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
18322 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
18323 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
18324 constexpr int MaxBases = 2;
18325 SmallVector<Value *, MaxBases> Bases(MaxBases);
18326 auto VLMask = zip(SubVL, SubMask);
18327 const unsigned VF = std::accumulate(
18328 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
18329 if (std::get<1>(D) == PoisonMaskElem)
18330 return S;
18331 Value *VecOp =
18332 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18333 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18334 !TEs.empty())
18335 VecOp = TEs.front()->VectorizedValue;
18336 assert(VecOp && "Expected vectorized value.");
18337 const unsigned Size =
18338 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18339 return std::max(S, Size);
18340 });
18341 for (const auto [V, I] : VLMask) {
18342 if (I == PoisonMaskElem)
18343 continue;
18344 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18345 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18346 VecOp = TEs.front()->VectorizedValue;
18347 assert(VecOp && "Expected vectorized value.");
18348 VecOp = castToScalarTyElem(VecOp);
18349 Bases[I / VF] = VecOp;
18350 }
18351 if (!Bases.front())
18352 continue;
18353 Value *SubVec;
18354 if (Bases.back()) {
18355 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18356 TransformToIdentity(SubMask);
18357 } else {
18358 SubVec = Bases.front();
18359 }
18360 if (!Vec) {
18361 Vec = SubVec;
18362 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18363 [&](unsigned P) {
18364 ArrayRef<int> SubMask =
18365 Mask.slice(P * SliceSize,
18366 getNumElems(Mask.size(),
18367 SliceSize, P));
18368 return all_of(SubMask, [](int Idx) {
18369 return Idx == PoisonMaskElem;
18370 });
18371 })) &&
18372 "Expected first part or all previous parts masked.");
18373 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18374 } else {
18375 unsigned NewVF =
18376 cast<FixedVectorType>(Vec->getType())->getNumElements();
18377 if (Vec->getType() != SubVec->getType()) {
18378 unsigned SubVecVF =
18379 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18380 NewVF = std::max(NewVF, SubVecVF);
18381 }
18382 // Adjust SubMask.
18383 for (int &Idx : SubMask)
18384 if (Idx != PoisonMaskElem)
18385 Idx += NewVF;
18386 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18387 Vec = createShuffle(Vec, SubVec, VecMask);
18388 TransformToIdentity(VecMask);
18389 }
18390 }
18391 copy(VecMask, Mask.begin());
18392 return Vec;
18393 }
18394 /// Checks if the specified entry \p E needs to be delayed because of its
18395 /// dependency nodes.
18396 std::optional<Value *>
18397 needToDelay(const TreeEntry *E,
18399 // No need to delay emission if all deps are ready.
18400 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18401 return all_of(
18402 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18403 }))
18404 return std::nullopt;
18405 // Postpone gather emission, will be emitted after the end of the
18406 // process to keep correct order.
18407 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18408 return Builder.CreateAlignedLoad(
18409 ResVecTy,
18410 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18411 MaybeAlign());
18412 }
18413 /// Reset the builder to handle perfect diamond match.
18415 IsFinalized = false;
18416 CommonMask.clear();
18417 InVectors.clear();
18418 }
18419 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18420 /// shuffling.
18421 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18422 Value *V1 = getVectorizedValue(E1);
18423 Value *V2 = getVectorizedValue(E2);
18424 add(V1, V2, Mask);
18425 }
18426 /// Adds single input vector (in form of tree entry) and the mask for its
18427 /// shuffling.
18428 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18429 Value *V1 = getVectorizedValue(E1);
18430 add(V1, Mask);
18431 }
18432 /// Adds 2 input vectors and the mask for their shuffling.
18433 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18434 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18437 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18438 V1 = castToScalarTyElem(V1);
18439 V2 = castToScalarTyElem(V2);
18440 if (InVectors.empty()) {
18441 InVectors.push_back(V1);
18442 InVectors.push_back(V2);
18443 CommonMask.assign(Mask.begin(), Mask.end());
18444 return;
18445 }
18446 Value *Vec = InVectors.front();
18447 if (InVectors.size() == 2) {
18448 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18449 transformMaskAfterShuffle(CommonMask, CommonMask);
18450 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18451 Mask.size()) {
18452 Vec = createShuffle(Vec, nullptr, CommonMask);
18453 transformMaskAfterShuffle(CommonMask, CommonMask);
18454 }
18455 V1 = createShuffle(V1, V2, Mask);
18456 unsigned VF = std::max(getVF(V1), getVF(Vec));
18457 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18458 if (Mask[Idx] != PoisonMaskElem)
18459 CommonMask[Idx] = Idx + VF;
18460 InVectors.front() = Vec;
18461 if (InVectors.size() == 2)
18462 InVectors.back() = V1;
18463 else
18464 InVectors.push_back(V1);
18465 }
18466 /// Adds another one input vector and the mask for the shuffling.
18467 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
18469 "castToScalarTyElem expects V1 to be FixedVectorType");
18470 V1 = castToScalarTyElem(V1);
18471 if (InVectors.empty()) {
18472 InVectors.push_back(V1);
18473 CommonMask.assign(Mask.begin(), Mask.end());
18474 return;
18475 }
18476 const auto *It = find(InVectors, V1);
18477 if (It == InVectors.end()) {
18478 if (InVectors.size() == 2 ||
18479 InVectors.front()->getType() != V1->getType()) {
18480 Value *V = InVectors.front();
18481 if (InVectors.size() == 2) {
18482 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18483 transformMaskAfterShuffle(CommonMask, CommonMask);
18484 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18485 CommonMask.size()) {
18486 V = createShuffle(InVectors.front(), nullptr, CommonMask);
18487 transformMaskAfterShuffle(CommonMask, CommonMask);
18488 }
18489 unsigned VF = std::max(CommonMask.size(), Mask.size());
18490 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18491 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
18492 CommonMask[Idx] = V->getType() != V1->getType()
18493 ? Idx + VF
18494 : Mask[Idx] + getVF(V1);
18495 if (V->getType() != V1->getType())
18496 V1 = createShuffle(V1, nullptr, Mask);
18497 InVectors.front() = V;
18498 if (InVectors.size() == 2)
18499 InVectors.back() = V1;
18500 else
18501 InVectors.push_back(V1);
18502 return;
18503 }
18504 // Check if second vector is required if the used elements are already
18505 // used from the first one.
18506 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18507 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
18508 InVectors.push_back(V1);
18509 break;
18510 }
18511 }
18512 unsigned VF = 0;
18513 for (Value *V : InVectors)
18514 VF = std::max(VF, getVF(V));
18515 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18516 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
18517 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18518 }
18519 /// Adds another one input vector and the mask for the shuffling.
18521 SmallVector<int> NewMask;
18522 inversePermutation(Order, NewMask);
18523 add(V1, NewMask);
18524 }
18525 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
18526 Value *Root = nullptr) {
18527 return R.gather(VL, Root, ScalarTy,
18528 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
18529 return createShuffle(V1, V2, Mask);
18530 });
18531 }
18532 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
18533 /// Finalize emission of the shuffles.
18534 /// \param Action the action (if any) to be performed before final applying of
18535 /// the \p ExtMask mask.
18537 ArrayRef<int> ExtMask,
18538 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18539 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
18542 Action = {}) {
18543 IsFinalized = true;
18544 if (Action) {
18545 Value *Vec = InVectors.front();
18546 if (InVectors.size() == 2) {
18547 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18548 InVectors.pop_back();
18549 } else {
18550 Vec = createShuffle(Vec, nullptr, CommonMask);
18551 }
18552 transformMaskAfterShuffle(CommonMask, CommonMask);
18553 assert(VF > 0 &&
18554 "Expected vector length for the final value before action.");
18555 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
18556 if (VecVF < VF) {
18557 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18558 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18559 Vec = createShuffle(Vec, nullptr, ResizeMask);
18560 }
18561 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
18562 return createShuffle(V1, V2, Mask);
18563 });
18564 InVectors.front() = Vec;
18565 }
18566 if (!SubVectors.empty()) {
18567 Value *Vec = InVectors.front();
18568 if (InVectors.size() == 2) {
18569 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18570 InVectors.pop_back();
18571 } else {
18572 Vec = createShuffle(Vec, nullptr, CommonMask);
18573 }
18574 transformMaskAfterShuffle(CommonMask, CommonMask);
18575 auto CreateSubVectors = [&](Value *Vec,
18576 SmallVectorImpl<int> &CommonMask) {
18577 for (auto [E, Idx] : SubVectors) {
18578 Value *V = getVectorizedValue(*E);
18579 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
18580 // Use scalar version of the SCalarType to correctly handle shuffles
18581 // for revectorization. The revectorization mode operates by the
18582 // vectors, but here we need to operate on the scalars, because the
18583 // masks were already transformed for the vector elements and we don't
18584 // need doing this transformation again.
18585 Type *OrigScalarTy = ScalarTy;
18586 ScalarTy = ScalarTy->getScalarType();
18587 Vec = createInsertVector(
18588 Builder, Vec, V, InsertionIndex,
18589 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
18590 _3));
18591 ScalarTy = OrigScalarTy;
18592 if (!CommonMask.empty()) {
18593 std::iota(std::next(CommonMask.begin(), Idx),
18594 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
18595 Idx);
18596 }
18597 }
18598 return Vec;
18599 };
18600 if (SubVectorsMask.empty()) {
18601 Vec = CreateSubVectors(Vec, CommonMask);
18602 } else {
18603 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
18604 copy(SubVectorsMask, SVMask.begin());
18605 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
18606 if (I2 != PoisonMaskElem) {
18607 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
18608 I1 = I2 + CommonMask.size();
18609 }
18610 }
18611 Value *InsertVec =
18612 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
18613 Vec = createShuffle(InsertVec, Vec, SVMask);
18614 transformMaskAfterShuffle(CommonMask, SVMask);
18615 }
18616 InVectors.front() = Vec;
18617 }
18618
18619 if (!ExtMask.empty()) {
18620 if (CommonMask.empty()) {
18621 CommonMask.assign(ExtMask.begin(), ExtMask.end());
18622 } else {
18623 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
18624 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
18625 if (ExtMask[I] == PoisonMaskElem)
18626 continue;
18627 NewMask[I] = CommonMask[ExtMask[I]];
18628 }
18629 CommonMask.swap(NewMask);
18630 }
18631 }
18632 if (CommonMask.empty()) {
18633 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
18634 return InVectors.front();
18635 }
18636 if (InVectors.size() == 2)
18637 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18638 return createShuffle(InVectors.front(), nullptr, CommonMask);
18639 }
18640
18642 assert((IsFinalized || CommonMask.empty()) &&
18643 "Shuffle construction must be finalized.");
18644 }
18645};
18646
18647Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
18648 return vectorizeTree(getOperandEntry(E, NodeIdx));
18649}
18650
18651template <typename BVTy, typename ResTy, typename... Args>
18652ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
18653 Args &...Params) {
18654 assert(E->isGather() && "Expected gather node.");
18655 unsigned VF = E->getVectorFactor();
18656
18657 bool NeedFreeze = false;
18658 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
18659 // Clear values, to be replaced by insertvector instructions.
18660 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18661 for_each(MutableArrayRef(GatheredScalars)
18662 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18663 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
18665 E->CombinedEntriesWithIndices.size());
18666 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18667 [&](const auto &P) {
18668 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18669 });
18670 // Build a mask out of the reorder indices and reorder scalars per this
18671 // mask.
18672 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18673 E->ReorderIndices.end());
18674 if (!ReorderMask.empty())
18675 reorderScalars(GatheredScalars, ReorderMask);
18676 SmallVector<int> SubVectorsMask;
18677 inversePermutation(E->ReorderIndices, SubVectorsMask);
18678 // Transform non-clustered elements in the mask to poison (-1).
18679 // "Clustered" operations will be reordered using this mask later.
18680 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
18681 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
18682 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
18683 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
18684 } else {
18685 SubVectorsMask.clear();
18686 }
18687 SmallVector<Value *> StoredGS(GatheredScalars);
18688 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
18689 unsigned I, unsigned SliceSize,
18690 bool IsNotPoisonous) {
18691 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
18692 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18693 }))
18694 return false;
18695 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18696 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18697 if (UserTE->getNumOperands() != 2)
18698 return false;
18699 if (!IsNotPoisonous) {
18700 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18701 [=](const std::unique_ptr<TreeEntry> &TE) {
18702 return TE->UserTreeIndex.UserTE == UserTE &&
18703 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18704 });
18705 if (It == VectorizableTree.end())
18706 return false;
18707 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
18708 if (!(*It)->ReorderIndices.empty()) {
18709 inversePermutation((*It)->ReorderIndices, ReorderMask);
18710 reorderScalars(GS, ReorderMask);
18711 }
18712 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
18713 Value *V0 = std::get<0>(P);
18714 Value *V1 = std::get<1>(P);
18715 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18716 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18717 is_contained(E->Scalars, V1));
18718 }))
18719 return false;
18720 }
18721 int Idx;
18722 if ((Mask.size() < InputVF &&
18723 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
18724 Idx == 0) ||
18725 (Mask.size() == InputVF &&
18726 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
18727 std::iota(
18728 std::next(Mask.begin(), I * SliceSize),
18729 std::next(Mask.begin(),
18730 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18731 0);
18732 } else {
18733 unsigned IVal =
18734 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
18735 std::fill(
18736 std::next(Mask.begin(), I * SliceSize),
18737 std::next(Mask.begin(),
18738 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18739 IVal);
18740 }
18741 return true;
18742 };
18743 BVTy ShuffleBuilder(ScalarTy, Params...);
18744 ResTy Res = ResTy();
18745 SmallVector<int> Mask;
18746 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
18748 Value *ExtractVecBase = nullptr;
18749 bool UseVecBaseAsInput = false;
18752 Type *OrigScalarTy = GatheredScalars.front()->getType();
18753 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
18754 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
18755 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
18756 // Check for gathered extracts.
18757 bool Resized = false;
18758 ExtractShuffles =
18759 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18760 if (!ExtractShuffles.empty()) {
18761 SmallVector<const TreeEntry *> ExtractEntries;
18762 for (auto [Idx, I] : enumerate(ExtractMask)) {
18763 if (I == PoisonMaskElem)
18764 continue;
18765 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
18766 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
18767 !TEs.empty())
18768 ExtractEntries.append(TEs.begin(), TEs.end());
18769 }
18770 if (std::optional<ResTy> Delayed =
18771 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18772 // Delay emission of gathers which are not ready yet.
18773 PostponedGathers.insert(E);
18774 // Postpone gather emission, will be emitted after the end of the
18775 // process to keep correct order.
18776 return *Delayed;
18777 }
18778 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
18779 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18780 ExtractVecBase = VecBase;
18781 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18782 if (VF == VecBaseTy->getNumElements() &&
18783 GatheredScalars.size() != VF) {
18784 Resized = true;
18785 GatheredScalars.append(VF - GatheredScalars.size(),
18786 PoisonValue::get(OrigScalarTy));
18787 NumParts =
18788 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
18789 }
18790 }
18791 }
18792 // Gather extracts after we check for full matched gathers only.
18793 if (!ExtractShuffles.empty() || !E->hasState() ||
18794 E->getOpcode() != Instruction::Load ||
18795 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18796 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18797 any_of(E->Scalars,
18798 [this](Value *V) {
18799 return isa<LoadInst>(V) && isVectorized(V);
18800 })) ||
18801 (E->hasState() && E->isAltShuffle()) ||
18802 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
18803 isSplat(E->Scalars) ||
18804 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18805 GatherShuffles =
18806 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18807 }
18808 if (!GatherShuffles.empty()) {
18809 if (std::optional<ResTy> Delayed =
18810 ShuffleBuilder.needToDelay(E, Entries)) {
18811 // Delay emission of gathers which are not ready yet.
18812 PostponedGathers.insert(E);
18813 // Postpone gather emission, will be emitted after the end of the
18814 // process to keep correct order.
18815 return *Delayed;
18816 }
18817 if (GatherShuffles.size() == 1 &&
18818 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
18819 Entries.front().front()->isSame(E->Scalars)) {
18820 // Perfect match in the graph, will reuse the previously vectorized
18821 // node. Cost is 0.
18822 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
18823 << shortBundleName(E->Scalars, E->Idx) << ".\n");
18824 // Restore the mask for previous partially matched values.
18825 Mask.resize(E->Scalars.size());
18826 const TreeEntry *FrontTE = Entries.front().front();
18827 if (FrontTE->ReorderIndices.empty() &&
18828 ((FrontTE->ReuseShuffleIndices.empty() &&
18829 E->Scalars.size() == FrontTE->Scalars.size()) ||
18830 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18831 std::iota(Mask.begin(), Mask.end(), 0);
18832 } else {
18833 for (auto [I, V] : enumerate(E->Scalars)) {
18834 if (isa<PoisonValue>(V)) {
18835 Mask[I] = PoisonMaskElem;
18836 continue;
18837 }
18838 Mask[I] = FrontTE->findLaneForValue(V);
18839 }
18840 }
18841 // Reset the builder(s) to correctly handle perfect diamond matched
18842 // nodes.
18843 ShuffleBuilder.resetForSameNode();
18844 ShuffleBuilder.add(*FrontTE, Mask);
18845 // Full matched entry found, no need to insert subvectors.
18846 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18847 return Res;
18848 }
18849 if (!Resized) {
18850 if (GatheredScalars.size() != VF &&
18851 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
18852 return any_of(TEs, [&](const TreeEntry *TE) {
18853 return TE->getVectorFactor() == VF;
18854 });
18855 }))
18856 GatheredScalars.append(VF - GatheredScalars.size(),
18857 PoisonValue::get(OrigScalarTy));
18858 }
18859 // Remove shuffled elements from list of gathers.
18860 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18861 if (Mask[I] != PoisonMaskElem)
18862 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18863 }
18864 }
18865 }
18866 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18867 SmallVectorImpl<int> &ReuseMask,
18868 bool IsRootPoison) {
18869 // For splats with can emit broadcasts instead of gathers, so try to find
18870 // such sequences.
18871 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
18872 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
18873 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
18874 SmallVector<int> UndefPos;
18875 DenseMap<Value *, unsigned> UniquePositions;
18876 // Gather unique non-const values and all constant values.
18877 // For repeated values, just shuffle them.
18878 int NumNonConsts = 0;
18879 int SinglePos = 0;
18880 for (auto [I, V] : enumerate(Scalars)) {
18881 if (isa<UndefValue>(V)) {
18882 if (!isa<PoisonValue>(V)) {
18883 ReuseMask[I] = I;
18884 UndefPos.push_back(I);
18885 }
18886 continue;
18887 }
18888 if (isConstant(V)) {
18889 ReuseMask[I] = I;
18890 continue;
18891 }
18892 ++NumNonConsts;
18893 SinglePos = I;
18894 Value *OrigV = V;
18895 Scalars[I] = PoisonValue::get(OrigScalarTy);
18896 if (IsSplat) {
18897 Scalars.front() = OrigV;
18898 ReuseMask[I] = 0;
18899 } else {
18900 const auto Res = UniquePositions.try_emplace(OrigV, I);
18901 Scalars[Res.first->second] = OrigV;
18902 ReuseMask[I] = Res.first->second;
18903 }
18904 }
18905 if (NumNonConsts == 1) {
18906 // Restore single insert element.
18907 if (IsSplat) {
18908 ReuseMask.assign(VF, PoisonMaskElem);
18909 std::swap(Scalars.front(), Scalars[SinglePos]);
18910 if (!UndefPos.empty() && UndefPos.front() == 0)
18911 Scalars.front() = UndefValue::get(OrigScalarTy);
18912 }
18913 ReuseMask[SinglePos] = SinglePos;
18914 } else if (!UndefPos.empty() && IsSplat) {
18915 // For undef values, try to replace them with the simple broadcast.
18916 // We can do it if the broadcasted value is guaranteed to be
18917 // non-poisonous, or by freezing the incoming scalar value first.
18918 auto *It = find_if(Scalars, [this, E](Value *V) {
18919 return !isa<UndefValue>(V) &&
18921 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
18922 // Check if the value already used in the same operation in
18923 // one of the nodes already.
18924 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18925 is_contained(E->UserTreeIndex.UserTE->Scalars,
18926 U.getUser());
18927 })));
18928 });
18929 if (It != Scalars.end()) {
18930 // Replace undefs by the non-poisoned scalars and emit broadcast.
18931 int Pos = std::distance(Scalars.begin(), It);
18932 for (int I : UndefPos) {
18933 // Set the undef position to the non-poisoned scalar.
18934 ReuseMask[I] = Pos;
18935 // Replace the undef by the poison, in the mask it is replaced by
18936 // non-poisoned scalar already.
18937 if (I != Pos)
18938 Scalars[I] = PoisonValue::get(OrigScalarTy);
18939 }
18940 } else {
18941 // Replace undefs by the poisons, emit broadcast and then emit
18942 // freeze.
18943 for (int I : UndefPos) {
18944 ReuseMask[I] = PoisonMaskElem;
18945 if (isa<UndefValue>(Scalars[I]))
18946 Scalars[I] = PoisonValue::get(OrigScalarTy);
18947 }
18948 NeedFreeze = true;
18949 }
18950 }
18951 };
18952 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
18953 bool IsNonPoisoned = true;
18954 bool IsUsedInExpr = true;
18955 Value *Vec1 = nullptr;
18956 if (!ExtractShuffles.empty()) {
18957 // Gather of extractelements can be represented as just a shuffle of
18958 // a single/two vectors the scalars are extracted from.
18959 // Find input vectors.
18960 Value *Vec2 = nullptr;
18961 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18962 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
18963 ExtractMask[I] = PoisonMaskElem;
18964 }
18965 if (UseVecBaseAsInput) {
18966 Vec1 = ExtractVecBase;
18967 } else {
18968 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18969 if (ExtractMask[I] == PoisonMaskElem)
18970 continue;
18971 if (isa<UndefValue>(StoredGS[I]))
18972 continue;
18973 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
18974 Value *VecOp = EI->getVectorOperand();
18975 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
18976 !TEs.empty() && TEs.front()->VectorizedValue)
18977 VecOp = TEs.front()->VectorizedValue;
18978 if (!Vec1) {
18979 Vec1 = VecOp;
18980 } else if (Vec1 != VecOp) {
18981 assert((!Vec2 || Vec2 == VecOp) &&
18982 "Expected only 1 or 2 vectors shuffle.");
18983 Vec2 = VecOp;
18984 }
18985 }
18986 }
18987 if (Vec2) {
18988 IsUsedInExpr = false;
18989 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
18990 isGuaranteedNotToBePoison(Vec2, AC);
18991 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18992 } else if (Vec1) {
18993 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
18994 IsUsedInExpr &= FindReusedSplat(
18995 ExtractMask,
18996 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
18997 ExtractMask.size(), IsNotPoisonedVec);
18998 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
18999 IsNonPoisoned &= IsNotPoisonedVec;
19000 } else {
19001 IsUsedInExpr = false;
19002 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
19003 /*ForExtracts=*/true);
19004 }
19005 }
19006 if (!GatherShuffles.empty()) {
19007 unsigned SliceSize =
19008 getPartNumElems(E->Scalars.size(),
19009 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
19010 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
19011 for (const auto [I, TEs] : enumerate(Entries)) {
19012 if (TEs.empty()) {
19013 assert(!GatherShuffles[I] &&
19014 "No shuffles with empty entries list expected.");
19015 continue;
19016 }
19017 assert((TEs.size() == 1 || TEs.size() == 2) &&
19018 "Expected shuffle of 1 or 2 entries.");
19019 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
19020 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
19021 VecMask.assign(VecMask.size(), PoisonMaskElem);
19022 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
19023 if (TEs.size() == 1) {
19024 bool IsNotPoisonedVec =
19025 TEs.front()->VectorizedValue
19026 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
19027 : true;
19028 IsUsedInExpr &=
19029 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
19030 SliceSize, IsNotPoisonedVec);
19031 ShuffleBuilder.add(*TEs.front(), VecMask);
19032 IsNonPoisoned &= IsNotPoisonedVec;
19033 } else {
19034 IsUsedInExpr = false;
19035 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19036 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19037 IsNonPoisoned &=
19038 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
19039 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
19040 }
19041 }
19042 }
19043 // Try to figure out best way to combine values: build a shuffle and insert
19044 // elements or just build several shuffles.
19045 // Insert non-constant scalars.
19046 SmallVector<Value *> NonConstants(GatheredScalars);
19047 int EMSz = ExtractMask.size();
19048 int MSz = Mask.size();
19049 // Try to build constant vector and shuffle with it only if currently we
19050 // have a single permutation and more than 1 scalar constants.
19051 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
19052 bool IsIdentityShuffle =
19053 ((UseVecBaseAsInput ||
19054 all_of(ExtractShuffles,
19055 [](const std::optional<TTI::ShuffleKind> &SK) {
19056 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19058 })) &&
19059 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
19060 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
19061 (!GatherShuffles.empty() &&
19062 all_of(GatherShuffles,
19063 [](const std::optional<TTI::ShuffleKind> &SK) {
19064 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19066 }) &&
19067 none_of(Mask, [&](int I) { return I >= MSz; }) &&
19069 bool EnoughConstsForShuffle =
19070 IsSingleShuffle &&
19071 (none_of(GatheredScalars,
19072 [](Value *V) {
19073 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19074 }) ||
19075 any_of(GatheredScalars,
19076 [](Value *V) {
19077 return isa<Constant>(V) && !isa<UndefValue>(V);
19078 })) &&
19079 (!IsIdentityShuffle ||
19080 (GatheredScalars.size() == 2 &&
19081 any_of(GatheredScalars,
19082 [](Value *V) { return !isa<UndefValue>(V); })) ||
19083 count_if(GatheredScalars, [](Value *V) {
19084 return isa<Constant>(V) && !isa<PoisonValue>(V);
19085 }) > 1);
19086 // NonConstants array contains just non-constant values, GatheredScalars
19087 // contains only constant to build final vector and then shuffle.
19088 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
19089 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
19090 NonConstants[I] = PoisonValue::get(OrigScalarTy);
19091 else
19092 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
19093 }
19094 // Generate constants for final shuffle and build a mask for them.
19095 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
19096 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
19097 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
19098 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
19099 ShuffleBuilder.add(BV, BVMask);
19100 }
19101 if (all_of(NonConstants, [=](Value *V) {
19102 return isa<PoisonValue>(V) ||
19103 (IsSingleShuffle && ((IsIdentityShuffle &&
19104 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
19105 }))
19106 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19107 SubVectorsMask);
19108 else
19109 Res = ShuffleBuilder.finalize(
19110 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
19111 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
19112 bool IsSplat = isSplat(NonConstants);
19113 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
19114 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
19115 auto CheckIfSplatIsProfitable = [&]() {
19116 // Estimate the cost of splatting + shuffle and compare with
19117 // insert + shuffle.
19118 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19119 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19120 if (isa<ExtractElementInst>(V) || isVectorized(V))
19121 return false;
19122 InstructionCost SplatCost = TTI->getVectorInstrCost(
19123 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
19124 PoisonValue::get(VecTy), V);
19125 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19126 for (auto [Idx, I] : enumerate(BVMask))
19127 if (I != PoisonMaskElem)
19128 NewMask[Idx] = Mask.size();
19129 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
19130 NewMask, CostKind);
19131 InstructionCost BVCost = TTI->getVectorInstrCost(
19132 Instruction::InsertElement, VecTy, CostKind,
19133 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
19134 Vec, V);
19135 // Shuffle required?
19136 if (count(BVMask, PoisonMaskElem) <
19137 static_cast<int>(BVMask.size() - 1)) {
19138 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19139 for (auto [Idx, I] : enumerate(BVMask))
19140 if (I != PoisonMaskElem)
19141 NewMask[Idx] = I;
19142 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
19143 VecTy, NewMask, CostKind);
19144 }
19145 return SplatCost <= BVCost;
19146 };
19147 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
19148 for (auto [Idx, I] : enumerate(BVMask))
19149 if (I != PoisonMaskElem)
19150 Mask[Idx] = I;
19151 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
19152 } else {
19153 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19154 SmallVector<Value *> Values(NonConstants.size(),
19155 PoisonValue::get(ScalarTy));
19156 Values[0] = V;
19157 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
19158 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
19159 transform(BVMask, SplatMask.begin(), [](int I) {
19160 return I == PoisonMaskElem ? PoisonMaskElem : 0;
19161 });
19162 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
19163 BV = CreateShuffle(BV, nullptr, SplatMask);
19164 for (auto [Idx, I] : enumerate(BVMask))
19165 if (I != PoisonMaskElem)
19166 Mask[Idx] = BVMask.size() + Idx;
19167 Vec = CreateShuffle(Vec, BV, Mask);
19168 for (auto [Idx, I] : enumerate(Mask))
19169 if (I != PoisonMaskElem)
19170 Mask[Idx] = Idx;
19171 }
19172 });
19173 } else if (!allConstant(GatheredScalars)) {
19174 // Gather unique scalars and all constants.
19175 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
19176 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
19177 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
19178 ShuffleBuilder.add(BV, ReuseMask);
19179 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19180 SubVectorsMask);
19181 } else {
19182 // Gather all constants.
19183 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
19184 for (auto [I, V] : enumerate(GatheredScalars)) {
19185 if (!isa<PoisonValue>(V))
19186 Mask[I] = I;
19187 }
19188 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19189 ShuffleBuilder.add(BV, Mask);
19190 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19191 SubVectorsMask);
19192 }
19193
19194 if (NeedFreeze)
19195 Res = ShuffleBuilder.createFreeze(Res);
19196 return Res;
19197}
19198
19199Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
19200 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
19201 (void)vectorizeTree(VectorizableTree[EIdx].get());
19202 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
19203 Builder, *this);
19204}
19205
19206/// \returns \p I after propagating metadata from \p VL only for instructions in
19207/// \p VL.
19210 for (Value *V : VL)
19211 if (isa<Instruction>(V))
19212 Insts.push_back(V);
19213 return llvm::propagateMetadata(Inst, Insts);
19214}
19215
19217 if (DebugLoc DL = PN.getDebugLoc())
19218 return DL;
19219 return DebugLoc::getUnknown();
19220}
19221
19222Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
19223 IRBuilderBase::InsertPointGuard Guard(Builder);
19224
19225 Value *V = E->Scalars.front();
19226 Type *ScalarTy = V->getType();
19227 if (!isa<CmpInst>(V))
19228 ScalarTy = getValueType(V);
19229 auto It = MinBWs.find(E);
19230 if (It != MinBWs.end()) {
19231 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
19232 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
19233 if (VecTy)
19234 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
19235 }
19236 if (E->VectorizedValue)
19237 return E->VectorizedValue;
19238 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
19239 if (E->isGather()) {
19240 // Set insert point for non-reduction initial nodes.
19241 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
19242 setInsertPointAfterBundle(E);
19243 Value *Vec = createBuildVector(E, ScalarTy);
19244 E->VectorizedValue = Vec;
19245 return Vec;
19246 }
19247 if (E->State == TreeEntry::SplitVectorize) {
19248 assert(E->CombinedEntriesWithIndices.size() == 2 &&
19249 "Expected exactly 2 combined entries.");
19250 setInsertPointAfterBundle(E);
19251 TreeEntry &OpTE1 =
19252 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
19253 assert(OpTE1.isSame(
19254 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19255 "Expected same first part of scalars.");
19256 Value *Op1 = vectorizeTree(&OpTE1);
19257 TreeEntry &OpTE2 =
19258 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
19259 assert(
19260 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19261 "Expected same second part of scalars.");
19262 Value *Op2 = vectorizeTree(&OpTE2);
19263 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
19264 bool IsSigned = false;
19265 auto It = MinBWs.find(OpE);
19266 if (It != MinBWs.end())
19267 IsSigned = It->second.second;
19268 else
19269 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19270 if (isa<PoisonValue>(V))
19271 return false;
19272 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19273 });
19274 return IsSigned;
19275 };
19276 if (cast<VectorType>(Op1->getType())->getElementType() !=
19277 ScalarTy->getScalarType()) {
19278 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19279 Op1 = Builder.CreateIntCast(
19280 Op1,
19282 ScalarTy,
19283 cast<FixedVectorType>(Op1->getType())->getNumElements()),
19284 GetOperandSignedness(&OpTE1));
19285 }
19286 if (cast<VectorType>(Op2->getType())->getElementType() !=
19287 ScalarTy->getScalarType()) {
19288 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19289 Op2 = Builder.CreateIntCast(
19290 Op2,
19292 ScalarTy,
19293 cast<FixedVectorType>(Op2->getType())->getNumElements()),
19294 GetOperandSignedness(&OpTE2));
19295 }
19296 if (E->ReorderIndices.empty()) {
19297 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
19298 std::iota(
19299 Mask.begin(),
19300 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
19301 0);
19302 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
19303 if (ScalarTyNumElements != 1) {
19304 assert(SLPReVec && "Only supported by REVEC.");
19305 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
19306 }
19307 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19308 Vec = createInsertVector(Builder, Vec, Op2,
19309 E->CombinedEntriesWithIndices.back().second *
19310 ScalarTyNumElements);
19311 E->VectorizedValue = Vec;
19312 return Vec;
19313 }
19314 unsigned CommonVF =
19315 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19316 if (getNumElements(Op1->getType()) != CommonVF) {
19317 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19318 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
19319 0);
19320 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19321 }
19322 if (getNumElements(Op2->getType()) != CommonVF) {
19323 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19324 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
19325 0);
19326 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19327 }
19328 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
19329 E->VectorizedValue = Vec;
19330 return Vec;
19331 }
19332
19333 bool IsReverseOrder =
19334 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
19335 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
19336 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
19337 if (E->getOpcode() == Instruction::Store &&
19338 E->State == TreeEntry::Vectorize) {
19339 ArrayRef<int> Mask =
19340 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19341 E->ReorderIndices.size());
19342 ShuffleBuilder.add(V, Mask);
19343 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19344 E->State == TreeEntry::CompressVectorize) {
19345 ShuffleBuilder.addOrdered(V, {});
19346 } else {
19347 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19348 }
19350 E->CombinedEntriesWithIndices.size());
19351 transform(
19352 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19353 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19354 });
19355 assert(
19356 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19357 "Expected either combined subnodes or reordering");
19358 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19359 };
19360
19361 assert(!E->isGather() && "Unhandled state");
19362 unsigned ShuffleOrOp =
19363 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19364 Instruction *VL0 = E->getMainOp();
19365 auto GetOperandSignedness = [&](unsigned Idx) {
19366 const TreeEntry *OpE = getOperandEntry(E, Idx);
19367 bool IsSigned = false;
19368 auto It = MinBWs.find(OpE);
19369 if (It != MinBWs.end())
19370 IsSigned = It->second.second;
19371 else
19372 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19373 if (isa<PoisonValue>(V))
19374 return false;
19375 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19376 });
19377 return IsSigned;
19378 };
19379 switch (ShuffleOrOp) {
19380 case Instruction::PHI: {
19381 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19382 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19383 "PHI reordering is free.");
19384 auto *PH = cast<PHINode>(VL0);
19385 Builder.SetInsertPoint(PH->getParent(),
19386 PH->getParent()->getFirstNonPHIIt());
19387 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19388 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19389 Value *V = NewPhi;
19390
19391 // Adjust insertion point once all PHI's have been generated.
19392 Builder.SetInsertPoint(PH->getParent(),
19393 PH->getParent()->getFirstInsertionPt());
19394 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19395
19396 V = FinalShuffle(V, E);
19397
19398 E->VectorizedValue = V;
19399 // If phi node is fully emitted - exit.
19400 if (NewPhi->getNumIncomingValues() != 0)
19401 return NewPhi;
19402
19403 // PHINodes may have multiple entries from the same block. We want to
19404 // visit every block once.
19405 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19406
19407 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19408 BasicBlock *IBB = PH->getIncomingBlock(I);
19409
19410 // Stop emission if all incoming values are generated.
19411 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19412 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19413 return NewPhi;
19414 }
19415
19416 if (!VisitedBBs.insert(IBB).second) {
19417 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19418 NewPhi->addIncoming(VecOp, IBB);
19419 TreeEntry *OpTE = getOperandEntry(E, I);
19420 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19421 OpTE->VectorizedValue = VecOp;
19422 continue;
19423 }
19424
19425 Builder.SetInsertPoint(IBB->getTerminator());
19426 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19427 Value *Vec = vectorizeOperand(E, I);
19428 if (VecTy != Vec->getType()) {
19429 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19430 MinBWs.contains(getOperandEntry(E, I))) &&
19431 "Expected item in MinBWs.");
19432 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19433 }
19434 NewPhi->addIncoming(Vec, IBB);
19435 }
19436
19437 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19438 "Invalid number of incoming values");
19439 assert(E->VectorizedValue && "Expected vectorized value.");
19440 return E->VectorizedValue;
19441 }
19442
19443 case Instruction::ExtractElement: {
19444 Value *V = E->getSingleOperand(0);
19445 setInsertPointAfterBundle(E);
19446 V = FinalShuffle(V, E);
19447 E->VectorizedValue = V;
19448 return V;
19449 }
19450 case Instruction::ExtractValue: {
19451 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19452 Builder.SetInsertPoint(LI);
19453 Value *Ptr = LI->getPointerOperand();
19454 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19455 Value *NewV = ::propagateMetadata(V, E->Scalars);
19456 NewV = FinalShuffle(NewV, E);
19457 E->VectorizedValue = NewV;
19458 return NewV;
19459 }
19460 case Instruction::InsertElement: {
19461 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
19462 if (const TreeEntry *OpE = getOperandEntry(E, 1);
19463 OpE && !OpE->isGather() && OpE->hasState() &&
19464 !OpE->hasCopyableElements())
19465 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
19466 else
19467 setInsertPointAfterBundle(E);
19468 Value *V = vectorizeOperand(E, 1);
19469 ArrayRef<Value *> Op = E->getOperand(1);
19470 Type *ScalarTy = Op.front()->getType();
19471 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
19472 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19473 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
19474 assert(Res.first > 0 && "Expected item in MinBWs.");
19475 V = Builder.CreateIntCast(
19476 V,
19478 ScalarTy,
19479 cast<FixedVectorType>(V->getType())->getNumElements()),
19480 Res.second);
19481 }
19482
19483 // Create InsertVector shuffle if necessary
19484 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
19485 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19486 }));
19487 const unsigned NumElts =
19488 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19489 const unsigned NumScalars = E->Scalars.size();
19490
19491 unsigned Offset = *getElementIndex(VL0);
19492 assert(Offset < NumElts && "Failed to find vector index offset");
19493
19494 // Create shuffle to resize vector
19495 SmallVector<int> Mask;
19496 if (!E->ReorderIndices.empty()) {
19497 inversePermutation(E->ReorderIndices, Mask);
19498 Mask.append(NumElts - NumScalars, PoisonMaskElem);
19499 } else {
19500 Mask.assign(NumElts, PoisonMaskElem);
19501 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
19502 }
19503 // Create InsertVector shuffle if necessary
19504 bool IsIdentity = true;
19505 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
19506 Mask.swap(PrevMask);
19507 for (unsigned I = 0; I < NumScalars; ++I) {
19508 Value *Scalar = E->Scalars[PrevMask[I]];
19509 unsigned InsertIdx = *getElementIndex(Scalar);
19510 IsIdentity &= InsertIdx - Offset == I;
19511 Mask[InsertIdx - Offset] = I;
19512 }
19513 if (!IsIdentity || NumElts != NumScalars) {
19514 Value *V2 = nullptr;
19515 bool IsVNonPoisonous =
19517 SmallVector<int> InsertMask(Mask);
19518 if (NumElts != NumScalars && Offset == 0) {
19519 // Follow all insert element instructions from the current buildvector
19520 // sequence.
19521 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
19522 do {
19523 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
19524 if (!InsertIdx)
19525 break;
19526 if (InsertMask[*InsertIdx] == PoisonMaskElem)
19527 InsertMask[*InsertIdx] = *InsertIdx;
19528 if (!Ins->hasOneUse())
19529 break;
19532 } while (Ins);
19533 SmallBitVector UseMask =
19534 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19535 SmallBitVector IsFirstPoison =
19536 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19537 SmallBitVector IsFirstUndef =
19538 isUndefVector(FirstInsert->getOperand(0), UseMask);
19539 if (!IsFirstPoison.all()) {
19540 unsigned Idx = 0;
19541 for (unsigned I = 0; I < NumElts; I++) {
19542 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
19543 IsFirstUndef.test(I)) {
19544 if (IsVNonPoisonous) {
19545 InsertMask[I] = I < NumScalars ? I : 0;
19546 continue;
19547 }
19548 if (!V2)
19549 V2 = UndefValue::get(V->getType());
19550 if (Idx >= NumScalars)
19551 Idx = NumScalars - 1;
19552 InsertMask[I] = NumScalars + Idx;
19553 ++Idx;
19554 } else if (InsertMask[I] != PoisonMaskElem &&
19555 Mask[I] == PoisonMaskElem) {
19556 InsertMask[I] = PoisonMaskElem;
19557 }
19558 }
19559 } else {
19560 InsertMask = Mask;
19561 }
19562 }
19563 if (!V2)
19564 V2 = PoisonValue::get(V->getType());
19565 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19566 if (auto *I = dyn_cast<Instruction>(V)) {
19567 GatherShuffleExtractSeq.insert(I);
19568 CSEBlocks.insert(I->getParent());
19569 }
19570 }
19571
19572 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
19573 for (unsigned I = 0; I < NumElts; I++) {
19574 if (Mask[I] != PoisonMaskElem)
19575 InsertMask[Offset + I] = I;
19576 }
19577 SmallBitVector UseMask =
19578 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19579 SmallBitVector IsFirstUndef =
19580 isUndefVector(FirstInsert->getOperand(0), UseMask);
19581 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
19582 NumElts != NumScalars) {
19583 if (IsFirstUndef.all()) {
19584 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
19585 SmallBitVector IsFirstPoison =
19586 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19587 if (!IsFirstPoison.all()) {
19588 for (unsigned I = 0; I < NumElts; I++) {
19589 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
19590 InsertMask[I] = I + NumElts;
19591 }
19592 }
19593 V = Builder.CreateShuffleVector(
19594 V,
19595 IsFirstPoison.all() ? PoisonValue::get(V->getType())
19596 : FirstInsert->getOperand(0),
19597 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
19598 if (auto *I = dyn_cast<Instruction>(V)) {
19599 GatherShuffleExtractSeq.insert(I);
19600 CSEBlocks.insert(I->getParent());
19601 }
19602 }
19603 } else {
19604 SmallBitVector IsFirstPoison =
19605 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19606 for (unsigned I = 0; I < NumElts; I++) {
19607 if (InsertMask[I] == PoisonMaskElem)
19608 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
19609 else
19610 InsertMask[I] += NumElts;
19611 }
19612 V = Builder.CreateShuffleVector(
19613 FirstInsert->getOperand(0), V, InsertMask,
19614 cast<Instruction>(E->Scalars.back())->getName());
19615 if (auto *I = dyn_cast<Instruction>(V)) {
19616 GatherShuffleExtractSeq.insert(I);
19617 CSEBlocks.insert(I->getParent());
19618 }
19619 }
19620 }
19621
19622 ++NumVectorInstructions;
19623 E->VectorizedValue = V;
19624 return V;
19625 }
19626 case Instruction::ZExt:
19627 case Instruction::SExt:
19628 case Instruction::FPToUI:
19629 case Instruction::FPToSI:
19630 case Instruction::FPExt:
19631 case Instruction::PtrToInt:
19632 case Instruction::IntToPtr:
19633 case Instruction::SIToFP:
19634 case Instruction::UIToFP:
19635 case Instruction::Trunc:
19636 case Instruction::FPTrunc:
19637 case Instruction::BitCast: {
19638 setInsertPointAfterBundle(E);
19639
19640 Value *InVec = vectorizeOperand(E, 0);
19641
19642 auto *CI = cast<CastInst>(VL0);
19643 Instruction::CastOps VecOpcode = CI->getOpcode();
19644 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
19645 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
19646 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
19647 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19648 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
19649 // Check if the values are candidates to demote.
19650 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19651 if (SrcIt != MinBWs.end())
19652 SrcBWSz = SrcIt->second.first;
19653 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
19654 if (BWSz == SrcBWSz) {
19655 VecOpcode = Instruction::BitCast;
19656 } else if (BWSz < SrcBWSz) {
19657 VecOpcode = Instruction::Trunc;
19658 } else if (It != MinBWs.end()) {
19659 assert(BWSz > SrcBWSz && "Invalid cast!");
19660 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19661 } else if (SrcIt != MinBWs.end()) {
19662 assert(BWSz > SrcBWSz && "Invalid cast!");
19663 VecOpcode =
19664 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19665 }
19666 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19667 !SrcIt->second.second) {
19668 VecOpcode = Instruction::UIToFP;
19669 }
19670 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19671 ? InVec
19672 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19673 V = FinalShuffle(V, E);
19674
19675 E->VectorizedValue = V;
19676 ++NumVectorInstructions;
19677 return V;
19678 }
19679 case Instruction::FCmp:
19680 case Instruction::ICmp: {
19681 setInsertPointAfterBundle(E);
19682
19683 Value *L = vectorizeOperand(E, 0);
19684 Value *R = vectorizeOperand(E, 1);
19685 if (L->getType() != R->getType()) {
19686 assert((getOperandEntry(E, 0)->isGather() ||
19687 getOperandEntry(E, 1)->isGather() ||
19688 MinBWs.contains(getOperandEntry(E, 0)) ||
19689 MinBWs.contains(getOperandEntry(E, 1))) &&
19690 "Expected item in MinBWs.");
19691 if (cast<VectorType>(L->getType())
19692 ->getElementType()
19693 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
19694 ->getElementType()
19695 ->getIntegerBitWidth()) {
19696 Type *CastTy = R->getType();
19697 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19698 } else {
19699 Type *CastTy = L->getType();
19700 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19701 }
19702 }
19703
19704 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
19705 Value *V = Builder.CreateCmp(P0, L, R);
19706 propagateIRFlags(V, E->Scalars, VL0);
19707 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
19708 ICmp->setSameSign(/*B=*/false);
19709 // Do not cast for cmps.
19710 VecTy = cast<FixedVectorType>(V->getType());
19711 V = FinalShuffle(V, E);
19712
19713 E->VectorizedValue = V;
19714 ++NumVectorInstructions;
19715 return V;
19716 }
19717 case Instruction::Select: {
19718 setInsertPointAfterBundle(E);
19719
19720 Value *Cond = vectorizeOperand(E, 0);
19721 Value *True = vectorizeOperand(E, 1);
19722 Value *False = vectorizeOperand(E, 2);
19723 if (True->getType() != VecTy || False->getType() != VecTy) {
19724 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
19725 getOperandEntry(E, 2)->isGather() ||
19726 MinBWs.contains(getOperandEntry(E, 1)) ||
19727 MinBWs.contains(getOperandEntry(E, 2))) &&
19728 "Expected item in MinBWs.");
19729 if (True->getType() != VecTy)
19730 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19731 if (False->getType() != VecTy)
19732 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19733 }
19734
19735 unsigned CondNumElements = getNumElements(Cond->getType());
19736 unsigned TrueNumElements = getNumElements(True->getType());
19737 assert(TrueNumElements >= CondNumElements &&
19738 TrueNumElements % CondNumElements == 0 &&
19739 "Cannot vectorize Instruction::Select");
19740 assert(TrueNumElements == getNumElements(False->getType()) &&
19741 "Cannot vectorize Instruction::Select");
19742 if (CondNumElements != TrueNumElements) {
19743 // When the return type is i1 but the source is fixed vector type, we
19744 // need to duplicate the condition value.
19745 Cond = Builder.CreateShuffleVector(
19746 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
19747 CondNumElements));
19748 }
19749 assert(getNumElements(Cond->getType()) == TrueNumElements &&
19750 "Cannot vectorize Instruction::Select");
19751 Value *V =
19752 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
19753 V = FinalShuffle(V, E);
19754
19755 E->VectorizedValue = V;
19756 ++NumVectorInstructions;
19757 return V;
19758 }
19759 case Instruction::FNeg: {
19760 setInsertPointAfterBundle(E);
19761
19762 Value *Op = vectorizeOperand(E, 0);
19763
19764 Value *V = Builder.CreateUnOp(
19765 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
19766 propagateIRFlags(V, E->Scalars, VL0);
19767 if (auto *I = dyn_cast<Instruction>(V))
19768 V = ::propagateMetadata(I, E->Scalars);
19769
19770 V = FinalShuffle(V, E);
19771
19772 E->VectorizedValue = V;
19773 ++NumVectorInstructions;
19774
19775 return V;
19776 }
19777 case Instruction::Freeze: {
19778 setInsertPointAfterBundle(E);
19779
19780 Value *Op = vectorizeOperand(E, 0);
19781
19782 if (Op->getType() != VecTy) {
19783 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19784 MinBWs.contains(getOperandEntry(E, 0))) &&
19785 "Expected item in MinBWs.");
19786 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
19787 }
19788 Value *V = Builder.CreateFreeze(Op);
19789 V = FinalShuffle(V, E);
19790
19791 E->VectorizedValue = V;
19792 ++NumVectorInstructions;
19793
19794 return V;
19795 }
19796 case Instruction::Add:
19797 case Instruction::FAdd:
19798 case Instruction::Sub:
19799 case Instruction::FSub:
19800 case Instruction::Mul:
19801 case Instruction::FMul:
19802 case Instruction::UDiv:
19803 case Instruction::SDiv:
19804 case Instruction::FDiv:
19805 case Instruction::URem:
19806 case Instruction::SRem:
19807 case Instruction::FRem:
19808 case Instruction::Shl:
19809 case Instruction::LShr:
19810 case Instruction::AShr:
19811 case Instruction::And:
19812 case Instruction::Or:
19813 case Instruction::Xor: {
19814 setInsertPointAfterBundle(E);
19815
19816 Value *LHS = vectorizeOperand(E, 0);
19817 Value *RHS = vectorizeOperand(E, 1);
19818 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19819 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19820 ArrayRef<Value *> Ops = E->getOperand(I);
19821 if (all_of(Ops, [&](Value *Op) {
19822 auto *CI = dyn_cast<ConstantInt>(Op);
19823 return CI && CI->getValue().countr_one() >= It->second.first;
19824 })) {
19825 V = FinalShuffle(I == 0 ? RHS : LHS, E);
19826 E->VectorizedValue = V;
19827 ++NumVectorInstructions;
19828 return V;
19829 }
19830 }
19831 }
19832 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
19833 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19834 getOperandEntry(E, 1)->isGather() ||
19835 MinBWs.contains(getOperandEntry(E, 0)) ||
19836 MinBWs.contains(getOperandEntry(E, 1))) &&
19837 "Expected item in MinBWs.");
19838 if (LHS->getType() != VecTy)
19839 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
19840 if (RHS->getType() != VecTy)
19841 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
19842 }
19843
19844 Value *V = Builder.CreateBinOp(
19845 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
19846 RHS);
19847 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
19848 if (auto *I = dyn_cast<Instruction>(V)) {
19849 V = ::propagateMetadata(I, E->Scalars);
19850 // Drop nuw flags for abs(sub(commutative), true).
19851 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19852 any_of(E->Scalars, [E](Value *V) {
19853 return isa<PoisonValue>(V) ||
19854 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
19855 isCommutative(cast<Instruction>(V));
19856 }))
19857 I->setHasNoUnsignedWrap(/*b=*/false);
19858 }
19859
19860 V = FinalShuffle(V, E);
19861
19862 E->VectorizedValue = V;
19863 ++NumVectorInstructions;
19864
19865 return V;
19866 }
19867 case Instruction::Load: {
19868 // Loads are inserted at the head of the tree because we don't want to
19869 // sink them all the way down past store instructions.
19870 setInsertPointAfterBundle(E);
19871
19872 LoadInst *LI = cast<LoadInst>(VL0);
19873 Instruction *NewLI;
19874 FixedVectorType *StridedLoadTy = nullptr;
19875 Value *PO = LI->getPointerOperand();
19876 if (E->State == TreeEntry::Vectorize) {
19877 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19878 } else if (E->State == TreeEntry::CompressVectorize) {
19879 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19880 CompressEntryToData.at(E);
19881 Align CommonAlignment = LI->getAlign();
19882 if (IsMasked) {
19883 unsigned VF = getNumElements(LoadVecTy);
19884 SmallVector<Constant *> MaskValues(
19885 VF / getNumElements(LI->getType()),
19886 ConstantInt::getFalse(VecTy->getContext()));
19887 for (int I : CompressMask)
19888 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
19889 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19890 assert(SLPReVec && "Only supported by REVEC.");
19891 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
19892 }
19893 Constant *MaskValue = ConstantVector::get(MaskValues);
19894 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19895 MaskValue);
19896 } else {
19897 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19898 }
19899 NewLI = ::propagateMetadata(NewLI, E->Scalars);
19900 // TODO: include this cost into CommonCost.
19901 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19902 assert(SLPReVec && "FixedVectorType is not expected.");
19903 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
19904 CompressMask);
19905 }
19906 NewLI =
19907 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
19908 } else if (E->State == TreeEntry::StridedVectorize) {
19909 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19910 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19911 PO = IsReverseOrder ? PtrN : Ptr0;
19912 Type *StrideTy = DL->getIndexType(PO->getType());
19913 Value *StrideVal;
19914 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
19915 StridedLoadTy = SPtrInfo.Ty;
19916 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
19917 unsigned StridedLoadEC =
19918 StridedLoadTy->getElementCount().getKnownMinValue();
19919
19920 Value *Stride = SPtrInfo.StrideVal;
19921 if (!Stride) {
19922 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19923 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
19924 SCEVExpander Expander(*SE, *DL, "strided-load-vec");
19925 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
19926 &*Builder.GetInsertPoint());
19927 }
19928 Value *NewStride =
19929 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
19930 StrideVal = Builder.CreateMul(
19931 NewStride, ConstantInt::get(
19932 StrideTy, (IsReverseOrder ? -1 : 1) *
19933 static_cast<int>(
19934 DL->getTypeAllocSize(ScalarTy))));
19935 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19936 auto *Inst = Builder.CreateIntrinsic(
19937 Intrinsic::experimental_vp_strided_load,
19938 {StridedLoadTy, PO->getType(), StrideTy},
19939 {PO, StrideVal,
19940 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
19941 Builder.getInt32(StridedLoadEC)});
19942 Inst->addParamAttr(
19943 /*ArgNo=*/0,
19944 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19945 NewLI = Inst;
19946 } else {
19947 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
19948 Value *VecPtr = vectorizeOperand(E, 0);
19949 if (isa<FixedVectorType>(ScalarTy)) {
19950 assert(SLPReVec && "FixedVectorType is not expected.");
19951 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
19952 // to expand VecPtr if ScalarTy is a vector type.
19953 unsigned ScalarTyNumElements =
19954 cast<FixedVectorType>(ScalarTy)->getNumElements();
19955 unsigned VecTyNumElements =
19956 cast<FixedVectorType>(VecTy)->getNumElements();
19957 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19958 "Cannot expand getelementptr.");
19959 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19960 SmallVector<Constant *> Indices(VecTyNumElements);
19961 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
19962 return Builder.getInt64(I % ScalarTyNumElements);
19963 });
19964 VecPtr = Builder.CreateGEP(
19965 VecTy->getElementType(),
19966 Builder.CreateShuffleVector(
19967 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
19968 ConstantVector::get(Indices));
19969 }
19970 // Use the minimum alignment of the gathered loads.
19971 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19972 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19973 }
19974 Value *V = E->State == TreeEntry::CompressVectorize
19975 ? NewLI
19976 : ::propagateMetadata(NewLI, E->Scalars);
19977
19978 if (StridedLoadTy != VecTy)
19979 V = Builder.CreateBitOrPointerCast(V, VecTy);
19980 V = FinalShuffle(V, E);
19981 E->VectorizedValue = V;
19982 ++NumVectorInstructions;
19983 return V;
19984 }
19985 case Instruction::Store: {
19986 auto *SI = cast<StoreInst>(VL0);
19987
19988 setInsertPointAfterBundle(E);
19989
19990 Value *VecValue = vectorizeOperand(E, 0);
19991 if (VecValue->getType() != VecTy)
19992 VecValue =
19993 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19994 VecValue = FinalShuffle(VecValue, E);
19995
19996 Value *Ptr = SI->getPointerOperand();
19997 Instruction *ST;
19998 if (E->State == TreeEntry::Vectorize) {
19999 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
20000 } else {
20001 assert(E->State == TreeEntry::StridedVectorize &&
20002 "Expected either strided or consecutive stores.");
20003 if (!E->ReorderIndices.empty()) {
20004 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
20005 Ptr = SI->getPointerOperand();
20006 }
20007 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
20008 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
20009 auto *Inst = Builder.CreateIntrinsic(
20010 Intrinsic::experimental_vp_strided_store,
20011 {VecTy, Ptr->getType(), StrideTy},
20012 {VecValue, Ptr,
20013 ConstantInt::get(
20014 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
20015 Builder.getAllOnesMask(VecTy->getElementCount()),
20016 Builder.getInt32(E->Scalars.size())});
20017 Inst->addParamAttr(
20018 /*ArgNo=*/1,
20019 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20020 ST = Inst;
20021 }
20022
20023 Value *V = ::propagateMetadata(ST, E->Scalars);
20024
20025 E->VectorizedValue = V;
20026 ++NumVectorInstructions;
20027 return V;
20028 }
20029 case Instruction::GetElementPtr: {
20030 auto *GEP0 = cast<GetElementPtrInst>(VL0);
20031 setInsertPointAfterBundle(E);
20032
20033 Value *Op0 = vectorizeOperand(E, 0);
20034
20035 SmallVector<Value *> OpVecs;
20036 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
20037 Value *OpVec = vectorizeOperand(E, J);
20038 OpVecs.push_back(OpVec);
20039 }
20040
20041 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20042 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
20044 for (Value *V : E->Scalars) {
20046 GEPs.push_back(V);
20047 }
20048 V = ::propagateMetadata(I, GEPs);
20049 }
20050
20051 V = FinalShuffle(V, E);
20052
20053 E->VectorizedValue = V;
20054 ++NumVectorInstructions;
20055
20056 return V;
20057 }
20058 case Instruction::Call: {
20059 CallInst *CI = cast<CallInst>(VL0);
20060 setInsertPointAfterBundle(E);
20061
20063
20065 CI, ID, VecTy->getNumElements(),
20066 It != MinBWs.end() ? It->second.first : 0, TTI);
20067 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
20068 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
20069 VecCallCosts.first <= VecCallCosts.second;
20070
20071 Value *ScalarArg = nullptr;
20072 SmallVector<Value *> OpVecs;
20073 SmallVector<Type *, 2> TysForDecl;
20074 // Add return type if intrinsic is overloaded on it.
20075 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
20076 TysForDecl.push_back(VecTy);
20077 auto *CEI = cast<CallInst>(VL0);
20078 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
20079 // Some intrinsics have scalar arguments. This argument should not be
20080 // vectorized.
20081 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
20082 ScalarArg = CEI->getArgOperand(I);
20083 // if decided to reduce bitwidth of abs intrinsic, it second argument
20084 // must be set false (do not return poison, if value issigned min).
20085 if (ID == Intrinsic::abs && It != MinBWs.end() &&
20086 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
20087 ScalarArg = Builder.getFalse();
20088 OpVecs.push_back(ScalarArg);
20090 TysForDecl.push_back(ScalarArg->getType());
20091 continue;
20092 }
20093
20094 Value *OpVec = vectorizeOperand(E, I);
20095 ScalarArg = CEI->getArgOperand(I);
20096 if (cast<VectorType>(OpVec->getType())->getElementType() !=
20097 ScalarArg->getType()->getScalarType() &&
20098 It == MinBWs.end()) {
20099 auto *CastTy =
20100 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
20101 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
20102 } else if (It != MinBWs.end()) {
20103 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
20104 }
20105 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
20106 OpVecs.push_back(OpVec);
20107 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
20108 TysForDecl.push_back(OpVec->getType());
20109 }
20110
20111 Function *CF;
20112 if (!UseIntrinsic) {
20113 VFShape Shape =
20115 ElementCount::getFixed(VecTy->getNumElements()),
20116 false /*HasGlobalPred*/);
20117 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
20118 } else {
20119 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
20120 }
20121
20123 CI->getOperandBundlesAsDefs(OpBundles);
20124 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
20125
20126 propagateIRFlags(V, E->Scalars, VL0);
20127 V = FinalShuffle(V, E);
20128
20129 E->VectorizedValue = V;
20130 ++NumVectorInstructions;
20131 return V;
20132 }
20133 case Instruction::ShuffleVector: {
20134 Value *V;
20135 if (SLPReVec && !E->isAltShuffle()) {
20136 setInsertPointAfterBundle(E);
20137 Value *Src = vectorizeOperand(E, 0);
20138 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
20139 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
20140 SmallVector<int> NewMask(ThisMask.size());
20141 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
20142 return SVSrc->getShuffleMask()[Mask];
20143 });
20144 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
20145 SVSrc->getOperand(1), NewMask);
20146 } else {
20147 V = Builder.CreateShuffleVector(Src, ThisMask);
20148 }
20149 propagateIRFlags(V, E->Scalars, VL0);
20150 if (auto *I = dyn_cast<Instruction>(V))
20151 V = ::propagateMetadata(I, E->Scalars);
20152 V = FinalShuffle(V, E);
20153 } else {
20154 assert(E->isAltShuffle() &&
20155 ((Instruction::isBinaryOp(E->getOpcode()) &&
20156 Instruction::isBinaryOp(E->getAltOpcode())) ||
20157 (Instruction::isCast(E->getOpcode()) &&
20158 Instruction::isCast(E->getAltOpcode())) ||
20159 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
20160 "Invalid Shuffle Vector Operand");
20161
20162 Value *LHS = nullptr, *RHS = nullptr;
20163 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
20164 setInsertPointAfterBundle(E);
20165 LHS = vectorizeOperand(E, 0);
20166 RHS = vectorizeOperand(E, 1);
20167 } else {
20168 setInsertPointAfterBundle(E);
20169 LHS = vectorizeOperand(E, 0);
20170 }
20171 if (LHS && RHS &&
20172 ((Instruction::isBinaryOp(E->getOpcode()) &&
20173 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
20174 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
20175 assert((It != MinBWs.end() ||
20176 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
20177 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
20178 MinBWs.contains(getOperandEntry(E, 0)) ||
20179 MinBWs.contains(getOperandEntry(E, 1))) &&
20180 "Expected item in MinBWs.");
20181 Type *CastTy = VecTy;
20182 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
20184 ->getElementType()
20185 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
20186 ->getElementType()
20187 ->getIntegerBitWidth())
20188 CastTy = RHS->getType();
20189 else
20190 CastTy = LHS->getType();
20191 }
20192 if (LHS->getType() != CastTy)
20193 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
20194 if (RHS->getType() != CastTy)
20195 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
20196 }
20197
20198 Value *V0, *V1;
20199 if (Instruction::isBinaryOp(E->getOpcode())) {
20200 V0 = Builder.CreateBinOp(
20201 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
20202 V1 = Builder.CreateBinOp(
20203 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
20204 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
20205 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
20206 auto *AltCI = cast<CmpInst>(E->getAltOp());
20207 CmpInst::Predicate AltPred = AltCI->getPredicate();
20208 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
20209 } else {
20210 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
20211 unsigned SrcBWSz = DL->getTypeSizeInBits(
20212 cast<VectorType>(LHS->getType())->getElementType());
20213 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20214 if (BWSz <= SrcBWSz) {
20215 if (BWSz < SrcBWSz)
20216 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
20217 assert(LHS->getType() == VecTy &&
20218 "Expected same type as operand.");
20219 if (auto *I = dyn_cast<Instruction>(LHS))
20220 LHS = ::propagateMetadata(I, E->Scalars);
20221 LHS = FinalShuffle(LHS, E);
20222 E->VectorizedValue = LHS;
20223 ++NumVectorInstructions;
20224 return LHS;
20225 }
20226 }
20227 V0 = Builder.CreateCast(
20228 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
20229 V1 = Builder.CreateCast(
20230 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
20231 }
20232 // Add V0 and V1 to later analysis to try to find and remove matching
20233 // instruction, if any.
20234 for (Value *V : {V0, V1}) {
20235 if (auto *I = dyn_cast<Instruction>(V)) {
20236 GatherShuffleExtractSeq.insert(I);
20237 CSEBlocks.insert(I->getParent());
20238 }
20239 }
20240
20241 // Create shuffle to take alternate operations from the vector.
20242 // Also, gather up main and alt scalar ops to propagate IR flags to
20243 // each vector operation.
20244 ValueList OpScalars, AltScalars;
20245 SmallVector<int> Mask;
20246 E->buildAltOpShuffleMask(
20247 [E, this](Instruction *I) {
20248 assert(E->getMatchingMainOpOrAltOp(I) &&
20249 "Unexpected main/alternate opcode");
20250 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
20251 *TLI);
20252 },
20253 Mask, &OpScalars, &AltScalars);
20254
20255 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
20256 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
20257 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
20258 // Drop nuw flags for abs(sub(commutative), true).
20259 if (auto *I = dyn_cast<Instruction>(Vec);
20260 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
20261 any_of(E->Scalars, [E](Value *V) {
20262 if (isa<PoisonValue>(V))
20263 return false;
20264 if (E->hasCopyableElements() && E->isCopyableElement(V))
20265 return false;
20266 auto *IV = cast<Instruction>(V);
20267 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20268 }))
20269 I->setHasNoUnsignedWrap(/*b=*/false);
20270 };
20271 DropNuwFlag(V0, E->getOpcode());
20272 DropNuwFlag(V1, E->getAltOpcode());
20273
20274 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20275 assert(SLPReVec && "FixedVectorType is not expected.");
20276 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
20277 }
20278 V = Builder.CreateShuffleVector(V0, V1, Mask);
20279 if (auto *I = dyn_cast<Instruction>(V)) {
20280 V = ::propagateMetadata(I, E->Scalars);
20281 GatherShuffleExtractSeq.insert(I);
20282 CSEBlocks.insert(I->getParent());
20283 }
20284 }
20285
20286 E->VectorizedValue = V;
20287 ++NumVectorInstructions;
20288
20289 return V;
20290 }
20291 default:
20292 llvm_unreachable("unknown inst");
20293 }
20294 return nullptr;
20295}
20296
20298 ExtraValueToDebugLocsMap ExternallyUsedValues;
20299 return vectorizeTree(ExternallyUsedValues);
20300}
20301
20303 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
20304 Instruction *ReductionRoot,
20305 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20306 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
20307 // need to rebuild it.
20308 EntryToLastInstruction.clear();
20309 // All blocks must be scheduled before any instructions are inserted.
20310 for (auto &BSIter : BlocksSchedules)
20311 scheduleBlock(*this, BSIter.second.get());
20312 // Cache last instructions for the nodes to avoid side effects, which may
20313 // appear during vectorization, like extra uses, etc.
20314 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20315 if (TE->isGather())
20316 continue;
20317 (void)getLastInstructionInBundle(TE.get());
20318 }
20319
20320 if (ReductionRoot)
20321 Builder.SetInsertPoint(ReductionRoot->getParent(),
20322 ReductionRoot->getIterator());
20323 else
20324 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20325
20326 // Vectorize gather operands of the nodes with the external uses only.
20328 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20329 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20330 TE->UserTreeIndex.UserTE->hasState() &&
20331 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20332 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20333 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20334 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20335 all_of(TE->UserTreeIndex.UserTE->Scalars,
20336 [](Value *V) { return isUsedOutsideBlock(V); })) {
20337 Instruction &LastInst =
20338 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20339 GatherEntries.emplace_back(TE.get(), &LastInst);
20340 }
20341 }
20342 for (auto &Entry : GatherEntries) {
20343 IRBuilderBase::InsertPointGuard Guard(Builder);
20344 Builder.SetInsertPoint(Entry.second);
20345 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20346 (void)vectorizeTree(Entry.first);
20347 }
20348 // Emit gathered loads first to emit better code for the users of those
20349 // gathered loads.
20350 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20351 if (GatheredLoadsEntriesFirst.has_value() &&
20352 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20353 (!TE->isGather() || TE->UserTreeIndex)) {
20354 assert((TE->UserTreeIndex ||
20355 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20356 "Expected gathered load node.");
20357 (void)vectorizeTree(TE.get());
20358 }
20359 }
20360 (void)vectorizeTree(VectorizableTree[0].get());
20361 // Run through the list of postponed gathers and emit them, replacing the temp
20362 // emitted allocas with actual vector instructions.
20363 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20365 for (const TreeEntry *E : PostponedNodes) {
20366 auto *TE = const_cast<TreeEntry *>(E);
20367 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20368 TE->VectorizedValue = nullptr;
20369 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20370 // If user is a PHI node, its vector code have to be inserted right before
20371 // block terminator. Since the node was delayed, there were some unresolved
20372 // dependencies at the moment when stab instruction was emitted. In a case
20373 // when any of these dependencies turn out an operand of another PHI, coming
20374 // from this same block, position of a stab instruction will become invalid.
20375 // The is because source vector that supposed to feed this gather node was
20376 // inserted at the end of the block [after stab instruction]. So we need
20377 // to adjust insertion point again to the end of block.
20378 if (isa<PHINode>(UserI) ||
20379 (TE->UserTreeIndex.UserTE->hasState() &&
20380 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20381 // Insert before all users.
20382 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20383 for (User *U : PrevVec->users()) {
20384 if (U == UserI)
20385 continue;
20386 auto *UI = dyn_cast<Instruction>(U);
20387 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20388 continue;
20389 if (UI->comesBefore(InsertPt))
20390 InsertPt = UI;
20391 }
20392 Builder.SetInsertPoint(InsertPt);
20393 } else {
20394 Builder.SetInsertPoint(PrevVec);
20395 }
20396 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20397 Value *Vec = vectorizeTree(TE);
20398 if (auto *VecI = dyn_cast<Instruction>(Vec);
20399 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20400 Builder.GetInsertPoint()->comesBefore(VecI))
20401 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20402 Builder.GetInsertPoint());
20403 if (Vec->getType() != PrevVec->getType()) {
20404 assert(Vec->getType()->isIntOrIntVectorTy() &&
20405 PrevVec->getType()->isIntOrIntVectorTy() &&
20406 "Expected integer vector types only.");
20407 std::optional<bool> IsSigned;
20408 for (Value *V : TE->Scalars) {
20409 if (isVectorized(V)) {
20410 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20411 auto It = MinBWs.find(MNTE);
20412 if (It != MinBWs.end()) {
20413 IsSigned = IsSigned.value_or(false) || It->second.second;
20414 if (*IsSigned)
20415 break;
20416 }
20417 }
20418 if (IsSigned.value_or(false))
20419 break;
20420 // Scan through gather nodes.
20421 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20422 auto It = MinBWs.find(BVE);
20423 if (It != MinBWs.end()) {
20424 IsSigned = IsSigned.value_or(false) || It->second.second;
20425 if (*IsSigned)
20426 break;
20427 }
20428 }
20429 if (IsSigned.value_or(false))
20430 break;
20431 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20432 IsSigned =
20433 IsSigned.value_or(false) ||
20434 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20435 continue;
20436 }
20437 if (IsSigned.value_or(false))
20438 break;
20439 }
20440 }
20441 if (IsSigned.value_or(false)) {
20442 // Final attempt - check user node.
20443 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20444 if (It != MinBWs.end())
20445 IsSigned = It->second.second;
20446 }
20447 assert(IsSigned &&
20448 "Expected user node or perfect diamond match in MinBWs.");
20449 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20450 }
20451 PrevVec->replaceAllUsesWith(Vec);
20452 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20453 // Replace the stub vector node, if it was used before for one of the
20454 // buildvector nodes already.
20455 auto It = PostponedValues.find(PrevVec);
20456 if (It != PostponedValues.end()) {
20457 for (TreeEntry *VTE : It->getSecond())
20458 VTE->VectorizedValue = Vec;
20459 }
20460 eraseInstruction(PrevVec);
20461 }
20462
20463 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
20464 << " values .\n");
20465
20467 // Maps vector instruction to original insertelement instruction
20468 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
20469 // Maps extract Scalar to the corresponding extractelement instruction in the
20470 // basic block. Only one extractelement per block should be emitted.
20472 ScalarToEEs;
20473 SmallDenseSet<Value *, 4> UsedInserts;
20475 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
20477 // Extract all of the elements with the external uses.
20478 for (const auto &ExternalUse : ExternalUses) {
20479 Value *Scalar = ExternalUse.Scalar;
20480 llvm::User *User = ExternalUse.User;
20481
20482 // Skip users that we already RAUW. This happens when one instruction
20483 // has multiple uses of the same value.
20484 if (User && !is_contained(Scalar->users(), User))
20485 continue;
20486 const TreeEntry *E = &ExternalUse.E;
20487 assert(E && "Invalid scalar");
20488 assert(!E->isGather() && "Extracting from a gather list");
20489 // Non-instruction pointers are not deleted, just skip them.
20490 if (E->getOpcode() == Instruction::GetElementPtr &&
20491 !isa<GetElementPtrInst>(Scalar))
20492 continue;
20493
20494 Value *Vec = E->VectorizedValue;
20495 assert(Vec && "Can't find vectorizable value");
20496
20497 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20498 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
20499 if (Scalar->getType() != Vec->getType()) {
20500 Value *Ex = nullptr;
20501 Value *ExV = nullptr;
20502 auto *Inst = dyn_cast<Instruction>(Scalar);
20503 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20504 auto It = ScalarToEEs.find(Scalar);
20505 if (It != ScalarToEEs.end()) {
20506 // No need to emit many extracts, just move the only one in the
20507 // current block.
20508 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20509 : Builder.GetInsertBlock());
20510 if (EEIt != It->second.end()) {
20511 Value *PrevV = EEIt->second.first;
20512 if (auto *I = dyn_cast<Instruction>(PrevV);
20513 I && !ReplaceInst &&
20514 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20515 Builder.GetInsertPoint()->comesBefore(I)) {
20516 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20517 Builder.GetInsertPoint());
20518 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20519 CI->moveAfter(I);
20520 }
20521 Ex = PrevV;
20522 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20523 }
20524 }
20525 if (!Ex) {
20526 // "Reuse" the existing extract to improve final codegen.
20527 if (ReplaceInst) {
20528 // Leave the instruction as is, if it cheaper extracts and all
20529 // operands are scalar.
20530 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20531 IgnoredExtracts.insert(EE);
20532 Ex = EE;
20533 } else {
20534 auto *CloneInst = Inst->clone();
20535 CloneInst->insertBefore(Inst->getIterator());
20536 if (Inst->hasName())
20537 CloneInst->takeName(Inst);
20538 Ex = CloneInst;
20539 }
20540 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20541 ES && isa<Instruction>(Vec)) {
20542 Value *V = ES->getVectorOperand();
20543 auto *IVec = cast<Instruction>(Vec);
20544 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
20545 V = ETEs.front()->VectorizedValue;
20546 if (auto *IV = dyn_cast<Instruction>(V);
20547 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
20548 IV->comesBefore(IVec))
20549 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20550 else
20551 Ex = Builder.CreateExtractElement(Vec, Lane);
20552 } else if (auto *VecTy =
20553 dyn_cast<FixedVectorType>(Scalar->getType())) {
20554 assert(SLPReVec && "FixedVectorType is not expected.");
20555 unsigned VecTyNumElements = VecTy->getNumElements();
20556 // When REVEC is enabled, we need to extract a vector.
20557 // Note: The element size of Scalar may be different from the
20558 // element size of Vec.
20559 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
20560 ExternalUse.Lane * VecTyNumElements);
20561 } else {
20562 Ex = Builder.CreateExtractElement(Vec, Lane);
20563 }
20564 // If necessary, sign-extend or zero-extend ScalarRoot
20565 // to the larger type.
20566 ExV = Ex;
20567 if (Scalar->getType() != Ex->getType())
20568 ExV = Builder.CreateIntCast(
20569 Ex, Scalar->getType(),
20570 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
20571 auto *I = dyn_cast<Instruction>(Ex);
20572 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
20573 : &F->getEntryBlock(),
20574 std::make_pair(Ex, ExV));
20575 }
20576 // The then branch of the previous if may produce constants, since 0
20577 // operand might be a constant.
20578 if (auto *ExI = dyn_cast<Instruction>(Ex);
20579 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
20580 GatherShuffleExtractSeq.insert(ExI);
20581 CSEBlocks.insert(ExI->getParent());
20582 }
20583 return ExV;
20584 }
20585 assert(isa<FixedVectorType>(Scalar->getType()) &&
20586 isa<InsertElementInst>(Scalar) &&
20587 "In-tree scalar of vector type is not insertelement?");
20588 auto *IE = cast<InsertElementInst>(Scalar);
20589 VectorToInsertElement.try_emplace(Vec, IE);
20590 return Vec;
20591 };
20592 // If User == nullptr, the Scalar remains as scalar in vectorized
20593 // instructions or is used as extra arg. Generate ExtractElement instruction
20594 // and update the record for this scalar in ExternallyUsedValues.
20595 if (!User) {
20596 if (!ScalarsWithNullptrUser.insert(Scalar).second)
20597 continue;
20598 assert(
20599 (ExternallyUsedValues.count(Scalar) ||
20600 ExternalUsesWithNonUsers.count(Scalar) ||
20601 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20602 any_of(
20603 Scalar->users(),
20604 [&, TTI = TTI](llvm::User *U) {
20605 if (ExternalUsesAsOriginalScalar.contains(U))
20606 return true;
20607 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20608 return !UseEntries.empty() &&
20609 (E->State == TreeEntry::Vectorize ||
20610 E->State == TreeEntry::StridedVectorize ||
20611 E->State == TreeEntry::CompressVectorize) &&
20612 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20613 return (UseEntry->State == TreeEntry::Vectorize ||
20614 UseEntry->State ==
20615 TreeEntry::StridedVectorize ||
20616 UseEntry->State ==
20617 TreeEntry::CompressVectorize) &&
20618 doesInTreeUserNeedToExtract(
20619 Scalar, getRootEntryInstruction(*UseEntry),
20620 TLI, TTI);
20621 });
20622 })) &&
20623 "Scalar with nullptr User must be registered in "
20624 "ExternallyUsedValues map or remain as scalar in vectorized "
20625 "instructions");
20626 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20627 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
20628 if (PHI->getParent()->isLandingPad())
20629 Builder.SetInsertPoint(
20630 PHI->getParent(),
20631 std::next(
20632 PHI->getParent()->getLandingPadInst()->getIterator()));
20633 else
20634 Builder.SetInsertPoint(PHI->getParent(),
20635 PHI->getParent()->getFirstNonPHIIt());
20636 } else {
20637 Builder.SetInsertPoint(VecI->getParent(),
20638 std::next(VecI->getIterator()));
20639 }
20640 } else {
20641 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20642 }
20643 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20644 // Required to update internally referenced instructions.
20645 if (Scalar != NewInst) {
20646 assert((!isa<ExtractElementInst>(Scalar) ||
20647 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
20648 "Extractelements should not be replaced.");
20649 Scalar->replaceAllUsesWith(NewInst);
20650 }
20651 continue;
20652 }
20653
20654 if (auto *VU = dyn_cast<InsertElementInst>(User);
20655 VU && VU->getOperand(1) == Scalar) {
20656 // Skip if the scalar is another vector op or Vec is not an instruction.
20657 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20658 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
20659 if (!UsedInserts.insert(VU).second)
20660 continue;
20661 // Need to use original vector, if the root is truncated.
20662 auto BWIt = MinBWs.find(E);
20663 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
20664 auto *ScalarTy = FTy->getElementType();
20665 auto Key = std::make_pair(Vec, ScalarTy);
20666 auto VecIt = VectorCasts.find(Key);
20667 if (VecIt == VectorCasts.end()) {
20668 IRBuilderBase::InsertPointGuard Guard(Builder);
20669 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
20670 if (IVec->getParent()->isLandingPad())
20671 Builder.SetInsertPoint(IVec->getParent(),
20672 std::next(IVec->getParent()
20673 ->getLandingPadInst()
20674 ->getIterator()));
20675 else
20676 Builder.SetInsertPoint(
20677 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20678 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
20679 Builder.SetInsertPoint(IVec->getNextNode());
20680 }
20681 Vec = Builder.CreateIntCast(
20682 Vec,
20684 ScalarTy,
20685 cast<FixedVectorType>(Vec->getType())->getNumElements()),
20686 BWIt->second.second);
20687 VectorCasts.try_emplace(Key, Vec);
20688 } else {
20689 Vec = VecIt->second;
20690 }
20691 }
20692
20693 std::optional<unsigned> InsertIdx = getElementIndex(VU);
20694 if (InsertIdx) {
20695 auto *It = find_if(
20696 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
20697 // Checks if 2 insertelements are from the same buildvector.
20698 InsertElementInst *VecInsert = Data.InsertElements.front();
20700 VU, VecInsert,
20701 [](InsertElementInst *II) { return II->getOperand(0); });
20702 });
20703 unsigned Idx = *InsertIdx;
20704 if (It == ShuffledInserts.end()) {
20705 (void)ShuffledInserts.emplace_back();
20706 It = std::next(ShuffledInserts.begin(),
20707 ShuffledInserts.size() - 1);
20708 }
20709 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
20710 if (Mask.empty())
20711 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
20712 Mask[Idx] = ExternalUse.Lane;
20713 It->InsertElements.push_back(cast<InsertElementInst>(User));
20714 continue;
20715 }
20716 }
20717 }
20718 }
20719
20720 // Generate extracts for out-of-tree users.
20721 // Find the insertion point for the extractelement lane.
20722 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20723 if (PHINode *PH = dyn_cast<PHINode>(User)) {
20724 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20725 if (PH->getIncomingValue(I) == Scalar) {
20726 Instruction *IncomingTerminator =
20727 PH->getIncomingBlock(I)->getTerminator();
20728 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20729 Builder.SetInsertPoint(VecI->getParent(),
20730 std::next(VecI->getIterator()));
20731 } else {
20732 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
20733 }
20734 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20735 PH->setOperand(I, NewInst);
20736 }
20737 }
20738 } else {
20739 Builder.SetInsertPoint(cast<Instruction>(User));
20740 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20741 User->replaceUsesOfWith(Scalar, NewInst);
20742 }
20743 } else {
20744 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20745 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20746 User->replaceUsesOfWith(Scalar, NewInst);
20747 }
20748
20749 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
20750 }
20751
20752 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20753 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
20754 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
20755 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
20756 for (int I = 0, E = Mask.size(); I < E; ++I) {
20757 if (Mask[I] < VF)
20758 CombinedMask1[I] = Mask[I];
20759 else
20760 CombinedMask2[I] = Mask[I] - VF;
20761 }
20762 ShuffleInstructionBuilder ShuffleBuilder(
20763 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
20764 ShuffleBuilder.add(V1, CombinedMask1);
20765 if (V2)
20766 ShuffleBuilder.add(V2, CombinedMask2);
20767 return ShuffleBuilder.finalize({}, {}, {});
20768 };
20769
20770 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
20771 bool ForSingleMask) {
20772 unsigned VF = Mask.size();
20773 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20774 if (VF != VecVF) {
20775 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
20776 Vec = CreateShuffle(Vec, nullptr, Mask);
20777 return std::make_pair(Vec, true);
20778 }
20779 if (!ForSingleMask) {
20780 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20781 for (unsigned I = 0; I < VF; ++I) {
20782 if (Mask[I] != PoisonMaskElem)
20783 ResizeMask[Mask[I]] = Mask[I];
20784 }
20785 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
20786 }
20787 }
20788
20789 return std::make_pair(Vec, false);
20790 };
20791 // Perform shuffling of the vectorize tree entries for better handling of
20792 // external extracts.
20793 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20794 // Find the first and the last instruction in the list of insertelements.
20795 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
20796 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
20797 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
20798 Builder.SetInsertPoint(LastInsert);
20799 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20801 MutableArrayRef(Vector.data(), Vector.size()),
20802 FirstInsert->getOperand(0),
20803 [](Value *Vec) {
20804 return cast<VectorType>(Vec->getType())
20805 ->getElementCount()
20806 .getKnownMinValue();
20807 },
20808 ResizeToVF,
20809 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20810 ArrayRef<Value *> Vals) {
20811 assert((Vals.size() == 1 || Vals.size() == 2) &&
20812 "Expected exactly 1 or 2 input values.");
20813 if (Vals.size() == 1) {
20814 // Do not create shuffle if the mask is a simple identity
20815 // non-resizing mask.
20816 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20817 ->getNumElements() ||
20818 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20819 return CreateShuffle(Vals.front(), nullptr, Mask);
20820 return Vals.front();
20821 }
20822 return CreateShuffle(Vals.front() ? Vals.front()
20823 : FirstInsert->getOperand(0),
20824 Vals.back(), Mask);
20825 });
20826 auto It = ShuffledInserts[I].InsertElements.rbegin();
20827 // Rebuild buildvector chain.
20828 InsertElementInst *II = nullptr;
20829 if (It != ShuffledInserts[I].InsertElements.rend())
20830 II = *It;
20832 while (It != ShuffledInserts[I].InsertElements.rend()) {
20833 assert(II && "Must be an insertelement instruction.");
20834 if (*It == II)
20835 ++It;
20836 else
20837 Inserts.push_back(cast<Instruction>(II));
20838 II = dyn_cast<InsertElementInst>(II->getOperand(0));
20839 }
20840 for (Instruction *II : reverse(Inserts)) {
20841 II->replaceUsesOfWith(II->getOperand(0), NewInst);
20842 if (auto *NewI = dyn_cast<Instruction>(NewInst))
20843 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
20844 II->moveAfter(NewI);
20845 NewInst = II;
20846 }
20847 LastInsert->replaceAllUsesWith(NewInst);
20848 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
20849 IE->replaceUsesOfWith(IE->getOperand(0),
20850 PoisonValue::get(IE->getOperand(0)->getType()));
20851 IE->replaceUsesOfWith(IE->getOperand(1),
20852 PoisonValue::get(IE->getOperand(1)->getType()));
20853 eraseInstruction(IE);
20854 }
20855 CSEBlocks.insert(LastInsert->getParent());
20856 }
20857
20858 SmallVector<Instruction *> RemovedInsts;
20859 // For each vectorized value:
20860 for (auto &TEPtr : VectorizableTree) {
20861 TreeEntry *Entry = TEPtr.get();
20862
20863 // No need to handle users of gathered values.
20864 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
20865 continue;
20866
20867 assert(Entry->VectorizedValue && "Can't find vectorizable value");
20868
20869 // For each lane:
20870 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
20871 Value *Scalar = Entry->Scalars[Lane];
20872
20873 if (Entry->getOpcode() == Instruction::GetElementPtr &&
20874 !isa<GetElementPtrInst>(Scalar))
20875 continue;
20876 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20877 EE && IgnoredExtracts.contains(EE))
20878 continue;
20879 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
20880 continue;
20881#ifndef NDEBUG
20882 Type *Ty = Scalar->getType();
20883 if (!Ty->isVoidTy()) {
20884 for (User *U : Scalar->users()) {
20885 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
20886
20887 // It is legal to delete users in the ignorelist.
20888 assert((isVectorized(U) ||
20889 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20892 "Deleting out-of-tree value");
20893 }
20894 }
20895#endif
20896 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
20897 auto *I = cast<Instruction>(Scalar);
20898 RemovedInsts.push_back(I);
20899 }
20900 }
20901
20902 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
20903 // new vector instruction.
20904 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20905 V->mergeDIAssignID(RemovedInsts);
20906
20907 // Clear up reduction references, if any.
20908 if (UserIgnoreList) {
20909 for (Instruction *I : RemovedInsts) {
20910 const TreeEntry *IE = getTreeEntries(I).front();
20911 if (IE->Idx != 0 &&
20912 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
20913 (ValueToGatherNodes.lookup(I).contains(
20914 VectorizableTree.front().get()) ||
20915 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20916 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20917 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20918 IE->UserTreeIndex &&
20919 is_contained(VectorizableTree.front()->Scalars, I)) &&
20920 !(GatheredLoadsEntriesFirst.has_value() &&
20921 IE->Idx >= *GatheredLoadsEntriesFirst &&
20922 VectorizableTree.front()->isGather() &&
20923 is_contained(VectorizableTree.front()->Scalars, I)) &&
20924 !(!VectorizableTree.front()->isGather() &&
20925 VectorizableTree.front()->isCopyableElement(I)))
20926 continue;
20927 SmallVector<SelectInst *> LogicalOpSelects;
20928 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
20929 // Do not replace condition of the logical op in form select <cond>.
20930 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20931 (match(U.getUser(), m_LogicalAnd()) ||
20932 match(U.getUser(), m_LogicalOr())) &&
20933 U.getOperandNo() == 0;
20934 if (IsPoisoningLogicalOp) {
20935 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20936 return false;
20937 }
20938 return UserIgnoreList->contains(U.getUser());
20939 });
20940 // Replace conditions of the poisoning logical ops with the non-poison
20941 // constant value.
20942 for (SelectInst *SI : LogicalOpSelects)
20943 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
20944 }
20945 }
20946 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
20947 // cache correctness.
20948 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
20949 // - instructions are not deleted until later.
20950 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
20951
20952 Builder.ClearInsertionPoint();
20953 InstrElementSize.clear();
20954
20955 const TreeEntry &RootTE = *VectorizableTree.front();
20956 Value *Vec = RootTE.VectorizedValue;
20957 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20958 It != MinBWs.end() &&
20959 ReductionBitWidth != It->second.first) {
20960 IRBuilder<>::InsertPointGuard Guard(Builder);
20961 Builder.SetInsertPoint(ReductionRoot->getParent(),
20962 ReductionRoot->getIterator());
20963 Vec = Builder.CreateIntCast(
20964 Vec,
20965 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20966 cast<VectorType>(Vec->getType())->getElementCount()),
20967 It->second.second);
20968 }
20969 return Vec;
20970}
20971
20973 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
20974 << " gather sequences instructions.\n");
20975 // LICM InsertElementInst sequences.
20976 for (Instruction *I : GatherShuffleExtractSeq) {
20977 if (isDeleted(I))
20978 continue;
20979
20980 // Check if this block is inside a loop.
20981 Loop *L = LI->getLoopFor(I->getParent());
20982 if (!L)
20983 continue;
20984
20985 // Check if it has a preheader.
20986 BasicBlock *PreHeader = L->getLoopPreheader();
20987 if (!PreHeader)
20988 continue;
20989
20990 // If the vector or the element that we insert into it are
20991 // instructions that are defined in this basic block then we can't
20992 // hoist this instruction.
20993 if (any_of(I->operands(), [L](Value *V) {
20994 auto *OpI = dyn_cast<Instruction>(V);
20995 return OpI && L->contains(OpI);
20996 }))
20997 continue;
20998
20999 // We can hoist this instruction. Move it to the pre-header.
21000 I->moveBefore(PreHeader->getTerminator()->getIterator());
21001 CSEBlocks.insert(PreHeader);
21002 }
21003
21004 // Make a list of all reachable blocks in our CSE queue.
21006 CSEWorkList.reserve(CSEBlocks.size());
21007 for (BasicBlock *BB : CSEBlocks)
21008 if (DomTreeNode *N = DT->getNode(BB)) {
21009 assert(DT->isReachableFromEntry(N));
21010 CSEWorkList.push_back(N);
21011 }
21012
21013 // Sort blocks by domination. This ensures we visit a block after all blocks
21014 // dominating it are visited.
21015 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
21016 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
21017 "Different nodes should have different DFS numbers");
21018 return A->getDFSNumIn() < B->getDFSNumIn();
21019 });
21020
21021 // Less defined shuffles can be replaced by the more defined copies.
21022 // Between two shuffles one is less defined if it has the same vector operands
21023 // and its mask indeces are the same as in the first one or undefs. E.g.
21024 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
21025 // poison, <0, 0, 0, 0>.
21026 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
21027 Instruction *I2,
21028 SmallVectorImpl<int> &NewMask) {
21029 if (I1->getType() != I2->getType())
21030 return false;
21031 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
21032 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
21033 if (!SI1 || !SI2)
21034 return I1->isIdenticalTo(I2);
21035 if (SI1->isIdenticalTo(SI2))
21036 return true;
21037 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
21038 if (SI1->getOperand(I) != SI2->getOperand(I))
21039 return false;
21040 // Check if the second instruction is more defined than the first one.
21041 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
21042 ArrayRef<int> SM1 = SI1->getShuffleMask();
21043 // Count trailing undefs in the mask to check the final number of used
21044 // registers.
21045 unsigned LastUndefsCnt = 0;
21046 for (int I = 0, E = NewMask.size(); I < E; ++I) {
21047 if (SM1[I] == PoisonMaskElem)
21048 ++LastUndefsCnt;
21049 else
21050 LastUndefsCnt = 0;
21051 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
21052 NewMask[I] != SM1[I])
21053 return false;
21054 if (NewMask[I] == PoisonMaskElem)
21055 NewMask[I] = SM1[I];
21056 }
21057 // Check if the last undefs actually change the final number of used vector
21058 // registers.
21059 return SM1.size() - LastUndefsCnt > 1 &&
21060 ::getNumberOfParts(*TTI, SI1->getType()) ==
21062 *TTI, getWidenedType(SI1->getType()->getElementType(),
21063 SM1.size() - LastUndefsCnt));
21064 };
21065 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
21066 // instructions. TODO: We can further optimize this scan if we split the
21067 // instructions into different buckets based on the insert lane.
21069 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
21070 assert(*I &&
21071 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
21072 "Worklist not sorted properly!");
21073 BasicBlock *BB = (*I)->getBlock();
21074 // For all instructions in blocks containing gather sequences:
21075 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
21076 if (isDeleted(&In))
21077 continue;
21079 !GatherShuffleExtractSeq.contains(&In))
21080 continue;
21081
21082 // Check if we can replace this instruction with any of the
21083 // visited instructions.
21084 bool Replaced = false;
21085 for (Instruction *&V : Visited) {
21086 SmallVector<int> NewMask;
21087 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
21088 DT->dominates(V->getParent(), In.getParent())) {
21089 In.replaceAllUsesWith(V);
21090 eraseInstruction(&In);
21091 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
21092 if (!NewMask.empty())
21093 SI->setShuffleMask(NewMask);
21094 Replaced = true;
21095 break;
21096 }
21098 GatherShuffleExtractSeq.contains(V) &&
21099 IsIdenticalOrLessDefined(V, &In, NewMask) &&
21100 DT->dominates(In.getParent(), V->getParent())) {
21101 In.moveAfter(V);
21102 V->replaceAllUsesWith(&In);
21104 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
21105 if (!NewMask.empty())
21106 SI->setShuffleMask(NewMask);
21107 V = &In;
21108 Replaced = true;
21109 break;
21110 }
21111 }
21112 if (!Replaced) {
21113 assert(!is_contained(Visited, &In));
21114 Visited.push_back(&In);
21115 }
21116 }
21117 }
21118 CSEBlocks.clear();
21119 GatherShuffleExtractSeq.clear();
21120}
21121
21122BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
21123 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
21124 auto &BundlePtr =
21125 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
21126 for (Value *V : VL) {
21127 if (S.isNonSchedulable(V))
21128 continue;
21129 auto *I = cast<Instruction>(V);
21130 if (S.isCopyableElement(V)) {
21131 // Add a copyable element model.
21132 ScheduleCopyableData &SD =
21133 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
21134 // Group the instructions to a bundle.
21135 BundlePtr->add(&SD);
21136 continue;
21137 }
21138 ScheduleData *BundleMember = getScheduleData(V);
21139 assert(BundleMember && "no ScheduleData for bundle member "
21140 "(maybe not in same basic block)");
21141 // Group the instructions to a bundle.
21142 BundlePtr->add(BundleMember);
21143 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
21144 BundlePtr.get());
21145 }
21146 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
21147 return *BundlePtr;
21148}
21149
21150// Groups the instructions to a bundle (which is then a single scheduling entity)
21151// and schedules instructions until the bundle gets ready.
21152std::optional<BoUpSLP::ScheduleBundle *>
21153BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
21154 const InstructionsState &S,
21155 const EdgeInfo &EI) {
21156 // No need to schedule PHIs, insertelement, extractelement and extractvalue
21157 // instructions.
21158 if (isa<PHINode>(S.getMainOp()) ||
21159 isVectorLikeInstWithConstOps(S.getMainOp()))
21160 return nullptr;
21161 // If the parent node is non-schedulable and the current node is copyable, and
21162 // any of parent instructions are used outside several basic blocks or in
21163 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
21164 // analysis, leading to a crash.
21165 // Non-scheduled nodes may not have related ScheduleData model, which may lead
21166 // to a skipped dep analysis.
21167 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21168 EI.UserTE->doesNotNeedToSchedule() &&
21169 EI.UserTE->getOpcode() != Instruction::PHI &&
21170 any_of(EI.UserTE->Scalars, [](Value *V) {
21171 auto *I = dyn_cast<Instruction>(V);
21172 if (!I || I->hasOneUser())
21173 return false;
21174 for (User *U : I->users()) {
21175 auto *UI = cast<Instruction>(U);
21176 if (isa<BinaryOperator>(UI))
21177 return true;
21178 }
21179 return false;
21180 }))
21181 return std::nullopt;
21182 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21183 EI.UserTE->hasCopyableElements() &&
21184 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21185 all_of(VL, [&](Value *V) {
21186 if (S.isCopyableElement(V))
21187 return true;
21188 return isUsedOutsideBlock(V);
21189 }))
21190 return std::nullopt;
21191 // If any instruction is used outside block only and its operand is placed
21192 // immediately before it, do not schedule, it may cause wrong def-use chain.
21193 if (S.areInstructionsWithCopyableElements() && any_of(VL, [&](Value *V) {
21194 if (isa<PoisonValue>(V) || S.isCopyableElement(V))
21195 return false;
21196 if (isUsedOutsideBlock(V)) {
21197 for (Value *Op : cast<Instruction>(V)->operands()) {
21198 auto *I = dyn_cast<Instruction>(Op);
21199 if (!I)
21200 continue;
21201 return SLP->isVectorized(I) && I->getNextNode() == V;
21202 }
21203 }
21204 return false;
21205 }))
21206 return std::nullopt;
21207 if (S.areInstructionsWithCopyableElements() && EI) {
21208 bool IsNonSchedulableWithParentPhiNode =
21209 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
21210 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
21211 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21212 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
21213 if (IsNonSchedulableWithParentPhiNode) {
21214 SmallSet<std::pair<Value *, Value *>, 4> Values;
21215 for (const auto [Idx, V] :
21216 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
21217 Value *Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
21218 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
21219 auto *I = dyn_cast<Instruction>(Op);
21220 if (!I || !isCommutative(I))
21221 continue;
21222 if (!Values.insert(std::make_pair(V, Op)).second)
21223 return std::nullopt;
21224 }
21225 }
21226 }
21227 bool HasCopyables = S.areInstructionsWithCopyableElements();
21228 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
21229 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
21230 // If all operands were replaced by copyables, the operands of this node
21231 // might be not, so need to recalculate dependencies for schedule data,
21232 // replaced by copyable schedule data.
21233 SmallVector<ScheduleData *> ControlDependentMembers;
21234 for (Value *V : VL) {
21235 auto *I = dyn_cast<Instruction>(V);
21236 if (!I || (HasCopyables && S.isCopyableElement(V)))
21237 continue;
21238 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21239 for (const Use &U : I->operands()) {
21240 unsigned &NumOps =
21241 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
21242 .first->getSecond();
21243 ++NumOps;
21244 if (auto *Op = dyn_cast<Instruction>(U.get());
21245 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
21246 if (ScheduleData *OpSD = getScheduleData(Op);
21247 OpSD && OpSD->hasValidDependencies()) {
21248 OpSD->clearDirectDependencies();
21249 if (RegionHasStackSave ||
21251 ControlDependentMembers.push_back(OpSD);
21252 }
21253 }
21254 }
21255 }
21256 if (!ControlDependentMembers.empty()) {
21257 ScheduleBundle Invalid = ScheduleBundle::invalid();
21258 calculateDependencies(Invalid, /*InsertInReadyList=*/true, SLP,
21259 ControlDependentMembers);
21260 }
21261 return nullptr;
21262 }
21263
21264 // Initialize the instruction bundle.
21265 Instruction *OldScheduleEnd = ScheduleEnd;
21266 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
21267
21268 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
21269 // Clear deps or recalculate the region, if the memory instruction is a
21270 // copyable. It may have memory deps, which must be recalculated.
21271 SmallVector<ScheduleData *> ControlDependentMembers;
21272 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21273 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21274 for (ScheduleEntity *SE : Bundle.getBundle()) {
21275 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
21276 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21277 BundleMember && BundleMember->hasValidDependencies()) {
21278 BundleMember->clearDirectDependencies();
21279 if (RegionHasStackSave ||
21281 BundleMember->getInst()))
21282 ControlDependentMembers.push_back(BundleMember);
21283 }
21284 continue;
21285 }
21286 auto *SD = cast<ScheduleData>(SE);
21287 if (SD->hasValidDependencies() &&
21288 (!S.areInstructionsWithCopyableElements() ||
21289 !S.isCopyableElement(SD->getInst())) &&
21290 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21291 EI.UserTE->hasState() &&
21292 (!EI.UserTE->hasCopyableElements() ||
21293 !EI.UserTE->isCopyableElement(SD->getInst())))
21294 SD->clearDirectDependencies();
21295 for (const Use &U : SD->getInst()->operands()) {
21296 unsigned &NumOps =
21297 UserOpToNumOps
21298 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
21299 .first->getSecond();
21300 ++NumOps;
21301 if (auto *Op = dyn_cast<Instruction>(U.get());
21302 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
21303 *SLP, NumOps)) {
21304 if (ScheduleData *OpSD = getScheduleData(Op);
21305 OpSD && OpSD->hasValidDependencies()) {
21306 OpSD->clearDirectDependencies();
21307 if (RegionHasStackSave ||
21309 ControlDependentMembers.push_back(OpSD);
21310 }
21311 }
21312 }
21313 }
21314 };
21315 // The scheduling region got new instructions at the lower end (or it is a
21316 // new region for the first bundle). This makes it necessary to
21317 // recalculate all dependencies.
21318 // It is seldom that this needs to be done a second time after adding the
21319 // initial bundle to the region.
21320 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21321 for_each(ScheduleDataMap, [&](auto &P) {
21322 if (BB != P.first->getParent())
21323 return;
21324 ScheduleData *SD = P.second;
21325 if (isInSchedulingRegion(*SD))
21326 SD->clearDependencies();
21327 });
21328 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21329 for_each(P.second, [&](ScheduleCopyableData *SD) {
21330 if (isInSchedulingRegion(*SD))
21331 SD->clearDependencies();
21332 });
21333 });
21334 ReSchedule = true;
21335 }
21336 // Check if the bundle data has deps for copyable elements already. In
21337 // this case need to reset deps and recalculate it.
21338 if (Bundle && !Bundle.getBundle().empty()) {
21339 if (S.areInstructionsWithCopyableElements() ||
21340 !ScheduleCopyableDataMap.empty())
21341 CheckIfNeedToClearDeps(Bundle);
21342 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
21343 << BB->getName() << "\n");
21344 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
21345 ControlDependentMembers);
21346 } else if (!ControlDependentMembers.empty()) {
21347 ScheduleBundle Invalid = ScheduleBundle::invalid();
21348 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
21349 ControlDependentMembers);
21350 }
21351
21352 if (ReSchedule) {
21353 resetSchedule();
21354 initialFillReadyList(ReadyInsts);
21355 }
21356
21357 // Now try to schedule the new bundle or (if no bundle) just calculate
21358 // dependencies. As soon as the bundle is "ready" it means that there are no
21359 // cyclic dependencies and we can schedule it. Note that's important that we
21360 // don't "schedule" the bundle yet.
21361 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21362 !ReadyInsts.empty()) {
21363 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21364 assert(Picked->isReady() && "must be ready to schedule");
21365 schedule(*SLP, S, EI, Picked, ReadyInsts);
21366 if (Picked == &Bundle)
21367 break;
21368 }
21369 };
21370
21371 // Make sure that the scheduling region contains all
21372 // instructions of the bundle.
21373 for (Value *V : VL) {
21374 if (S.isNonSchedulable(V))
21375 continue;
21376 if (!extendSchedulingRegion(V, S)) {
21377 // If the scheduling region got new instructions at the lower end (or it
21378 // is a new region for the first bundle). This makes it necessary to
21379 // recalculate all dependencies.
21380 // Otherwise the compiler may crash trying to incorrectly calculate
21381 // dependencies and emit instruction in the wrong order at the actual
21382 // scheduling.
21383 ScheduleBundle Invalid = ScheduleBundle::invalid();
21384 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
21385 return std::nullopt;
21386 }
21387 }
21388
21389 bool ReSchedule = false;
21390 for (Value *V : VL) {
21391 if (S.isNonSchedulable(V))
21392 continue;
21394 getScheduleCopyableData(cast<Instruction>(V));
21395 if (!CopyableData.empty()) {
21396 for (ScheduleCopyableData *SD : CopyableData)
21397 ReadyInsts.remove(SD);
21398 }
21399 ScheduleData *BundleMember = getScheduleData(V);
21400 assert((BundleMember || S.isCopyableElement(V)) &&
21401 "no ScheduleData for bundle member (maybe not in same basic block)");
21402 if (!BundleMember)
21403 continue;
21404
21405 // Make sure we don't leave the pieces of the bundle in the ready list when
21406 // whole bundle might not be ready.
21407 ReadyInsts.remove(BundleMember);
21408 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
21409 !Bundles.empty()) {
21410 for (ScheduleBundle *B : Bundles)
21411 ReadyInsts.remove(B);
21412 }
21413
21414 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21415 continue;
21416 // A bundle member was scheduled as single instruction before and now
21417 // needs to be scheduled as part of the bundle. We just get rid of the
21418 // existing schedule.
21419 // A bundle member has deps calculated before it was copyable element - need
21420 // to reschedule.
21421 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
21422 << " was already scheduled\n");
21423 ReSchedule = true;
21424 }
21425
21426 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21427 TryScheduleBundleImpl(ReSchedule, Bundle);
21428 if (!Bundle.isReady()) {
21429 for (ScheduleEntity *BD : Bundle.getBundle()) {
21430 // Copyable data scheduling is just removed.
21432 continue;
21433 if (BD->isReady()) {
21434 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
21435 if (Bundles.empty()) {
21436 ReadyInsts.insert(BD);
21437 continue;
21438 }
21439 for (ScheduleBundle *B : Bundles)
21440 if (B->isReady())
21441 ReadyInsts.insert(B);
21442 }
21443 }
21444 ScheduledBundlesList.pop_back();
21445 SmallVector<ScheduleData *> ControlDependentMembers;
21446 for (Value *V : VL) {
21447 if (S.isNonSchedulable(V))
21448 continue;
21449 auto *I = cast<Instruction>(V);
21450 if (S.isCopyableElement(I)) {
21451 // Remove the copyable data from the scheduling region and restore
21452 // previous mappings.
21453 auto KV = std::make_pair(EI, I);
21454 assert(ScheduleCopyableDataMap.contains(KV) &&
21455 "no ScheduleCopyableData for copyable element");
21456 ScheduleCopyableData *SD =
21457 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
21458 ScheduleCopyableDataMapByUsers[I].remove(SD);
21459 if (EI.UserTE) {
21460 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21461 const auto *It = find(Op, I);
21462 assert(It != Op.end() && "Lane not set");
21463 SmallPtrSet<Instruction *, 4> Visited;
21464 do {
21465 int Lane = std::distance(Op.begin(), It);
21466 assert(Lane >= 0 && "Lane not set");
21467 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21468 !EI.UserTE->ReorderIndices.empty())
21469 Lane = EI.UserTE->ReorderIndices[Lane];
21470 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21471 "Couldn't find extract lane");
21472 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21473 if (!Visited.insert(In).second) {
21474 It = find(make_range(std::next(It), Op.end()), I);
21475 break;
21476 }
21477 ScheduleCopyableDataMapByInstUser
21478 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
21479 .pop_back();
21480 It = find(make_range(std::next(It), Op.end()), I);
21481 } while (It != Op.end());
21482 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
21483 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
21484 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
21485 }
21486 if (ScheduleCopyableDataMapByUsers[I].empty())
21487 ScheduleCopyableDataMapByUsers.erase(I);
21488 ScheduleCopyableDataMap.erase(KV);
21489 // Need to recalculate dependencies for the actual schedule data.
21490 if (ScheduleData *OpSD = getScheduleData(I);
21491 OpSD && OpSD->hasValidDependencies()) {
21492 OpSD->clearDirectDependencies();
21493 if (RegionHasStackSave ||
21495 ControlDependentMembers.push_back(OpSD);
21496 }
21497 continue;
21498 }
21499 ScheduledBundles.find(I)->getSecond().pop_back();
21500 }
21501 if (!ControlDependentMembers.empty()) {
21502 ScheduleBundle Invalid = ScheduleBundle::invalid();
21503 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
21504 ControlDependentMembers);
21505 }
21506 return std::nullopt;
21507 }
21508 return &Bundle;
21509}
21510
21511BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21512 // Allocate a new ScheduleData for the instruction.
21513 if (ChunkPos >= ChunkSize) {
21514 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21515 ChunkPos = 0;
21516 }
21517 return &(ScheduleDataChunks.back()[ChunkPos++]);
21518}
21519
21520bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21521 Value *V, const InstructionsState &S) {
21523 assert(I && "bundle member must be an instruction");
21524 if (getScheduleData(I))
21525 return true;
21526 if (!ScheduleStart) {
21527 // It's the first instruction in the new region.
21528 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
21529 ScheduleStart = I;
21530 ScheduleEnd = I->getNextNode();
21531 assert(ScheduleEnd && "tried to vectorize a terminator?");
21532 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
21533 return true;
21534 }
21535 // Search up and down at the same time, because we don't know if the new
21536 // instruction is above or below the existing scheduling region.
21537 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
21538 // against the budget. Otherwise debug info could affect codegen.
21540 ++ScheduleStart->getIterator().getReverse();
21541 BasicBlock::reverse_iterator UpperEnd = BB->rend();
21542 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
21543 BasicBlock::iterator LowerEnd = BB->end();
21544 auto IsAssumeLikeIntr = [](const Instruction &I) {
21545 if (auto *II = dyn_cast<IntrinsicInst>(&I))
21546 return II->isAssumeLikeIntrinsic();
21547 return false;
21548 };
21549 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21550 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21551 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
21552 &*DownIter != I) {
21553 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21554 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
21555 return false;
21556 }
21557
21558 ++UpIter;
21559 ++DownIter;
21560
21561 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21562 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21563 }
21564 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
21565 assert(I->getParent() == ScheduleStart->getParent() &&
21566 "Instruction is in wrong basic block.");
21567 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
21568 ScheduleStart = I;
21569 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
21570 << "\n");
21571 return true;
21572 }
21573 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
21574 "Expected to reach top of the basic block or instruction down the "
21575 "lower end.");
21576 assert(I->getParent() == ScheduleEnd->getParent() &&
21577 "Instruction is in wrong basic block.");
21578 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
21579 nullptr);
21580 ScheduleEnd = I->getNextNode();
21581 assert(ScheduleEnd && "tried to vectorize a terminator?");
21582 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
21583 return true;
21584}
21585
21586void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21587 Instruction *ToI,
21588 ScheduleData *PrevLoadStore,
21589 ScheduleData *NextLoadStore) {
21590 ScheduleData *CurrentLoadStore = PrevLoadStore;
21591 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
21592 // No need to allocate data for non-schedulable instructions.
21593 if (isa<PHINode>(I))
21594 continue;
21595 ScheduleData *SD = ScheduleDataMap.lookup(I);
21596 if (!SD) {
21597 SD = allocateScheduleDataChunks();
21598 ScheduleDataMap[I] = SD;
21599 }
21600 assert(!isInSchedulingRegion(*SD) &&
21601 "new ScheduleData already in scheduling region");
21602 SD->init(SchedulingRegionID, I);
21603
21604 auto CanIgnoreLoad = [](const Instruction *I) {
21605 const auto *LI = dyn_cast<LoadInst>(I);
21606 // If there is a simple load marked as invariant, we can ignore it.
21607 // But, in the (unlikely) case of non-simple invariant load,
21608 // we should not ignore it.
21609 return LI && LI->isSimple() &&
21610 LI->getMetadata(LLVMContext::MD_invariant_load);
21611 };
21612
21613 if (I->mayReadOrWriteMemory() &&
21614 // Simple InvariantLoad does not depend on other memory accesses.
21615 !CanIgnoreLoad(I) &&
21616 (!isa<IntrinsicInst>(I) ||
21617 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
21619 Intrinsic::pseudoprobe))) {
21620 // Update the linked list of memory accessing instructions.
21621 if (CurrentLoadStore) {
21622 CurrentLoadStore->setNextLoadStore(SD);
21623 } else {
21624 FirstLoadStoreInRegion = SD;
21625 }
21626 CurrentLoadStore = SD;
21627 }
21628
21631 RegionHasStackSave = true;
21632 }
21633 if (NextLoadStore) {
21634 if (CurrentLoadStore)
21635 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21636 } else {
21637 LastLoadStoreInRegion = CurrentLoadStore;
21638 }
21639}
21640
21641void BoUpSLP::BlockScheduling::calculateDependencies(
21642 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
21643 ArrayRef<ScheduleData *> ControlDeps) {
21644 SmallVector<ScheduleEntity *> WorkList;
21645 auto ProcessNode = [&](ScheduleEntity *SE) {
21646 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21647 if (CD->hasValidDependencies())
21648 return;
21649 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
21650 CD->initDependencies();
21651 CD->resetUnscheduledDeps();
21652 const EdgeInfo &EI = CD->getEdgeInfo();
21653 if (EI.UserTE) {
21654 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21655 const auto *It = find(Op, CD->getInst());
21656 assert(It != Op.end() && "Lane not set");
21657 SmallPtrSet<Instruction *, 4> Visited;
21658 do {
21659 int Lane = std::distance(Op.begin(), It);
21660 assert(Lane >= 0 && "Lane not set");
21661 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21662 !EI.UserTE->ReorderIndices.empty())
21663 Lane = EI.UserTE->ReorderIndices[Lane];
21664 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21665 "Couldn't find extract lane");
21666 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21667 if (EI.UserTE->isCopyableElement(In)) {
21668 // We may have not have related copyable scheduling data, if the
21669 // instruction is non-schedulable.
21670 if (ScheduleCopyableData *UseSD =
21671 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21672 CD->incDependencies();
21673 if (!UseSD->isScheduled())
21674 CD->incrementUnscheduledDeps(1);
21675 if (!UseSD->hasValidDependencies() ||
21676 (InsertInReadyList && UseSD->isReady()))
21677 WorkList.push_back(UseSD);
21678 }
21679 } else if (Visited.insert(In).second) {
21680 if (ScheduleData *UseSD = getScheduleData(In)) {
21681 CD->incDependencies();
21682 if (!UseSD->isScheduled())
21683 CD->incrementUnscheduledDeps(1);
21684 if (!UseSD->hasValidDependencies() ||
21685 (InsertInReadyList && UseSD->isReady()))
21686 WorkList.push_back(UseSD);
21687 }
21688 }
21689 It = find(make_range(std::next(It), Op.end()), CD->getInst());
21690 } while (It != Op.end());
21691 if (CD->isReady() && CD->getDependencies() == 0 &&
21692 (EI.UserTE->hasState() &&
21693 (EI.UserTE->getMainOp()->getParent() !=
21694 CD->getInst()->getParent() ||
21695 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21696 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
21697 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21698 auto *IU = dyn_cast<Instruction>(U);
21699 if (!IU)
21700 return true;
21701 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21702 })))))) {
21703 // If no uses in the block - mark as having pseudo-use, which cannot
21704 // be scheduled.
21705 // Prevents incorrect def-use tracking between external user and
21706 // actual instruction.
21707 CD->incDependencies();
21708 CD->incrementUnscheduledDeps(1);
21709 }
21710 }
21711 return;
21712 }
21713 auto *BundleMember = cast<ScheduleData>(SE);
21714 if (BundleMember->hasValidDependencies())
21715 return;
21716 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
21717 BundleMember->initDependencies();
21718 BundleMember->resetUnscheduledDeps();
21719 // Handle def-use chain dependencies.
21720 SmallDenseMap<Value *, unsigned> UserToNumOps;
21721 for (User *U : BundleMember->getInst()->users()) {
21722 if (isa<PHINode>(U))
21723 continue;
21724 if (ScheduleData *UseSD = getScheduleData(U)) {
21725 // The operand is a copyable element - skip.
21726 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
21727 ++NumOps;
21728 if (areAllOperandsReplacedByCopyableData(
21729 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21730 continue;
21731 BundleMember->incDependencies();
21732 if (!UseSD->isScheduled())
21733 BundleMember->incrementUnscheduledDeps(1);
21734 if (!UseSD->hasValidDependencies() ||
21735 (InsertInReadyList && UseSD->isReady()))
21736 WorkList.push_back(UseSD);
21737 }
21738 }
21739 for (ScheduleCopyableData *UseSD :
21740 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21741 BundleMember->incDependencies();
21742 if (!UseSD->isScheduled())
21743 BundleMember->incrementUnscheduledDeps(1);
21744 if (!UseSD->hasValidDependencies() ||
21745 (InsertInReadyList && UseSD->isReady()))
21746 WorkList.push_back(UseSD);
21747 }
21748
21749 SmallPtrSet<const Instruction *, 4> Visited;
21750 auto MakeControlDependent = [&](Instruction *I) {
21751 // Do not mark control dependent twice.
21752 if (!Visited.insert(I).second)
21753 return;
21754 auto *DepDest = getScheduleData(I);
21755 assert(DepDest && "must be in schedule window");
21756 DepDest->addControlDependency(BundleMember);
21757 BundleMember->incDependencies();
21758 if (!DepDest->isScheduled())
21759 BundleMember->incrementUnscheduledDeps(1);
21760 if (!DepDest->hasValidDependencies() ||
21761 (InsertInReadyList && DepDest->isReady()))
21762 WorkList.push_back(DepDest);
21763 };
21764
21765 // Any instruction which isn't safe to speculate at the beginning of the
21766 // block is control depend on any early exit or non-willreturn call
21767 // which proceeds it.
21768 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
21769 for (Instruction *I = BundleMember->getInst()->getNextNode();
21770 I != ScheduleEnd; I = I->getNextNode()) {
21771 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
21772 continue;
21773
21774 // Add the dependency
21775 MakeControlDependent(I);
21776
21778 // Everything past here must be control dependent on I.
21779 break;
21780 }
21781 }
21782
21783 if (RegionHasStackSave) {
21784 // If we have an inalloc alloca instruction, it needs to be scheduled
21785 // after any preceeding stacksave. We also need to prevent any alloca
21786 // from reordering above a preceeding stackrestore.
21787 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21788 match(BundleMember->getInst(),
21790 for (Instruction *I = BundleMember->getInst()->getNextNode();
21791 I != ScheduleEnd; I = I->getNextNode()) {
21794 // Any allocas past here must be control dependent on I, and I
21795 // must be memory dependend on BundleMember->Inst.
21796 break;
21797
21798 if (!isa<AllocaInst>(I))
21799 continue;
21800
21801 // Add the dependency
21802 MakeControlDependent(I);
21803 }
21804 }
21805
21806 // In addition to the cases handle just above, we need to prevent
21807 // allocas and loads/stores from moving below a stacksave or a
21808 // stackrestore. Avoiding moving allocas below stackrestore is currently
21809 // thought to be conservatism. Moving loads/stores below a stackrestore
21810 // can lead to incorrect code.
21811 if (isa<AllocaInst>(BundleMember->getInst()) ||
21812 BundleMember->getInst()->mayReadOrWriteMemory()) {
21813 for (Instruction *I = BundleMember->getInst()->getNextNode();
21814 I != ScheduleEnd; I = I->getNextNode()) {
21817 continue;
21818
21819 // Add the dependency
21820 MakeControlDependent(I);
21821 break;
21822 }
21823 }
21824 }
21825
21826 // Handle the memory dependencies (if any).
21827 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21828 if (!NextLoadStore)
21829 return;
21830 Instruction *SrcInst = BundleMember->getInst();
21831 assert(SrcInst->mayReadOrWriteMemory() &&
21832 "NextLoadStore list for non memory effecting bundle?");
21833 MemoryLocation SrcLoc = getLocation(SrcInst);
21834 bool SrcMayWrite = SrcInst->mayWriteToMemory();
21835 unsigned NumAliased = 0;
21836 unsigned DistToSrc = 1;
21837 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
21838
21839 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21840 DepDest = DepDest->getNextLoadStore()) {
21841 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
21842
21843 // We have two limits to reduce the complexity:
21844 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
21845 // SLP->isAliased (which is the expensive part in this loop).
21846 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
21847 // the whole loop (even if the loop is fast, it's quadratic).
21848 // It's important for the loop break condition (see below) to
21849 // check this limit even between two read-only instructions.
21850 if (DistToSrc >= MaxMemDepDistance ||
21851 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21852 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
21853 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21854
21855 // We increment the counter only if the locations are aliased
21856 // (instead of counting all alias checks). This gives a better
21857 // balance between reduced runtime and accurate dependencies.
21858 NumAliased++;
21859
21860 DepDest->addMemoryDependency(BundleMember);
21861 BundleMember->incDependencies();
21862 if (!DepDest->isScheduled())
21863 BundleMember->incrementUnscheduledDeps(1);
21864 if (!DepDest->hasValidDependencies() ||
21865 (InsertInReadyList && DepDest->isReady()))
21866 WorkList.push_back(DepDest);
21867 }
21868
21869 // Example, explaining the loop break condition: Let's assume our
21870 // starting instruction is i0 and MaxMemDepDistance = 3.
21871 //
21872 // +--------v--v--v
21873 // i0,i1,i2,i3,i4,i5,i6,i7,i8
21874 // +--------^--^--^
21875 //
21876 // MaxMemDepDistance let us stop alias-checking at i3 and we add
21877 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
21878 // Previously we already added dependencies from i3 to i6,i7,i8
21879 // (because of MaxMemDepDistance). As we added a dependency from
21880 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
21881 // and we can abort this loop at i6.
21882 if (DistToSrc >= 2 * MaxMemDepDistance)
21883 break;
21884 DistToSrc++;
21885 }
21886 };
21887
21888 assert((Bundle || !ControlDeps.empty()) &&
21889 "expected at least one instruction to schedule");
21890 if (Bundle)
21891 WorkList.push_back(Bundle.getBundle().front());
21892 WorkList.append(ControlDeps.begin(), ControlDeps.end());
21893 SmallPtrSet<ScheduleBundle *, 16> Visited;
21894 while (!WorkList.empty()) {
21895 ScheduleEntity *SD = WorkList.pop_back_val();
21896 SmallVector<ScheduleBundle *, 1> CopyableBundle;
21898 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21899 CopyableBundle.push_back(&CD->getBundle());
21900 Bundles = CopyableBundle;
21901 } else {
21902 Bundles = getScheduleBundles(SD->getInst());
21903 }
21904 if (Bundles.empty()) {
21905 if (!SD->hasValidDependencies())
21906 ProcessNode(SD);
21907 if (InsertInReadyList && SD->isReady()) {
21908 ReadyInsts.insert(SD);
21909 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
21910 }
21911 continue;
21912 }
21913 for (ScheduleBundle *Bundle : Bundles) {
21914 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
21915 continue;
21916 assert(isInSchedulingRegion(*Bundle) &&
21917 "ScheduleData not in scheduling region");
21918 for_each(Bundle->getBundle(), ProcessNode);
21919 }
21920 if (InsertInReadyList && SD->isReady()) {
21921 for (ScheduleBundle *Bundle : Bundles) {
21922 assert(isInSchedulingRegion(*Bundle) &&
21923 "ScheduleData not in scheduling region");
21924 if (!Bundle->isReady())
21925 continue;
21926 ReadyInsts.insert(Bundle);
21927 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
21928 << "\n");
21929 }
21930 }
21931 }
21932}
21933
21934void BoUpSLP::BlockScheduling::resetSchedule() {
21935 assert(ScheduleStart &&
21936 "tried to reset schedule on block which has not been scheduled");
21937 for_each(ScheduleDataMap, [&](auto &P) {
21938 if (BB != P.first->getParent())
21939 return;
21940 ScheduleData *SD = P.second;
21941 if (isInSchedulingRegion(*SD)) {
21942 SD->setScheduled(/*Scheduled=*/false);
21943 SD->resetUnscheduledDeps();
21944 }
21945 });
21946 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21947 for_each(P.second, [&](ScheduleCopyableData *SD) {
21948 if (isInSchedulingRegion(*SD)) {
21949 SD->setScheduled(/*Scheduled=*/false);
21950 SD->resetUnscheduledDeps();
21951 }
21952 });
21953 });
21954 for_each(ScheduledBundles, [&](auto &P) {
21955 for_each(P.second, [&](ScheduleBundle *Bundle) {
21956 if (isInSchedulingRegion(*Bundle))
21957 Bundle->setScheduled(/*Scheduled=*/false);
21958 });
21959 });
21960 // Reset schedule data for copyable elements.
21961 for (auto &P : ScheduleCopyableDataMap) {
21962 if (isInSchedulingRegion(*P.second)) {
21963 P.second->setScheduled(/*Scheduled=*/false);
21964 P.second->resetUnscheduledDeps();
21965 }
21966 }
21967 ReadyInsts.clear();
21968}
21969
21970void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
21971 if (!BS->ScheduleStart)
21972 return;
21973
21974 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
21975
21976 // A key point - if we got here, pre-scheduling was able to find a valid
21977 // scheduling of the sub-graph of the scheduling window which consists
21978 // of all vector bundles and their transitive users. As such, we do not
21979 // need to reschedule anything *outside of* that subgraph.
21980
21981 BS->resetSchedule();
21982
21983 // For the real scheduling we use a more sophisticated ready-list: it is
21984 // sorted by the original instruction location. This lets the final schedule
21985 // be as close as possible to the original instruction order.
21986 // WARNING: If changing this order causes a correctness issue, that means
21987 // there is some missing dependence edge in the schedule data graph.
21988 struct ScheduleDataCompare {
21989 bool operator()(const ScheduleEntity *SD1,
21990 const ScheduleEntity *SD2) const {
21991 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21992 }
21993 };
21994 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21995
21996 // Ensure that all dependency data is updated (for nodes in the sub-graph)
21997 // and fill the ready-list with initial instructions.
21998 int Idx = 0;
21999 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22000 I = I->getNextNode()) {
22001 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
22002 if (!Bundles.empty()) {
22003 for (ScheduleBundle *Bundle : Bundles) {
22004 Bundle->setSchedulingPriority(Idx++);
22005 if (!Bundle->hasValidDependencies())
22006 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
22007 }
22008 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
22009 for (ScheduleCopyableData *SD : reverse(SDs)) {
22010 ScheduleBundle &Bundle = SD->getBundle();
22011 Bundle.setSchedulingPriority(Idx++);
22012 if (!Bundle.hasValidDependencies())
22013 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22014 }
22015 continue;
22016 }
22018 BS->getScheduleCopyableDataUsers(I);
22019 if (ScheduleData *SD = BS->getScheduleData(I)) {
22020 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
22021 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
22022 SDTEs.front()->doesNotNeedToSchedule() ||
22024 "scheduler and vectorizer bundle mismatch");
22025 SD->setSchedulingPriority(Idx++);
22026 if (!SD->hasValidDependencies() &&
22027 (!CopyableData.empty() ||
22028 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
22029 assert(TE->isGather() && "expected gather node");
22030 return TE->hasState() && TE->hasCopyableElements() &&
22031 TE->isCopyableElement(I);
22032 }))) {
22033 // Need to calculate deps for these nodes to correctly handle copyable
22034 // dependencies, even if they were cancelled.
22035 // If copyables bundle was cancelled, the deps are cleared and need to
22036 // recalculate them.
22037 ScheduleBundle Bundle;
22038 Bundle.add(SD);
22039 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22040 }
22041 }
22042 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
22043 ScheduleBundle &Bundle = SD->getBundle();
22044 Bundle.setSchedulingPriority(Idx++);
22045 if (!Bundle.hasValidDependencies())
22046 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
22047 }
22048 }
22049 BS->initialFillReadyList(ReadyInsts);
22050
22051 Instruction *LastScheduledInst = BS->ScheduleEnd;
22052
22053 // Do the "real" scheduling.
22054 SmallPtrSet<Instruction *, 16> Scheduled;
22055 while (!ReadyInsts.empty()) {
22056 auto *Picked = *ReadyInsts.begin();
22057 ReadyInsts.erase(ReadyInsts.begin());
22058
22059 // Move the scheduled instruction(s) to their dedicated places, if not
22060 // there yet.
22061 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
22062 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
22063 Instruction *PickedInst = BundleMember->getInst();
22064 // If copyable must be schedule as part of something else, skip it.
22065 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
22066 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
22067 (!IsCopyable && !Scheduled.insert(PickedInst).second))
22068 continue;
22069 if (PickedInst->getNextNode() != LastScheduledInst)
22070 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
22071 LastScheduledInst = PickedInst;
22072 }
22073 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
22074 LastScheduledInst);
22075 } else {
22076 auto *SD = cast<ScheduleData>(Picked);
22077 Instruction *PickedInst = SD->getInst();
22078 if (PickedInst->getNextNode() != LastScheduledInst)
22079 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
22080 LastScheduledInst = PickedInst;
22081 }
22082 auto Invalid = InstructionsState::invalid();
22083 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
22084 }
22085
22086 // Check that we didn't break any of our invariants.
22087#ifdef EXPENSIVE_CHECKS
22088 BS->verify();
22089#endif
22090
22091#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
22092 // Check that all schedulable entities got scheduled
22093 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22094 I = I->getNextNode()) {
22095 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
22096 assert(all_of(Bundles,
22097 [](const ScheduleBundle *Bundle) {
22098 return Bundle->isScheduled();
22099 }) &&
22100 "must be scheduled at this point");
22101 }
22102#endif
22103
22104 // Avoid duplicate scheduling of the block.
22105 BS->ScheduleStart = nullptr;
22106}
22107
22109 // If V is a store, just return the width of the stored value (or value
22110 // truncated just before storing) without traversing the expression tree.
22111 // This is the common case.
22112 if (auto *Store = dyn_cast<StoreInst>(V))
22113 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
22114
22115 if (auto *IEI = dyn_cast<InsertElementInst>(V))
22116 return getVectorElementSize(IEI->getOperand(1));
22117
22118 auto E = InstrElementSize.find(V);
22119 if (E != InstrElementSize.end())
22120 return E->second;
22121
22122 // If V is not a store, we can traverse the expression tree to find loads
22123 // that feed it. The type of the loaded value may indicate a more suitable
22124 // width than V's type. We want to base the vector element size on the width
22125 // of memory operations where possible.
22128 if (auto *I = dyn_cast<Instruction>(V)) {
22129 Worklist.emplace_back(I, I->getParent(), 0);
22130 Visited.insert(I);
22131 }
22132
22133 // Traverse the expression tree in bottom-up order looking for loads. If we
22134 // encounter an instruction we don't yet handle, we give up.
22135 auto Width = 0u;
22136 Value *FirstNonBool = nullptr;
22137 while (!Worklist.empty()) {
22138 auto [I, Parent, Level] = Worklist.pop_back_val();
22139
22140 // We should only be looking at scalar instructions here. If the current
22141 // instruction has a vector type, skip.
22142 auto *Ty = I->getType();
22143 if (isa<VectorType>(Ty))
22144 continue;
22145 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
22146 FirstNonBool = I;
22147 if (Level > RecursionMaxDepth)
22148 continue;
22149
22150 // If the current instruction is a load, update MaxWidth to reflect the
22151 // width of the loaded value.
22153 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
22154
22155 // Otherwise, we need to visit the operands of the instruction. We only
22156 // handle the interesting cases from buildTree here. If an operand is an
22157 // instruction we haven't yet visited and from the same basic block as the
22158 // user or the use is a PHI node, we add it to the worklist.
22161 for (Use &U : I->operands()) {
22162 if (auto *J = dyn_cast<Instruction>(U.get()))
22163 if (Visited.insert(J).second &&
22164 (isa<PHINode>(I) || J->getParent() == Parent)) {
22165 Worklist.emplace_back(J, J->getParent(), Level + 1);
22166 continue;
22167 }
22168 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
22169 FirstNonBool = U.get();
22170 }
22171 } else {
22172 break;
22173 }
22174 }
22175
22176 // If we didn't encounter a memory access in the expression tree, or if we
22177 // gave up for some reason, just return the width of V. Otherwise, return the
22178 // maximum width we found.
22179 if (!Width) {
22180 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
22181 V = FirstNonBool;
22182 Width = DL->getTypeSizeInBits(V->getType());
22183 }
22184
22185 for (Instruction *I : Visited)
22186 InstrElementSize[I] = Width;
22187
22188 return Width;
22189}
22190
22191bool BoUpSLP::collectValuesToDemote(
22192 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
22194 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
22195 bool &IsProfitableToDemote, bool IsTruncRoot) const {
22196 // We can always demote constants.
22197 if (all_of(E.Scalars, IsaPred<Constant>))
22198 return true;
22199
22200 unsigned OrigBitWidth =
22201 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
22202 if (OrigBitWidth == BitWidth) {
22203 MaxDepthLevel = 1;
22204 return true;
22205 }
22206
22207 // Check if the node was analyzed already and must keep its original bitwidth.
22208 if (NodesToKeepBWs.contains(E.Idx))
22209 return false;
22210
22211 // If the value is not a vectorized instruction in the expression and not used
22212 // by the insertelement instruction and not used in multiple vector nodes, it
22213 // cannot be demoted.
22214 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
22215 if (isa<PoisonValue>(R))
22216 return false;
22217 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22218 });
22219 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
22220 if (isa<PoisonValue>(V))
22221 return true;
22222 if (getTreeEntries(V).size() > 1)
22223 return false;
22224 // For lat shuffle of sext/zext with many uses need to check the extra bit
22225 // for unsigned values, otherwise may have incorrect casting for reused
22226 // scalars.
22227 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
22228 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
22229 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22230 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22231 return true;
22232 }
22233 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22234 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22235 if (IsSignedNode)
22236 ++BitWidth1;
22237 if (auto *I = dyn_cast<Instruction>(V)) {
22238 APInt Mask = DB->getDemandedBits(I);
22239 unsigned BitWidth2 =
22240 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
22241 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22242 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
22243 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22244 break;
22245 BitWidth2 *= 2;
22246 }
22247 BitWidth1 = std::min(BitWidth1, BitWidth2);
22248 }
22249 BitWidth = std::max(BitWidth, BitWidth1);
22250 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
22251 };
22252 auto FinalAnalysis = [&, TTI = TTI]() {
22253 if (!IsProfitableToDemote)
22254 return false;
22255 bool Res = all_of(
22256 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
22257 // Demote gathers.
22258 if (Res && E.isGather()) {
22259 if (E.hasState()) {
22260 if (const TreeEntry *SameTE =
22261 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
22262 SameTE)
22263 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
22264 ToDemote, Visited, NodesToKeepBWs,
22265 MaxDepthLevel, IsProfitableToDemote,
22266 IsTruncRoot)) {
22267 ToDemote.push_back(E.Idx);
22268 return true;
22269 }
22270 }
22271 // Check possible extractelement instructions bases and final vector
22272 // length.
22273 SmallPtrSet<Value *, 4> UniqueBases;
22274 for (Value *V : E.Scalars) {
22275 auto *EE = dyn_cast<ExtractElementInst>(V);
22276 if (!EE)
22277 continue;
22278 UniqueBases.insert(EE->getVectorOperand());
22279 }
22280 const unsigned VF = E.Scalars.size();
22281 Type *OrigScalarTy = E.Scalars.front()->getType();
22282 if (UniqueBases.size() <= 2 ||
22283 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
22285 *TTI,
22287 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
22288 VF))) {
22289 ToDemote.push_back(E.Idx);
22290 return true;
22291 }
22292 }
22293 return Res;
22294 };
22295 if (E.isGather() || !Visited.insert(&E).second ||
22296 any_of(E.Scalars, [&](Value *V) {
22297 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22298 return isa<InsertElementInst>(U) && !isVectorized(U);
22299 });
22300 }))
22301 return FinalAnalysis();
22302
22303 if (any_of(E.Scalars, [&](Value *V) {
22304 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22305 return isVectorized(U) ||
22306 (E.Idx == 0 && UserIgnoreList &&
22307 UserIgnoreList->contains(U)) ||
22308 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22309 !U->getType()->isScalableTy() &&
22310 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22311 }) && !IsPotentiallyTruncated(V, BitWidth);
22312 }))
22313 return false;
22314
22315 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
22316 bool &NeedToExit) {
22317 NeedToExit = false;
22318 unsigned InitLevel = MaxDepthLevel;
22319 for (const TreeEntry *Op : Operands) {
22320 unsigned Level = InitLevel;
22321 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
22322 ToDemote, Visited, NodesToKeepBWs, Level,
22323 IsProfitableToDemote, IsTruncRoot)) {
22324 if (!IsProfitableToDemote)
22325 return false;
22326 NeedToExit = true;
22327 if (!FinalAnalysis())
22328 return false;
22329 continue;
22330 }
22331 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22332 }
22333 return true;
22334 };
22335 auto AttemptCheckBitwidth =
22336 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
22337 // Try all bitwidth < OrigBitWidth.
22338 NeedToExit = false;
22339 unsigned BestFailBitwidth = 0;
22340 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
22341 if (Checker(BitWidth, OrigBitWidth))
22342 return true;
22343 if (BestFailBitwidth == 0 && FinalAnalysis())
22344 BestFailBitwidth = BitWidth;
22345 }
22346 if (BitWidth >= OrigBitWidth) {
22347 if (BestFailBitwidth == 0) {
22348 BitWidth = OrigBitWidth;
22349 return false;
22350 }
22351 MaxDepthLevel = 1;
22352 BitWidth = BestFailBitwidth;
22353 NeedToExit = true;
22354 return true;
22355 }
22356 return false;
22357 };
22358 auto TryProcessInstruction =
22359 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
22360 function_ref<bool(unsigned, unsigned)> Checker = {}) {
22361 if (Operands.empty()) {
22362 if (!IsTruncRoot)
22363 MaxDepthLevel = 1;
22364 for (Value *V : E.Scalars)
22365 (void)IsPotentiallyTruncated(V, BitWidth);
22366 } else {
22367 // Several vectorized uses? Check if we can truncate it, otherwise -
22368 // exit.
22369 if (any_of(E.Scalars, [&](Value *V) {
22370 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22371 }))
22372 return false;
22373 bool NeedToExit = false;
22374 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22375 return false;
22376 if (NeedToExit)
22377 return true;
22378 if (!ProcessOperands(Operands, NeedToExit))
22379 return false;
22380 if (NeedToExit)
22381 return true;
22382 }
22383
22384 ++MaxDepthLevel;
22385 // Record the entry that we can demote.
22386 ToDemote.push_back(E.Idx);
22387 return IsProfitableToDemote;
22388 };
22389
22390 if (E.State == TreeEntry::SplitVectorize)
22391 return TryProcessInstruction(
22392 BitWidth,
22393 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
22394 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
22395
22396 if (E.isAltShuffle()) {
22397 // Combining these opcodes may lead to incorrect analysis, skip for now.
22398 auto IsDangerousOpcode = [](unsigned Opcode) {
22399 switch (Opcode) {
22400 case Instruction::Shl:
22401 case Instruction::AShr:
22402 case Instruction::LShr:
22403 case Instruction::UDiv:
22404 case Instruction::SDiv:
22405 case Instruction::URem:
22406 case Instruction::SRem:
22407 return true;
22408 default:
22409 break;
22410 }
22411 return false;
22412 };
22413 if (IsDangerousOpcode(E.getAltOpcode()))
22414 return FinalAnalysis();
22415 }
22416
22417 switch (E.getOpcode()) {
22418
22419 // We can always demote truncations and extensions. Since truncations can
22420 // seed additional demotion, we save the truncated value.
22421 case Instruction::Trunc:
22422 if (IsProfitableToDemoteRoot)
22423 IsProfitableToDemote = true;
22424 return TryProcessInstruction(BitWidth);
22425 case Instruction::ZExt:
22426 case Instruction::SExt:
22427 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
22428 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22429 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22430 return false;
22431 IsProfitableToDemote = true;
22432 return TryProcessInstruction(BitWidth);
22433
22434 // We can demote certain binary operations if we can demote both of their
22435 // operands.
22436 case Instruction::Add:
22437 case Instruction::Sub:
22438 case Instruction::Mul:
22439 case Instruction::And:
22440 case Instruction::Or:
22441 case Instruction::Xor: {
22442 return TryProcessInstruction(
22443 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
22444 }
22445 case Instruction::Freeze:
22446 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
22447 case Instruction::Shl: {
22448 // If we are truncating the result of this SHL, and if it's a shift of an
22449 // inrange amount, we can always perform a SHL in a smaller type.
22450 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
22451 return all_of(E.Scalars, [&](Value *V) {
22452 if (isa<PoisonValue>(V))
22453 return true;
22454 if (E.isCopyableElement(V))
22455 return true;
22456 auto *I = cast<Instruction>(V);
22457 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22458 return AmtKnownBits.getMaxValue().ult(BitWidth);
22459 });
22460 };
22461 return TryProcessInstruction(
22462 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
22463 }
22464 case Instruction::LShr: {
22465 // If this is a truncate of a logical shr, we can truncate it to a smaller
22466 // lshr iff we know that the bits we would otherwise be shifting in are
22467 // already zeros.
22468 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22469 return all_of(E.Scalars, [&](Value *V) {
22470 if (isa<PoisonValue>(V))
22471 return true;
22472 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22473 if (E.isCopyableElement(V))
22474 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22475 auto *I = cast<Instruction>(V);
22476 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22477 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22478 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22479 SimplifyQuery(*DL));
22480 });
22481 };
22482 return TryProcessInstruction(
22483 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22484 LShrChecker);
22485 }
22486 case Instruction::AShr: {
22487 // If this is a truncate of an arithmetic shr, we can truncate it to a
22488 // smaller ashr iff we know that all the bits from the sign bit of the
22489 // original type and the sign bit of the truncate type are similar.
22490 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22491 return all_of(E.Scalars, [&](Value *V) {
22492 if (isa<PoisonValue>(V))
22493 return true;
22494 auto *I = cast<Instruction>(V);
22495 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22496 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22497 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22498 ShiftedBits <
22499 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22500 });
22501 };
22502 return TryProcessInstruction(
22503 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22504 AShrChecker);
22505 }
22506 case Instruction::UDiv:
22507 case Instruction::URem: {
22508 // UDiv and URem can be truncated if all the truncated bits are zero.
22509 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22510 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22511 return all_of(E.Scalars, [&](Value *V) {
22512 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22513 if (E.hasCopyableElements() && E.isCopyableElement(V))
22514 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22515 auto *I = cast<Instruction>(V);
22516 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22517 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22518 });
22519 };
22520 return TryProcessInstruction(
22521 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
22522 }
22523
22524 // We can demote selects if we can demote their true and false values.
22525 case Instruction::Select: {
22526 return TryProcessInstruction(
22527 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
22528 }
22529
22530 // We can demote phis if we can demote all their incoming operands.
22531 case Instruction::PHI: {
22532 const unsigned NumOps = E.getNumOperands();
22534 transform(seq<unsigned>(0, NumOps), Ops.begin(),
22535 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
22536
22537 return TryProcessInstruction(BitWidth, Ops);
22538 }
22539
22540 case Instruction::Call: {
22541 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
22542 if (!IC)
22543 break;
22545 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
22546 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
22547 break;
22548 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
22549 function_ref<bool(unsigned, unsigned)> CallChecker;
22550 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22551 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22552 return all_of(E.Scalars, [&](Value *V) {
22553 auto *I = cast<Instruction>(V);
22554 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22555 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22556 return MaskedValueIsZero(I->getOperand(0), Mask,
22557 SimplifyQuery(*DL)) &&
22558 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22559 }
22560 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
22561 "Expected min/max intrinsics only.");
22562 unsigned SignBits = OrigBitWidth - BitWidth;
22563 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22564 unsigned Op0SignBits =
22565 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22566 unsigned Op1SignBits =
22567 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
22568 return SignBits <= Op0SignBits &&
22569 ((SignBits != Op0SignBits &&
22570 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22571 MaskedValueIsZero(I->getOperand(0), Mask,
22572 SimplifyQuery(*DL))) &&
22573 SignBits <= Op1SignBits &&
22574 ((SignBits != Op1SignBits &&
22575 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
22576 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
22577 });
22578 };
22579 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22580 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22581 return all_of(E.Scalars, [&](Value *V) {
22582 auto *I = cast<Instruction>(V);
22583 unsigned SignBits = OrigBitWidth - BitWidth;
22584 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22585 unsigned Op0SignBits =
22586 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22587 return SignBits <= Op0SignBits &&
22588 ((SignBits != Op0SignBits &&
22589 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22590 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22591 });
22592 };
22593 if (ID != Intrinsic::abs) {
22594 Operands.push_back(getOperandEntry(&E, 1));
22595 CallChecker = CompChecker;
22596 } else {
22597 CallChecker = AbsChecker;
22598 }
22599 InstructionCost BestCost =
22600 std::numeric_limits<InstructionCost::CostType>::max();
22601 unsigned BestBitWidth = BitWidth;
22602 unsigned VF = E.Scalars.size();
22603 // Choose the best bitwidth based on cost estimations.
22604 auto Checker = [&](unsigned BitWidth, unsigned) {
22605 unsigned MinBW = PowerOf2Ceil(BitWidth);
22606 SmallVector<Type *> ArgTys =
22607 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
22608 auto VecCallCosts = getVectorCallCosts(
22609 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
22610 TTI, TLI, ArgTys);
22611 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
22612 if (Cost < BestCost) {
22613 BestCost = Cost;
22614 BestBitWidth = BitWidth;
22615 }
22616 return false;
22617 };
22618 [[maybe_unused]] bool NeedToExit;
22619 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22620 BitWidth = BestBitWidth;
22621 return TryProcessInstruction(BitWidth, Operands, CallChecker);
22622 }
22623
22624 // Otherwise, conservatively give up.
22625 default:
22626 break;
22627 }
22628 MaxDepthLevel = 1;
22629 return FinalAnalysis();
22630}
22631
22632static RecurKind getRdxKind(Value *V);
22633
22635 // We only attempt to truncate integer expressions.
22636 bool IsStoreOrInsertElt =
22637 VectorizableTree.front()->hasState() &&
22638 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22639 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22640 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22641 ExtraBitWidthNodes.size() <= 1 &&
22642 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22643 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22644 return;
22645
22646 unsigned NodeIdx = 0;
22647 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22648 NodeIdx = 1;
22649
22650 // Ensure the roots of the vectorizable tree don't form a cycle.
22651 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
22652 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22653 "Unexpected tree is graph.");
22654
22655 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
22656 // resize to the final type.
22657 bool IsTruncRoot = false;
22658 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22659 SmallVector<unsigned> RootDemotes;
22660 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
22661 if (NodeIdx != 0 &&
22662 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22663 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22664 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
22665 IsTruncRoot = true;
22666 RootDemotes.push_back(NodeIdx);
22667 IsProfitableToDemoteRoot = true;
22668 ++NodeIdx;
22669 }
22670
22671 // Analyzed the reduction already and not profitable - exit.
22672 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22673 return;
22674
22675 SmallVector<unsigned> ToDemote;
22676 auto ComputeMaxBitWidth =
22677 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
22678 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
22679 ToDemote.clear();
22680 // Check if the root is trunc and the next node is gather/buildvector, then
22681 // keep trunc in scalars, which is free in most cases.
22682 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22683 !NodesToKeepBWs.contains(E.Idx) &&
22684 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22685 all_of(E.Scalars, [&](Value *V) {
22686 return V->hasOneUse() || isa<Constant>(V) ||
22687 (!V->hasNUsesOrMore(UsesLimit) &&
22688 none_of(V->users(), [&](User *U) {
22689 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22690 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22691 if (TEs.empty() || is_contained(TEs, UserTE))
22692 return false;
22693 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22694 SelectInst>(U) ||
22695 isa<SIToFPInst, UIToFPInst>(U) ||
22696 (UserTE->hasState() &&
22697 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22698 SelectInst>(UserTE->getMainOp()) ||
22699 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22700 return true;
22701 unsigned UserTESz = DL->getTypeSizeInBits(
22702 UserTE->Scalars.front()->getType());
22703 if (all_of(TEs, [&](const TreeEntry *TE) {
22704 auto It = MinBWs.find(TE);
22705 return It != MinBWs.end() &&
22706 It->second.first > UserTESz;
22707 }))
22708 return true;
22709 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22710 }));
22711 })) {
22712 ToDemote.push_back(E.Idx);
22713 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22714 auto It = MinBWs.find(UserTE);
22715 if (It != MinBWs.end())
22716 return It->second.first;
22717 unsigned MaxBitWidth =
22718 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22719 MaxBitWidth = bit_ceil(MaxBitWidth);
22720 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22721 MaxBitWidth = 8;
22722 return MaxBitWidth;
22723 }
22724
22725 if (!E.hasState())
22726 return 0u;
22727
22728 unsigned VF = E.getVectorFactor();
22729 Type *ScalarTy = E.Scalars.front()->getType();
22730 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22731 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
22732 if (!TreeRootIT)
22733 return 0u;
22734
22735 if (any_of(E.Scalars,
22736 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
22737 return 0u;
22738
22739 unsigned NumParts = ::getNumberOfParts(
22740 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
22741
22742 // The maximum bit width required to represent all the values that can be
22743 // demoted without loss of precision. It would be safe to truncate the roots
22744 // of the expression to this width.
22745 unsigned MaxBitWidth = 1u;
22746
22747 // True if the roots can be zero-extended back to their original type,
22748 // rather than sign-extended. We know that if the leading bits are not
22749 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
22750 // True.
22751 // Determine if the sign bit of all the roots is known to be zero. If not,
22752 // IsKnownPositive is set to False.
22753 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
22754 if (isa<PoisonValue>(R))
22755 return true;
22756 KnownBits Known = computeKnownBits(R, *DL);
22757 return Known.isNonNegative();
22758 });
22759
22760 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22761 E.UserTreeIndex.UserTE->hasState() &&
22762 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22763 MaxBitWidth =
22764 std::min(DL->getTypeSizeInBits(
22765 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22766 DL->getTypeSizeInBits(ScalarTy));
22767
22768 // We first check if all the bits of the roots are demanded. If they're not,
22769 // we can truncate the roots to this narrower type.
22770 for (Value *Root : E.Scalars) {
22771 if (isa<PoisonValue>(Root))
22772 continue;
22773 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
22774 TypeSize NumTypeBits =
22775 DL->getTypeSizeInBits(Root->getType()->getScalarType());
22776 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22777 // If we can't prove that the sign bit is zero, we must add one to the
22778 // maximum bit width to account for the unknown sign bit. This preserves
22779 // the existing sign bit so we can safely sign-extend the root back to the
22780 // original type. Otherwise, if we know the sign bit is zero, we will
22781 // zero-extend the root instead.
22782 //
22783 // FIXME: This is somewhat suboptimal, as there will be cases where adding
22784 // one to the maximum bit width will yield a larger-than-necessary
22785 // type. In general, we need to add an extra bit only if we can't
22786 // prove that the upper bit of the original type is equal to the
22787 // upper bit of the proposed smaller type. If these two bits are
22788 // the same (either zero or one) we know that sign-extending from
22789 // the smaller type will result in the same value. Here, since we
22790 // can't yet prove this, we are just making the proposed smaller
22791 // type larger to ensure correctness.
22792 if (!IsKnownPositive)
22793 ++BitWidth1;
22794
22795 auto *I = dyn_cast<Instruction>(Root);
22796 if (!I) {
22797 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22798 continue;
22799 }
22800 APInt Mask = DB->getDemandedBits(I);
22801 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22802 MaxBitWidth =
22803 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22804 }
22805
22806 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22807 MaxBitWidth = 8;
22808
22809 // If the original type is large, but reduced type does not improve the reg
22810 // use - ignore it.
22811 if (NumParts > 1 &&
22812 NumParts ==
22814 *TTI, getWidenedType(IntegerType::get(F->getContext(),
22815 bit_ceil(MaxBitWidth)),
22816 VF)))
22817 return 0u;
22818
22819 unsigned Opcode = E.getOpcode();
22820 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22821 Opcode == Instruction::SExt ||
22822 Opcode == Instruction::ZExt || NumParts > 1;
22823 // Conservatively determine if we can actually truncate the roots of the
22824 // expression. Collect the values that can be demoted in ToDemote and
22825 // additional roots that require investigating in Roots.
22827 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22828 bool NeedToDemote = IsProfitableToDemote;
22829
22830 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22831 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22832 NeedToDemote, IsTruncRoot) ||
22833 (MaxDepthLevel <= Limit &&
22834 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22835 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22836 DL->getTypeSizeInBits(TreeRootIT) /
22837 DL->getTypeSizeInBits(
22838 E.getMainOp()->getOperand(0)->getType()) >
22839 2)))))
22840 return 0u;
22841 // Round MaxBitWidth up to the next power-of-two.
22842 MaxBitWidth = bit_ceil(MaxBitWidth);
22843
22844 return MaxBitWidth;
22845 };
22846
22847 // If we can truncate the root, we must collect additional values that might
22848 // be demoted as a result. That is, those seeded by truncations we will
22849 // modify.
22850 // Add reduction ops sizes, if any.
22851 if (UserIgnoreList &&
22852 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22853 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
22854 // x i1> to in)).
22855 if (all_of(*UserIgnoreList,
22856 [](Value *V) {
22857 return isa<PoisonValue>(V) ||
22858 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22859 }) &&
22860 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22861 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22862 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22863 Builder.getInt1Ty()) {
22864 ReductionBitWidth = 1;
22865 } else {
22866 for (Value *V : *UserIgnoreList) {
22867 if (isa<PoisonValue>(V))
22868 continue;
22869 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22870 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
22871 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22872 if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
22873 ++BitWidth1;
22874 unsigned BitWidth2 = BitWidth1;
22876 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
22877 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22878 }
22879 ReductionBitWidth =
22880 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22881 }
22882 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22883 ReductionBitWidth = 8;
22884
22885 ReductionBitWidth = bit_ceil(ReductionBitWidth);
22886 }
22887 }
22888 bool IsTopRoot = NodeIdx == 0;
22889 while (NodeIdx < VectorizableTree.size() &&
22890 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22891 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22892 RootDemotes.push_back(NodeIdx);
22893 ++NodeIdx;
22894 IsTruncRoot = true;
22895 }
22896 bool IsSignedCmp = false;
22897 if (UserIgnoreList &&
22898 all_of(*UserIgnoreList,
22900 m_SMax(m_Value(), m_Value())))))
22901 IsSignedCmp = true;
22902 while (NodeIdx < VectorizableTree.size()) {
22903 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
22904 unsigned Limit = 2;
22905 if (IsTopRoot &&
22906 ReductionBitWidth ==
22907 DL->getTypeSizeInBits(
22908 VectorizableTree.front()->Scalars.front()->getType()))
22909 Limit = 3;
22910 unsigned MaxBitWidth = ComputeMaxBitWidth(
22911 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22912 IsTruncRoot, IsSignedCmp);
22913 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22914 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22915 ReductionBitWidth = bit_ceil(MaxBitWidth);
22916 else if (MaxBitWidth == 0)
22917 ReductionBitWidth = 0;
22918 }
22919
22920 for (unsigned Idx : RootDemotes) {
22921 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
22922 uint32_t OrigBitWidth =
22923 DL->getTypeSizeInBits(V->getType()->getScalarType());
22924 if (OrigBitWidth > MaxBitWidth) {
22925 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
22926 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22927 }
22928 return false;
22929 }))
22930 ToDemote.push_back(Idx);
22931 }
22932 RootDemotes.clear();
22933 IsTopRoot = false;
22934 IsProfitableToDemoteRoot = true;
22935
22936 if (ExtraBitWidthNodes.empty()) {
22937 NodeIdx = VectorizableTree.size();
22938 } else {
22939 unsigned NewIdx = 0;
22940 do {
22941 NewIdx = *ExtraBitWidthNodes.begin();
22942 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22943 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22944 NodeIdx = NewIdx;
22945 IsTruncRoot =
22946 NodeIdx < VectorizableTree.size() &&
22947 VectorizableTree[NodeIdx]->UserTreeIndex &&
22948 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22949 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22950 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22951 Instruction::Trunc &&
22952 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22953 IsSignedCmp =
22954 NodeIdx < VectorizableTree.size() &&
22955 VectorizableTree[NodeIdx]->UserTreeIndex &&
22956 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22957 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22958 Instruction::ICmp &&
22959 any_of(
22960 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22961 [&](Value *V) {
22962 auto *IC = dyn_cast<ICmpInst>(V);
22963 return IC && (IC->isSigned() ||
22964 !isKnownNonNegative(IC->getOperand(0),
22965 SimplifyQuery(*DL)) ||
22966 !isKnownNonNegative(IC->getOperand(1),
22967 SimplifyQuery(*DL)));
22968 });
22969 }
22970
22971 // If the maximum bit width we compute is less than the width of the roots'
22972 // type, we can proceed with the narrowing. Otherwise, do nothing.
22973 if (MaxBitWidth == 0 ||
22974 MaxBitWidth >=
22975 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
22976 ->getBitWidth()) {
22977 if (UserIgnoreList)
22978 AnalyzedMinBWVals.insert_range(TreeRoot);
22979 NodesToKeepBWs.insert_range(ToDemote);
22980 continue;
22981 }
22982
22983 // Finally, map the values we can demote to the maximum bit with we
22984 // computed.
22985 for (unsigned Idx : ToDemote) {
22986 TreeEntry *TE = VectorizableTree[Idx].get();
22987 if (MinBWs.contains(TE))
22988 continue;
22989 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
22990 if (isa<PoisonValue>(R))
22991 return false;
22992 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22993 });
22994 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22995 }
22996 }
22997}
22998
23000 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
23001 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
23003 auto *AA = &AM.getResult<AAManager>(F);
23004 auto *LI = &AM.getResult<LoopAnalysis>(F);
23005 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
23006 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
23007 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
23009
23010 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
23011 if (!Changed)
23012 return PreservedAnalyses::all();
23013
23016 return PA;
23017}
23018
23020 TargetTransformInfo *TTI_,
23021 TargetLibraryInfo *TLI_, AAResults *AA_,
23022 LoopInfo *LI_, DominatorTree *DT_,
23023 AssumptionCache *AC_, DemandedBits *DB_,
23026 return false;
23027 SE = SE_;
23028 TTI = TTI_;
23029 TLI = TLI_;
23030 AA = AA_;
23031 LI = LI_;
23032 DT = DT_;
23033 AC = AC_;
23034 DB = DB_;
23035 DL = &F.getDataLayout();
23036
23037 Stores.clear();
23038 GEPs.clear();
23039 bool Changed = false;
23040
23041 // If the target claims to have no vector registers don't attempt
23042 // vectorization.
23043 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
23044 LLVM_DEBUG(
23045 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
23046 return false;
23047 }
23048
23049 // Don't vectorize when the attribute NoImplicitFloat is used.
23050 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
23051 return false;
23052
23053 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
23054
23055 // Use the bottom up slp vectorizer to construct chains that start with
23056 // store instructions.
23057 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
23058
23059 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
23060 // delete instructions.
23061
23062 // Update DFS numbers now so that we can use them for ordering.
23063 DT->updateDFSNumbers();
23064
23065 // Scan the blocks in the function in post order.
23066 for (auto *BB : post_order(&F.getEntryBlock())) {
23068 continue;
23069
23070 // Start new block - clear the list of reduction roots.
23071 R.clearReductionData();
23072 collectSeedInstructions(BB);
23073
23074 // Vectorize trees that end at stores.
23075 if (!Stores.empty()) {
23076 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
23077 << " underlying objects.\n");
23078 Changed |= vectorizeStoreChains(R);
23079 }
23080
23081 // Vectorize trees that end at reductions.
23082 Changed |= vectorizeChainsInBlock(BB, R);
23083
23084 // Vectorize the index computations of getelementptr instructions. This
23085 // is primarily intended to catch gather-like idioms ending at
23086 // non-consecutive loads.
23087 if (!GEPs.empty()) {
23088 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
23089 << " underlying objects.\n");
23090 Changed |= vectorizeGEPIndices(BB, R);
23091 }
23092 }
23093
23094 if (Changed) {
23095 R.optimizeGatherSequence();
23096 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
23097 }
23098 return Changed;
23099}
23100
23101std::optional<bool>
23102SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
23103 unsigned Idx, unsigned MinVF,
23104 unsigned &Size) {
23105 Size = 0;
23106 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
23107 << "\n");
23108 const unsigned Sz = R.getVectorElementSize(Chain[0]);
23109 unsigned VF = Chain.size();
23110
23111 if (!has_single_bit(Sz) ||
23113 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
23114 VF) ||
23115 VF < 2 || VF < MinVF) {
23116 // Check if vectorizing with a non-power-of-2 VF should be considered. At
23117 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
23118 // all vector lanes are used.
23119 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
23120 return false;
23121 }
23122
23123 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
23124 << "\n");
23125
23126 SetVector<Value *> ValOps;
23127 for (Value *V : Chain)
23128 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
23129 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
23130 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
23131 InstructionsState S = Analysis.buildInstructionsState(
23132 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
23133 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
23134 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
23135 bool IsAllowedSize =
23136 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
23137 ValOps.size()) ||
23138 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
23139 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
23140 (!S.getMainOp()->isSafeToRemove() ||
23141 any_of(ValOps.getArrayRef(),
23142 [&](Value *V) {
23143 return !isa<ExtractElementInst>(V) &&
23144 (V->getNumUses() > Chain.size() ||
23145 any_of(V->users(), [&](User *U) {
23146 return !Stores.contains(U);
23147 }));
23148 }))) ||
23149 (ValOps.size() > Chain.size() / 2 && !S)) {
23150 Size = (!IsAllowedSize && S) ? 1 : 2;
23151 return false;
23152 }
23153 }
23154 if (R.isLoadCombineCandidate(Chain))
23155 return true;
23156 R.buildTree(Chain);
23157 // Check if tree tiny and store itself or its value is not vectorized.
23158 if (R.isTreeTinyAndNotFullyVectorizable()) {
23159 if (R.isGathered(Chain.front()) ||
23160 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
23161 return std::nullopt;
23162 Size = R.getCanonicalGraphSize();
23163 return false;
23164 }
23165 if (R.isProfitableToReorder()) {
23166 R.reorderTopToBottom();
23167 R.reorderBottomToTop();
23168 }
23169 R.transformNodes();
23170 R.buildExternalUses();
23171
23172 R.computeMinimumValueSizes();
23173
23174 Size = R.getCanonicalGraphSize();
23175 if (S && S.getOpcode() == Instruction::Load)
23176 Size = 2; // cut off masked gather small trees
23177 InstructionCost Cost = R.getTreeCost();
23178
23179 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
23180 if (Cost < -SLPCostThreshold) {
23181 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
23182
23183 using namespace ore;
23184
23185 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
23186 cast<StoreInst>(Chain[0]))
23187 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
23188 << " and with tree size "
23189 << NV("TreeSize", R.getTreeSize()));
23190
23191 R.vectorizeTree();
23192 return true;
23193 }
23194
23195 return false;
23196}
23197
23198/// Checks if the quadratic mean deviation is less than 90% of the mean size.
23199static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
23200 bool First) {
23201 unsigned Num = 0;
23202 uint64_t Sum = std::accumulate(
23203 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
23204 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
23205 unsigned Size = First ? Val.first : Val.second;
23206 if (Size == 1)
23207 return V;
23208 ++Num;
23209 return V + Size;
23210 });
23211 if (Num == 0)
23212 return true;
23213 uint64_t Mean = Sum / Num;
23214 if (Mean == 0)
23215 return true;
23216 uint64_t Dev = std::accumulate(
23217 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
23218 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
23219 unsigned P = First ? Val.first : Val.second;
23220 if (P == 1)
23221 return V;
23222 return V + (P - Mean) * (P - Mean);
23223 }) /
23224 Num;
23225 return Dev * 96 / (Mean * Mean) == 0;
23226}
23227
23228namespace {
23229
23230/// A group of stores that we'll try to bundle together using vector ops.
23231/// They are ordered using the signed distance of their address operand to the
23232/// address of this group's BaseInstr.
23233class RelatedStoreInsts {
23234public:
23235 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
23236 : AllStores(AllStores) {
23237 reset(BaseInstrIdx);
23238 }
23239
23240 void reset(unsigned NewBaseInstr) {
23241 assert(NewBaseInstr < AllStores.size() &&
23242 "Instruction index out of bounds");
23243 BaseInstrIdx = NewBaseInstr;
23244 Instrs.clear();
23245 insertOrLookup(NewBaseInstr, 0);
23246 }
23247
23248 /// Tries to insert \p InstrIdx as the store with a pointer distance of
23249 /// \p PtrDist.
23250 /// Does nothing if there is already a store with that \p PtrDist.
23251 /// \returns The previously associated Instruction index, or std::nullopt
23252 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
23253 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23254 return Inserted ? std::nullopt : std::make_optional(It->second);
23255 }
23256
23257 using DistToInstMap = std::map<int64_t, unsigned>;
23258 const DistToInstMap &getStores() const { return Instrs; }
23259
23260 /// If \p SI is related to this group of stores, return the distance of its
23261 /// pointer operand to the one the group's BaseInstr.
23262 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
23263 ScalarEvolution &SE) const {
23264 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23265 return getPointersDiff(
23266 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
23267 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
23268 /*StrictCheck=*/true);
23269 }
23270
23271 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
23272 /// Stores whose index is less than \p MinSafeIdx will be dropped.
23273 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
23274 int64_t DistFromCurBase) {
23275 DistToInstMap PrevSet = std::move(Instrs);
23276 reset(NewBaseInstIdx);
23277
23278 // Re-insert stores that come after MinSafeIdx to try and vectorize them
23279 // again. Their distance will be "rebased" to use NewBaseInstIdx as
23280 // reference.
23281 for (auto [Dist, InstIdx] : PrevSet) {
23282 if (InstIdx >= MinSafeIdx)
23283 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23284 }
23285 }
23286
23287 /// Remove all stores that have been vectorized from this group.
23288 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
23289 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
23290 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
23291 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
23292 });
23293
23294 // Get a forward iterator pointing after the last vectorized store and erase
23295 // all stores before it so we don't try to vectorize them again.
23296 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23297 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23298 }
23299
23300private:
23301 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
23302 unsigned BaseInstrIdx;
23303
23304 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
23305 DistToInstMap Instrs;
23306
23307 /// Reference to all the stores in the BB being analyzed.
23308 ArrayRef<StoreInst *> AllStores;
23309};
23310
23311} // end anonymous namespace
23312
23313bool SLPVectorizerPass::vectorizeStores(
23314 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
23315 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23316 &Visited) {
23317 // We may run into multiple chains that merge into a single chain. We mark the
23318 // stores that we vectorized so that we don't visit the same store twice.
23319 BoUpSLP::ValueSet VectorizedStores;
23320 bool Changed = false;
23321
23322 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23323 int64_t PrevDist = -1;
23324 BoUpSLP::ValueList Operands;
23325 // Collect the chain into a list.
23326 for (auto [Idx, Data] : enumerate(StoreSeq)) {
23327 auto &[Dist, InstIdx] = Data;
23328 if (Operands.empty() || Dist - PrevDist == 1) {
23329 Operands.push_back(Stores[InstIdx]);
23330 PrevDist = Dist;
23331 if (Idx != StoreSeq.size() - 1)
23332 continue;
23333 }
23334 auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
23335 Operands.clear();
23336 Operands.push_back(Stores[InstIdx]);
23337 PrevDist = Dist;
23338 });
23339
23340 if (Operands.size() <= 1 ||
23341 !Visited
23342 .insert({Operands.front(),
23343 cast<StoreInst>(Operands.front())->getValueOperand(),
23344 Operands.back(),
23345 cast<StoreInst>(Operands.back())->getValueOperand(),
23346 Operands.size()})
23347 .second)
23348 continue;
23349
23350 unsigned MaxVecRegSize = R.getMaxVecRegSize();
23351 unsigned EltSize = R.getVectorElementSize(Operands[0]);
23352 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
23353
23354 unsigned MaxVF =
23355 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23356 auto *Store = cast<StoreInst>(Operands[0]);
23357 Type *StoreTy = Store->getValueOperand()->getType();
23358 Type *ValueTy = StoreTy;
23359 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
23360 ValueTy = Trunc->getSrcTy();
23361 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
23362 // getStoreMinimumVF only support scalar type as arguments. As a result,
23363 // we need to use the element type of StoreTy and ValueTy to retrieve the
23364 // VF and then transform it back.
23365 // Remember: VF is defined as the number we want to vectorize, not the
23366 // number of elements in the final vector.
23367 Type *StoreScalarTy = StoreTy->getScalarType();
23368 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
23369 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23370 ValueTy->getScalarType()));
23371 MinVF /= getNumElements(StoreTy);
23372 MinVF = std::max<unsigned>(2, MinVF);
23373
23374 if (MaxVF < MinVF) {
23375 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23376 << ") < "
23377 << "MinVF (" << MinVF << ")\n");
23378 continue;
23379 }
23380
23381 unsigned NonPowerOf2VF = 0;
23383 // First try vectorizing with a non-power-of-2 VF. At the moment, only
23384 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
23385 // lanes are used.
23386 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
23387 if (has_single_bit(CandVF + 1)) {
23388 NonPowerOf2VF = CandVF;
23389 assert(NonPowerOf2VF != MaxVF &&
23390 "Non-power-of-2 VF should not be equal to MaxVF");
23391 }
23392 }
23393
23394 // MaxRegVF represents the number of instructions (scalar, or vector in
23395 // case of revec) that can be vectorized to naturally fit in a vector
23396 // register.
23397 unsigned MaxRegVF = MaxVF;
23398
23399 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
23400 if (MaxVF < MinVF) {
23401 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23402 << ") < "
23403 << "MinVF (" << MinVF << ")\n");
23404 continue;
23405 }
23406
23407 SmallVector<unsigned> CandidateVFs;
23408 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23409 VF = divideCeil(VF, 2))
23410 CandidateVFs.push_back(VF);
23411
23412 unsigned End = Operands.size();
23413 unsigned Repeat = 0;
23414 constexpr unsigned MaxAttempts = 4;
23415 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
23416 for (std::pair<unsigned, unsigned> &P : RangeSizes)
23417 P.first = P.second = 1;
23418 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23419 auto IsNotVectorized = [](bool First,
23420 const std::pair<unsigned, unsigned> &P) {
23421 return First ? P.first > 0 : P.second > 0;
23422 };
23423 auto IsVectorized = [](bool First,
23424 const std::pair<unsigned, unsigned> &P) {
23425 return First ? P.first == 0 : P.second == 0;
23426 };
23427 auto VFIsProfitable = [](bool First, unsigned Size,
23428 const std::pair<unsigned, unsigned> &P) {
23429 return First ? Size >= P.first : Size >= P.second;
23430 };
23431 auto FirstSizeSame = [](unsigned Size,
23432 const std::pair<unsigned, unsigned> &P) {
23433 return Size == P.first;
23434 };
23435 while (true) {
23436 ++Repeat;
23437 bool RepeatChanged = false;
23438 bool AnyProfitableGraph = false;
23439 for (unsigned VF : CandidateVFs) {
23440 AnyProfitableGraph = false;
23441 unsigned FirstUnvecStore =
23442 std::distance(RangeSizes.begin(),
23443 find_if(RangeSizes, std::bind(IsNotVectorized,
23444 VF >= MaxRegVF, _1)));
23445
23446 // Form slices of size VF starting from FirstUnvecStore and try to
23447 // vectorize them.
23448 while (FirstUnvecStore < End) {
23449 unsigned FirstVecStore = std::distance(
23450 RangeSizes.begin(),
23451 find_if(RangeSizes.drop_front(FirstUnvecStore),
23452 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23453 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23454 for (unsigned SliceStartIdx = FirstUnvecStore;
23455 SliceStartIdx + VF <= MaxSliceEnd;) {
23456 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
23457 VF >= MaxRegVF)) {
23458 ++SliceStartIdx;
23459 continue;
23460 }
23461 ArrayRef<Value *> Slice =
23462 ArrayRef(Operands).slice(SliceStartIdx, VF);
23463 assert(all_of(Slice,
23464 [&](Value *V) {
23465 return cast<StoreInst>(V)
23466 ->getValueOperand()
23467 ->getType() ==
23468 cast<StoreInst>(Slice.front())
23469 ->getValueOperand()
23470 ->getType();
23471 }) &&
23472 "Expected all operands of same type.");
23473 if (!NonSchedulable.empty()) {
23474 auto [NonSchedSizeMax, NonSchedSizeMin] =
23475 NonSchedulable.lookup(Slice.front());
23476 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23477 // VF is too ambitious. Try to vectorize another slice before
23478 // trying a smaller VF.
23479 SliceStartIdx += NonSchedSizeMax;
23480 continue;
23481 }
23482 }
23483 unsigned TreeSize;
23484 std::optional<bool> Res =
23485 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23486 if (!Res) {
23487 // Update the range of non schedulable VFs for slices starting
23488 // at SliceStartIdx.
23489 NonSchedulable
23490 .try_emplace(Slice.front(), std::make_pair(VF, VF))
23491 .first->getSecond()
23492 .second = VF;
23493 } else if (*Res) {
23494 // Mark the vectorized stores so that we don't vectorize them
23495 // again.
23496 VectorizedStores.insert_range(Slice);
23497 // Mark the vectorized stores so that we don't vectorize them
23498 // again.
23499 AnyProfitableGraph = RepeatChanged = Changed = true;
23500 // If we vectorized initial block, no need to try to vectorize
23501 // it again.
23502 for (std::pair<unsigned, unsigned> &P :
23503 RangeSizes.slice(SliceStartIdx, VF))
23504 P.first = P.second = 0;
23505 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23506 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
23507 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23508 P.first = P.second = 0;
23509 FirstUnvecStore = SliceStartIdx + VF;
23510 }
23511 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23512 for (std::pair<unsigned, unsigned> &P :
23513 RangeSizes.slice(SliceStartIdx + VF,
23514 MaxSliceEnd - (SliceStartIdx + VF)))
23515 P.first = P.second = 0;
23516 if (MaxSliceEnd == End)
23517 End = SliceStartIdx;
23518 MaxSliceEnd = SliceStartIdx;
23519 }
23520 SliceStartIdx += VF;
23521 continue;
23522 }
23523 if (VF > 2 && Res &&
23524 !all_of(RangeSizes.slice(SliceStartIdx, VF),
23525 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23526 _1))) {
23527 SliceStartIdx += VF;
23528 continue;
23529 }
23530 // Check for the very big VFs that we're not rebuilding same
23531 // trees, just with larger number of elements.
23532 if (VF > MaxRegVF && TreeSize > 1 &&
23533 all_of(RangeSizes.slice(SliceStartIdx, VF),
23534 std::bind(FirstSizeSame, TreeSize, _1))) {
23535 SliceStartIdx += VF;
23536 while (SliceStartIdx != MaxSliceEnd &&
23537 RangeSizes[SliceStartIdx].first == TreeSize)
23538 ++SliceStartIdx;
23539 continue;
23540 }
23541 if (TreeSize > 1) {
23542 for (std::pair<unsigned, unsigned> &P :
23543 RangeSizes.slice(SliceStartIdx, VF)) {
23544 if (VF >= MaxRegVF)
23545 P.second = std::max(P.second, TreeSize);
23546 else
23547 P.first = std::max(P.first, TreeSize);
23548 }
23549 }
23550 ++SliceStartIdx;
23551 AnyProfitableGraph = true;
23552 }
23553 if (FirstUnvecStore >= End)
23554 break;
23555 if (MaxSliceEnd - FirstUnvecStore < VF &&
23556 MaxSliceEnd - FirstUnvecStore >= MinVF)
23557 AnyProfitableGraph = true;
23558 FirstUnvecStore = std::distance(
23559 RangeSizes.begin(),
23560 find_if(RangeSizes.drop_front(MaxSliceEnd),
23561 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23562 }
23563 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
23564 break;
23565 }
23566 // All values vectorized - exit.
23567 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
23568 return P.first == 0 && P.second == 0;
23569 }))
23570 break;
23571 // Check if tried all attempts or no need for the last attempts at all.
23572 if (Repeat >= MaxAttempts ||
23573 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23574 break;
23575 constexpr unsigned StoresLimit = 64;
23576 const unsigned MaxTotalNum = std::min<unsigned>(
23577 Operands.size(),
23578 static_cast<unsigned>(
23579 End -
23580 std::distance(
23581 RangeSizes.begin(),
23582 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
23583 1));
23584 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
23585 unsigned Limit =
23586 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
23587 CandidateVFs.clear();
23588 if (bit_floor(Limit) == VF)
23589 CandidateVFs.push_back(Limit);
23590 if (VF > MaxTotalNum || VF >= StoresLimit)
23591 break;
23592 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
23593 if (P.first != 0)
23594 P.first = std::max(P.second, P.first);
23595 }
23596 // Last attempt to vectorize max number of elements, if all previous
23597 // attempts were unsuccessful because of the cost issues.
23598 CandidateVFs.push_back(VF);
23599 }
23600 }
23601 };
23602
23603 /// Groups of stores to vectorize
23604 SmallVector<RelatedStoreInsts> SortedStores;
23605
23606 // Inserts the specified store SI with the given index Idx to the set of the
23607 // stores. If the store with the same distance is found already - stop
23608 // insertion, try to vectorize already found stores. If some stores from this
23609 // sequence were not vectorized - try to vectorize them with the new store
23610 // later. But this logic is applied only to the stores, that come before the
23611 // previous store with the same distance.
23612 // Example:
23613 // 1. store x, %p
23614 // 2. store y, %p+1
23615 // 3. store z, %p+2
23616 // 4. store a, %p
23617 // 5. store b, %p+3
23618 // - Scan this from the last to first store. The very first bunch of stores is
23619 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
23620 // vector).
23621 // - The next store in the list - #1 - has the same distance from store #5 as
23622 // the store #4.
23623 // - Try to vectorize sequence of stores 4,2,3,5.
23624 // - If all these stores are vectorized - just drop them.
23625 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
23626 // - Start new stores sequence.
23627 // The new bunch of stores is {1, {1, 0}}.
23628 // - Add the stores from previous sequence, that were not vectorized.
23629 // Here we consider the stores in the reversed order, rather they are used in
23630 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
23631 // Store #3 can be added -> comes after store #4 with the same distance as
23632 // store #1.
23633 // Store #5 cannot be added - comes before store #4.
23634 // This logic allows to improve the compile time, we assume that the stores
23635 // after previous store with the same distance most likely have memory
23636 // dependencies and no need to waste compile time to try to vectorize them.
23637 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
23638 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
23639 std::optional<int64_t> PtrDist;
23640 auto *RelatedStores = find_if(
23641 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
23642 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23643 return PtrDist.has_value();
23644 });
23645
23646 // We did not find a comparable store, start a new group.
23647 if (RelatedStores == SortedStores.end()) {
23648 SortedStores.emplace_back(Idx, Stores);
23649 return;
23650 }
23651
23652 // If there is already a store in the group with the same PtrDiff, try to
23653 // vectorize the existing instructions before adding the current store.
23654 // Otherwise, insert this store and keep collecting.
23655 if (std::optional<unsigned> PrevInst =
23656 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23657 TryToVectorize(RelatedStores->getStores());
23658 RelatedStores->clearVectorizedStores(VectorizedStores);
23659 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
23660 /*NewBaseInstIdx=*/Idx,
23661 /*DistFromCurBase=*/*PtrDist);
23662 }
23663 };
23664 Type *PrevValTy = nullptr;
23665 for (auto [I, SI] : enumerate(Stores)) {
23666 if (R.isDeleted(SI))
23667 continue;
23668 if (!PrevValTy)
23669 PrevValTy = SI->getValueOperand()->getType();
23670 // Check that we do not try to vectorize stores of different types.
23671 if (PrevValTy != SI->getValueOperand()->getType()) {
23672 for (RelatedStoreInsts &StoreSeq : SortedStores)
23673 TryToVectorize(StoreSeq.getStores());
23674 SortedStores.clear();
23675 PrevValTy = SI->getValueOperand()->getType();
23676 }
23677 FillStoresSet(I, SI);
23678 }
23679
23680 // Final vectorization attempt.
23681 for (RelatedStoreInsts &StoreSeq : SortedStores)
23682 TryToVectorize(StoreSeq.getStores());
23683
23684 return Changed;
23685}
23686
23687void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23688 // Initialize the collections. We will make a single pass over the block.
23689 Stores.clear();
23690 GEPs.clear();
23691
23692 // Visit the store and getelementptr instructions in BB and organize them in
23693 // Stores and GEPs according to the underlying objects of their pointer
23694 // operands.
23695 for (Instruction &I : *BB) {
23696 // Ignore store instructions that are volatile or have a pointer operand
23697 // that doesn't point to a scalar type.
23698 if (auto *SI = dyn_cast<StoreInst>(&I)) {
23699 if (!SI->isSimple())
23700 continue;
23701 if (!isValidElementType(SI->getValueOperand()->getType()))
23702 continue;
23703 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
23704 }
23705
23706 // Ignore getelementptr instructions that have more than one index, a
23707 // constant index, or a pointer operand that doesn't point to a scalar
23708 // type.
23709 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
23710 if (GEP->getNumIndices() != 1)
23711 continue;
23712 Value *Idx = GEP->idx_begin()->get();
23713 if (isa<Constant>(Idx))
23714 continue;
23715 if (!isValidElementType(Idx->getType()))
23716 continue;
23717 if (GEP->getType()->isVectorTy())
23718 continue;
23719 GEPs[GEP->getPointerOperand()].push_back(GEP);
23720 }
23721 }
23722}
23723
23724bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
23725 bool MaxVFOnly) {
23726 if (VL.size() < 2)
23727 return false;
23728
23729 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
23730 << VL.size() << ".\n");
23731
23732 // Check that all of the parts are instructions of the same type,
23733 // we permit an alternate opcode via InstructionsState.
23734 InstructionsState S = getSameOpcode(VL, *TLI);
23735 if (!S)
23736 return false;
23737
23738 Instruction *I0 = S.getMainOp();
23739 // Make sure invalid types (including vector type) are rejected before
23740 // determining vectorization factor for scalar instructions.
23741 for (Value *V : VL) {
23742 Type *Ty = V->getType();
23744 // NOTE: the following will give user internal llvm type name, which may
23745 // not be useful.
23746 R.getORE()->emit([&]() {
23747 std::string TypeStr;
23748 llvm::raw_string_ostream OS(TypeStr);
23749 Ty->print(OS);
23750 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
23751 << "Cannot SLP vectorize list: type "
23752 << TypeStr + " is unsupported by vectorizer";
23753 });
23754 return false;
23755 }
23756 }
23757
23758 Type *ScalarTy = getValueType(VL[0]);
23759 unsigned Sz = R.getVectorElementSize(I0);
23760 unsigned MinVF = R.getMinVF(Sz);
23761 unsigned MaxVF = std::max<unsigned>(
23762 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
23763 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23764 if (MaxVF < 2) {
23765 R.getORE()->emit([&]() {
23766 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
23767 << "Cannot SLP vectorize list: vectorization factor "
23768 << "less than 2 is not supported";
23769 });
23770 return false;
23771 }
23772
23773 bool Changed = false;
23774 bool CandidateFound = false;
23775 InstructionCost MinCost = SLPCostThreshold.getValue();
23776
23777 unsigned NextInst = 0, MaxInst = VL.size();
23778 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23779 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
23780 // No actual vectorization should happen, if number of parts is the same as
23781 // provided vectorization factor (i.e. the scalar type is used for vector
23782 // code during codegen).
23783 auto *VecTy = getWidenedType(ScalarTy, VF);
23784 if (TTI->getNumberOfParts(VecTy) == VF)
23785 continue;
23786 for (unsigned I = NextInst; I < MaxInst; ++I) {
23787 unsigned ActualVF = std::min(MaxInst - I, VF);
23788
23789 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
23790 continue;
23791
23792 if (MaxVFOnly && ActualVF < MaxVF)
23793 break;
23794 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23795 break;
23796
23797 SmallVector<Value *> Ops(ActualVF, nullptr);
23798 unsigned Idx = 0;
23799 for (Value *V : VL.drop_front(I)) {
23800 // Check that a previous iteration of this loop did not delete the
23801 // Value.
23802 if (auto *Inst = dyn_cast<Instruction>(V);
23803 !Inst || !R.isDeleted(Inst)) {
23804 Ops[Idx] = V;
23805 ++Idx;
23806 if (Idx == ActualVF)
23807 break;
23808 }
23809 }
23810 // Not enough vectorizable instructions - exit.
23811 if (Idx != ActualVF)
23812 break;
23813
23814 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
23815 << "\n");
23816
23817 R.buildTree(Ops);
23818 if (R.isTreeTinyAndNotFullyVectorizable())
23819 continue;
23820 if (R.isProfitableToReorder()) {
23821 R.reorderTopToBottom();
23822 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
23823 }
23824 R.transformNodes();
23825 R.buildExternalUses();
23826
23827 R.computeMinimumValueSizes();
23828 InstructionCost Cost = R.getTreeCost();
23829 CandidateFound = true;
23830 MinCost = std::min(MinCost, Cost);
23831
23832 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
23833 << " for VF=" << ActualVF << "\n");
23834 if (Cost < -SLPCostThreshold) {
23835 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
23836 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
23838 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
23839 << " and with tree size "
23840 << ore::NV("TreeSize", R.getTreeSize()));
23841
23842 R.vectorizeTree();
23843 // Move to the next bundle.
23844 I += VF - 1;
23845 NextInst = I + 1;
23846 Changed = true;
23847 }
23848 }
23849 }
23850
23851 if (!Changed && CandidateFound) {
23852 R.getORE()->emit([&]() {
23853 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
23854 << "List vectorization was possible but not beneficial with cost "
23855 << ore::NV("Cost", MinCost) << " >= "
23856 << ore::NV("Treshold", -SLPCostThreshold);
23857 });
23858 } else if (!Changed) {
23859 R.getORE()->emit([&]() {
23860 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
23861 << "Cannot SLP vectorize list: vectorization was impossible"
23862 << " with available vectorization factors";
23863 });
23864 }
23865 return Changed;
23866}
23867
23868namespace {
23869
23870/// Model horizontal reductions.
23871///
23872/// A horizontal reduction is a tree of reduction instructions that has values
23873/// that can be put into a vector as its leaves. For example:
23874///
23875/// mul mul mul mul
23876/// \ / \ /
23877/// + +
23878/// \ /
23879/// +
23880/// This tree has "mul" as its leaf values and "+" as its reduction
23881/// instructions. A reduction can feed into a store or a binary operation
23882/// feeding a phi.
23883/// ...
23884/// \ /
23885/// +
23886/// |
23887/// phi +=
23888///
23889/// Or:
23890/// ...
23891/// \ /
23892/// +
23893/// |
23894/// *p =
23895///
23896class HorizontalReduction {
23897 using ReductionOpsType = SmallVector<Value *, 16>;
23898 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23899 ReductionOpsListType ReductionOps;
23900 /// List of possibly reduced values.
23902 /// Maps reduced value to the corresponding reduction operation.
23903 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23904 WeakTrackingVH ReductionRoot;
23905 /// The type of reduction operation.
23906 RecurKind RdxKind;
23907 /// Checks if the optimization of original scalar identity operations on
23908 /// matched horizontal reductions is enabled and allowed.
23909 bool IsSupportedHorRdxIdentityOp = false;
23910 /// The minimum number of the reduced values.
23911 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
23912 /// Contains vector values for reduction including their scale factor and
23913 /// signedness.
23915
23916 static bool isCmpSelMinMax(Instruction *I) {
23917 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
23919 }
23920
23921 // And/or are potentially poison-safe logical patterns like:
23922 // select x, y, false
23923 // select x, true, y
23924 static bool isBoolLogicOp(Instruction *I) {
23925 return isa<SelectInst>(I) &&
23926 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
23927 }
23928
23929 /// Checks if instruction is associative and can be vectorized.
23930 static bool isVectorizable(RecurKind Kind, Instruction *I,
23931 bool TwoElementReduction = false) {
23932 if (Kind == RecurKind::None)
23933 return false;
23934
23935 // Integer ops that map to select instructions or intrinsics are fine.
23937 isBoolLogicOp(I))
23938 return true;
23939
23940 // No need to check for associativity, if 2 reduced values.
23941 if (TwoElementReduction)
23942 return true;
23943
23944 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23945 // FP min/max are associative except for NaN and -0.0. We do not
23946 // have to rule out -0.0 here because the intrinsic semantics do not
23947 // specify a fixed result for it.
23948 return I->getFastMathFlags().noNaNs();
23949 }
23950
23951 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23952 return true;
23953
23954 return I->isAssociative();
23955 }
23956
23957 static Value *getRdxOperand(Instruction *I, unsigned Index) {
23958 // Poison-safe 'or' takes the form: select X, true, Y
23959 // To make that work with the normal operand processing, we skip the
23960 // true value operand.
23961 // TODO: Change the code and data structures to handle this without a hack.
23962 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
23963 return I->getOperand(2);
23964 return I->getOperand(Index);
23965 }
23966
23967 /// Creates reduction operation with the current opcode.
23968 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
23969 Value *RHS, const Twine &Name, bool UseSelect) {
23970 Type *OpTy = LHS->getType();
23971 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
23972 switch (Kind) {
23973 case RecurKind::Or: {
23974 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23975 return Builder.CreateSelectWithUnknownProfile(
23976 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
23977 RHS, DEBUG_TYPE, Name);
23978 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23979 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23980 Name);
23981 }
23982 case RecurKind::And: {
23983 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23984 return Builder.CreateSelectWithUnknownProfile(
23985 LHS, RHS,
23986 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
23987 DEBUG_TYPE, Name);
23988 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23989 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23990 Name);
23991 }
23992 case RecurKind::Add:
23993 case RecurKind::Mul:
23994 case RecurKind::Xor:
23995 case RecurKind::FAdd:
23996 case RecurKind::FMul: {
23997 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23998 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23999 Name);
24000 }
24001 case RecurKind::SMax:
24002 case RecurKind::SMin:
24003 case RecurKind::UMax:
24004 case RecurKind::UMin:
24005 if (UseSelect) {
24007 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
24008 return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
24009 Name);
24010 }
24011 [[fallthrough]];
24012 case RecurKind::FMax:
24013 case RecurKind::FMin:
24014 case RecurKind::FMaximum:
24015 case RecurKind::FMinimum:
24016 case RecurKind::FMaximumNum:
24017 case RecurKind::FMinimumNum: {
24019 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
24020 }
24021 default:
24022 llvm_unreachable("Unknown reduction operation.");
24023 }
24024 }
24025
24026 /// Creates reduction operation with the current opcode with the IR flags
24027 /// from \p ReductionOps, dropping nuw/nsw flags.
24028 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
24029 Value *RHS, const Twine &Name,
24030 const ReductionOpsListType &ReductionOps) {
24031 bool UseSelect = ReductionOps.size() == 2 ||
24032 // Logical or/and.
24033 (ReductionOps.size() == 1 &&
24034 any_of(ReductionOps.front(), IsaPred<SelectInst>));
24035 assert((!UseSelect || ReductionOps.size() != 2 ||
24036 isa<SelectInst>(ReductionOps[1][0])) &&
24037 "Expected cmp + select pairs for reduction");
24038 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
24040 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
24041 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
24042 /*IncludeWrapFlags=*/false);
24043 propagateIRFlags(Op, ReductionOps[1], nullptr,
24044 /*IncludeWrapFlags=*/false);
24045 return Op;
24046 }
24047 }
24048 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
24049 return Op;
24050 }
24051
24052public:
24053 static RecurKind getRdxKind(Value *V) {
24054 auto *I = dyn_cast<Instruction>(V);
24055 if (!I)
24056 return RecurKind::None;
24057 if (match(I, m_Add(m_Value(), m_Value())))
24058 return RecurKind::Add;
24059 if (match(I, m_Mul(m_Value(), m_Value())))
24060 return RecurKind::Mul;
24061 if (match(I, m_And(m_Value(), m_Value())) ||
24063 return RecurKind::And;
24064 if (match(I, m_Or(m_Value(), m_Value())) ||
24066 return RecurKind::Or;
24067 if (match(I, m_Xor(m_Value(), m_Value())))
24068 return RecurKind::Xor;
24069 if (match(I, m_FAdd(m_Value(), m_Value())))
24070 return RecurKind::FAdd;
24071 if (match(I, m_FMul(m_Value(), m_Value())))
24072 return RecurKind::FMul;
24073
24075 return RecurKind::FMax;
24077 return RecurKind::FMin;
24078
24079 if (match(I, m_FMaximum(m_Value(), m_Value())))
24080 return RecurKind::FMaximum;
24081 if (match(I, m_FMinimum(m_Value(), m_Value())))
24082 return RecurKind::FMinimum;
24083 // This matches either cmp+select or intrinsics. SLP is expected to handle
24084 // either form.
24085 // TODO: If we are canonicalizing to intrinsics, we can remove several
24086 // special-case paths that deal with selects.
24087 if (match(I, m_SMax(m_Value(), m_Value())))
24088 return RecurKind::SMax;
24089 if (match(I, m_SMin(m_Value(), m_Value())))
24090 return RecurKind::SMin;
24091 if (match(I, m_UMax(m_Value(), m_Value())))
24092 return RecurKind::UMax;
24093 if (match(I, m_UMin(m_Value(), m_Value())))
24094 return RecurKind::UMin;
24095
24096 if (auto *Select = dyn_cast<SelectInst>(I)) {
24097 // Try harder: look for min/max pattern based on instructions producing
24098 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
24099 // During the intermediate stages of SLP, it's very common to have
24100 // pattern like this (since optimizeGatherSequence is run only once
24101 // at the end):
24102 // %1 = extractelement <2 x i32> %a, i32 0
24103 // %2 = extractelement <2 x i32> %a, i32 1
24104 // %cond = icmp sgt i32 %1, %2
24105 // %3 = extractelement <2 x i32> %a, i32 0
24106 // %4 = extractelement <2 x i32> %a, i32 1
24107 // %select = select i1 %cond, i32 %3, i32 %4
24108 CmpPredicate Pred;
24109 Instruction *L1;
24110 Instruction *L2;
24111
24112 Value *LHS = Select->getTrueValue();
24113 Value *RHS = Select->getFalseValue();
24114 Value *Cond = Select->getCondition();
24115
24116 // TODO: Support inverse predicates.
24117 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
24120 return RecurKind::None;
24121 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
24124 return RecurKind::None;
24125 } else {
24127 return RecurKind::None;
24128 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
24131 return RecurKind::None;
24132 }
24133
24134 switch (Pred) {
24135 default:
24136 return RecurKind::None;
24137 case CmpInst::ICMP_SGT:
24138 case CmpInst::ICMP_SGE:
24139 return RecurKind::SMax;
24140 case CmpInst::ICMP_SLT:
24141 case CmpInst::ICMP_SLE:
24142 return RecurKind::SMin;
24143 case CmpInst::ICMP_UGT:
24144 case CmpInst::ICMP_UGE:
24145 return RecurKind::UMax;
24146 case CmpInst::ICMP_ULT:
24147 case CmpInst::ICMP_ULE:
24148 return RecurKind::UMin;
24149 }
24150 }
24151 return RecurKind::None;
24152 }
24153
24154 /// Get the index of the first operand.
24155 static unsigned getFirstOperandIndex(Instruction *I) {
24156 return isCmpSelMinMax(I) ? 1 : 0;
24157 }
24158
24159private:
24160 /// Total number of operands in the reduction operation.
24161 static unsigned getNumberOfOperands(Instruction *I) {
24162 return isCmpSelMinMax(I) ? 3 : 2;
24163 }
24164
24165 /// Checks if the instruction is in basic block \p BB.
24166 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
24167 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
24168 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
24169 auto *Sel = cast<SelectInst>(I);
24170 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
24171 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
24172 }
24173 return I->getParent() == BB;
24174 }
24175
24176 /// Expected number of uses for reduction operations/reduced values.
24177 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
24178 if (IsCmpSelMinMax) {
24179 // SelectInst must be used twice while the condition op must have single
24180 // use only.
24181 if (auto *Sel = dyn_cast<SelectInst>(I))
24182 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
24183 return I->hasNUses(2);
24184 }
24185
24186 // Arithmetic reduction operation must be used once only.
24187 return I->hasOneUse();
24188 }
24189
24190 /// Initializes the list of reduction operations.
24191 void initReductionOps(Instruction *I) {
24192 if (isCmpSelMinMax(I))
24193 ReductionOps.assign(2, ReductionOpsType());
24194 else
24195 ReductionOps.assign(1, ReductionOpsType());
24196 }
24197
24198 /// Add all reduction operations for the reduction instruction \p I.
24199 void addReductionOps(Instruction *I) {
24200 if (isCmpSelMinMax(I)) {
24201 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
24202 ReductionOps[1].emplace_back(I);
24203 } else {
24204 ReductionOps[0].emplace_back(I);
24205 }
24206 }
24207
24208 static bool isGoodForReduction(ArrayRef<Value *> Data) {
24209 int Sz = Data.size();
24210 auto *I = dyn_cast<Instruction>(Data.front());
24211 return Sz > 1 || isConstant(Data.front()) ||
24212 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
24213 }
24214
24215public:
24216 HorizontalReduction() = default;
24218 : ReductionRoot(I), ReductionLimit(2) {
24219 RdxKind = HorizontalReduction::getRdxKind(I);
24220 ReductionOps.emplace_back().push_back(I);
24221 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
24222 for (Value *V : Ops)
24223 ReducedValsToOps[V].push_back(I);
24224 }
24225
24226 bool matchReductionForOperands() const {
24227 // Analyze "regular" integer/FP types for reductions - no target-specific
24228 // types or pointers.
24229 assert(ReductionRoot && "Reduction root is not set!");
24230 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
24231 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
24232 return Ops.size() == 2;
24233 })))
24234 return false;
24235
24236 return true;
24237 }
24238
24239 /// Try to find a reduction tree.
24240 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24241 ScalarEvolution &SE, const DataLayout &DL,
24242 const TargetLibraryInfo &TLI) {
24243 RdxKind = HorizontalReduction::getRdxKind(Root);
24244 if (!isVectorizable(RdxKind, Root))
24245 return false;
24246
24247 // Analyze "regular" integer/FP types for reductions - no target-specific
24248 // types or pointers.
24249 Type *Ty = Root->getType();
24250 if (!isValidElementType(Ty) || Ty->isPointerTy())
24251 return false;
24252
24253 // Though the ultimate reduction may have multiple uses, its condition must
24254 // have only single use.
24255 if (auto *Sel = dyn_cast<SelectInst>(Root))
24256 if (!Sel->getCondition()->hasOneUse())
24257 return false;
24258
24259 ReductionRoot = Root;
24260
24261 // Iterate through all the operands of the possible reduction tree and
24262 // gather all the reduced values, sorting them by their value id.
24263 BasicBlock *BB = Root->getParent();
24264 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24266 1, std::make_pair(Root, 0));
24267 // Checks if the operands of the \p TreeN instruction are also reduction
24268 // operations or should be treated as reduced values or an extra argument,
24269 // which is not part of the reduction.
24270 auto CheckOperands = [&](Instruction *TreeN,
24271 SmallVectorImpl<Value *> &PossibleReducedVals,
24272 SmallVectorImpl<Instruction *> &ReductionOps,
24273 unsigned Level) {
24274 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
24275 getNumberOfOperands(TreeN)))) {
24276 Value *EdgeVal = getRdxOperand(TreeN, I);
24277 ReducedValsToOps[EdgeVal].push_back(TreeN);
24278 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
24279 // If the edge is not an instruction, or it is different from the main
24280 // reduction opcode or has too many uses - possible reduced value.
24281 // Also, do not try to reduce const values, if the operation is not
24282 // foldable.
24283 if (!EdgeInst || Level > RecursionMaxDepth ||
24284 getRdxKind(EdgeInst) != RdxKind ||
24285 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24286 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24287 !isVectorizable(RdxKind, EdgeInst) ||
24288 (R.isAnalyzedReductionRoot(EdgeInst) &&
24289 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
24290 PossibleReducedVals.push_back(EdgeVal);
24291 continue;
24292 }
24293 ReductionOps.push_back(EdgeInst);
24294 }
24295 };
24296 // Try to regroup reduced values so that it gets more profitable to try to
24297 // reduce them. Values are grouped by their value ids, instructions - by
24298 // instruction op id and/or alternate op id, plus do extra analysis for
24299 // loads (grouping them by the distance between pointers) and cmp
24300 // instructions (grouping them by the predicate).
24301 SmallMapVector<
24302 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24303 8>
24304 PossibleReducedVals;
24305 initReductionOps(Root);
24306 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
24307 SmallSet<size_t, 2> LoadKeyUsed;
24308
24309 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
24311 Value *Ptr =
24313 if (!LoadKeyUsed.insert(Key).second) {
24314 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
24315 if (LIt != LoadsMap.end()) {
24316 for (LoadInst *RLI : LIt->second) {
24317 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
24318 LI->getType(), LI->getPointerOperand(), DL, SE,
24319 /*StrictCheck=*/true))
24320 return hash_value(RLI->getPointerOperand());
24321 }
24322 for (LoadInst *RLI : LIt->second) {
24324 LI->getPointerOperand(), TLI)) {
24325 hash_code SubKey = hash_value(RLI->getPointerOperand());
24326 return SubKey;
24327 }
24328 }
24329 if (LIt->second.size() > 2) {
24330 hash_code SubKey =
24331 hash_value(LIt->second.back()->getPointerOperand());
24332 return SubKey;
24333 }
24334 }
24335 }
24336 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
24337 .first->second.push_back(LI);
24338 return hash_value(LI->getPointerOperand());
24339 };
24340
24341 while (!Worklist.empty()) {
24342 auto [TreeN, Level] = Worklist.pop_back_val();
24343 SmallVector<Value *> PossibleRedVals;
24344 SmallVector<Instruction *> PossibleReductionOps;
24345 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24346 addReductionOps(TreeN);
24347 // Add reduction values. The values are sorted for better vectorization
24348 // results.
24349 for (Value *V : PossibleRedVals) {
24350 size_t Key, Idx;
24351 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
24352 /*AllowAlternate=*/false);
24353 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
24354 }
24355 for (Instruction *I : reverse(PossibleReductionOps))
24356 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
24357 }
24358 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
24359 // Sort values by the total number of values kinds to start the reduction
24360 // from the longest possible reduced values sequences.
24361 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
24362 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
24363 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
24364 for (auto &Slice : PossibleRedVals) {
24365 PossibleRedValsVect.emplace_back();
24366 auto RedValsVect = Slice.second.takeVector();
24367 stable_sort(RedValsVect, llvm::less_second());
24368 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
24369 PossibleRedValsVect.back().append(Data.second, Data.first);
24370 }
24371 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
24372 return P1.size() > P2.size();
24373 });
24374 bool First = true;
24375 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
24376 if (First) {
24377 First = false;
24378 ReducedVals.emplace_back();
24379 } else if (!isGoodForReduction(Data)) {
24380 auto *LI = dyn_cast<LoadInst>(Data.front());
24381 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
24382 if (!LI || !LastLI ||
24384 getUnderlyingObject(LastLI->getPointerOperand()))
24385 ReducedVals.emplace_back();
24386 }
24387 ReducedVals.back().append(Data.rbegin(), Data.rend());
24388 }
24389 }
24390 // Sort the reduced values by number of same/alternate opcode and/or pointer
24391 // operand.
24392 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
24393 return P1.size() > P2.size();
24394 });
24395 return true;
24396 }
24397
24398 /// Attempt to vectorize the tree found by matchAssociativeReduction.
24399 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
24400 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24401 DominatorTree &DT) {
24402 constexpr unsigned RegMaxNumber = 4;
24403 constexpr unsigned RedValsMaxNumber = 128;
24404 // If there are a sufficient number of reduction values, reduce
24405 // to a nearby power-of-2. We can safely generate oversized
24406 // vectors and rely on the backend to split them to legal sizes.
24407 if (unsigned NumReducedVals = std::accumulate(
24408 ReducedVals.begin(), ReducedVals.end(), 0,
24409 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
24410 if (!isGoodForReduction(Vals))
24411 return Num;
24412 return Num + Vals.size();
24413 });
24414 NumReducedVals < ReductionLimit &&
24415 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
24416 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
24417 })) {
24418 for (ReductionOpsType &RdxOps : ReductionOps)
24419 for (Value *RdxOp : RdxOps)
24420 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24421 return nullptr;
24422 }
24423
24424 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24425 TargetFolder(DL));
24426 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
24427
24428 // Track the reduced values in case if they are replaced by extractelement
24429 // because of the vectorization.
24430 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
24431 ReducedVals.front().size());
24432
24433 // The compare instruction of a min/max is the insertion point for new
24434 // instructions and may be replaced with a new compare instruction.
24435 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
24436 assert(isa<SelectInst>(RdxRootInst) &&
24437 "Expected min/max reduction to have select root instruction");
24438 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
24439 assert(isa<Instruction>(ScalarCond) &&
24440 "Expected min/max reduction to have compare condition");
24441 return cast<Instruction>(ScalarCond);
24442 };
24443
24444 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
24445 return isBoolLogicOp(cast<Instruction>(V));
24446 });
24447 // Return new VectorizedTree, based on previous value.
24448 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
24449 if (VectorizedTree) {
24450 // Update the final value in the reduction.
24452 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
24453 if (AnyBoolLogicOp) {
24454 auto It = ReducedValsToOps.find(VectorizedTree);
24455 auto It1 = ReducedValsToOps.find(Res);
24456 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
24457 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
24458 (It != ReducedValsToOps.end() &&
24459 any_of(It->getSecond(), [&](Instruction *I) {
24460 return isBoolLogicOp(I) &&
24461 getRdxOperand(I, 0) == VectorizedTree;
24462 }))) {
24463 ;
24464 } else if (isGuaranteedNotToBePoison(Res, AC) ||
24465 (It1 != ReducedValsToOps.end() &&
24466 any_of(It1->getSecond(), [&](Instruction *I) {
24467 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24468 }))) {
24469 std::swap(VectorizedTree, Res);
24470 } else {
24471 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
24472 }
24473 }
24474
24475 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
24476 ReductionOps);
24477 }
24478 // Initialize the final value in the reduction.
24479 return Res;
24480 };
24481 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24482 ReductionOps.front().size());
24483 for (ReductionOpsType &RdxOps : ReductionOps)
24484 for (Value *RdxOp : RdxOps) {
24485 if (!RdxOp)
24486 continue;
24487 IgnoreList.insert(RdxOp);
24488 }
24489 // Intersect the fast-math-flags from all reduction operations.
24490 FastMathFlags RdxFMF;
24491 RdxFMF.set();
24492 for (Value *U : IgnoreList)
24493 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
24494 RdxFMF &= FPMO->getFastMathFlags();
24495 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
24496
24497 // Need to track reduced vals, they may be changed during vectorization of
24498 // subvectors.
24499 for (ArrayRef<Value *> Candidates : ReducedVals)
24500 for (Value *V : Candidates)
24501 TrackedVals.try_emplace(V, V);
24502
24503 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24504 Value *V) -> unsigned & {
24505 auto *It = MV.find(V);
24506 assert(It != MV.end() && "Unable to find given key.");
24507 return It->second;
24508 };
24509
24510 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
24511 // List of the values that were reduced in other trees as part of gather
24512 // nodes and thus requiring extract if fully vectorized in other trees.
24513 SmallPtrSet<Value *, 4> RequiredExtract;
24514 WeakTrackingVH VectorizedTree = nullptr;
24515 bool CheckForReusedReductionOps = false;
24516 // Try to vectorize elements based on their type.
24518 for (ArrayRef<Value *> RV : ReducedVals)
24519 States.push_back(getSameOpcode(RV, TLI));
24520 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
24521 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
24522 InstructionsState S = States[I];
24523 SmallVector<Value *> Candidates;
24524 Candidates.reserve(2 * OrigReducedVals.size());
24525 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
24526 for (Value *ReducedVal : OrigReducedVals) {