LLVM 23.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <map>
99#include <memory>
100#include <optional>
101#include <set>
102#include <string>
103#include <tuple>
104#include <utility>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108using namespace slpvectorizer;
109using namespace std::placeholders;
110
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
113
114STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
115
116DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
117 "Controls which SLP graphs should be vectorized.");
118
119static cl::opt<bool>
120 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
121 cl::desc("Run the SLP vectorization passes"));
122
123static cl::opt<bool>
124 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
125 cl::desc("Enable vectorization for wider vector utilization"));
126
127static cl::opt<int>
129 cl::desc("Only vectorize if you gain more than this "
130 "number "));
131
133 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
134 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
135 "heuristics and makes vectorization decision via cost modeling."));
136
137static cl::opt<bool>
138ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
139 cl::desc("Attempt to vectorize horizontal reductions"));
140
142 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
143 cl::desc(
144 "Attempt to vectorize horizontal reductions feeding into a store"));
145
147 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
148 cl::desc("Improve the code quality by splitting alternate instructions"));
149
150static cl::opt<int>
152 cl::desc("Attempt to vectorize for this register size in bits"));
153
156 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
157
158/// Limits the size of scheduling regions in a block.
159/// It avoid long compile times for _very_ large blocks where vector
160/// instructions are spread over a wide range.
161/// This limit is way higher than needed by real-world functions.
162static cl::opt<int>
163ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
164 cl::desc("Limit the size of the SLP scheduling region per block"));
165
167 "slp-min-reg-size", cl::init(128), cl::Hidden,
168 cl::desc("Attempt to vectorize for this register size in bits"));
169
171 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
172 cl::desc("Limit the recursion depth when building a vectorizable tree"));
173
175 "slp-min-tree-size", cl::init(3), cl::Hidden,
176 cl::desc("Only vectorize small trees if they are fully vectorizable"));
177
178// The maximum depth that the look-ahead score heuristic will explore.
179// The higher this value, the higher the compilation time overhead.
181 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
182 cl::desc("The maximum look-ahead depth for operand reordering scores"));
183
184// The maximum depth that the look-ahead score heuristic will explore
185// when it probing among candidates for vectorization tree roots.
186// The higher this value, the higher the compilation time overhead but unlike
187// similar limit for operands ordering this is less frequently used, hence
188// impact of higher value is less noticeable.
190 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
191 cl::desc("The maximum look-ahead depth for searching best rooting option"));
192
194 "slp-min-strided-loads", cl::init(2), cl::Hidden,
195 cl::desc("The minimum number of loads, which should be considered strided, "
196 "if the stride is > 1 or is runtime value"));
197
199 "slp-max-stride", cl::init(8), cl::Hidden,
200 cl::desc("The maximum stride, considered to be profitable."));
201
202static cl::opt<bool>
203 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
204 cl::desc("Disable tree reordering even if it is "
205 "profitable. Used for testing only."));
206
207static cl::opt<bool>
208 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
209 cl::desc("Generate strided loads even if they are not "
210 "profitable. Used for testing only."));
211
212static cl::opt<bool>
213 ViewSLPTree("view-slp-tree", cl::Hidden,
214 cl::desc("Display the SLP trees with Graphviz"));
215
217 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
218 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
219
220/// Enables vectorization of copyable elements.
222 "slp-copyable-elements", cl::init(true), cl::Hidden,
223 cl::desc("Try to replace values with the idempotent instructions for "
224 "better vectorization."));
225
226// Limit the number of alias checks. The limit is chosen so that
227// it has no negative effect on the llvm benchmarks.
228static const unsigned AliasedCheckLimit = 10;
229
230// Limit of the number of uses for potentially transformed instructions/values,
231// used in checks to avoid compile-time explode.
232static constexpr int UsesLimit = 64;
233
234// Another limit for the alias checks: The maximum distance between load/store
235// instructions where alias checks are done.
236// This limit is useful for very large basic blocks.
237static const unsigned MaxMemDepDistance = 160;
238
239/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
240/// regions to be handled.
241static const int MinScheduleRegionSize = 16;
242
243/// Maximum allowed number of operands in the PHI nodes.
244static const unsigned MaxPHINumOperands = 128;
245
246/// Predicate for the element types that the SLP vectorizer supports.
247///
248/// The most important thing to filter here are types which are invalid in LLVM
249/// vectors. We also filter target specific types which have absolutely no
250/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
251/// avoids spending time checking the cost model and realizing that they will
252/// be inevitably scalarized.
253static bool isValidElementType(Type *Ty) {
254 // TODO: Support ScalableVectorType.
256 Ty = Ty->getScalarType();
257 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
258 !Ty->isPPC_FP128Ty();
259}
260
261/// Returns the type of the given value/instruction \p V. If it is store,
262/// returns the type of its value operand, for Cmp - the types of the compare
263/// operands and for insertelement - the type os the inserted operand.
264/// Otherwise, just the type of the value is returned.
266 if (auto *SI = dyn_cast<StoreInst>(V))
267 return SI->getValueOperand()->getType();
268 if (auto *CI = dyn_cast<CmpInst>(V))
269 return CI->getOperand(0)->getType();
270 if (auto *IE = dyn_cast<InsertElementInst>(V))
271 return IE->getOperand(1)->getType();
272 return V->getType();
273}
274
275/// \returns the number of elements for Ty.
276static unsigned getNumElements(Type *Ty) {
278 "ScalableVectorType is not supported.");
279 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
280 return VecTy->getNumElements();
281 return 1;
282}
283
284/// \returns the vector type of ScalarTy based on vectorization factor.
285static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
286 return FixedVectorType::get(ScalarTy->getScalarType(),
287 VF * getNumElements(ScalarTy));
288}
289
290/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
291/// which forms type, which splits by \p TTI into whole vector types during
292/// legalization.
294 Type *Ty, unsigned Sz) {
295 if (!isValidElementType(Ty))
296 return bit_ceil(Sz);
297 // Find the number of elements, which forms full vectors.
298 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
299 if (NumParts == 0 || NumParts >= Sz)
300 return bit_ceil(Sz);
301 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
302}
303
304/// Returns the number of elements of the given type \p Ty, not greater than \p
305/// Sz, which forms type, which splits by \p TTI into whole vector types during
306/// legalization.
307static unsigned
309 unsigned Sz) {
310 if (!isValidElementType(Ty))
311 return bit_floor(Sz);
312 // Find the number of elements, which forms full vectors.
313 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
314 if (NumParts == 0 || NumParts >= Sz)
315 return bit_floor(Sz);
316 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
317 if (RegVF > Sz)
318 return bit_floor(Sz);
319 return (Sz / RegVF) * RegVF;
320}
321
322static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
323 SmallVectorImpl<int> &Mask) {
324 // The ShuffleBuilder implementation use shufflevector to splat an "element".
325 // But the element have different meaning for SLP (scalar) and REVEC
326 // (vector). We need to expand Mask into masks which shufflevector can use
327 // directly.
328 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
329 for (unsigned I : seq<unsigned>(Mask.size()))
330 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
331 I * VecTyNumElements, VecTyNumElements)))
332 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
333 : Mask[I] * VecTyNumElements + J;
334 Mask.swap(NewMask);
335}
336
337/// \returns the number of groups of shufflevector
338/// A group has the following features
339/// 1. All of value in a group are shufflevector.
340/// 2. The mask of all shufflevector is isExtractSubvectorMask.
341/// 3. The mask of all shufflevector uses all of the elements of the source.
342/// e.g., it is 1 group (%0)
343/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
344/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
345/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
346/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
347/// it is 2 groups (%3 and %4)
348/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
349/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
350/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
351/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
352/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
353/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
354/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
355/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
356/// it is 0 group
357/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
358/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
359/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
360/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
362 if (VL.empty())
363 return 0;
365 return 0;
366 auto *SV = cast<ShuffleVectorInst>(VL.front());
367 unsigned SVNumElements =
368 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
369 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
370 if (SVNumElements % ShuffleMaskSize != 0)
371 return 0;
372 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
373 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
374 return 0;
375 unsigned NumGroup = 0;
376 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
377 auto *SV = cast<ShuffleVectorInst>(VL[I]);
378 Value *Src = SV->getOperand(0);
379 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
380 SmallBitVector ExpectedIndex(GroupSize);
381 if (!all_of(Group, [&](Value *V) {
382 auto *SV = cast<ShuffleVectorInst>(V);
383 // From the same source.
384 if (SV->getOperand(0) != Src)
385 return false;
386 int Index;
387 if (!SV->isExtractSubvectorMask(Index))
388 return false;
389 ExpectedIndex.set(Index / ShuffleMaskSize);
390 return true;
391 }))
392 return 0;
393 if (!ExpectedIndex.all())
394 return 0;
395 ++NumGroup;
396 }
397 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
398 return NumGroup;
399}
400
401/// \returns a shufflevector mask which is used to vectorize shufflevectors
402/// e.g.,
403/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
404/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
405/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
406/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
407/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
408/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
409/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
410/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
411/// the result is
412/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
414 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
415 auto *SV = cast<ShuffleVectorInst>(VL.front());
416 unsigned SVNumElements =
417 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
418 SmallVector<int> Mask;
419 unsigned AccumulateLength = 0;
420 for (Value *V : VL) {
421 auto *SV = cast<ShuffleVectorInst>(V);
422 for (int M : SV->getShuffleMask())
423 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
424 : AccumulateLength + M);
425 AccumulateLength += SVNumElements;
426 }
427 return Mask;
428}
429
430/// \returns True if the value is a constant (but not globals/constant
431/// expressions).
432static bool isConstant(Value *V) {
434}
435
436/// Checks if \p V is one of vector-like instructions, i.e. undef,
437/// insertelement/extractelement with constant indices for fixed vector type or
438/// extractvalue instruction.
442 return false;
443 auto *I = dyn_cast<Instruction>(V);
444 if (!I || isa<ExtractValueInst>(I))
445 return true;
446 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
447 return false;
449 return isConstant(I->getOperand(1));
450 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
451 return isConstant(I->getOperand(2));
452}
453
454/// Returns power-of-2 number of elements in a single register (part), given the
455/// total number of elements \p Size and number of registers (parts) \p
456/// NumParts.
457static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
458 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
459}
460
461/// Returns correct remaining number of elements, considering total amount \p
462/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
463/// and current register (part) \p Part.
464static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
465 unsigned Part) {
466 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
467}
468
469#if !defined(NDEBUG)
470/// Print a short descriptor of the instruction bundle suitable for debug output.
471static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
472 std::string Result;
473 raw_string_ostream OS(Result);
474 if (Idx >= 0)
475 OS << "Idx: " << Idx << ", ";
476 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
477 return Result;
478}
479#endif
480
481/// \returns true if all of the instructions in \p VL are in the same block or
482/// false otherwise.
484 auto *It = find_if(VL, IsaPred<Instruction>);
485 if (It == VL.end())
486 return false;
489 return true;
490
491 BasicBlock *BB = I0->getParent();
492 for (Value *V : iterator_range(It, VL.end())) {
493 if (isa<PoisonValue>(V))
494 continue;
495 auto *II = dyn_cast<Instruction>(V);
496 if (!II)
497 return false;
498
499 if (BB != II->getParent())
500 return false;
501 }
502 return true;
503}
504
505/// \returns True if all of the values in \p VL are constants (but not
506/// globals/constant expressions).
508 // Constant expressions and globals can't be vectorized like normal integer/FP
509 // constants.
510 return all_of(VL, isConstant);
511}
512
513/// \returns True if all of the values in \p VL are identical or some of them
514/// are UndefValue.
515static bool isSplat(ArrayRef<Value *> VL) {
516 Value *FirstNonUndef = nullptr;
517 for (Value *V : VL) {
518 if (isa<UndefValue>(V))
519 continue;
520 if (!FirstNonUndef) {
521 FirstNonUndef = V;
522 continue;
523 }
524 if (V != FirstNonUndef)
525 return false;
526 }
527 return FirstNonUndef != nullptr;
528}
529
530/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
531/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
532/// patterns that make it effectively commutative (like equality comparisons
533/// with zero).
534/// In most cases, users should not call this function directly (since \p I and
535/// \p InstWithUses are the same). However, when analyzing interchangeable
536/// instructions, we need to use the converted opcode along with the original
537/// uses.
538/// \param I The instruction to check for commutativity
539/// \param ValWithUses The value whose uses are analyzed for special
540/// patterns
541static bool isCommutative(Instruction *I, Value *ValWithUses,
542 bool IsCopyable = false) {
543 if (auto *Cmp = dyn_cast<CmpInst>(I))
544 return Cmp->isCommutative();
545 if (auto *BO = dyn_cast<BinaryOperator>(I))
546 return BO->isCommutative() ||
547 (BO->getOpcode() == Instruction::Sub &&
548 ValWithUses->hasUseList() &&
549 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
550 all_of(
551 ValWithUses->uses(),
552 [&](const Use &U) {
553 // Commutative, if icmp eq/ne sub, 0
554 CmpPredicate Pred;
555 if (match(U.getUser(),
556 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
557 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return true;
559 // Commutative, if abs(sub nsw, true) or abs(sub, false).
560 ConstantInt *Flag;
561 auto *I = dyn_cast<BinaryOperator>(U.get());
562 return match(U.getUser(),
563 m_Intrinsic<Intrinsic::abs>(
564 m_Specific(U.get()), m_ConstantInt(Flag))) &&
565 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
566 Flag->isOne());
567 })) ||
568 (BO->getOpcode() == Instruction::FSub &&
569 ValWithUses->hasUseList() &&
570 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
571 all_of(ValWithUses->uses(), [](const Use &U) {
572 return match(U.getUser(),
573 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
574 }));
575 return I->isCommutative();
576}
577
578/// Checks if the operand is commutative. In commutative operations, not all
579/// operands might commutable, e.g. for fmuladd only 2 first operands are
580/// commutable.
581static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
582 bool IsCopyable = false) {
583 assert(::isCommutative(I, ValWithUses, IsCopyable) &&
584 "The instruction is not commutative.");
585 if (isa<CmpInst>(I))
586 return true;
587 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
588 switch (BO->getOpcode()) {
589 case Instruction::Sub:
590 case Instruction::FSub:
591 return true;
592 default:
593 break;
594 }
595 }
596 return I->isCommutableOperand(Op);
597}
598
599/// This is a helper function to check whether \p I is commutative.
600/// This is a convenience wrapper that calls the two-parameter version of
601/// isCommutative with the same instruction for both parameters. This is
602/// the common case where the instruction being checked for commutativity
603/// is the same as the instruction whose uses are analyzed for special
604/// patterns (see the two-parameter version above for details).
605/// \param I The instruction to check for commutativity
606/// \returns true if the instruction is commutative, false otherwise
607static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
608
609/// \returns number of operands of \p I, considering commutativity. Returns 2
610/// for commutative intrinsics.
611/// \param I The instruction to check for commutativity
614 // IntrinsicInst::isCommutative returns true if swapping the first "two"
615 // arguments to the intrinsic produces the same result.
616 constexpr unsigned IntrinsicNumOperands = 2;
617 return IntrinsicNumOperands;
618 }
619 return I->getNumOperands();
620}
621
622template <typename T>
623static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
624 unsigned Offset) {
625 static_assert(std::is_same_v<T, InsertElementInst> ||
626 std::is_same_v<T, ExtractElementInst>,
627 "unsupported T");
628 int Index = Offset;
629 if (const auto *IE = dyn_cast<T>(Inst)) {
630 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
631 if (!VT)
632 return std::nullopt;
633 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
634 if (!CI)
635 return std::nullopt;
636 if (CI->getValue().uge(VT->getNumElements()))
637 return std::nullopt;
638 Index *= VT->getNumElements();
639 Index += CI->getZExtValue();
640 return Index;
641 }
642 return std::nullopt;
643}
644
645/// \returns inserting or extracting index of InsertElement, ExtractElement or
646/// InsertValue instruction, using Offset as base offset for index.
647/// \returns std::nullopt if the index is not an immediate.
648static std::optional<unsigned> getElementIndex(const Value *Inst,
649 unsigned Offset = 0) {
650 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
651 return Index;
653 return Index;
654
655 int Index = Offset;
656
657 const auto *IV = dyn_cast<InsertValueInst>(Inst);
658 if (!IV)
659 return std::nullopt;
660
661 Type *CurrentType = IV->getType();
662 for (unsigned I : IV->indices()) {
663 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
664 Index *= ST->getNumElements();
665 CurrentType = ST->getElementType(I);
666 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
667 Index *= AT->getNumElements();
668 CurrentType = AT->getElementType();
669 } else {
670 return std::nullopt;
671 }
672 Index += I;
673 }
674 return Index;
675}
676
677/// \returns true if all of the values in \p VL use the same opcode.
678/// For comparison instructions, also checks if predicates match.
679/// PoisonValues are considered matching.
680/// Interchangeable instructions are not considered.
682 auto *It = find_if(VL, IsaPred<Instruction>);
683 if (It == VL.end())
684 return true;
685 Instruction *MainOp = cast<Instruction>(*It);
686 unsigned Opcode = MainOp->getOpcode();
687 bool IsCmpOp = isa<CmpInst>(MainOp);
688 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
690 return std::all_of(It, VL.end(), [&](Value *V) {
691 if (auto *CI = dyn_cast<CmpInst>(V))
692 return BasePred == CI->getPredicate();
693 if (auto *I = dyn_cast<Instruction>(V))
694 return I->getOpcode() == Opcode;
695 return isa<PoisonValue>(V);
696 });
697}
698
699namespace {
700/// Specifies the way the mask should be analyzed for undefs/poisonous elements
701/// in the shuffle mask.
702enum class UseMask {
703 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
704 ///< check for the mask elements for the first argument (mask
705 ///< indices are in range [0:VF)).
706 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
707 ///< for the mask elements for the second argument (mask indices
708 ///< are in range [VF:2*VF))
709 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
710 ///< future shuffle elements and mark them as ones as being used
711 ///< in future. Non-undef elements are considered as unused since
712 ///< they're already marked as used in the mask.
713};
714} // namespace
715
716/// Prepares a use bitset for the given mask either for the first argument or
717/// for the second.
719 UseMask MaskArg) {
720 SmallBitVector UseMask(VF, true);
721 for (auto [Idx, Value] : enumerate(Mask)) {
722 if (Value == PoisonMaskElem) {
723 if (MaskArg == UseMask::UndefsAsMask)
724 UseMask.reset(Idx);
725 continue;
726 }
727 if (MaskArg == UseMask::FirstArg && Value < VF)
728 UseMask.reset(Value);
729 else if (MaskArg == UseMask::SecondArg && Value >= VF)
730 UseMask.reset(Value - VF);
731 }
732 return UseMask;
733}
734
735/// Checks if the given value is actually an undefined constant vector.
736/// Also, if the \p UseMask is not empty, tries to check if the non-masked
737/// elements actually mask the insertelement buildvector, if any.
738template <bool IsPoisonOnly = false>
740 const SmallBitVector &UseMask = {}) {
741 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
742 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
743 if (isa<T>(V))
744 return Res;
745 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
746 if (!VecTy)
747 return Res.reset();
748 auto *C = dyn_cast<Constant>(V);
749 if (!C) {
750 if (!UseMask.empty()) {
751 const Value *Base = V;
752 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
753 Base = II->getOperand(0);
754 if (isa<T>(II->getOperand(1)))
755 continue;
756 std::optional<unsigned> Idx = getElementIndex(II);
757 if (!Idx) {
758 Res.reset();
759 return Res;
760 }
761 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
762 Res.reset(*Idx);
763 }
764 // TODO: Add analysis for shuffles here too.
765 if (V == Base) {
766 Res.reset();
767 } else {
768 SmallBitVector SubMask(UseMask.size(), false);
769 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
770 }
771 } else {
772 Res.reset();
773 }
774 return Res;
775 }
776 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
777 if (Constant *Elem = C->getAggregateElement(I))
778 if (!isa<T>(Elem) &&
779 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
780 Res.reset(I);
781 }
782 return Res;
783}
784
785/// Checks if the vector of instructions can be represented as a shuffle, like:
786/// %x0 = extractelement <4 x i8> %x, i32 0
787/// %x3 = extractelement <4 x i8> %x, i32 3
788/// %y1 = extractelement <4 x i8> %y, i32 1
789/// %y2 = extractelement <4 x i8> %y, i32 2
790/// %x0x0 = mul i8 %x0, %x0
791/// %x3x3 = mul i8 %x3, %x3
792/// %y1y1 = mul i8 %y1, %y1
793/// %y2y2 = mul i8 %y2, %y2
794/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
795/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
796/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
797/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
798/// ret <4 x i8> %ins4
799/// can be transformed into:
800/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
801/// i32 6>
802/// %2 = mul <4 x i8> %1, %1
803/// ret <4 x i8> %2
804/// Mask will return the Shuffle Mask equivalent to the extracted elements.
805/// TODO: Can we split off and reuse the shuffle mask detection from
806/// ShuffleVectorInst/getShuffleCost?
807static std::optional<TargetTransformInfo::ShuffleKind>
809 AssumptionCache *AC) {
810 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
811 if (It == VL.end())
812 return std::nullopt;
813 unsigned Size =
814 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
815 auto *EI = dyn_cast<ExtractElementInst>(V);
816 if (!EI)
817 return S;
818 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
819 if (!VTy)
820 return S;
821 return std::max(S, VTy->getNumElements());
822 });
823
824 Value *Vec1 = nullptr;
825 Value *Vec2 = nullptr;
826 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
827 auto *EE = dyn_cast<ExtractElementInst>(V);
828 if (!EE)
829 return false;
830 Value *Vec = EE->getVectorOperand();
831 if (isa<UndefValue>(Vec))
832 return false;
833 return isGuaranteedNotToBePoison(Vec, AC);
834 });
835 enum ShuffleMode { Unknown, Select, Permute };
836 ShuffleMode CommonShuffleMode = Unknown;
837 Mask.assign(VL.size(), PoisonMaskElem);
838 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
839 // Undef can be represented as an undef element in a vector.
840 if (isa<UndefValue>(VL[I]))
841 continue;
842 auto *EI = cast<ExtractElementInst>(VL[I]);
843 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
844 return std::nullopt;
845 auto *Vec = EI->getVectorOperand();
846 // We can extractelement from undef or poison vector.
848 continue;
849 // All vector operands must have the same number of vector elements.
850 if (isa<UndefValue>(Vec)) {
851 Mask[I] = I;
852 } else {
853 if (isa<UndefValue>(EI->getIndexOperand()))
854 continue;
855 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
856 if (!Idx)
857 return std::nullopt;
858 // Undefined behavior if Idx is negative or >= Size.
859 if (Idx->getValue().uge(Size))
860 continue;
861 unsigned IntIdx = Idx->getValue().getZExtValue();
862 Mask[I] = IntIdx;
863 }
864 if (isUndefVector(Vec).all() && HasNonUndefVec)
865 continue;
866 // For correct shuffling we have to have at most 2 different vector operands
867 // in all extractelement instructions.
868 if (!Vec1 || Vec1 == Vec) {
869 Vec1 = Vec;
870 } else if (!Vec2 || Vec2 == Vec) {
871 Vec2 = Vec;
872 Mask[I] += Size;
873 } else {
874 return std::nullopt;
875 }
876 if (CommonShuffleMode == Permute)
877 continue;
878 // If the extract index is not the same as the operation number, it is a
879 // permutation.
880 if (Mask[I] % Size != I) {
881 CommonShuffleMode = Permute;
882 continue;
883 }
884 CommonShuffleMode = Select;
885 }
886 // If we're not crossing lanes in different vectors, consider it as blending.
887 if (CommonShuffleMode == Select && Vec2)
889 // If Vec2 was never used, we have a permutation of a single vector, otherwise
890 // we have permutation of 2 vectors.
893}
894
895/// \returns True if Extract{Value,Element} instruction extracts element Idx.
896static std::optional<unsigned> getExtractIndex(const Instruction *E) {
897 unsigned Opcode = E->getOpcode();
898 assert((Opcode == Instruction::ExtractElement ||
899 Opcode == Instruction::ExtractValue) &&
900 "Expected extractelement or extractvalue instruction.");
901 if (Opcode == Instruction::ExtractElement) {
902 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
903 if (!CI)
904 return std::nullopt;
905 // Check if the index is out of bound - we can get the source vector from
906 // operand 0
907 unsigned Idx = CI->getZExtValue();
908 auto *EE = cast<ExtractElementInst>(E);
909 const unsigned VF = ::getNumElements(EE->getVectorOperandType());
910 if (Idx >= VF)
911 return std::nullopt;
912 return Idx;
913 }
914 auto *EI = cast<ExtractValueInst>(E);
915 if (EI->getNumIndices() != 1)
916 return std::nullopt;
917 return *EI->idx_begin();
918}
919
920/// Checks if the provided value does not require scheduling. It does not
921/// require scheduling if this is not an instruction or it is an instruction
922/// that does not read/write memory and all operands are either not instructions
923/// or phi nodes or instructions from different blocks.
924static bool areAllOperandsNonInsts(Value *V);
925/// Checks if the provided value does not require scheduling. It does not
926/// require scheduling if this is not an instruction or it is an instruction
927/// that does not read/write memory and all users are phi nodes or instructions
928/// from the different blocks.
929static bool isUsedOutsideBlock(Value *V);
930/// Checks if the specified value does not require scheduling. It does not
931/// require scheduling if all operands and all users do not need to be scheduled
932/// in the current basic block.
933static bool doesNotNeedToBeScheduled(Value *V);
934
935/// \returns true if \p Opcode is allowed as part of the main/alternate
936/// instruction for SLP vectorization.
937///
938/// Example of unsupported opcode is SDIV that can potentially cause UB if the
939/// "shuffled out" lane would result in division by zero.
940static bool isValidForAlternation(unsigned Opcode) {
941 return !Instruction::isIntDivRem(Opcode);
942}
943
944namespace {
945
946/// Helper class that determines VL can use the same opcode.
947/// Alternate instruction is supported. In addition, it supports interchangeable
948/// instruction. An interchangeable instruction is an instruction that can be
949/// converted to another instruction with same semantics. For example, x << 1 is
950/// equal to x * 2. x * 1 is equal to x | 0.
951class BinOpSameOpcodeHelper {
952 using MaskType = std::uint_fast16_t;
953 /// Sort SupportedOp because it is used by binary_search.
954 constexpr static std::initializer_list<unsigned> SupportedOp = {
955 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
956 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
957 enum : MaskType {
958 ShlBIT = 0b1,
959 AShrBIT = 0b10,
960 MulBIT = 0b100,
961 AddBIT = 0b1000,
962 SubBIT = 0b10000,
963 AndBIT = 0b100000,
964 OrBIT = 0b1000000,
965 XorBIT = 0b10000000,
966 MainOpBIT = 0b100000000,
968 };
969 /// Return a non-nullptr if either operand of I is a ConstantInt.
970 /// The second return value represents the operand position. We check the
971 /// right-hand side first (1). If the right hand side is not a ConstantInt and
972 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
973 /// side (0).
974 static std::pair<ConstantInt *, unsigned>
975 isBinOpWithConstantInt(const Instruction *I) {
976 unsigned Opcode = I->getOpcode();
977 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
978 (void)SupportedOp;
979 auto *BinOp = cast<BinaryOperator>(I);
980 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
981 return {CI, 1};
982 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
983 Opcode == Instruction::AShr)
984 return {nullptr, 0};
985 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
986 return {CI, 0};
987 return {nullptr, 0};
988 }
989 struct InterchangeableInfo {
990 const Instruction *I = nullptr;
991 /// The bit it sets represents whether MainOp can be converted to.
992 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
993 MulBIT | AShrBIT | ShlBIT;
994 /// We cannot create an interchangeable instruction that does not exist in
995 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
996 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
997 /// 1]. SeenBefore is used to know what operations have been seen before.
998 MaskType SeenBefore = 0;
999 InterchangeableInfo(const Instruction *I) : I(I) {}
1000 /// Return false allows BinOpSameOpcodeHelper to find an alternate
1001 /// instruction. Directly setting the mask will destroy the mask state,
1002 /// preventing us from determining which instruction it should convert to.
1003 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
1004 if (Mask & InterchangeableMask) {
1005 SeenBefore |= OpcodeInMaskForm;
1006 Mask &= InterchangeableMask;
1007 return true;
1008 }
1009 return false;
1010 }
1011 bool equal(unsigned Opcode) {
1012 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1013 }
1014 unsigned getOpcode() const {
1015 MaskType Candidate = Mask & SeenBefore;
1016 if (Candidate & MainOpBIT)
1017 return I->getOpcode();
1018 if (Candidate & ShlBIT)
1019 return Instruction::Shl;
1020 if (Candidate & AShrBIT)
1021 return Instruction::AShr;
1022 if (Candidate & MulBIT)
1023 return Instruction::Mul;
1024 if (Candidate & AddBIT)
1025 return Instruction::Add;
1026 if (Candidate & SubBIT)
1027 return Instruction::Sub;
1028 if (Candidate & AndBIT)
1029 return Instruction::And;
1030 if (Candidate & OrBIT)
1031 return Instruction::Or;
1032 if (Candidate & XorBIT)
1033 return Instruction::Xor;
1034 llvm_unreachable("Cannot find interchangeable instruction.");
1035 }
1036
1037 /// Return true if the instruction can be converted to \p Opcode.
1038 bool hasCandidateOpcode(unsigned Opcode) const {
1039 MaskType Candidate = Mask & SeenBefore;
1040 switch (Opcode) {
1041 case Instruction::Shl:
1042 return Candidate & ShlBIT;
1043 case Instruction::AShr:
1044 return Candidate & AShrBIT;
1045 case Instruction::Mul:
1046 return Candidate & MulBIT;
1047 case Instruction::Add:
1048 return Candidate & AddBIT;
1049 case Instruction::Sub:
1050 return Candidate & SubBIT;
1051 case Instruction::And:
1052 return Candidate & AndBIT;
1053 case Instruction::Or:
1054 return Candidate & OrBIT;
1055 case Instruction::Xor:
1056 return Candidate & XorBIT;
1057 case Instruction::LShr:
1058 case Instruction::FAdd:
1059 case Instruction::FSub:
1060 case Instruction::FMul:
1061 case Instruction::SDiv:
1062 case Instruction::UDiv:
1063 case Instruction::FDiv:
1064 case Instruction::SRem:
1065 case Instruction::URem:
1066 case Instruction::FRem:
1067 return false;
1068 default:
1069 break;
1070 }
1071 llvm_unreachable("Cannot find interchangeable instruction.");
1072 }
1073
1074 SmallVector<Value *> getOperand(const Instruction *To) const {
1075 unsigned ToOpcode = To->getOpcode();
1076 unsigned FromOpcode = I->getOpcode();
1077 if (FromOpcode == ToOpcode)
1078 return SmallVector<Value *>(I->operands());
1079 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1080 auto [CI, Pos] = isBinOpWithConstantInt(I);
1081 const APInt &FromCIValue = CI->getValue();
1082 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1083 APInt ToCIValue;
1084 switch (FromOpcode) {
1085 case Instruction::Shl:
1086 if (ToOpcode == Instruction::Mul) {
1087 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1088 FromCIValue.getZExtValue());
1089 } else {
1090 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1091 ToCIValue = ToOpcode == Instruction::And
1092 ? APInt::getAllOnes(FromCIValueBitWidth)
1093 : APInt::getZero(FromCIValueBitWidth);
1094 }
1095 break;
1096 case Instruction::Mul:
1097 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1098 if (ToOpcode == Instruction::Shl) {
1099 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1100 } else {
1101 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1102 ToCIValue = ToOpcode == Instruction::And
1103 ? APInt::getAllOnes(FromCIValueBitWidth)
1104 : APInt::getZero(FromCIValueBitWidth);
1105 }
1106 break;
1107 case Instruction::Add:
1108 case Instruction::Sub:
1109 if (FromCIValue.isZero()) {
1110 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1111 } else {
1112 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1113 "Cannot convert the instruction.");
1114 ToCIValue = FromCIValue;
1115 ToCIValue.negate();
1116 }
1117 break;
1118 case Instruction::And:
1119 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1120 ToCIValue = ToOpcode == Instruction::Mul
1121 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1122 : APInt::getZero(FromCIValueBitWidth);
1123 break;
1124 default:
1125 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1126 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1127 break;
1128 }
1129 Value *LHS = I->getOperand(1 - Pos);
1130 Constant *RHS =
1131 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1132 // constant + x cannot be -constant - x
1133 // instead, it should be x - -constant
1134 if (Pos == 1 ||
1135 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1136 FromOpcode == Instruction::Xor) &&
1137 ToOpcode == Instruction::Sub))
1138 return SmallVector<Value *>({LHS, RHS});
1139 return SmallVector<Value *>({RHS, LHS});
1140 }
1141 };
1142 InterchangeableInfo MainOp;
1143 InterchangeableInfo AltOp;
1144 bool isValidForAlternation(const Instruction *I) const {
1145 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1146 ::isValidForAlternation(I->getOpcode());
1147 }
1148 bool initializeAltOp(const Instruction *I) {
1149 if (AltOp.I)
1150 return true;
1152 return false;
1153 AltOp.I = I;
1154 return true;
1155 }
1156
1157public:
1158 BinOpSameOpcodeHelper(const Instruction *MainOp,
1159 const Instruction *AltOp = nullptr)
1160 : MainOp(MainOp), AltOp(AltOp) {
1161 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1162 }
1163 bool add(const Instruction *I) {
1165 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1166 unsigned Opcode = I->getOpcode();
1167 MaskType OpcodeInMaskForm;
1168 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1169 switch (Opcode) {
1170 case Instruction::Shl:
1171 OpcodeInMaskForm = ShlBIT;
1172 break;
1173 case Instruction::AShr:
1174 OpcodeInMaskForm = AShrBIT;
1175 break;
1176 case Instruction::Mul:
1177 OpcodeInMaskForm = MulBIT;
1178 break;
1179 case Instruction::Add:
1180 OpcodeInMaskForm = AddBIT;
1181 break;
1182 case Instruction::Sub:
1183 OpcodeInMaskForm = SubBIT;
1184 break;
1185 case Instruction::And:
1186 OpcodeInMaskForm = AndBIT;
1187 break;
1188 case Instruction::Or:
1189 OpcodeInMaskForm = OrBIT;
1190 break;
1191 case Instruction::Xor:
1192 OpcodeInMaskForm = XorBIT;
1193 break;
1194 default:
1195 return MainOp.equal(Opcode) ||
1196 (initializeAltOp(I) && AltOp.equal(Opcode));
1197 }
1198 MaskType InterchangeableMask = OpcodeInMaskForm;
1199 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1200 if (CI) {
1201 constexpr MaskType CanBeAll =
1202 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1203 const APInt &CIValue = CI->getValue();
1204 switch (Opcode) {
1205 case Instruction::Shl:
1206 if (CIValue.ult(CIValue.getBitWidth()))
1207 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1208 break;
1209 case Instruction::Mul:
1210 if (CIValue.isOne()) {
1211 InterchangeableMask = CanBeAll;
1212 break;
1213 }
1214 if (CIValue.isPowerOf2())
1215 InterchangeableMask = MulBIT | ShlBIT;
1216 break;
1217 case Instruction::Add:
1218 case Instruction::Sub:
1219 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1220 break;
1221 case Instruction::And:
1222 if (CIValue.isAllOnes())
1223 InterchangeableMask = CanBeAll;
1224 break;
1225 case Instruction::Xor:
1226 if (CIValue.isZero())
1227 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1228 break;
1229 default:
1230 if (CIValue.isZero())
1231 InterchangeableMask = CanBeAll;
1232 break;
1233 }
1234 }
1235 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1236 (initializeAltOp(I) &&
1237 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1238 }
1239 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1240 /// Checks if the list of potential opcodes includes \p Opcode.
1241 bool hasCandidateOpcode(unsigned Opcode) const {
1242 return MainOp.hasCandidateOpcode(Opcode);
1243 }
1244 bool hasAltOp() const { return AltOp.I; }
1245 unsigned getAltOpcode() const {
1246 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1247 }
1248 SmallVector<Value *> getOperand(const Instruction *I) const {
1249 return MainOp.getOperand(I);
1250 }
1251};
1252
1253/// Main data required for vectorization of instructions.
1254class InstructionsState {
1255 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1256 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1257 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1258 /// isAltShuffle).
1259 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1260 /// from getMainAltOpsNoStateVL.
1261 /// For those InstructionsState that use alternate instructions, the resulting
1262 /// vectorized output ultimately comes from a shufflevector. For example,
1263 /// given a vector list (VL):
1264 /// VL[0] = add i32 a, e
1265 /// VL[1] = sub i32 b, f
1266 /// VL[2] = add i32 c, g
1267 /// VL[3] = sub i32 d, h
1268 /// The vectorized result would be:
1269 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1270 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1271 /// result = shufflevector <4 x i32> intermediated_0,
1272 /// <4 x i32> intermediated_1,
1273 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1274 /// Since shufflevector is used in the final result, when calculating the cost
1275 /// (getEntryCost), we must account for the usage of shufflevector in
1276 /// GetVectorCost.
1277 Instruction *MainOp = nullptr;
1278 Instruction *AltOp = nullptr;
1279 /// Wether the instruction state represents copyable instructions.
1280 bool HasCopyables = false;
1281
1282public:
1283 Instruction *getMainOp() const {
1284 assert(valid() && "InstructionsState is invalid.");
1285 return MainOp;
1286 }
1287
1288 Instruction *getAltOp() const {
1289 assert(valid() && "InstructionsState is invalid.");
1290 return AltOp;
1291 }
1292
1293 /// The main/alternate opcodes for the list of instructions.
1294 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1295
1296 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1297
1298 /// Some of the instructions in the list have alternate opcodes.
1299 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1300
1301 /// Checks if the instruction matches either the main or alternate opcode.
1302 /// \returns
1303 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1304 /// to it
1305 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1306 /// it
1307 /// - nullptr if \param I cannot be matched or converted to either opcode
1308 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1309 assert(MainOp && "MainOp cannot be nullptr.");
1310 if (I->getOpcode() == MainOp->getOpcode())
1311 return MainOp;
1312 // Prefer AltOp instead of interchangeable instruction of MainOp.
1313 assert(AltOp && "AltOp cannot be nullptr.");
1314 if (I->getOpcode() == AltOp->getOpcode())
1315 return AltOp;
1316 if (!I->isBinaryOp())
1317 return nullptr;
1318 BinOpSameOpcodeHelper Converter(MainOp);
1319 if (!Converter.add(I) || !Converter.add(MainOp))
1320 return nullptr;
1321 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1322 BinOpSameOpcodeHelper AltConverter(AltOp);
1323 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1324 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1325 return AltOp;
1326 }
1327 if (Converter.hasAltOp() && !isAltShuffle())
1328 return nullptr;
1329 return Converter.hasAltOp() ? AltOp : MainOp;
1330 }
1331
1332 /// Checks if main/alt instructions are shift operations.
1333 bool isShiftOp() const {
1334 return getMainOp()->isShift() && getAltOp()->isShift();
1335 }
1336
1337 /// Checks if main/alt instructions are bitwise logic operations.
1338 bool isBitwiseLogicOp() const {
1339 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1340 }
1341
1342 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1343 bool isMulDivLikeOp() const {
1344 constexpr std::array<unsigned, 8> MulDiv = {
1345 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1346 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1347 Instruction::URem, Instruction::FRem};
1348 return is_contained(MulDiv, getOpcode()) &&
1349 is_contained(MulDiv, getAltOpcode());
1350 }
1351
1352 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1353 bool isAddSubLikeOp() const {
1354 constexpr std::array<unsigned, 4> AddSub = {
1355 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1356 Instruction::FSub};
1357 return is_contained(AddSub, getOpcode()) &&
1358 is_contained(AddSub, getAltOpcode());
1359 }
1360
1361 /// Checks if main/alt instructions are cmp operations.
1362 bool isCmpOp() const {
1363 return (getOpcode() == Instruction::ICmp ||
1364 getOpcode() == Instruction::FCmp) &&
1365 getAltOpcode() == getOpcode();
1366 }
1367
1368 /// Checks if the current state is valid, i.e. has non-null MainOp
1369 bool valid() const { return MainOp && AltOp; }
1370
1371 explicit operator bool() const { return valid(); }
1372
1373 InstructionsState() = delete;
1374 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1375 bool HasCopyables = false)
1376 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1377 static InstructionsState invalid() { return {nullptr, nullptr}; }
1378
1379 /// Checks if the value is a copyable element.
1380 bool isCopyableElement(Value *V) const {
1381 assert(valid() && "InstructionsState is invalid.");
1382 if (!HasCopyables)
1383 return false;
1384 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1385 return false;
1386 auto *I = dyn_cast<Instruction>(V);
1387 if (!I)
1388 return !isa<PoisonValue>(V);
1389 if (I->getParent() != MainOp->getParent() &&
1392 return true;
1393 if (I->getOpcode() == MainOp->getOpcode())
1394 return false;
1395 if (!I->isBinaryOp())
1396 return true;
1397 BinOpSameOpcodeHelper Converter(MainOp);
1398 return !Converter.add(I) || !Converter.add(MainOp) ||
1399 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1400 }
1401
1402 /// Checks if the value is non-schedulable.
1403 bool isNonSchedulable(Value *V) const {
1404 assert(valid() && "InstructionsState is invalid.");
1405 auto *I = dyn_cast<Instruction>(V);
1406 if (!HasCopyables)
1407 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1409 // MainOp for copyables always schedulable to correctly identify
1410 // non-schedulable copyables.
1411 if (getMainOp() == V)
1412 return false;
1413 if (isCopyableElement(V)) {
1414 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1415 auto *I = dyn_cast<Instruction>(V);
1416 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1418 // If the copyable instructions comes after MainOp
1419 // (non-schedulable, but used in the block) - cannot vectorize
1420 // it, will possibly generate use before def.
1421 !MainOp->comesBefore(I));
1422 };
1423
1424 return IsNonSchedulableCopyableElement(V);
1425 }
1426 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1428 }
1429
1430 /// Checks if the state represents copyable instructions.
1431 bool areInstructionsWithCopyableElements() const {
1432 assert(valid() && "InstructionsState is invalid.");
1433 return HasCopyables;
1434 }
1435};
1436
1437std::pair<Instruction *, SmallVector<Value *>>
1438convertTo(Instruction *I, const InstructionsState &S) {
1439 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1440 assert(SelectedOp && "Cannot convert the instruction.");
1441 if (I->isBinaryOp()) {
1442 BinOpSameOpcodeHelper Converter(I);
1443 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1444 }
1445 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1446}
1447
1448} // end anonymous namespace
1449
1450static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1451 const TargetLibraryInfo &TLI);
1452
1453/// Find an instruction with a specific opcode in VL.
1454/// \param VL Array of values to search through. Must contain only Instructions
1455/// and PoisonValues.
1456/// \param Opcode The instruction opcode to search for
1457/// \returns
1458/// - The first instruction found with matching opcode
1459/// - nullptr if no matching instruction is found
1461 unsigned Opcode) {
1462 for (Value *V : VL) {
1463 if (isa<PoisonValue>(V))
1464 continue;
1465 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1466 auto *Inst = cast<Instruction>(V);
1467 if (Inst->getOpcode() == Opcode)
1468 return Inst;
1469 }
1470 return nullptr;
1471}
1472
1473/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1474/// compatible instructions or constants, or just some other regular values.
1475static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1476 Value *Op1, const TargetLibraryInfo &TLI) {
1477 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1478 (isConstant(BaseOp1) && isConstant(Op1)) ||
1479 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1480 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1481 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1482 getSameOpcode({BaseOp0, Op0}, TLI) ||
1483 getSameOpcode({BaseOp1, Op1}, TLI);
1484}
1485
1486/// \returns true if a compare instruction \p CI has similar "look" and
1487/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1488/// swapped, false otherwise.
1489static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1490 const TargetLibraryInfo &TLI) {
1491 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1492 "Assessing comparisons of different types?");
1493 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1494 CmpInst::Predicate Pred = CI->getPredicate();
1496
1497 Value *BaseOp0 = BaseCI->getOperand(0);
1498 Value *BaseOp1 = BaseCI->getOperand(1);
1499 Value *Op0 = CI->getOperand(0);
1500 Value *Op1 = CI->getOperand(1);
1501
1502 return (BasePred == Pred &&
1503 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1504 (BasePred == SwappedPred &&
1505 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1506}
1507
1508/// \returns analysis of the Instructions in \p VL described in
1509/// InstructionsState, the Opcode that we suppose the whole list
1510/// could be vectorized even if its structure is diverse.
1511static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1512 const TargetLibraryInfo &TLI) {
1513 // Make sure these are all Instructions.
1515 return InstructionsState::invalid();
1516
1517 auto *It = find_if(VL, IsaPred<Instruction>);
1518 if (It == VL.end())
1519 return InstructionsState::invalid();
1520
1521 Instruction *MainOp = cast<Instruction>(*It);
1522 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1523 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1524 (VL.size() == 2 && InstCnt < 2))
1525 return InstructionsState::invalid();
1526
1527 bool IsCastOp = isa<CastInst>(MainOp);
1528 bool IsBinOp = isa<BinaryOperator>(MainOp);
1529 bool IsCmpOp = isa<CmpInst>(MainOp);
1530 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1532 Instruction *AltOp = MainOp;
1533 unsigned Opcode = MainOp->getOpcode();
1534 unsigned AltOpcode = Opcode;
1535
1536 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1537 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1538 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1539 UniquePreds.insert(BasePred);
1540 UniqueNonSwappedPreds.insert(BasePred);
1541 for (Value *V : VL) {
1542 auto *I = dyn_cast<CmpInst>(V);
1543 if (!I)
1544 return false;
1545 CmpInst::Predicate CurrentPred = I->getPredicate();
1546 CmpInst::Predicate SwappedCurrentPred =
1547 CmpInst::getSwappedPredicate(CurrentPred);
1548 UniqueNonSwappedPreds.insert(CurrentPred);
1549 if (!UniquePreds.contains(CurrentPred) &&
1550 !UniquePreds.contains(SwappedCurrentPred))
1551 UniquePreds.insert(CurrentPred);
1552 }
1553 // Total number of predicates > 2, but if consider swapped predicates
1554 // compatible only 2, consider swappable predicates as compatible opcodes,
1555 // not alternate.
1556 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1557 }();
1558 // Check for one alternate opcode from another BinaryOperator.
1559 // TODO - generalize to support all operators (types, calls etc.).
1560 Intrinsic::ID BaseID = 0;
1561 SmallVector<VFInfo> BaseMappings;
1562 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1563 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1564 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1565 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1566 return InstructionsState::invalid();
1567 }
1568 bool AnyPoison = InstCnt != VL.size();
1569 // Check MainOp too to be sure that it matches the requirements for the
1570 // instructions.
1571 for (Value *V : iterator_range(It, VL.end())) {
1572 auto *I = dyn_cast<Instruction>(V);
1573 if (!I)
1574 continue;
1575
1576 // Cannot combine poison and divisions.
1577 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1578 // intrinsics/functions only.
1579 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1580 return InstructionsState::invalid();
1581 unsigned InstOpcode = I->getOpcode();
1582 if (IsBinOp && isa<BinaryOperator>(I)) {
1583 if (BinOpHelper.add(I))
1584 continue;
1585 } else if (IsCastOp && isa<CastInst>(I)) {
1586 Value *Op0 = MainOp->getOperand(0);
1587 Type *Ty0 = Op0->getType();
1588 Value *Op1 = I->getOperand(0);
1589 Type *Ty1 = Op1->getType();
1590 if (Ty0 == Ty1) {
1591 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1592 continue;
1593 if (Opcode == AltOpcode) {
1594 assert(isValidForAlternation(Opcode) &&
1595 isValidForAlternation(InstOpcode) &&
1596 "Cast isn't safe for alternation, logic needs to be updated!");
1597 AltOpcode = InstOpcode;
1598 AltOp = I;
1599 continue;
1600 }
1601 }
1602 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1603 auto *BaseInst = cast<CmpInst>(MainOp);
1604 Type *Ty0 = BaseInst->getOperand(0)->getType();
1605 Type *Ty1 = Inst->getOperand(0)->getType();
1606 if (Ty0 == Ty1) {
1607 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator "
1610 "and CastInst.");
1611 // Check for compatible operands. If the corresponding operands are not
1612 // compatible - need to perform alternate vectorization.
1613 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1614 CmpInst::Predicate SwappedCurrentPred =
1615 CmpInst::getSwappedPredicate(CurrentPred);
1616
1617 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1618 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1619 continue;
1620
1621 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1622 continue;
1623 auto *AltInst = cast<CmpInst>(AltOp);
1624 if (MainOp != AltOp) {
1625 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1626 continue;
1627 } else if (BasePred != CurrentPred) {
1628 assert(
1629 isValidForAlternation(InstOpcode) &&
1630 "CmpInst isn't safe for alternation, logic needs to be updated!");
1631 AltOp = I;
1632 continue;
1633 }
1634 CmpInst::Predicate AltPred = AltInst->getPredicate();
1635 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1636 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1637 continue;
1638 }
1639 } else if (InstOpcode == Opcode) {
1640 assert(InstOpcode == AltOpcode &&
1641 "Alternate instructions are only supported by BinaryOperator and "
1642 "CastInst.");
1643 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1644 if (Gep->getNumOperands() != 2 ||
1645 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1646 return InstructionsState::invalid();
1647 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1649 return InstructionsState::invalid();
1650 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1651 auto *BaseLI = cast<LoadInst>(MainOp);
1652 if (!LI->isSimple() || !BaseLI->isSimple())
1653 return InstructionsState::invalid();
1654 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1655 auto *CallBase = cast<CallInst>(MainOp);
1656 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1657 return InstructionsState::invalid();
1658 if (Call->hasOperandBundles() &&
1660 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1661 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1662 CallBase->op_begin() +
1664 return InstructionsState::invalid();
1666 if (ID != BaseID)
1667 return InstructionsState::invalid();
1668 if (!ID) {
1669 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1670 if (Mappings.size() != BaseMappings.size() ||
1671 Mappings.front().ISA != BaseMappings.front().ISA ||
1672 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1673 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1674 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1675 Mappings.front().Shape.Parameters !=
1676 BaseMappings.front().Shape.Parameters)
1677 return InstructionsState::invalid();
1678 }
1679 }
1680 continue;
1681 }
1682 return InstructionsState::invalid();
1683 }
1684
1685 if (IsBinOp) {
1686 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1687 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1688 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1689 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1690 }
1691 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1692 "Incorrect implementation of allSameOpcode.");
1693 InstructionsState S(MainOp, AltOp);
1694 assert(all_of(VL,
1695 [&](Value *V) {
1696 return isa<PoisonValue>(V) ||
1697 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1698 }) &&
1699 "Invalid InstructionsState.");
1700 return S;
1701}
1702
1703/// \returns true if all of the values in \p VL have the same type or false
1704/// otherwise.
1706 Type *Ty = VL.consume_front()->getType();
1707 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1708}
1709
1710/// \returns True if in-tree use also needs extract. This refers to
1711/// possible scalar operand in vectorized instruction.
1712static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1713 TargetLibraryInfo *TLI,
1714 const TargetTransformInfo *TTI) {
1715 if (!UserInst)
1716 return false;
1717 unsigned Opcode = UserInst->getOpcode();
1718 switch (Opcode) {
1719 case Instruction::Load: {
1720 LoadInst *LI = cast<LoadInst>(UserInst);
1721 return (LI->getPointerOperand() == Scalar);
1722 }
1723 case Instruction::Store: {
1724 StoreInst *SI = cast<StoreInst>(UserInst);
1725 return (SI->getPointerOperand() == Scalar);
1726 }
1727 case Instruction::Call: {
1728 CallInst *CI = cast<CallInst>(UserInst);
1730 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1731 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1732 Arg.value().get() == Scalar;
1733 });
1734 }
1735 default:
1736 return false;
1737 }
1738}
1739
1740/// \returns the AA location that is being access by the instruction.
1743 return MemoryLocation::get(SI);
1744 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1745 return MemoryLocation::get(LI);
1746 return MemoryLocation();
1747}
1748
1749/// \returns True if the instruction is not a volatile or atomic load/store.
1750static bool isSimple(Instruction *I) {
1751 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1752 return LI->isSimple();
1754 return SI->isSimple();
1756 return !MI->isVolatile();
1757 return true;
1758}
1759
1760/// Shuffles \p Mask in accordance with the given \p SubMask.
1761/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1762/// one but two input vectors.
1763static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1764 bool ExtendingManyInputs = false) {
1765 if (SubMask.empty())
1766 return;
1767 assert(
1768 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1769 // Check if input scalars were extended to match the size of other node.
1770 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1771 "SubMask with many inputs support must be larger than the mask.");
1772 if (Mask.empty()) {
1773 Mask.append(SubMask.begin(), SubMask.end());
1774 return;
1775 }
1776 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1777 int TermValue = std::min(Mask.size(), SubMask.size());
1778 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1779 if (SubMask[I] == PoisonMaskElem ||
1780 (!ExtendingManyInputs &&
1781 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1782 continue;
1783 NewMask[I] = Mask[SubMask[I]];
1784 }
1785 Mask.swap(NewMask);
1786}
1787
1788/// Order may have elements assigned special value (size) which is out of
1789/// bounds. Such indices only appear on places which correspond to undef values
1790/// (see canReuseExtract for details) and used in order to avoid undef values
1791/// have effect on operands ordering.
1792/// The first loop below simply finds all unused indices and then the next loop
1793/// nest assigns these indices for undef values positions.
1794/// As an example below Order has two undef positions and they have assigned
1795/// values 3 and 7 respectively:
1796/// before: 6 9 5 4 9 2 1 0
1797/// after: 6 3 5 4 7 2 1 0
1799 const size_t Sz = Order.size();
1800 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1801 SmallBitVector MaskedIndices(Sz);
1802 for (unsigned I = 0; I < Sz; ++I) {
1803 if (Order[I] < Sz)
1804 UnusedIndices.reset(Order[I]);
1805 else
1806 MaskedIndices.set(I);
1807 }
1808 if (MaskedIndices.none())
1809 return;
1810 assert(UnusedIndices.count() == MaskedIndices.count() &&
1811 "Non-synced masked/available indices.");
1812 int Idx = UnusedIndices.find_first();
1813 int MIdx = MaskedIndices.find_first();
1814 while (MIdx >= 0) {
1815 assert(Idx >= 0 && "Indices must be synced.");
1816 Order[MIdx] = Idx;
1817 Idx = UnusedIndices.find_next(Idx);
1818 MIdx = MaskedIndices.find_next(MIdx);
1819 }
1820}
1821
1822/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1823/// Opcode1.
1825 unsigned Opcode0, unsigned Opcode1) {
1826 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1827 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1828 for (unsigned Lane : seq<unsigned>(VL.size())) {
1829 if (isa<PoisonValue>(VL[Lane]))
1830 continue;
1831 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1832 OpcodeMask.set(Lane * ScalarTyNumElements,
1833 Lane * ScalarTyNumElements + ScalarTyNumElements);
1834 }
1835 return OpcodeMask;
1836}
1837
1838/// Replicates the given \p Val \p VF times.
1840 unsigned VF) {
1841 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1842 "Expected scalar constants.");
1843 SmallVector<Constant *> NewVal(Val.size() * VF);
1844 for (auto [I, V] : enumerate(Val))
1845 std::fill_n(NewVal.begin() + I * VF, VF, V);
1846 return NewVal;
1847}
1848
1850 SmallVectorImpl<int> &Mask) {
1851 Mask.clear();
1852 const unsigned E = Indices.size();
1853 Mask.resize(E, PoisonMaskElem);
1854 for (unsigned I = 0; I < E; ++I)
1855 Mask[Indices[I]] = I;
1856}
1857
1858/// Reorders the list of scalars in accordance with the given \p Mask.
1860 ArrayRef<int> Mask) {
1861 assert(!Mask.empty() && "Expected non-empty mask.");
1862 SmallVector<Value *> Prev(Scalars.size(),
1863 PoisonValue::get(Scalars.front()->getType()));
1864 Prev.swap(Scalars);
1865 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1866 if (Mask[I] != PoisonMaskElem)
1867 Scalars[Mask[I]] = Prev[I];
1868}
1869
1870/// Checks if the provided value does not require scheduling. It does not
1871/// require scheduling if this is not an instruction or it is an instruction
1872/// that does not read/write memory and all operands are either not instructions
1873/// or phi nodes or instructions from different blocks.
1875 auto *I = dyn_cast<Instruction>(V);
1876 if (!I)
1877 return true;
1878 return !mayHaveNonDefUseDependency(*I) &&
1879 all_of(I->operands(), [I](Value *V) {
1880 auto *IO = dyn_cast<Instruction>(V);
1881 if (!IO)
1882 return true;
1883 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1884 });
1885}
1886
1887/// Checks if the provided value does not require scheduling. It does not
1888/// require scheduling if this is not an instruction or it is an instruction
1889/// that does not read/write memory and all users are phi nodes or instructions
1890/// from the different blocks.
1891static bool isUsedOutsideBlock(Value *V) {
1892 auto *I = dyn_cast<Instruction>(V);
1893 if (!I)
1894 return true;
1895 // Limits the number of uses to save compile time.
1896 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1897 all_of(I->users(), [I](User *U) {
1898 auto *IU = dyn_cast<Instruction>(U);
1899 if (!IU)
1900 return true;
1901 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1902 });
1903}
1904
1905/// Checks if the specified value does not require scheduling. It does not
1906/// require scheduling if all operands and all users do not need to be scheduled
1907/// in the current basic block.
1910}
1911
1912/// Checks if the specified array of instructions does not require scheduling.
1913/// It is so if all either instructions have operands that do not require
1914/// scheduling or their users do not require scheduling since they are phis or
1915/// in other basic blocks.
1917 return !VL.empty() &&
1919}
1920
1921/// Returns true if widened type of \p Ty elements with size \p Sz represents
1922/// full vector type, i.e. adding extra element results in extra parts upon type
1923/// legalization.
1925 unsigned Sz) {
1926 if (Sz <= 1)
1927 return false;
1929 return false;
1930 if (has_single_bit(Sz))
1931 return true;
1932 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1933 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1934 Sz % NumParts == 0;
1935}
1936
1937/// Returns number of parts, the type \p VecTy will be split at the codegen
1938/// phase. If the type is going to be scalarized or does not uses whole
1939/// registers, returns 1.
1940static unsigned
1942 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1943 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1944 if (NumParts == 0 || NumParts >= Limit)
1945 return 1;
1946 unsigned Sz = getNumElements(VecTy);
1947 if (NumParts >= Sz || Sz % NumParts != 0 ||
1948 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1949 return 1;
1950 return NumParts;
1951}
1952
1953/// Bottom Up SLP Vectorizer.
1955 class TreeEntry;
1956 class ScheduleEntity;
1957 class ScheduleData;
1958 class ScheduleCopyableData;
1959 class ScheduleBundle;
1962
1963 /// If we decide to generate strided load / store, this struct contains all
1964 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1965 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1966 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1967 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1968 /// size of element of FixedVectorType.
1969 struct StridedPtrInfo {
1970 Value *StrideVal = nullptr;
1971 const SCEV *StrideSCEV = nullptr;
1972 FixedVectorType *Ty = nullptr;
1973 };
1974 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1975
1976public:
1977 /// Tracks the state we can represent the loads in the given sequence.
1985
1992
1994 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1996 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1997 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1998 AC(AC), DB(DB), DL(DL), ORE(ORE),
1999 Builder(Se->getContext(), TargetFolder(*DL)) {
2000 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
2001 // Use the vector register size specified by the target unless overridden
2002 // by a command-line option.
2003 // TODO: It would be better to limit the vectorization factor based on
2004 // data type rather than just register size. For example, x86 AVX has
2005 // 256-bit registers, but it does not support integer operations
2006 // at that width (that requires AVX2).
2007 if (MaxVectorRegSizeOption.getNumOccurrences())
2008 MaxVecRegSize = MaxVectorRegSizeOption;
2009 else
2010 MaxVecRegSize =
2011 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
2012 .getFixedValue();
2013
2014 if (MinVectorRegSizeOption.getNumOccurrences())
2015 MinVecRegSize = MinVectorRegSizeOption;
2016 else
2017 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2018 }
2019
2020 /// Vectorize the tree that starts with the elements in \p VL.
2021 /// Returns the vectorized root.
2023
2024 /// Vectorize the tree but with the list of externally used values \p
2025 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
2026 /// generated extractvalue instructions.
2028 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2029 Instruction *ReductionRoot = nullptr,
2030 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2031
2032 /// \returns the cost incurred by unwanted spills and fills, caused by
2033 /// holding live values over call sites.
2035
2036 /// Calculates the cost of the subtrees, trims non-profitable ones and returns
2037 /// final cost.
2040
2041 /// \returns the vectorization cost of the subtree that starts at \p VL.
2042 /// A negative number means that this is profitable.
2044 ArrayRef<Value *> VectorizedVals = {},
2045 InstructionCost ReductionCost = TTI::TCC_Free);
2046
2047 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2048 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2049 void buildTree(ArrayRef<Value *> Roots,
2050 const SmallDenseSet<Value *> &UserIgnoreLst);
2051
2052 /// Construct a vectorizable tree that starts at \p Roots.
2053 void buildTree(ArrayRef<Value *> Roots);
2054
2055 /// Return the scalars of the root node.
2057 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2058 return VectorizableTree.front()->Scalars;
2059 }
2060
2061 /// Returns the type/is-signed info for the root node in the graph without
2062 /// casting.
2063 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2064 const TreeEntry &Root = *VectorizableTree.front();
2065 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2066 !Root.Scalars.front()->getType()->isIntegerTy())
2067 return std::nullopt;
2068 auto It = MinBWs.find(&Root);
2069 if (It != MinBWs.end())
2070 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2071 It->second.first),
2072 It->second.second);
2073 if (Root.getOpcode() == Instruction::ZExt ||
2074 Root.getOpcode() == Instruction::SExt)
2075 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2076 Root.getOpcode() == Instruction::SExt);
2077 return std::nullopt;
2078 }
2079
2080 /// Checks if the root graph node can be emitted with narrower bitwidth at
2081 /// codegen and returns it signedness, if so.
2083 return MinBWs.at(VectorizableTree.front().get()).second;
2084 }
2085
2086 /// Returns reduction type after minbitdth analysis.
2088 if (ReductionBitWidth == 0 ||
2089 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2090 ReductionBitWidth >=
2091 DL->getTypeSizeInBits(
2092 VectorizableTree.front()->Scalars.front()->getType()))
2093 return getWidenedType(
2094 VectorizableTree.front()->Scalars.front()->getType(),
2095 VectorizableTree.front()->getVectorFactor());
2096 return getWidenedType(
2098 VectorizableTree.front()->Scalars.front()->getContext(),
2099 ReductionBitWidth),
2100 VectorizableTree.front()->getVectorFactor());
2101 }
2102
2103 /// Returns the opcode of the root node, or 0, if the root node is gather.
2105 return VectorizableTree.front()->hasState() &&
2106 (VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast ||
2107 VectorizableTree.front()->CombinedOp ==
2108 TreeEntry::ReducedBitcastBSwap) &&
2109 VectorizableTree.front()->State == TreeEntry::Vectorize;
2110 }
2111
2112 /// Builds external uses of the vectorized scalars, i.e. the list of
2113 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2114 /// ExternallyUsedValues contains additional list of external uses to handle
2115 /// vectorization of reductions.
2116 void
2117 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2118
2119 /// Transforms graph nodes to target specific representations, if profitable.
2120 void transformNodes();
2121
2122 /// Clear the internal data structures that are created by 'buildTree'.
2123 void deleteTree() {
2124 VectorizableTree.clear();
2125 ScalarToTreeEntries.clear();
2126 DeletedNodes.clear();
2127 TransformedToGatherNodes.clear();
2128 OperandsToTreeEntry.clear();
2129 ScalarsInSplitNodes.clear();
2130 MustGather.clear();
2131 NonScheduledFirst.clear();
2132 EntryToLastInstruction.clear();
2133 LastInstructionToPos.clear();
2134 LoadEntriesToVectorize.clear();
2135 IsGraphTransformMode = false;
2136 GatheredLoadsEntriesFirst.reset();
2137 CompressEntryToData.clear();
2138 ExternalUses.clear();
2139 ExternalUsesAsOriginalScalar.clear();
2140 ExternalUsesWithNonUsers.clear();
2141 for (auto &Iter : BlocksSchedules) {
2142 BlockScheduling *BS = Iter.second.get();
2143 BS->clear();
2144 }
2145 MinBWs.clear();
2146 ReductionBitWidth = 0;
2147 BaseGraphSize = 1;
2148 CastMaxMinBWSizes.reset();
2149 ExtraBitWidthNodes.clear();
2150 InstrElementSize.clear();
2151 UserIgnoreList = nullptr;
2152 PostponedGathers.clear();
2153 ValueToGatherNodes.clear();
2154 TreeEntryToStridedPtrInfoMap.clear();
2155 }
2156
2157 unsigned getTreeSize() const { return VectorizableTree.size(); }
2158
2159 /// Returns the base graph size, before any transformations.
2160 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2161
2162 /// Perform LICM and CSE on the newly generated gather sequences.
2164
2165 /// Does this non-empty order represent an identity order? Identity
2166 /// should be represented as an empty order, so this is used to
2167 /// decide if we can canonicalize a computed order. Undef elements
2168 /// (represented as size) are ignored.
2170 assert(!Order.empty() && "expected non-empty order");
2171 const unsigned Sz = Order.size();
2172 return all_of(enumerate(Order), [&](const auto &P) {
2173 return P.value() == P.index() || P.value() == Sz;
2174 });
2175 }
2176
2177 /// Checks if the specified gather tree entry \p TE can be represented as a
2178 /// shuffled vector entry + (possibly) permutation with other gathers. It
2179 /// implements the checks only for possibly ordered scalars (Loads,
2180 /// ExtractElement, ExtractValue), which can be part of the graph.
2181 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2182 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2183 /// node might be ignored.
2184 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2185 bool TopToBottom,
2186 bool IgnoreReorder);
2187
2188 /// Sort loads into increasing pointers offsets to allow greater clustering.
2189 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2190
2191 /// Gets reordering data for the given tree entry. If the entry is vectorized
2192 /// - just return ReorderIndices, otherwise check if the scalars can be
2193 /// reordered and return the most optimal order.
2194 /// \return std::nullopt if ordering is not important, empty order, if
2195 /// identity order is important, or the actual order.
2196 /// \param TopToBottom If true, include the order of vectorized stores and
2197 /// insertelement nodes, otherwise skip them.
2198 /// \param IgnoreReorder true, if the root node order can be ignored.
2199 std::optional<OrdersType>
2200 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2201
2202 /// Checks if it is profitable to reorder the current tree.
2203 /// If the tree does not contain many profitable reordable nodes, better to
2204 /// skip it to save compile time.
2205 bool isProfitableToReorder() const;
2206
2207 /// Reorders the current graph to the most profitable order starting from the
2208 /// root node to the leaf nodes. The best order is chosen only from the nodes
2209 /// of the same size (vectorization factor). Smaller nodes are considered
2210 /// parts of subgraph with smaller VF and they are reordered independently. We
2211 /// can make it because we still need to extend smaller nodes to the wider VF
2212 /// and we can merge reordering shuffles with the widening shuffles.
2213 void reorderTopToBottom();
2214
2215 /// Reorders the current graph to the most profitable order starting from
2216 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2217 /// number of reshuffles if the leaf nodes use the same order. In this case we
2218 /// can merge the orders and just shuffle user node instead of shuffling its
2219 /// operands. Plus, even the leaf nodes have different orders, it allows to
2220 /// sink reordering in the graph closer to the root node and merge it later
2221 /// during analysis.
2222 void reorderBottomToTop(bool IgnoreReorder = false);
2223
2224 /// \return The vector element size in bits to use when vectorizing the
2225 /// expression tree ending at \p V. If V is a store, the size is the width of
2226 /// the stored value. Otherwise, the size is the width of the largest loaded
2227 /// value reaching V. This method is used by the vectorizer to calculate
2228 /// vectorization factors.
2229 unsigned getVectorElementSize(Value *V);
2230
2231 /// Compute the minimum type sizes required to represent the entries in a
2232 /// vectorizable tree.
2234
2235 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2236 unsigned getMaxVecRegSize() const {
2237 return MaxVecRegSize;
2238 }
2239
2240 // \returns minimum vector register size as set by cl::opt.
2241 unsigned getMinVecRegSize() const {
2242 return MinVecRegSize;
2243 }
2244
2245 unsigned getMinVF(unsigned Sz) const {
2246 return std::max(2U, getMinVecRegSize() / Sz);
2247 }
2248
2249 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2250 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2251 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2252 return MaxVF ? MaxVF : UINT_MAX;
2253 }
2254
2255 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2256 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2257 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2258 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2259 ///
2260 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2261 unsigned canMapToVector(Type *T) const;
2262
2263 /// \returns True if the VectorizableTree is both tiny and not fully
2264 /// vectorizable. We do not vectorize such trees.
2265 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2266
2267 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2268 /// It may happen, if all gather nodes are loads and they cannot be
2269 /// "clusterized". In this case even subgraphs cannot be vectorized more
2270 /// effectively than the base graph.
2271 bool isTreeNotExtendable() const;
2272
2273 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2274 /// can be load combined in the backend. Load combining may not be allowed in
2275 /// the IR optimizer, so we do not want to alter the pattern. For example,
2276 /// partially transforming a scalar bswap() pattern into vector code is
2277 /// effectively impossible for the backend to undo.
2278 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2279 /// may not be necessary.
2280 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2281
2282 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2283 /// can be load combined in the backend. Load combining may not be allowed in
2284 /// the IR optimizer, so we do not want to alter the pattern. For example,
2285 /// partially transforming a scalar bswap() pattern into vector code is
2286 /// effectively impossible for the backend to undo.
2287 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2288 /// may not be necessary.
2289 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2290 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2291 Align Alignment, const int64_t Diff,
2292 const size_t Sz) const;
2293
2294 /// Return true if an array of scalar loads can be replaced with a strided
2295 /// load (with constant stride).
2296 ///
2297 /// It is possible that the load gets "widened". Suppose that originally each
2298 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2299 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2300 /// ...
2301 /// %b + 0 * %s + (w - 1)
2302 ///
2303 /// %b + 1 * %s + 0
2304 /// %b + 1 * %s + 1
2305 /// %b + 1 * %s + 2
2306 /// ...
2307 /// %b + 1 * %s + (w - 1)
2308 /// ...
2309 ///
2310 /// %b + (n - 1) * %s + 0
2311 /// %b + (n - 1) * %s + 1
2312 /// %b + (n - 1) * %s + 2
2313 /// ...
2314 /// %b + (n - 1) * %s + (w - 1)
2315 ///
2316 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2317 ///
2318 /// \param PointerOps list of pointer arguments of loads.
2319 /// \param ElemTy original scalar type of loads.
2320 /// \param Alignment alignment of the first load.
2321 /// \param SortedIndices is the order of PointerOps as returned by
2322 /// `sortPtrAccesses`
2323 /// \param Diff Pointer difference between the lowest and the highes pointer
2324 /// in `PointerOps` as returned by `getPointersDiff`.
2325 /// \param Ptr0 first pointer in `PointersOps`.
2326 /// \param PtrN last pointer in `PointersOps`.
2327 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2328 /// of `SPtrInfo` necessary to generate the strided load later.
2330 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2331 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2332 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2333
2334 /// Return true if an array of scalar loads can be replaced with a strided
2335 /// load (with run-time stride).
2336 /// \param PointerOps list of pointer arguments of loads.
2337 /// \param ScalarTy type of loads.
2338 /// \param CommonAlignment common alignement of loads as computed by
2339 /// `computeCommonAlignment<LoadInst>`.
2340 /// \param SortedIndicies is a list of indicies computed by this function such
2341 /// that the sequence `PointerOps[SortedIndices[0]],
2342 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2343 /// ordered by the coefficient of the stride. For example, if PointerOps is
2344 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2345 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2346 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2347 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2348 /// of `SPtrInfo` necessary to generate the strided load later.
2349 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2350 Align CommonAlignment,
2351 SmallVectorImpl<unsigned> &SortedIndices,
2352 StridedPtrInfo &SPtrInfo) const;
2353
2354 /// Checks if the given array of loads can be represented as a vectorized,
2355 /// scatter or just simple gather.
2356 /// \param VL list of loads.
2357 /// \param VL0 main load value.
2358 /// \param Order returned order of load instructions.
2359 /// \param PointerOps returned list of pointer operands.
2360 /// \param BestVF return best vector factor, if recursive check found better
2361 /// vectorization sequences rather than masked gather.
2362 /// \param TryRecursiveCheck used to check if long masked gather can be
2363 /// represented as a serie of loads/insert subvector, if profitable.
2366 SmallVectorImpl<Value *> &PointerOps,
2367 StridedPtrInfo &SPtrInfo,
2368 unsigned *BestVF = nullptr,
2369 bool TryRecursiveCheck = true) const;
2370
2371 /// Registers non-vectorizable sequence of loads
2372 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2373 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2374 }
2375
2376 /// Checks if the given loads sequence is known as not vectorizable
2377 template <typename T>
2379 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2380 }
2381
2383
2384 /// This structure holds any data we need about the edges being traversed
2385 /// during buildTreeRec(). We keep track of:
2386 /// (i) the user TreeEntry index, and
2387 /// (ii) the index of the edge.
2388 struct EdgeInfo {
2389 EdgeInfo() = default;
2390 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2392 /// The user TreeEntry.
2393 TreeEntry *UserTE = nullptr;
2394 /// The operand index of the use.
2395 unsigned EdgeIdx = UINT_MAX;
2396#ifndef NDEBUG
2398 const BoUpSLP::EdgeInfo &EI) {
2399 EI.dump(OS);
2400 return OS;
2401 }
2402 /// Debug print.
2403 void dump(raw_ostream &OS) const {
2404 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2405 << " EdgeIdx:" << EdgeIdx << "}";
2406 }
2407 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2408#endif
2409 bool operator == (const EdgeInfo &Other) const {
2410 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2411 }
2412
2413 operator bool() const { return UserTE != nullptr; }
2414 };
2415 friend struct DenseMapInfo<EdgeInfo>;
2416
2417 /// A helper class used for scoring candidates for two consecutive lanes.
2419 const TargetLibraryInfo &TLI;
2420 const DataLayout &DL;
2421 ScalarEvolution &SE;
2422 const BoUpSLP &R;
2423 int NumLanes; // Total number of lanes (aka vectorization factor).
2424 int MaxLevel; // The maximum recursion depth for accumulating score.
2425
2426 public:
2428 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2429 int MaxLevel)
2430 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2431 MaxLevel(MaxLevel) {}
2432
2433 // The hard-coded scores listed here are not very important, though it shall
2434 // be higher for better matches to improve the resulting cost. When
2435 // computing the scores of matching one sub-tree with another, we are
2436 // basically counting the number of values that are matching. So even if all
2437 // scores are set to 1, we would still get a decent matching result.
2438 // However, sometimes we have to break ties. For example we may have to
2439 // choose between matching loads vs matching opcodes. This is what these
2440 // scores are helping us with: they provide the order of preference. Also,
2441 // this is important if the scalar is externally used or used in another
2442 // tree entry node in the different lane.
2443
2444 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2445 static const int ScoreConsecutiveLoads = 4;
2446 /// The same load multiple times. This should have a better score than
2447 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2448 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2449 /// a vector load and 1.0 for a broadcast.
2450 static const int ScoreSplatLoads = 3;
2451 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2452 static const int ScoreReversedLoads = 3;
2453 /// A load candidate for masked gather.
2454 static const int ScoreMaskedGatherCandidate = 1;
2455 /// ExtractElementInst from same vector and consecutive indexes.
2456 static const int ScoreConsecutiveExtracts = 4;
2457 /// ExtractElementInst from same vector and reversed indices.
2458 static const int ScoreReversedExtracts = 3;
2459 /// Constants.
2460 static const int ScoreConstants = 2;
2461 /// Instructions with the same opcode.
2462 static const int ScoreSameOpcode = 2;
2463 /// Instructions with alt opcodes (e.g, add + sub).
2464 static const int ScoreAltOpcodes = 1;
2465 /// Identical instructions (a.k.a. splat or broadcast).
2466 static const int ScoreSplat = 1;
2467 /// Matching with an undef is preferable to failing.
2468 static const int ScoreUndef = 1;
2469 /// Score for failing to find a decent match.
2470 static const int ScoreFail = 0;
2471 /// Score if all users are vectorized.
2472 static const int ScoreAllUserVectorized = 1;
2473
2474 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2475 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2476 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2477 /// MainAltOps.
2479 ArrayRef<Value *> MainAltOps) const {
2480 if (!isValidElementType(V1->getType()) ||
2483
2484 if (V1 == V2) {
2485 if (isa<LoadInst>(V1)) {
2486 // Retruns true if the users of V1 and V2 won't need to be extracted.
2487 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2488 // Bail out if we have too many uses to save compilation time.
2489 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2490 return false;
2491
2492 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2493 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2494 return U == U1 || U == U2 || R.isVectorized(U);
2495 });
2496 };
2497 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2498 };
2499 // A broadcast of a load can be cheaper on some targets.
2500 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2501 ElementCount::getFixed(NumLanes)) &&
2502 ((int)V1->getNumUses() == NumLanes ||
2503 AllUsersAreInternal(V1, V2)))
2505 }
2507 }
2508
2509 auto CheckSameEntryOrFail = [&]() {
2510 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2512 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2513 !TEs2.empty() &&
2514 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2516 }
2518 };
2519
2520 auto *LI1 = dyn_cast<LoadInst>(V1);
2521 auto *LI2 = dyn_cast<LoadInst>(V2);
2522 if (LI1 && LI2) {
2523 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2524 !LI2->isSimple())
2525 return CheckSameEntryOrFail();
2526
2527 std::optional<int64_t> Dist = getPointersDiff(
2528 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2529 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2530 if (!Dist || *Dist == 0) {
2531 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2532 getUnderlyingObject(LI2->getPointerOperand()) &&
2533 R.TTI->isLegalMaskedGather(
2534 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2536 return CheckSameEntryOrFail();
2537 }
2538 // The distance is too large - still may be profitable to use masked
2539 // loads/gathers.
2540 if (std::abs(*Dist) > NumLanes / 2)
2542 // This still will detect consecutive loads, but we might have "holes"
2543 // in some cases. It is ok for non-power-2 vectorization and may produce
2544 // better results. It should not affect current vectorization.
2547 }
2548
2549 auto *C1 = dyn_cast<Constant>(V1);
2550 auto *C2 = dyn_cast<Constant>(V2);
2551 if (C1 && C2)
2553
2554 // Consider constants and buildvector compatible.
2555 if ((C1 && isa<InsertElementInst>(V2)) ||
2556 (C2 && isa<InsertElementInst>(V1)))
2558
2559 // Extracts from consecutive indexes of the same vector better score as
2560 // the extracts could be optimized away.
2561 Value *EV1;
2562 ConstantInt *Ex1Idx;
2563 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2564 // Undefs are always profitable for extractelements.
2565 // Compiler can easily combine poison and extractelement <non-poison> or
2566 // undef and extractelement <poison>. But combining undef +
2567 // extractelement <non-poison-but-may-produce-poison> requires some
2568 // extra operations.
2569 if (isa<UndefValue>(V2))
2570 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2573 Value *EV2 = nullptr;
2574 ConstantInt *Ex2Idx = nullptr;
2575 if (match(V2,
2577 m_Undef())))) {
2578 // Undefs are always profitable for extractelements.
2579 if (!Ex2Idx)
2581 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2583 if (EV2 == EV1) {
2584 int Idx1 = Ex1Idx->getZExtValue();
2585 int Idx2 = Ex2Idx->getZExtValue();
2586 int Dist = Idx2 - Idx1;
2587 // The distance is too large - still may be profitable to use
2588 // shuffles.
2589 if (std::abs(Dist) == 0)
2591 if (std::abs(Dist) > NumLanes / 2)
2595 }
2597 }
2598 return CheckSameEntryOrFail();
2599 }
2600
2601 auto *I1 = dyn_cast<Instruction>(V1);
2602 auto *I2 = dyn_cast<Instruction>(V2);
2603 if (I1 && I2) {
2604 if (I1->getParent() != I2->getParent())
2605 return CheckSameEntryOrFail();
2606 SmallVector<Value *, 4> Ops(MainAltOps);
2607 Ops.push_back(I1);
2608 Ops.push_back(I2);
2609 InstructionsState S = getSameOpcode(Ops, TLI);
2610 // Note: Only consider instructions with <= 2 operands to avoid
2611 // complexity explosion.
2612 if (S &&
2613 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2614 !S.isAltShuffle()) &&
2615 all_of(Ops, [&S](Value *V) {
2616 return isa<PoisonValue>(V) ||
2617 cast<Instruction>(V)->getNumOperands() ==
2618 S.getMainOp()->getNumOperands();
2619 }))
2620 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2622 }
2623
2624 if (I1 && isa<PoisonValue>(V2))
2626
2627 if (isa<UndefValue>(V2))
2629
2630 return CheckSameEntryOrFail();
2631 }
2632
2633 /// Go through the operands of \p LHS and \p RHS recursively until
2634 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2635 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2636 /// of \p U1 and \p U2), except at the beginning of the recursion where
2637 /// these are set to nullptr.
2638 ///
2639 /// For example:
2640 /// \verbatim
2641 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2642 /// \ / \ / \ / \ /
2643 /// + + + +
2644 /// G1 G2 G3 G4
2645 /// \endverbatim
2646 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2647 /// each level recursively, accumulating the score. It starts from matching
2648 /// the additions at level 0, then moves on to the loads (level 1). The
2649 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2650 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2651 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2652 /// Please note that the order of the operands does not matter, as we
2653 /// evaluate the score of all profitable combinations of operands. In
2654 /// other words the score of G1 and G4 is the same as G1 and G2. This
2655 /// heuristic is based on ideas described in:
2656 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2657 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2658 /// Luís F. W. Góes
2660 Instruction *U2, int CurrLevel,
2661 ArrayRef<Value *> MainAltOps) const {
2662
2663 // Get the shallow score of V1 and V2.
2664 int ShallowScoreAtThisLevel =
2665 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2666
2667 // If reached MaxLevel,
2668 // or if V1 and V2 are not instructions,
2669 // or if they are SPLAT,
2670 // or if they are not consecutive,
2671 // or if profitable to vectorize loads or extractelements, early return
2672 // the current cost.
2673 auto *I1 = dyn_cast<Instruction>(LHS);
2674 auto *I2 = dyn_cast<Instruction>(RHS);
2675 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2676 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2677 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2678 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2680 ShallowScoreAtThisLevel))
2681 return ShallowScoreAtThisLevel;
2682 assert(I1 && I2 && "Should have early exited.");
2683
2684 // Contains the I2 operand indexes that got matched with I1 operands.
2685 SmallSet<unsigned, 4> Op2Used;
2686
2687 // Recursion towards the operands of I1 and I2. We are trying all possible
2688 // operand pairs, and keeping track of the best score.
2689 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2690 OpIdx1 != NumOperands1; ++OpIdx1) {
2691 // Try to pair op1I with the best operand of I2.
2692 int MaxTmpScore = 0;
2693 unsigned MaxOpIdx2 = 0;
2694 bool FoundBest = false;
2695 // If I2 is commutative try all combinations.
2696 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2697 unsigned ToIdx = isCommutative(I2)
2698 ? I2->getNumOperands()
2699 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2700 assert(FromIdx <= ToIdx && "Bad index");
2701 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2702 // Skip operands already paired with OpIdx1.
2703 if (Op2Used.count(OpIdx2))
2704 continue;
2705 // Recursively calculate the cost at each level
2706 int TmpScore =
2707 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2708 I1, I2, CurrLevel + 1, {});
2709 // Look for the best score.
2710 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2711 TmpScore > MaxTmpScore) {
2712 MaxTmpScore = TmpScore;
2713 MaxOpIdx2 = OpIdx2;
2714 FoundBest = true;
2715 }
2716 }
2717 if (FoundBest) {
2718 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2719 Op2Used.insert(MaxOpIdx2);
2720 ShallowScoreAtThisLevel += MaxTmpScore;
2721 }
2722 }
2723 return ShallowScoreAtThisLevel;
2724 }
2725 };
2726 /// A helper data structure to hold the operands of a vector of instructions.
2727 /// This supports a fixed vector length for all operand vectors.
2729 /// For each operand we need (i) the value, and (ii) the opcode that it
2730 /// would be attached to if the expression was in a left-linearized form.
2731 /// This is required to avoid illegal operand reordering.
2732 /// For example:
2733 /// \verbatim
2734 /// 0 Op1
2735 /// |/
2736 /// Op1 Op2 Linearized + Op2
2737 /// \ / ----------> |/
2738 /// - -
2739 ///
2740 /// Op1 - Op2 (0 + Op1) - Op2
2741 /// \endverbatim
2742 ///
2743 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2744 ///
2745 /// Another way to think of this is to track all the operations across the
2746 /// path from the operand all the way to the root of the tree and to
2747 /// calculate the operation that corresponds to this path. For example, the
2748 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2749 /// corresponding operation is a '-' (which matches the one in the
2750 /// linearized tree, as shown above).
2751 ///
2752 /// For lack of a better term, we refer to this operation as Accumulated
2753 /// Path Operation (APO).
2754 struct OperandData {
2755 OperandData() = default;
2756 OperandData(Value *V, bool APO, bool IsUsed)
2757 : V(V), APO(APO), IsUsed(IsUsed) {}
2758 /// The operand value.
2759 Value *V = nullptr;
2760 /// TreeEntries only allow a single opcode, or an alternate sequence of
2761 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2762 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2763 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2764 /// (e.g., Add/Mul)
2765 bool APO = false;
2766 /// Helper data for the reordering function.
2767 bool IsUsed = false;
2768 };
2769
2770 /// During operand reordering, we are trying to select the operand at lane
2771 /// that matches best with the operand at the neighboring lane. Our
2772 /// selection is based on the type of value we are looking for. For example,
2773 /// if the neighboring lane has a load, we need to look for a load that is
2774 /// accessing a consecutive address. These strategies are summarized in the
2775 /// 'ReorderingMode' enumerator.
2776 enum class ReorderingMode {
2777 Load, ///< Matching loads to consecutive memory addresses
2778 Opcode, ///< Matching instructions based on opcode (same or alternate)
2779 Constant, ///< Matching constants
2780 Splat, ///< Matching the same instruction multiple times (broadcast)
2781 Failed, ///< We failed to create a vectorizable group
2782 };
2783
2784 using OperandDataVec = SmallVector<OperandData, 2>;
2785
2786 /// A vector of operand vectors.
2788 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2789 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2790 unsigned ArgSize = 0;
2791
2792 const TargetLibraryInfo &TLI;
2793 const DataLayout &DL;
2794 ScalarEvolution &SE;
2795 const BoUpSLP &R;
2796 const Loop *L = nullptr;
2797
2798 /// \returns the operand data at \p OpIdx and \p Lane.
2799 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2800 return OpsVec[OpIdx][Lane];
2801 }
2802
2803 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2804 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2805 return OpsVec[OpIdx][Lane];
2806 }
2807
2808 /// Clears the used flag for all entries.
2809 void clearUsed() {
2810 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2811 OpIdx != NumOperands; ++OpIdx)
2812 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2813 ++Lane)
2814 OpsVec[OpIdx][Lane].IsUsed = false;
2815 }
2816
2817 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2818 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2819 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2820 }
2821
2822 /// \param Lane lane of the operands under analysis.
2823 /// \param OpIdx operand index in \p Lane lane we're looking the best
2824 /// candidate for.
2825 /// \param Idx operand index of the current candidate value.
2826 /// \returns The additional score due to possible broadcasting of the
2827 /// elements in the lane. It is more profitable to have power-of-2 unique
2828 /// elements in the lane, it will be vectorized with higher probability
2829 /// after removing duplicates. Currently the SLP vectorizer supports only
2830 /// vectorization of the power-of-2 number of unique scalars.
2831 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2832 const SmallBitVector &UsedLanes) const {
2833 Value *IdxLaneV = getData(Idx, Lane).V;
2834 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2835 isa<ExtractElementInst>(IdxLaneV))
2836 return 0;
2838 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2839 if (Ln == Lane)
2840 continue;
2841 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2842 if (!isa<Instruction>(OpIdxLnV))
2843 return 0;
2844 Uniques.try_emplace(OpIdxLnV, Ln);
2845 }
2846 unsigned UniquesCount = Uniques.size();
2847 auto IdxIt = Uniques.find(IdxLaneV);
2848 unsigned UniquesCntWithIdxLaneV =
2849 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2850 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2851 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2852 unsigned UniquesCntWithOpIdxLaneV =
2853 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2854 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2855 return 0;
2856 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2857 UniquesCntWithOpIdxLaneV,
2858 UniquesCntWithOpIdxLaneV -
2859 bit_floor(UniquesCntWithOpIdxLaneV)) -
2860 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2861 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2862 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2863 }
2864
2865 /// \param Lane lane of the operands under analysis.
2866 /// \param OpIdx operand index in \p Lane lane we're looking the best
2867 /// candidate for.
2868 /// \param Idx operand index of the current candidate value.
2869 /// \returns The additional score for the scalar which users are all
2870 /// vectorized.
2871 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2872 Value *IdxLaneV = getData(Idx, Lane).V;
2873 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2874 // Do not care about number of uses for vector-like instructions
2875 // (extractelement/extractvalue with constant indices), they are extracts
2876 // themselves and already externally used. Vectorization of such
2877 // instructions does not add extra extractelement instruction, just may
2878 // remove it.
2879 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2880 isVectorLikeInstWithConstOps(OpIdxLaneV))
2882 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2883 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2884 return 0;
2885 return R.areAllUsersVectorized(IdxLaneI)
2887 : 0;
2888 }
2889
2890 /// Score scaling factor for fully compatible instructions but with
2891 /// different number of external uses. Allows better selection of the
2892 /// instructions with less external uses.
2893 static const int ScoreScaleFactor = 10;
2894
2895 /// \Returns the look-ahead score, which tells us how much the sub-trees
2896 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2897 /// score. This helps break ties in an informed way when we cannot decide on
2898 /// the order of the operands by just considering the immediate
2899 /// predecessors.
2900 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2901 int Lane, unsigned OpIdx, unsigned Idx,
2902 bool &IsUsed, const SmallBitVector &UsedLanes) {
2903 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2905 // Keep track of the instruction stack as we recurse into the operands
2906 // during the look-ahead score exploration.
2907 int Score =
2908 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2909 /*CurrLevel=*/1, MainAltOps);
2910 if (Score) {
2911 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2912 if (Score <= -SplatScore) {
2913 // Failed score.
2914 Score = 0;
2915 } else {
2916 Score += SplatScore;
2917 // Scale score to see the difference between different operands
2918 // and similar operands but all vectorized/not all vectorized
2919 // uses. It does not affect actual selection of the best
2920 // compatible operand in general, just allows to select the
2921 // operand with all vectorized uses.
2922 Score *= ScoreScaleFactor;
2923 Score += getExternalUseScore(Lane, OpIdx, Idx);
2924 IsUsed = true;
2925 }
2926 }
2927 return Score;
2928 }
2929
2930 /// Best defined scores per lanes between the passes. Used to choose the
2931 /// best operand (with the highest score) between the passes.
2932 /// The key - {Operand Index, Lane}.
2933 /// The value - the best score between the passes for the lane and the
2934 /// operand.
2936 BestScoresPerLanes;
2937
2938 // Search all operands in Ops[*][Lane] for the one that matches best
2939 // Ops[OpIdx][LastLane] and return its opreand index.
2940 // If no good match can be found, return std::nullopt.
2941 std::optional<unsigned>
2942 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2943 ArrayRef<ReorderingMode> ReorderingModes,
2944 ArrayRef<Value *> MainAltOps,
2945 const SmallBitVector &UsedLanes) {
2946 unsigned NumOperands = getNumOperands();
2947
2948 // The operand of the previous lane at OpIdx.
2949 Value *OpLastLane = getData(OpIdx, LastLane).V;
2950
2951 // Our strategy mode for OpIdx.
2952 ReorderingMode RMode = ReorderingModes[OpIdx];
2953 if (RMode == ReorderingMode::Failed)
2954 return std::nullopt;
2955
2956 // The linearized opcode of the operand at OpIdx, Lane.
2957 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2958
2959 // The best operand index and its score.
2960 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2961 // are using the score to differentiate between the two.
2962 struct BestOpData {
2963 std::optional<unsigned> Idx;
2964 unsigned Score = 0;
2965 } BestOp;
2966 BestOp.Score =
2967 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2968 .first->second;
2969
2970 // Track if the operand must be marked as used. If the operand is set to
2971 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2972 // want to reestimate the operands again on the following iterations).
2973 bool IsUsed = RMode == ReorderingMode::Splat ||
2974 RMode == ReorderingMode::Constant ||
2975 RMode == ReorderingMode::Load;
2976 // Iterate through all unused operands and look for the best.
2977 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2978 // Get the operand at Idx and Lane.
2979 OperandData &OpData = getData(Idx, Lane);
2980 Value *Op = OpData.V;
2981 bool OpAPO = OpData.APO;
2982
2983 // Skip already selected operands.
2984 if (OpData.IsUsed)
2985 continue;
2986
2987 // Skip if we are trying to move the operand to a position with a
2988 // different opcode in the linearized tree form. This would break the
2989 // semantics.
2990 if (OpAPO != OpIdxAPO)
2991 continue;
2992
2993 // Look for an operand that matches the current mode.
2994 switch (RMode) {
2995 case ReorderingMode::Load:
2996 case ReorderingMode::Opcode: {
2997 bool LeftToRight = Lane > LastLane;
2998 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2999 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
3000 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
3001 OpIdx, Idx, IsUsed, UsedLanes);
3002 if (Score > static_cast<int>(BestOp.Score) ||
3003 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
3004 Idx == OpIdx)) {
3005 BestOp.Idx = Idx;
3006 BestOp.Score = Score;
3007 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
3008 }
3009 break;
3010 }
3011 case ReorderingMode::Constant:
3012 if (isa<Constant>(Op) ||
3013 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
3014 BestOp.Idx = Idx;
3015 if (isa<Constant>(Op)) {
3017 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3019 }
3021 IsUsed = false;
3022 }
3023 break;
3024 case ReorderingMode::Splat:
3025 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
3026 IsUsed = Op == OpLastLane;
3027 if (Op == OpLastLane) {
3028 BestOp.Score = LookAheadHeuristics::ScoreSplat;
3029 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3031 }
3032 BestOp.Idx = Idx;
3033 }
3034 break;
3035 case ReorderingMode::Failed:
3036 llvm_unreachable("Not expected Failed reordering mode.");
3037 }
3038 }
3039
3040 if (BestOp.Idx) {
3041 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3042 return BestOp.Idx;
3043 }
3044 // If we could not find a good match return std::nullopt.
3045 return std::nullopt;
3046 }
3047
3048 /// Helper for reorderOperandVecs.
3049 /// \returns the lane that we should start reordering from. This is the one
3050 /// which has the least number of operands that can freely move about or
3051 /// less profitable because it already has the most optimal set of operands.
3052 unsigned getBestLaneToStartReordering() const {
3053 unsigned Min = UINT_MAX;
3054 unsigned SameOpNumber = 0;
3055 // std::pair<unsigned, unsigned> is used to implement a simple voting
3056 // algorithm and choose the lane with the least number of operands that
3057 // can freely move about or less profitable because it already has the
3058 // most optimal set of operands. The first unsigned is a counter for
3059 // voting, the second unsigned is the counter of lanes with instructions
3060 // with same/alternate opcodes and same parent basic block.
3062 // Try to be closer to the original results, if we have multiple lanes
3063 // with same cost. If 2 lanes have the same cost, use the one with the
3064 // highest index.
3065 for (int I = getNumLanes(); I > 0; --I) {
3066 unsigned Lane = I - 1;
3067 OperandsOrderData NumFreeOpsHash =
3068 getMaxNumOperandsThatCanBeReordered(Lane);
3069 // Compare the number of operands that can move and choose the one with
3070 // the least number.
3071 if (NumFreeOpsHash.NumOfAPOs < Min) {
3072 Min = NumFreeOpsHash.NumOfAPOs;
3073 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3074 HashMap.clear();
3075 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3076 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3077 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3078 // Select the most optimal lane in terms of number of operands that
3079 // should be moved around.
3080 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3081 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3082 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3083 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3084 auto [It, Inserted] =
3085 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3086 if (!Inserted)
3087 ++It->second.first;
3088 }
3089 }
3090 // Select the lane with the minimum counter.
3091 unsigned BestLane = 0;
3092 unsigned CntMin = UINT_MAX;
3093 for (const auto &Data : reverse(HashMap)) {
3094 if (Data.second.first < CntMin) {
3095 CntMin = Data.second.first;
3096 BestLane = Data.second.second;
3097 }
3098 }
3099 return BestLane;
3100 }
3101
3102 /// Data structure that helps to reorder operands.
3103 struct OperandsOrderData {
3104 /// The best number of operands with the same APOs, which can be
3105 /// reordered.
3106 unsigned NumOfAPOs = UINT_MAX;
3107 /// Number of operands with the same/alternate instruction opcode and
3108 /// parent.
3109 unsigned NumOpsWithSameOpcodeParent = 0;
3110 /// Hash for the actual operands ordering.
3111 /// Used to count operands, actually their position id and opcode
3112 /// value. It is used in the voting mechanism to find the lane with the
3113 /// least number of operands that can freely move about or less profitable
3114 /// because it already has the most optimal set of operands. Can be
3115 /// replaced with SmallVector<unsigned> instead but hash code is faster
3116 /// and requires less memory.
3117 unsigned Hash = 0;
3118 };
3119 /// \returns the maximum number of operands that are allowed to be reordered
3120 /// for \p Lane and the number of compatible instructions(with the same
3121 /// parent/opcode). This is used as a heuristic for selecting the first lane
3122 /// to start operand reordering.
3123 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3124 unsigned CntTrue = 0;
3125 unsigned NumOperands = getNumOperands();
3126 // Operands with the same APO can be reordered. We therefore need to count
3127 // how many of them we have for each APO, like this: Cnt[APO] = x.
3128 // Since we only have two APOs, namely true and false, we can avoid using
3129 // a map. Instead we can simply count the number of operands that
3130 // correspond to one of them (in this case the 'true' APO), and calculate
3131 // the other by subtracting it from the total number of operands.
3132 // Operands with the same instruction opcode and parent are more
3133 // profitable since we don't need to move them in many cases, with a high
3134 // probability such lane already can be vectorized effectively.
3135 bool AllUndefs = true;
3136 unsigned NumOpsWithSameOpcodeParent = 0;
3137 Instruction *OpcodeI = nullptr;
3138 BasicBlock *Parent = nullptr;
3139 unsigned Hash = 0;
3140 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3141 const OperandData &OpData = getData(OpIdx, Lane);
3142 if (OpData.APO)
3143 ++CntTrue;
3144 // Use Boyer-Moore majority voting for finding the majority opcode and
3145 // the number of times it occurs.
3146 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3147 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3148 I->getParent() != Parent) {
3149 if (NumOpsWithSameOpcodeParent == 0) {
3150 NumOpsWithSameOpcodeParent = 1;
3151 OpcodeI = I;
3152 Parent = I->getParent();
3153 } else {
3154 --NumOpsWithSameOpcodeParent;
3155 }
3156 } else {
3157 ++NumOpsWithSameOpcodeParent;
3158 }
3159 }
3160 Hash = hash_combine(
3161 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3162 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3163 }
3164 if (AllUndefs)
3165 return {};
3166 OperandsOrderData Data;
3167 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3168 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3169 Data.Hash = Hash;
3170 return Data;
3171 }
3172
3173 /// Go through the instructions in VL and append their operands.
3174 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3175 const InstructionsState &S) {
3176 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3177 assert((empty() || all_of(Operands,
3178 [this](const ValueList &VL) {
3179 return VL.size() == getNumLanes();
3180 })) &&
3181 "Expected same number of lanes");
3182 assert(S.valid() && "InstructionsState is invalid.");
3183 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3184 // arguments to the intrinsic produces the same result.
3185 Instruction *MainOp = S.getMainOp();
3186 unsigned NumOperands = MainOp->getNumOperands();
3188 OpsVec.resize(ArgSize);
3189 unsigned NumLanes = VL.size();
3190 for (OperandDataVec &Ops : OpsVec)
3191 Ops.resize(NumLanes);
3192 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3193 // Our tree has just 3 nodes: the root and two operands.
3194 // It is therefore trivial to get the APO. We only need to check the
3195 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3196 // operand. The LHS operand of both add and sub is never attached to an
3197 // inversese operation in the linearized form, therefore its APO is
3198 // false. The RHS is true only if V is an inverse operation.
3199
3200 // Since operand reordering is performed on groups of commutative
3201 // operations or alternating sequences (e.g., +, -), we can safely tell
3202 // the inverse operations by checking commutativity.
3203 auto *I = dyn_cast<Instruction>(VL[Lane]);
3204 if (!I && isa<PoisonValue>(VL[Lane])) {
3205 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3206 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3207 continue;
3208 }
3209 bool IsInverseOperation = false;
3210 if (S.isCopyableElement(VL[Lane])) {
3211 // The value is a copyable element.
3212 IsInverseOperation =
3213 !isCommutative(MainOp, VL[Lane], /*IsCopyable=*/true);
3214 } else {
3215 assert(I && "Expected instruction");
3216 auto [SelectedOp, Ops] = convertTo(I, S);
3217 // We cannot check commutativity by the converted instruction
3218 // (SelectedOp) because isCommutative also examines def-use
3219 // relationships.
3220 IsInverseOperation = !isCommutative(SelectedOp, I);
3221 }
3222 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3223 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3224 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3225 }
3226 }
3227 }
3228
3229 /// \returns the number of operands.
3230 unsigned getNumOperands() const { return ArgSize; }
3231
3232 /// \returns the number of lanes.
3233 unsigned getNumLanes() const { return OpsVec[0].size(); }
3234
3235 /// \returns the operand value at \p OpIdx and \p Lane.
3236 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3237 return getData(OpIdx, Lane).V;
3238 }
3239
3240 /// \returns true if the data structure is empty.
3241 bool empty() const { return OpsVec.empty(); }
3242
3243 /// Clears the data.
3244 void clear() { OpsVec.clear(); }
3245
3246 /// \Returns true if there are enough operands identical to \p Op to fill
3247 /// the whole vector (it is mixed with constants or loop invariant values).
3248 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3249 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3250 assert(Op == getValue(OpIdx, Lane) &&
3251 "Op is expected to be getValue(OpIdx, Lane).");
3252 // Small number of loads - try load matching.
3253 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3254 return false;
3255 bool OpAPO = getData(OpIdx, Lane).APO;
3256 bool IsInvariant = L && L->isLoopInvariant(Op);
3257 unsigned Cnt = 0;
3258 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3259 if (Ln == Lane)
3260 continue;
3261 // This is set to true if we found a candidate for broadcast at Lane.
3262 bool FoundCandidate = false;
3263 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3264 OperandData &Data = getData(OpI, Ln);
3265 if (Data.APO != OpAPO || Data.IsUsed)
3266 continue;
3267 Value *OpILane = getValue(OpI, Lane);
3268 bool IsConstantOp = isa<Constant>(OpILane);
3269 // Consider the broadcast candidate if:
3270 // 1. Same value is found in one of the operands.
3271 if (Data.V == Op ||
3272 // 2. The operand in the given lane is not constant but there is a
3273 // constant operand in another lane (which can be moved to the
3274 // given lane). In this case we can represent it as a simple
3275 // permutation of constant and broadcast.
3276 (!IsConstantOp &&
3277 ((Lns > 2 && isa<Constant>(Data.V)) ||
3278 // 2.1. If we have only 2 lanes, need to check that value in the
3279 // next lane does not build same opcode sequence.
3280 (Lns == 2 &&
3281 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3282 isa<Constant>(Data.V)))) ||
3283 // 3. The operand in the current lane is loop invariant (can be
3284 // hoisted out) and another operand is also a loop invariant
3285 // (though not a constant). In this case the whole vector can be
3286 // hoisted out.
3287 // FIXME: need to teach the cost model about this case for better
3288 // estimation.
3289 (IsInvariant && !isa<Constant>(Data.V) &&
3290 !getSameOpcode({Op, Data.V}, TLI) &&
3291 L->isLoopInvariant(Data.V))) {
3292 FoundCandidate = true;
3293 Data.IsUsed = Data.V == Op;
3294 if (Data.V == Op)
3295 ++Cnt;
3296 break;
3297 }
3298 }
3299 if (!FoundCandidate)
3300 return false;
3301 }
3302 return getNumLanes() == 2 || Cnt > 1;
3303 }
3304
3305 /// Checks if there is at least single compatible operand in lanes other
3306 /// than \p Lane, compatible with the operand \p Op.
3307 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3308 assert(Op == getValue(OpIdx, Lane) &&
3309 "Op is expected to be getValue(OpIdx, Lane).");
3310 bool OpAPO = getData(OpIdx, Lane).APO;
3311 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3312 if (Ln == Lane)
3313 continue;
3314 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3315 const OperandData &Data = getData(OpI, Ln);
3316 if (Data.APO != OpAPO || Data.IsUsed)
3317 return true;
3318 Value *OpILn = getValue(OpI, Ln);
3319 return (L && L->isLoopInvariant(OpILn)) ||
3320 (getSameOpcode({Op, OpILn}, TLI) &&
3321 allSameBlock({Op, OpILn}));
3322 }))
3323 return true;
3324 }
3325 return false;
3326 }
3327
3328 public:
3329 /// Initialize with all the operands of the instruction vector \p RootVL.
3331 const InstructionsState &S, const BoUpSLP &R)
3332 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3333 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3334 // Append all the operands of RootVL.
3335 appendOperands(RootVL, Operands, S);
3336 }
3337
3338 /// \Returns a value vector with the operands across all lanes for the
3339 /// opearnd at \p OpIdx.
3340 ValueList getVL(unsigned OpIdx) const {
3341 ValueList OpVL(OpsVec[OpIdx].size());
3342 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3343 "Expected same num of lanes across all operands");
3344 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3345 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3346 return OpVL;
3347 }
3348
3349 // Performs operand reordering for 2 or more operands.
3350 // The original operands are in OrigOps[OpIdx][Lane].
3351 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3352 void reorder() {
3353 unsigned NumOperands = getNumOperands();
3354 unsigned NumLanes = getNumLanes();
3355 // Each operand has its own mode. We are using this mode to help us select
3356 // the instructions for each lane, so that they match best with the ones
3357 // we have selected so far.
3358 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3359
3360 // This is a greedy single-pass algorithm. We are going over each lane
3361 // once and deciding on the best order right away with no back-tracking.
3362 // However, in order to increase its effectiveness, we start with the lane
3363 // that has operands that can move the least. For example, given the
3364 // following lanes:
3365 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3366 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3367 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3368 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3369 // we will start at Lane 1, since the operands of the subtraction cannot
3370 // be reordered. Then we will visit the rest of the lanes in a circular
3371 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3372
3373 // Find the first lane that we will start our search from.
3374 unsigned FirstLane = getBestLaneToStartReordering();
3375
3376 // Initialize the modes.
3377 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3378 Value *OpLane0 = getValue(OpIdx, FirstLane);
3379 // Keep track if we have instructions with all the same opcode on one
3380 // side.
3381 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3382 // Check if OpLane0 should be broadcast.
3383 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3384 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3385 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3386 else if (isa<LoadInst>(OpILane0))
3387 ReorderingModes[OpIdx] = ReorderingMode::Load;
3388 else
3389 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3390 } else if (isa<Constant>(OpLane0)) {
3391 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3392 } else if (isa<Argument>(OpLane0)) {
3393 // Our best hope is a Splat. It may save some cost in some cases.
3394 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3395 } else {
3396 llvm_unreachable("Unexpected value kind.");
3397 }
3398 }
3399
3400 // Check that we don't have same operands. No need to reorder if operands
3401 // are just perfect diamond or shuffled diamond match. Do not do it only
3402 // for possible broadcasts or non-power of 2 number of scalars (just for
3403 // now).
3404 auto &&SkipReordering = [this]() {
3405 SmallPtrSet<Value *, 4> UniqueValues;
3406 ArrayRef<OperandData> Op0 = OpsVec.front();
3407 for (const OperandData &Data : Op0)
3408 UniqueValues.insert(Data.V);
3410 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3411 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3412 return !UniqueValues.contains(Data.V);
3413 }))
3414 return false;
3415 }
3416 // TODO: Check if we can remove a check for non-power-2 number of
3417 // scalars after full support of non-power-2 vectorization.
3418 return UniqueValues.size() != 2 &&
3419 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3420 UniqueValues.size());
3421 };
3422
3423 // If the initial strategy fails for any of the operand indexes, then we
3424 // perform reordering again in a second pass. This helps avoid assigning
3425 // high priority to the failed strategy, and should improve reordering for
3426 // the non-failed operand indexes.
3427 for (int Pass = 0; Pass != 2; ++Pass) {
3428 // Check if no need to reorder operands since they're are perfect or
3429 // shuffled diamond match.
3430 // Need to do it to avoid extra external use cost counting for
3431 // shuffled matches, which may cause regressions.
3432 if (SkipReordering())
3433 break;
3434 // Skip the second pass if the first pass did not fail.
3435 bool StrategyFailed = false;
3436 // Mark all operand data as free to use.
3437 clearUsed();
3438 // We keep the original operand order for the FirstLane, so reorder the
3439 // rest of the lanes. We are visiting the nodes in a circular fashion,
3440 // using FirstLane as the center point and increasing the radius
3441 // distance.
3442 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3443 for (unsigned I = 0; I < NumOperands; ++I)
3444 MainAltOps[I].push_back(getData(I, FirstLane).V);
3445
3446 SmallBitVector UsedLanes(NumLanes);
3447 UsedLanes.set(FirstLane);
3448 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3449 // Visit the lane on the right and then the lane on the left.
3450 for (int Direction : {+1, -1}) {
3451 int Lane = FirstLane + Direction * Distance;
3452 if (Lane < 0 || Lane >= (int)NumLanes)
3453 continue;
3454 UsedLanes.set(Lane);
3455 int LastLane = Lane - Direction;
3456 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3457 "Out of bounds");
3458 // Look for a good match for each operand.
3459 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3460 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3461 std::optional<unsigned> BestIdx =
3462 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3463 MainAltOps[OpIdx], UsedLanes);
3464 // By not selecting a value, we allow the operands that follow to
3465 // select a better matching value. We will get a non-null value in
3466 // the next run of getBestOperand().
3467 if (BestIdx) {
3468 // Swap the current operand with the one returned by
3469 // getBestOperand().
3470 swap(OpIdx, *BestIdx, Lane);
3471 } else {
3472 // Enable the second pass.
3473 StrategyFailed = true;
3474 }
3475 // Try to get the alternate opcode and follow it during analysis.
3476 if (MainAltOps[OpIdx].size() != 2) {
3477 OperandData &AltOp = getData(OpIdx, Lane);
3478 InstructionsState OpS =
3479 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3480 if (OpS && OpS.isAltShuffle())
3481 MainAltOps[OpIdx].push_back(AltOp.V);
3482 }
3483 }
3484 }
3485 }
3486 // Skip second pass if the strategy did not fail.
3487 if (!StrategyFailed)
3488 break;
3489 }
3490 }
3491
3492#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3493 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3494 switch (RMode) {
3495 case ReorderingMode::Load:
3496 return "Load";
3497 case ReorderingMode::Opcode:
3498 return "Opcode";
3499 case ReorderingMode::Constant:
3500 return "Constant";
3501 case ReorderingMode::Splat:
3502 return "Splat";
3503 case ReorderingMode::Failed:
3504 return "Failed";
3505 }
3506 llvm_unreachable("Unimplemented Reordering Type");
3507 }
3508
3509 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3510 raw_ostream &OS) {
3511 return OS << getModeStr(RMode);
3512 }
3513
3514 /// Debug print.
3515 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3516 printMode(RMode, dbgs());
3517 }
3518
3519 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3520 return printMode(RMode, OS);
3521 }
3522
3524 const unsigned Indent = 2;
3525 unsigned Cnt = 0;
3526 for (const OperandDataVec &OpDataVec : OpsVec) {
3527 OS << "Operand " << Cnt++ << "\n";
3528 for (const OperandData &OpData : OpDataVec) {
3529 OS.indent(Indent) << "{";
3530 if (Value *V = OpData.V)
3531 OS << *V;
3532 else
3533 OS << "null";
3534 OS << ", APO:" << OpData.APO << "}\n";
3535 }
3536 OS << "\n";
3537 }
3538 return OS;
3539 }
3540
3541 /// Debug print.
3542 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3543#endif
3544 };
3545
3546 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3547 /// for a pair which have highest score deemed to have best chance to form
3548 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3549 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3550 /// of the cost, considered to be good enough score.
3551 std::optional<int>
3552 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3553 int Limit = LookAheadHeuristics::ScoreFail) const {
3554 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3556 int BestScore = Limit;
3557 std::optional<int> Index;
3558 for (int I : seq<int>(0, Candidates.size())) {
3559 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3560 Candidates[I].second,
3561 /*U1=*/nullptr, /*U2=*/nullptr,
3562 /*CurrLevel=*/1, {});
3563 if (Score > BestScore) {
3564 BestScore = Score;
3565 Index = I;
3566 }
3567 }
3568 return Index;
3569 }
3570
3571 /// Checks if the instruction is marked for deletion.
3572 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3573
3574 /// Removes an instruction from its block and eventually deletes it.
3575 /// It's like Instruction::eraseFromParent() except that the actual deletion
3576 /// is delayed until BoUpSLP is destructed.
3578 DeletedInstructions.insert(I);
3579 }
3580
3581 /// Remove instructions from the parent function and clear the operands of \p
3582 /// DeadVals instructions, marking for deletion trivially dead operands.
3583 template <typename T>
3585 ArrayRef<T *> DeadVals,
3586 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3588 for (T *V : DeadVals) {
3589 auto *I = cast<Instruction>(V);
3591 }
3592 DenseSet<Value *> Processed;
3593 for (T *V : DeadVals) {
3594 if (!V || !Processed.insert(V).second)
3595 continue;
3596 auto *I = cast<Instruction>(V);
3598 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3599 for (Use &U : I->operands()) {
3600 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3601 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3603 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3604 return Entry->VectorizedValue == OpI;
3605 })))
3606 DeadInsts.push_back(OpI);
3607 }
3608 I->dropAllReferences();
3609 }
3610 for (T *V : DeadVals) {
3611 auto *I = cast<Instruction>(V);
3612 if (!I->getParent())
3613 continue;
3614 assert((I->use_empty() || all_of(I->uses(),
3615 [&](Use &U) {
3616 return isDeleted(
3617 cast<Instruction>(U.getUser()));
3618 })) &&
3619 "trying to erase instruction with users.");
3620 I->removeFromParent();
3621 SE->forgetValue(I);
3622 }
3623 // Process the dead instruction list until empty.
3624 while (!DeadInsts.empty()) {
3625 Value *V = DeadInsts.pop_back_val();
3627 if (!VI || !VI->getParent())
3628 continue;
3630 "Live instruction found in dead worklist!");
3631 assert(VI->use_empty() && "Instructions with uses are not dead.");
3632
3633 // Don't lose the debug info while deleting the instructions.
3634 salvageDebugInfo(*VI);
3635
3636 // Null out all of the instruction's operands to see if any operand
3637 // becomes dead as we go.
3638 for (Use &OpU : VI->operands()) {
3639 Value *OpV = OpU.get();
3640 if (!OpV)
3641 continue;
3642 OpU.set(nullptr);
3643
3644 if (!OpV->use_empty())
3645 continue;
3646
3647 // If the operand is an instruction that became dead as we nulled out
3648 // the operand, and if it is 'trivially' dead, delete it in a future
3649 // loop iteration.
3650 if (auto *OpI = dyn_cast<Instruction>(OpV))
3651 if (!DeletedInstructions.contains(OpI) &&
3652 (!OpI->getType()->isVectorTy() ||
3653 none_of(VectorValuesAndScales,
3654 [&](const std::tuple<Value *, unsigned, bool> &V) {
3655 return std::get<0>(V) == OpI;
3656 })) &&
3658 DeadInsts.push_back(OpI);
3659 }
3660
3661 VI->removeFromParent();
3662 eraseInstruction(VI);
3663 SE->forgetValue(VI);
3664 }
3665 }
3666
3667 /// Checks if the instruction was already analyzed for being possible
3668 /// reduction root.
3670 return AnalyzedReductionsRoots.count(I);
3671 }
3672 /// Register given instruction as already analyzed for being possible
3673 /// reduction root.
3675 AnalyzedReductionsRoots.insert(I);
3676 }
3677 /// Checks if the provided list of reduced values was checked already for
3678 /// vectorization.
3680 return AnalyzedReductionVals.contains(hash_value(VL));
3681 }
3682 /// Adds the list of reduced values to list of already checked values for the
3683 /// vectorization.
3685 AnalyzedReductionVals.insert(hash_value(VL));
3686 }
3687 /// Clear the list of the analyzed reduction root instructions.
3689 AnalyzedReductionsRoots.clear();
3690 AnalyzedReductionVals.clear();
3691 AnalyzedMinBWVals.clear();
3692 }
3693 /// Checks if the given value is gathered in one of the nodes.
3694 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3695 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3696 }
3697 /// Checks if the given value is gathered in one of the nodes.
3698 bool isGathered(const Value *V) const {
3699 return MustGather.contains(V);
3700 }
3701 /// Checks if the specified value was not schedule.
3702 bool isNotScheduled(const Value *V) const {
3703 return NonScheduledFirst.contains(V);
3704 }
3705
3706 /// Check if the value is vectorized in the tree.
3707 bool isVectorized(const Value *V) const {
3708 assert(V && "V cannot be nullptr.");
3709 return ScalarToTreeEntries.contains(V);
3710 }
3711
3712 ~BoUpSLP();
3713
3714private:
3715 /// Determine if a node \p E in can be demoted to a smaller type with a
3716 /// truncation. We collect the entries that will be demoted in ToDemote.
3717 /// \param E Node for analysis
3718 /// \param ToDemote indices of the nodes to be demoted.
3719 bool collectValuesToDemote(
3720 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3722 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3723 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3724
3725 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3726 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3727 /// they have only one user and reordarable).
3728 /// \param ReorderableGathers List of all gather nodes that require reordering
3729 /// (e.g., gather of extractlements or partially vectorizable loads).
3730 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3731 /// reordering, subset of \p NonVectorized.
3732 void buildReorderableOperands(
3733 TreeEntry *UserTE,
3734 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3735 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3736 SmallVectorImpl<TreeEntry *> &GatherOps);
3737
3738 /// Checks if the given \p TE is a gather node with clustered reused scalars
3739 /// and reorders it per given \p Mask.
3740 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3741
3742 /// Checks if all users of \p I are the part of the vectorization tree.
3743 bool areAllUsersVectorized(
3744 Instruction *I,
3745 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3746
3747 /// Return information about the vector formed for the specified index
3748 /// of a vector of (the same) instruction.
3751
3752 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3753 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3754 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3755 return const_cast<TreeEntry *>(
3756 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3757 }
3758
3759 /// Gets the root instruction for the given node. If the node is a strided
3760 /// load/store node with the reverse order, the root instruction is the last
3761 /// one.
3762 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3763
3764 /// \returns Cast context for the given graph node.
3766 getCastContextHint(const TreeEntry &TE) const;
3767
3768 /// \returns the cost of the vectorizable entry.
3769 InstructionCost getEntryCost(const TreeEntry *E,
3770 ArrayRef<Value *> VectorizedVals,
3771 SmallPtrSetImpl<Value *> &CheckedExtracts);
3772
3773 /// Checks if it is legal and profitable to build SplitVectorize node for the
3774 /// given \p VL.
3775 /// \param Op1 first homogeneous scalars.
3776 /// \param Op2 second homogeneous scalars.
3777 /// \param ReorderIndices indices to reorder the scalars.
3778 /// \returns true if the node was successfully built.
3779 bool canBuildSplitNode(ArrayRef<Value *> VL,
3780 const InstructionsState &LocalState,
3783 OrdersType &ReorderIndices) const;
3784
3785 /// This is the recursive part of buildTree.
3786 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3787 unsigned InterleaveFactor = 0);
3788
3789 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3790 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3791 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3792 /// returns false, setting \p CurrentOrder to either an empty vector or a
3793 /// non-identity permutation that allows to reuse extract instructions.
3794 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3795 /// extract order.
3796 bool canReuseExtract(ArrayRef<Value *> VL,
3797 SmallVectorImpl<unsigned> &CurrentOrder,
3798 bool ResizeAllowed = false) const;
3799
3800 /// Vectorize a single entry in the tree.
3801 Value *vectorizeTree(TreeEntry *E);
3802
3803 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3804 /// \p E.
3805 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3806
3807 /// Create a new vector from a list of scalar values. Produces a sequence
3808 /// which exploits values reused across lanes, and arranges the inserts
3809 /// for ease of later optimization.
3810 template <typename BVTy, typename ResTy, typename... Args>
3811 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3812
3813 /// Create a new vector from a list of scalar values. Produces a sequence
3814 /// which exploits values reused across lanes, and arranges the inserts
3815 /// for ease of later optimization.
3816 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3817
3818 /// Returns the instruction in the bundle, which can be used as a base point
3819 /// for scheduling. Usually it is the last instruction in the bundle, except
3820 /// for the case when all operands are external (in this case, it is the first
3821 /// instruction in the list).
3822 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3823
3824 /// Tries to find extractelement instructions with constant indices from fixed
3825 /// vector type and gather such instructions into a bunch, which highly likely
3826 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3827 /// was successful, the matched scalars are replaced by poison values in \p VL
3828 /// for future analysis.
3829 std::optional<TargetTransformInfo::ShuffleKind>
3830 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3831 SmallVectorImpl<int> &Mask) const;
3832
3833 /// Tries to find extractelement instructions with constant indices from fixed
3834 /// vector type and gather such instructions into a bunch, which highly likely
3835 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3836 /// was successful, the matched scalars are replaced by poison values in \p VL
3837 /// for future analysis.
3839 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3841 unsigned NumParts) const;
3842
3843 /// Checks if the gathered \p VL can be represented as a single register
3844 /// shuffle(s) of previous tree entries.
3845 /// \param TE Tree entry checked for permutation.
3846 /// \param VL List of scalars (a subset of the TE scalar), checked for
3847 /// permutations. Must form single-register vector.
3848 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3849 /// commands to build the mask using the original vector value, without
3850 /// relying on the potential reordering.
3851 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3852 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3853 std::optional<TargetTransformInfo::ShuffleKind>
3854 isGatherShuffledSingleRegisterEntry(
3855 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3856 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3857 bool ForOrder);
3858
3859 /// Checks if the gathered \p VL can be represented as multi-register
3860 /// shuffle(s) of previous tree entries.
3861 /// \param TE Tree entry checked for permutation.
3862 /// \param VL List of scalars (a subset of the TE scalar), checked for
3863 /// permutations.
3864 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3865 /// commands to build the mask using the original vector value, without
3866 /// relying on the potential reordering.
3867 /// \returns per-register series of ShuffleKind, if gathered values can be
3868 /// represented as shuffles of previous tree entries. \p Mask is filled with
3869 /// the shuffle mask (also on per-register base).
3871 isGatherShuffledEntry(
3872 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3874 unsigned NumParts, bool ForOrder = false);
3875
3876 /// \returns the cost of gathering (inserting) the values in \p VL into a
3877 /// vector.
3878 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3879 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3880 Type *ScalarTy) const;
3881
3882 /// Set the Builder insert point to one after the last instruction in
3883 /// the bundle
3884 void setInsertPointAfterBundle(const TreeEntry *E);
3885
3886 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3887 /// specified, the starting vector value is poison.
3888 Value *
3889 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3890 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3891
3892 /// \returns whether the VectorizableTree is fully vectorizable and will
3893 /// be beneficial even the tree height is tiny.
3894 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3895
3896 /// Run through the list of all gathered loads in the graph and try to find
3897 /// vector loads/masked gathers instead of regular gathers. Later these loads
3898 /// are reshufled to build final gathered nodes.
3899 void tryToVectorizeGatheredLoads(
3900 const SmallMapVector<
3901 std::tuple<BasicBlock *, Value *, Type *>,
3902 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3903 &GatheredLoads);
3904
3905 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3906 /// users of \p TE and collects the stores. It returns the map from the store
3907 /// pointers to the collected stores.
3909 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3910
3911 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3912 /// stores in \p StoresVec can form a vector instruction. If so it returns
3913 /// true and populates \p ReorderIndices with the shuffle indices of the
3914 /// stores when compared to the sorted vector.
3915 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3916 OrdersType &ReorderIndices) const;
3917
3918 /// Iterates through the users of \p TE, looking for scalar stores that can be
3919 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3920 /// their order and builds an order index vector for each store bundle. It
3921 /// returns all these order vectors found.
3922 /// We run this after the tree has formed, otherwise we may come across user
3923 /// instructions that are not yet in the tree.
3925 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3926
3927 /// Tries to reorder the gathering node for better vectorization
3928 /// opportunities.
3929 void reorderGatherNode(TreeEntry &TE);
3930
3931 /// Checks if the tree represents disjoint or reduction of shl(zext, (0, 8,
3932 /// .., 56))-like pattern.
3933 /// If the int shifts unique, also strided, but not ordered, sets \p Order.
3934 /// If the node can be represented as a bitcast + bswap, sets \p IsBSwap.
3935 bool matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
3936 bool &IsBSwap) const;
3937
3938 class TreeEntry {
3939 public:
3940 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3941 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3942
3943 /// \returns Common mask for reorder indices and reused scalars.
3944 SmallVector<int> getCommonMask() const {
3945 if (State == TreeEntry::SplitVectorize)
3946 return {};
3947 SmallVector<int> Mask;
3948 inversePermutation(ReorderIndices, Mask);
3949 ::addMask(Mask, ReuseShuffleIndices);
3950 return Mask;
3951 }
3952
3953 /// \returns The mask for split nodes.
3954 SmallVector<int> getSplitMask() const {
3955 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3956 "Expected only split vectorize node.");
3957 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3958 unsigned CommonVF = std::max<unsigned>(
3959 CombinedEntriesWithIndices.back().second,
3960 Scalars.size() - CombinedEntriesWithIndices.back().second);
3961 for (auto [Idx, I] : enumerate(ReorderIndices))
3962 Mask[I] =
3963 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3964 ? CommonVF - CombinedEntriesWithIndices.back().second
3965 : 0);
3966 return Mask;
3967 }
3968
3969 /// Updates (reorders) SplitVectorize node according to the given mask \p
3970 /// Mask and order \p MaskOrder.
3971 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3972 ArrayRef<int> MaskOrder);
3973
3974 /// \returns true if the scalars in VL are equal to this entry.
3975 bool isSame(ArrayRef<Value *> VL) const {
3976 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3977 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3978 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3979 return VL.size() == Mask.size() &&
3980 std::equal(VL.begin(), VL.end(), Mask.begin(),
3981 [Scalars](Value *V, int Idx) {
3982 return (isa<UndefValue>(V) &&
3983 Idx == PoisonMaskElem) ||
3984 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3985 });
3986 };
3987 if (!ReorderIndices.empty()) {
3988 // TODO: implement matching if the nodes are just reordered, still can
3989 // treat the vector as the same if the list of scalars matches VL
3990 // directly, without reordering.
3991 SmallVector<int> Mask;
3992 inversePermutation(ReorderIndices, Mask);
3993 if (VL.size() == Scalars.size())
3994 return IsSame(Scalars, Mask);
3995 if (VL.size() == ReuseShuffleIndices.size()) {
3996 ::addMask(Mask, ReuseShuffleIndices);
3997 return IsSame(Scalars, Mask);
3998 }
3999 return false;
4000 }
4001 return IsSame(Scalars, ReuseShuffleIndices);
4002 }
4003
4004 /// \returns true if current entry has same operands as \p TE.
4005 bool hasEqualOperands(const TreeEntry &TE) const {
4006 if (TE.getNumOperands() != getNumOperands())
4007 return false;
4008 SmallBitVector Used(getNumOperands());
4009 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
4010 unsigned PrevCount = Used.count();
4011 for (unsigned K = 0; K < E; ++K) {
4012 if (Used.test(K))
4013 continue;
4014 if (getOperand(K) == TE.getOperand(I)) {
4015 Used.set(K);
4016 break;
4017 }
4018 }
4019 // Check if we actually found the matching operand.
4020 if (PrevCount == Used.count())
4021 return false;
4022 }
4023 return true;
4024 }
4025
4026 /// \return Final vectorization factor for the node. Defined by the total
4027 /// number of vectorized scalars, including those, used several times in the
4028 /// entry and counted in the \a ReuseShuffleIndices, if any.
4029 unsigned getVectorFactor() const {
4030 if (!ReuseShuffleIndices.empty())
4031 return ReuseShuffleIndices.size();
4032 return Scalars.size();
4033 };
4034
4035 /// Checks if the current node is a gather node.
4036 bool isGather() const { return State == NeedToGather; }
4037
4038 /// A vector of scalars.
4039 ValueList Scalars;
4040
4041 /// The Scalars are vectorized into this value. It is initialized to Null.
4042 WeakTrackingVH VectorizedValue = nullptr;
4043
4044 /// Do we need to gather this sequence or vectorize it
4045 /// (either with vector instruction or with scatter/gather
4046 /// intrinsics for store/load)?
4047 enum EntryState {
4048 Vectorize, ///< The node is regularly vectorized.
4049 ScatterVectorize, ///< Masked scatter/gather node.
4050 StridedVectorize, ///< Strided loads (and stores)
4051 CompressVectorize, ///< (Masked) load with compress.
4052 NeedToGather, ///< Gather/buildvector node.
4053 CombinedVectorize, ///< Vectorized node, combined with its user into more
4054 ///< complex node like select/cmp to minmax, mul/add to
4055 ///< fma, etc. Must be used for the following nodes in
4056 ///< the pattern, not the very first one.
4057 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4058 ///< independently and then combines back.
4059 };
4060 EntryState State;
4061
4062 /// List of combined opcodes supported by the vectorizer.
4063 enum CombinedOpcode {
4064 NotCombinedOp = -1,
4065 MinMax = Instruction::OtherOpsEnd + 1,
4066 FMulAdd,
4067 ReducedBitcast,
4068 ReducedBitcastBSwap,
4069 };
4070 CombinedOpcode CombinedOp = NotCombinedOp;
4071
4072 /// Does this sequence require some shuffling?
4073 SmallVector<int, 4> ReuseShuffleIndices;
4074
4075 /// Does this entry require reordering?
4076 SmallVector<unsigned, 4> ReorderIndices;
4077
4078 /// Points back to the VectorizableTree.
4079 ///
4080 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4081 /// to be a pointer and needs to be able to initialize the child iterator.
4082 /// Thus we need a reference back to the container to translate the indices
4083 /// to entries.
4084 VecTreeTy &Container;
4085
4086 /// The TreeEntry index containing the user of this entry.
4087 EdgeInfo UserTreeIndex;
4088
4089 /// The index of this treeEntry in VectorizableTree.
4090 unsigned Idx = 0;
4091
4092 /// For gather/buildvector/alt opcode nodes, which are combined from
4093 /// other nodes as a series of insertvector instructions.
4094 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4095
4096 private:
4097 /// The operands of each instruction in each lane Operands[op_index][lane].
4098 /// Note: This helps avoid the replication of the code that performs the
4099 /// reordering of operands during buildTreeRec() and vectorizeTree().
4100 SmallVector<ValueList, 2> Operands;
4101
4102 /// Copyable elements of the entry node.
4103 SmallPtrSet<const Value *, 4> CopyableElements;
4104
4105 /// MainOp and AltOp are recorded inside. S should be obtained from
4106 /// newTreeEntry.
4107 InstructionsState S = InstructionsState::invalid();
4108
4109 /// Interleaving factor for interleaved loads Vectorize nodes.
4110 unsigned InterleaveFactor = 0;
4111
4112 /// True if the node does not require scheduling.
4113 bool DoesNotNeedToSchedule = false;
4114
4115 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4116 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4117 if (Operands.size() < OpIdx + 1)
4118 Operands.resize(OpIdx + 1);
4119 assert(Operands[OpIdx].empty() && "Already resized?");
4120 assert(OpVL.size() <= Scalars.size() &&
4121 "Number of operands is greater than the number of scalars.");
4122 Operands[OpIdx].resize(OpVL.size());
4123 copy(OpVL, Operands[OpIdx].begin());
4124 }
4125
4126 public:
4127 /// Returns interleave factor for interleave nodes.
4128 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4129 /// Sets interleaving factor for the interleaving nodes.
4130 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4131
4132 /// Marks the node as one that does not require scheduling.
4133 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4134 /// Returns true if the node is marked as one that does not require
4135 /// scheduling.
4136 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4137
4138 /// Set this bundle's operands from \p Operands.
4139 void setOperands(ArrayRef<ValueList> Operands) {
4140 for (unsigned I : seq<unsigned>(Operands.size()))
4141 setOperand(I, Operands[I]);
4142 }
4143
4144 /// Reorders operands of the node to the given mask \p Mask.
4145 void reorderOperands(ArrayRef<int> Mask) {
4146 for (ValueList &Operand : Operands)
4147 reorderScalars(Operand, Mask);
4148 }
4149
4150 /// \returns the \p OpIdx operand of this TreeEntry.
4151 ValueList &getOperand(unsigned OpIdx) {
4152 assert(OpIdx < Operands.size() && "Off bounds");
4153 return Operands[OpIdx];
4154 }
4155
4156 /// \returns the \p OpIdx operand of this TreeEntry.
4157 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4158 assert(OpIdx < Operands.size() && "Off bounds");
4159 return Operands[OpIdx];
4160 }
4161
4162 /// \returns the number of operands.
4163 unsigned getNumOperands() const { return Operands.size(); }
4164
4165 /// \return the single \p OpIdx operand.
4166 Value *getSingleOperand(unsigned OpIdx) const {
4167 assert(OpIdx < Operands.size() && "Off bounds");
4168 assert(!Operands[OpIdx].empty() && "No operand available");
4169 return Operands[OpIdx][0];
4170 }
4171
4172 /// Some of the instructions in the list have alternate opcodes.
4173 bool isAltShuffle() const { return S.isAltShuffle(); }
4174
4175 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4176 return S.getMatchingMainOpOrAltOp(I);
4177 }
4178
4179 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4180 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4181 /// \p OpValue.
4182 Value *isOneOf(Value *Op) const {
4183 auto *I = dyn_cast<Instruction>(Op);
4184 if (I && getMatchingMainOpOrAltOp(I))
4185 return Op;
4186 return S.getMainOp();
4187 }
4188
4189 void setOperations(const InstructionsState &S) {
4190 assert(S && "InstructionsState is invalid.");
4191 this->S = S;
4192 }
4193
4194 Instruction *getMainOp() const { return S.getMainOp(); }
4195
4196 Instruction *getAltOp() const { return S.getAltOp(); }
4197
4198 /// The main/alternate opcodes for the list of instructions.
4199 unsigned getOpcode() const { return S.getOpcode(); }
4200
4201 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4202
4203 bool hasState() const { return S.valid(); }
4204
4205 /// Add \p V to the list of copyable elements.
4206 void addCopyableElement(Value *V) {
4207 assert(S.isCopyableElement(V) && "Not a copyable element.");
4208 CopyableElements.insert(V);
4209 }
4210
4211 /// Returns true if \p V is a copyable element.
4212 bool isCopyableElement(Value *V) const {
4213 return CopyableElements.contains(V);
4214 }
4215
4216 /// Returns true if any scalar in the list is a copyable element.
4217 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4218
4219 /// Returns the state of the operations.
4220 const InstructionsState &getOperations() const { return S; }
4221
4222 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4223 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4224 unsigned findLaneForValue(Value *V) const {
4225 unsigned FoundLane = getVectorFactor();
4226 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4227 std::advance(It, 1)) {
4228 if (*It != V)
4229 continue;
4230 FoundLane = std::distance(Scalars.begin(), It);
4231 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4232 if (!ReorderIndices.empty())
4233 FoundLane = ReorderIndices[FoundLane];
4234 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4235 if (ReuseShuffleIndices.empty())
4236 break;
4237 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4238 RIt != ReuseShuffleIndices.end()) {
4239 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4240 break;
4241 }
4242 }
4243 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4244 return FoundLane;
4245 }
4246
4247 /// Build a shuffle mask for graph entry which represents a merge of main
4248 /// and alternate operations.
4249 void
4250 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4251 SmallVectorImpl<int> &Mask,
4252 SmallVectorImpl<Value *> *OpScalars = nullptr,
4253 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4254
4255 /// Return true if this is a non-power-of-2 node.
4256 bool isNonPowOf2Vec() const {
4257 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4258 return IsNonPowerOf2;
4259 }
4260
4261 /// Return true if this is a node, which tries to vectorize number of
4262 /// elements, forming whole vectors.
4263 bool
4264 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4265 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4266 TTI, getValueType(Scalars.front()), Scalars.size());
4267 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4268 "Reshuffling not supported with non-power-of-2 vectors yet.");
4269 return IsNonPowerOf2;
4270 }
4271
4272 Value *getOrdered(unsigned Idx) const {
4273 if (ReorderIndices.empty())
4274 return Scalars[Idx];
4275 SmallVector<int> Mask;
4276 inversePermutation(ReorderIndices, Mask);
4277 return Scalars[Mask[Idx]];
4278 }
4279
4280#ifndef NDEBUG
4281 /// Debug printer.
4282 LLVM_DUMP_METHOD void dump() const {
4283 dbgs() << Idx << ".\n";
4284 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4285 dbgs() << "Operand " << OpI << ":\n";
4286 for (const Value *V : Operands[OpI])
4287 dbgs().indent(2) << *V << "\n";
4288 }
4289 dbgs() << "Scalars: \n";
4290 for (Value *V : Scalars)
4291 dbgs().indent(2) << *V << "\n";
4292 dbgs() << "State: ";
4293 if (S && hasCopyableElements())
4294 dbgs() << "[[Copyable]] ";
4295 switch (State) {
4296 case Vectorize:
4297 if (InterleaveFactor > 0) {
4298 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4299 << "\n";
4300 } else {
4301 dbgs() << "Vectorize\n";
4302 }
4303 break;
4304 case ScatterVectorize:
4305 dbgs() << "ScatterVectorize\n";
4306 break;
4307 case StridedVectorize:
4308 dbgs() << "StridedVectorize\n";
4309 break;
4310 case CompressVectorize:
4311 dbgs() << "CompressVectorize\n";
4312 break;
4313 case NeedToGather:
4314 dbgs() << "NeedToGather\n";
4315 break;
4316 case CombinedVectorize:
4317 dbgs() << "CombinedVectorize\n";
4318 break;
4319 case SplitVectorize:
4320 dbgs() << "SplitVectorize\n";
4321 break;
4322 }
4323 if (S) {
4324 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4325 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4326 } else {
4327 dbgs() << "MainOp: NULL\n";
4328 dbgs() << "AltOp: NULL\n";
4329 }
4330 dbgs() << "VectorizedValue: ";
4331 if (VectorizedValue)
4332 dbgs() << *VectorizedValue << "\n";
4333 else
4334 dbgs() << "NULL\n";
4335 dbgs() << "ReuseShuffleIndices: ";
4336 if (ReuseShuffleIndices.empty())
4337 dbgs() << "Empty";
4338 else
4339 for (int ReuseIdx : ReuseShuffleIndices)
4340 dbgs() << ReuseIdx << ", ";
4341 dbgs() << "\n";
4342 dbgs() << "ReorderIndices: ";
4343 for (unsigned ReorderIdx : ReorderIndices)
4344 dbgs() << ReorderIdx << ", ";
4345 dbgs() << "\n";
4346 dbgs() << "UserTreeIndex: ";
4347 if (UserTreeIndex)
4348 dbgs() << UserTreeIndex;
4349 else
4350 dbgs() << "<invalid>";
4351 dbgs() << "\n";
4352 if (!CombinedEntriesWithIndices.empty()) {
4353 dbgs() << "Combined entries: ";
4354 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4355 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4356 });
4357 dbgs() << "\n";
4358 }
4359 }
4360#endif
4361 };
4362
4363#ifndef NDEBUG
4364 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4365 InstructionCost VecCost, InstructionCost ScalarCost,
4366 StringRef Banner) const {
4367 dbgs() << "SLP: " << Banner << ":\n";
4368 E->dump();
4369 dbgs() << "SLP: Costs:\n";
4370 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4371 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4372 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4373 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4374 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4375 }
4376#endif
4377
4378 /// Create a new gather TreeEntry
4379 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4380 const InstructionsState &S,
4381 const EdgeInfo &UserTreeIdx,
4382 ArrayRef<int> ReuseShuffleIndices = {}) {
4383 auto Invalid = ScheduleBundle::invalid();
4384 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4385 }
4386
4387 /// Create a new VectorizableTree entry.
4388 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4389 const InstructionsState &S,
4390 const EdgeInfo &UserTreeIdx,
4391 ArrayRef<int> ReuseShuffleIndices = {},
4392 ArrayRef<unsigned> ReorderIndices = {},
4393 unsigned InterleaveFactor = 0) {
4394 TreeEntry::EntryState EntryState =
4395 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4396 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4397 ReuseShuffleIndices, ReorderIndices);
4398 if (E && InterleaveFactor > 0)
4399 E->setInterleave(InterleaveFactor);
4400 return E;
4401 }
4402
4403 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4404 TreeEntry::EntryState EntryState,
4405 ScheduleBundle &Bundle, const InstructionsState &S,
4406 const EdgeInfo &UserTreeIdx,
4407 ArrayRef<int> ReuseShuffleIndices = {},
4408 ArrayRef<unsigned> ReorderIndices = {}) {
4409 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4410 EntryState == TreeEntry::SplitVectorize)) ||
4411 (Bundle && EntryState != TreeEntry::NeedToGather &&
4412 EntryState != TreeEntry::SplitVectorize)) &&
4413 "Need to vectorize gather entry?");
4414 // Gathered loads still gathered? Do not create entry, use the original one.
4415 if (GatheredLoadsEntriesFirst.has_value() &&
4416 EntryState == TreeEntry::NeedToGather && S &&
4417 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4418 !UserTreeIdx.UserTE)
4419 return nullptr;
4420 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4421 TreeEntry *Last = VectorizableTree.back().get();
4422 Last->Idx = VectorizableTree.size() - 1;
4423 Last->State = EntryState;
4424 if (UserTreeIdx.UserTE)
4425 OperandsToTreeEntry.try_emplace(
4426 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4427 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4428 // for non-power-of-two vectors.
4429 assert(
4430 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4431 ReuseShuffleIndices.empty()) &&
4432 "Reshuffling scalars not yet supported for nodes with padding");
4433 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4434 ReuseShuffleIndices.end());
4435 if (ReorderIndices.empty()) {
4436 Last->Scalars.assign(VL.begin(), VL.end());
4437 if (S)
4438 Last->setOperations(S);
4439 } else {
4440 // Reorder scalars and build final mask.
4441 Last->Scalars.assign(VL.size(), nullptr);
4442 transform(ReorderIndices, Last->Scalars.begin(),
4443 [VL](unsigned Idx) -> Value * {
4444 if (Idx >= VL.size())
4445 return UndefValue::get(VL.front()->getType());
4446 return VL[Idx];
4447 });
4448 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4449 if (S)
4450 Last->setOperations(S);
4451 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4452 }
4453 if (EntryState == TreeEntry::SplitVectorize) {
4454 assert(S && "Split nodes must have operations.");
4455 Last->setOperations(S);
4456 SmallPtrSet<Value *, 4> Processed;
4457 for (Value *V : VL) {
4458 auto *I = dyn_cast<Instruction>(V);
4459 if (!I)
4460 continue;
4461 auto It = ScalarsInSplitNodes.find(V);
4462 if (It == ScalarsInSplitNodes.end()) {
4463 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4464 (void)Processed.insert(V);
4465 } else if (Processed.insert(V).second) {
4466 assert(!is_contained(It->getSecond(), Last) &&
4467 "Value already associated with the node.");
4468 It->getSecond().push_back(Last);
4469 }
4470 }
4471 } else if (!Last->isGather()) {
4472 if (isa<PHINode>(S.getMainOp()) ||
4473 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4474 (!S.areInstructionsWithCopyableElements() &&
4475 doesNotNeedToSchedule(VL)) ||
4476 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4477 Last->setDoesNotNeedToSchedule();
4478 SmallPtrSet<Value *, 4> Processed;
4479 for (Value *V : VL) {
4480 if (isa<PoisonValue>(V))
4481 continue;
4482 if (S.isCopyableElement(V)) {
4483 Last->addCopyableElement(V);
4484 continue;
4485 }
4486 auto It = ScalarToTreeEntries.find(V);
4487 if (It == ScalarToTreeEntries.end()) {
4488 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4489 (void)Processed.insert(V);
4490 } else if (Processed.insert(V).second) {
4491 assert(!is_contained(It->getSecond(), Last) &&
4492 "Value already associated with the node.");
4493 It->getSecond().push_back(Last);
4494 }
4495 }
4496 // Update the scheduler bundle to point to this TreeEntry.
4497 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4498 "Bundle and VL out of sync");
4499 if (!Bundle.getBundle().empty()) {
4500#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4501 auto *BundleMember = Bundle.getBundle().begin();
4502 SmallPtrSet<Value *, 4> Processed;
4503 for (Value *V : VL) {
4504 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4505 continue;
4506 ++BundleMember;
4507 }
4508 assert(BundleMember == Bundle.getBundle().end() &&
4509 "Bundle and VL out of sync");
4510#endif
4511 Bundle.setTreeEntry(Last);
4512 }
4513 } else {
4514 // Build a map for gathered scalars to the nodes where they are used.
4515 bool AllConstsOrCasts = true;
4516 for (Value *V : VL) {
4517 if (S && S.areInstructionsWithCopyableElements() &&
4518 S.isCopyableElement(V))
4519 Last->addCopyableElement(V);
4520 if (!isConstant(V)) {
4521 auto *I = dyn_cast<CastInst>(V);
4522 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4523 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4524 !UserTreeIdx.UserTE->isGather())
4525 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4526 }
4527 }
4528 if (AllConstsOrCasts)
4529 CastMaxMinBWSizes =
4530 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4531 MustGather.insert_range(VL);
4532 }
4533
4534 if (UserTreeIdx.UserTE)
4535 Last->UserTreeIndex = UserTreeIdx;
4536 return Last;
4537 }
4538
4539 /// -- Vectorization State --
4540 /// Holds all of the tree entries.
4541 TreeEntry::VecTreeTy VectorizableTree;
4542
4543#ifndef NDEBUG
4544 /// Debug printer.
4545 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4546 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4547 VectorizableTree[Id]->dump();
4548 if (TransformedToGatherNodes.contains(VectorizableTree[Id].get()))
4549 dbgs() << "[[TRANSFORMED TO GATHER]]";
4550 else if (DeletedNodes.contains(VectorizableTree[Id].get()))
4551 dbgs() << "[[DELETED NODE]]";
4552 dbgs() << "\n";
4553 }
4554 }
4555#endif
4556
4557 /// Get list of vector entries, associated with the value \p V.
4558 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4559 assert(V && "V cannot be nullptr.");
4560 auto It = ScalarToTreeEntries.find(V);
4561 if (It == ScalarToTreeEntries.end())
4562 return {};
4563 return It->getSecond();
4564 }
4565
4566 /// Get list of split vector entries, associated with the value \p V.
4567 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4568 assert(V && "V cannot be nullptr.");
4569 auto It = ScalarsInSplitNodes.find(V);
4570 if (It == ScalarsInSplitNodes.end())
4571 return {};
4572 return It->getSecond();
4573 }
4574
4575 /// Returns first vector node for value \p V, matching values \p VL.
4576 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4577 bool SameVF = false) const {
4578 assert(V && "V cannot be nullptr.");
4579 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4580 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4581 return TE;
4582 return nullptr;
4583 }
4584
4585 /// Check that the operand node of alternate node does not generate
4586 /// buildvector sequence. If it is, then probably not worth it to build
4587 /// alternate shuffle, if number of buildvector operands + alternate
4588 /// instruction > than the number of buildvector instructions.
4589 /// \param S the instructions state of the analyzed values.
4590 /// \param VL list of the instructions with alternate opcodes.
4591 bool areAltOperandsProfitable(const InstructionsState &S,
4592 ArrayRef<Value *> VL) const;
4593
4594 /// Contains all the outputs of legality analysis for a list of values to
4595 /// vectorize.
4596 class ScalarsVectorizationLegality {
4597 InstructionsState S;
4598 bool IsLegal;
4599 bool TryToFindDuplicates;
4600 bool TrySplitVectorize;
4601
4602 public:
4603 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4604 bool TryToFindDuplicates = true,
4605 bool TrySplitVectorize = false)
4606 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4607 TrySplitVectorize(TrySplitVectorize) {
4608 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4609 "Inconsistent state");
4610 }
4611 const InstructionsState &getInstructionsState() const { return S; };
4612 bool isLegal() const { return IsLegal; }
4613 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4614 bool trySplitVectorize() const { return TrySplitVectorize; }
4615 };
4616
4617 /// Checks if the specified list of the instructions/values can be vectorized
4618 /// in general.
4619 ScalarsVectorizationLegality
4620 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4621 const EdgeInfo &UserTreeIdx,
4622 bool TryCopyableElementsVectorization) const;
4623
4624 /// Checks if the specified list of the instructions/values can be vectorized
4625 /// and fills required data before actual scheduling of the instructions.
4626 TreeEntry::EntryState getScalarsVectorizationState(
4627 const InstructionsState &S, ArrayRef<Value *> VL,
4628 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4629 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4630
4631 /// Maps a specific scalar to its tree entry(ies).
4632 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4633
4634 /// List of deleted non-profitable nodes.
4635 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4636
4637 /// List of nodes, transformed to gathered, with their conservative
4638 /// gather/buildvector cost estimation.
4639 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4640
4641 /// Maps the operand index and entry to the corresponding tree entry.
4642 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4643 OperandsToTreeEntry;
4644
4645 /// Scalars, used in split vectorize nodes.
4646 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4647
4648 /// Maps a value to the proposed vectorizable size.
4649 SmallDenseMap<Value *, unsigned> InstrElementSize;
4650
4651 /// A list of scalars that we found that we need to keep as scalars.
4652 ValueSet MustGather;
4653
4654 /// A set of first non-schedulable values.
4655 ValueSet NonScheduledFirst;
4656
4657 /// A map between the vectorized entries and the last instructions in the
4658 /// bundles. The bundles are built in use order, not in the def order of the
4659 /// instructions. So, we cannot rely directly on the last instruction in the
4660 /// bundle being the last instruction in the program order during
4661 /// vectorization process since the basic blocks are affected, need to
4662 /// pre-gather them before.
4663 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4664
4665 /// Keeps the mapping between the last instructions and their insertion
4666 /// points, which is an instruction-after-the-last-instruction.
4667 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4668
4669 /// List of gather nodes, depending on other gather/vector nodes, which should
4670 /// be emitted after the vector instruction emission process to correctly
4671 /// handle order of the vector instructions and shuffles.
4672 SetVector<const TreeEntry *> PostponedGathers;
4673
4674 using ValueToGatherNodesMap =
4675 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4676 ValueToGatherNodesMap ValueToGatherNodes;
4677
4678 /// A list of the load entries (node indices), which can be vectorized using
4679 /// strided or masked gather approach, but attempted to be represented as
4680 /// contiguous loads.
4681 SetVector<unsigned> LoadEntriesToVectorize;
4682
4683 /// true if graph nodes transforming mode is on.
4684 bool IsGraphTransformMode = false;
4685
4686 /// The index of the first gathered load entry in the VectorizeTree.
4687 std::optional<unsigned> GatheredLoadsEntriesFirst;
4688
4689 /// Maps compress entries to their mask data for the final codegen.
4690 SmallDenseMap<const TreeEntry *,
4691 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4692 CompressEntryToData;
4693
4694 /// This POD struct describes one external user in the vectorized tree.
4695 struct ExternalUser {
4696 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4697 : Scalar(S), User(U), E(E), Lane(L) {}
4698
4699 /// Which scalar in our function.
4700 Value *Scalar = nullptr;
4701
4702 /// Which user that uses the scalar.
4703 llvm::User *User = nullptr;
4704
4705 /// Vector node, the value is part of.
4706 const TreeEntry &E;
4707
4708 /// Which lane does the scalar belong to.
4709 unsigned Lane;
4710 };
4711 using UserList = SmallVector<ExternalUser, 16>;
4712
4713 /// Checks if two instructions may access the same memory.
4714 ///
4715 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4716 /// is invariant in the calling loop.
4717 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4718 Instruction *Inst2) {
4719 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4720 // First check if the result is already in the cache.
4721 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4722 auto Res = AliasCache.try_emplace(Key);
4723 if (!Res.second)
4724 return Res.first->second;
4725 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4726 // Store the result in the cache.
4727 Res.first->getSecond() = Aliased;
4728 return Aliased;
4729 }
4730
4731 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4732
4733 /// Cache for alias results.
4734 /// TODO: consider moving this to the AliasAnalysis itself.
4735 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4736
4737 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4738 // globally through SLP because we don't perform any action which
4739 // invalidates capture results.
4740 BatchAAResults BatchAA;
4741
4742 /// Temporary store for deleted instructions. Instructions will be deleted
4743 /// eventually when the BoUpSLP is destructed. The deferral is required to
4744 /// ensure that there are no incorrect collisions in the AliasCache, which
4745 /// can happen if a new instruction is allocated at the same address as a
4746 /// previously deleted instruction.
4747 DenseSet<Instruction *> DeletedInstructions;
4748
4749 /// Set of the instruction, being analyzed already for reductions.
4750 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4751
4752 /// Set of hashes for the list of reduction values already being analyzed.
4753 DenseSet<size_t> AnalyzedReductionVals;
4754
4755 /// Values, already been analyzed for mininmal bitwidth and found to be
4756 /// non-profitable.
4757 DenseSet<Value *> AnalyzedMinBWVals;
4758
4759 /// A list of values that need to extracted out of the tree.
4760 /// This list holds pairs of (Internal Scalar : External User). External User
4761 /// can be nullptr, it means that this Internal Scalar will be used later,
4762 /// after vectorization.
4763 UserList ExternalUses;
4764
4765 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4766 /// extractelement instructions.
4767 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4768
4769 /// A list of scalar to be extracted without specific user necause of too many
4770 /// uses.
4771 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4772
4773 /// Values used only by @llvm.assume calls.
4774 SmallPtrSet<const Value *, 32> EphValues;
4775
4776 /// Holds all of the instructions that we gathered, shuffle instructions and
4777 /// extractelements.
4778 SetVector<Instruction *> GatherShuffleExtractSeq;
4779
4780 /// A list of blocks that we are going to CSE.
4781 DenseSet<BasicBlock *> CSEBlocks;
4782
4783 /// List of hashes of vector of loads, which are known to be non vectorizable.
4784 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4785
4786 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4787 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4788 /// instructions, while ScheduleBundle represents a batch of instructions,
4789 /// going to be groupped together. ScheduleCopyableData models extra user for
4790 /// "copyable" instructions.
4791 class ScheduleEntity {
4792 friend class ScheduleBundle;
4793 friend class ScheduleData;
4794 friend class ScheduleCopyableData;
4795
4796 protected:
4797 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4798 Kind getKind() const { return K; }
4799 ScheduleEntity(Kind K) : K(K) {}
4800
4801 private:
4802 /// Used for getting a "good" final ordering of instructions.
4803 int SchedulingPriority = 0;
4804 /// True if this instruction (or bundle) is scheduled (or considered as
4805 /// scheduled in the dry-run).
4806 bool IsScheduled = false;
4807 /// The kind of the ScheduleEntity.
4808 const Kind K = Kind::ScheduleData;
4809
4810 public:
4811 ScheduleEntity() = delete;
4812 /// Gets/sets the scheduling priority.
4813 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4814 int getSchedulingPriority() const { return SchedulingPriority; }
4815 bool isReady() const {
4816 if (const auto *SD = dyn_cast<ScheduleData>(this))
4817 return SD->isReady();
4818 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4819 return CD->isReady();
4820 return cast<ScheduleBundle>(this)->isReady();
4821 }
4822 /// Returns true if the dependency information has been calculated.
4823 /// Note that depenendency validity can vary between instructions within
4824 /// a single bundle.
4825 bool hasValidDependencies() const {
4826 if (const auto *SD = dyn_cast<ScheduleData>(this))
4827 return SD->hasValidDependencies();
4828 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4829 return CD->hasValidDependencies();
4830 return cast<ScheduleBundle>(this)->hasValidDependencies();
4831 }
4832 /// Gets the number of unscheduled dependencies.
4833 int getUnscheduledDeps() const {
4834 if (const auto *SD = dyn_cast<ScheduleData>(this))
4835 return SD->getUnscheduledDeps();
4836 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4837 return CD->getUnscheduledDeps();
4838 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4839 }
4840 /// Increments the number of unscheduled dependencies.
4841 int incrementUnscheduledDeps(int Incr) {
4842 if (auto *SD = dyn_cast<ScheduleData>(this))
4843 return SD->incrementUnscheduledDeps(Incr);
4844 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4845 }
4846 /// Gets the number of dependencies.
4847 int getDependencies() const {
4848 if (const auto *SD = dyn_cast<ScheduleData>(this))
4849 return SD->getDependencies();
4850 return cast<ScheduleCopyableData>(this)->getDependencies();
4851 }
4852 /// Gets the instruction.
4853 Instruction *getInst() const {
4854 if (const auto *SD = dyn_cast<ScheduleData>(this))
4855 return SD->getInst();
4856 return cast<ScheduleCopyableData>(this)->getInst();
4857 }
4858
4859 /// Gets/sets if the bundle is scheduled.
4860 bool isScheduled() const { return IsScheduled; }
4861 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4862
4863 static bool classof(const ScheduleEntity *) { return true; }
4864
4865#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4866 void dump(raw_ostream &OS) const {
4867 if (const auto *SD = dyn_cast<ScheduleData>(this))
4868 return SD->dump(OS);
4869 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4870 return CD->dump(OS);
4871 return cast<ScheduleBundle>(this)->dump(OS);
4872 }
4873
4874 LLVM_DUMP_METHOD void dump() const {
4875 dump(dbgs());
4876 dbgs() << '\n';
4877 }
4878#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4879 };
4880
4881#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4883 const BoUpSLP::ScheduleEntity &SE) {
4884 SE.dump(OS);
4885 return OS;
4886 }
4887#endif
4888
4889 /// Contains all scheduling relevant data for an instruction.
4890 /// A ScheduleData either represents a single instruction or a member of an
4891 /// instruction bundle (= a group of instructions which is combined into a
4892 /// vector instruction).
4893 class ScheduleData final : public ScheduleEntity {
4894 public:
4895 // The initial value for the dependency counters. It means that the
4896 // dependencies are not calculated yet.
4897 enum { InvalidDeps = -1 };
4898
4899 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4900 static bool classof(const ScheduleEntity *Entity) {
4901 return Entity->getKind() == Kind::ScheduleData;
4902 }
4903
4904 void init(int BlockSchedulingRegionID, Instruction *I) {
4905 NextLoadStore = nullptr;
4906 IsScheduled = false;
4907 SchedulingRegionID = BlockSchedulingRegionID;
4908 clearDependencies();
4909 Inst = I;
4910 }
4911
4912 /// Verify basic self consistency properties
4913 void verify() {
4914 if (hasValidDependencies()) {
4915 assert(UnscheduledDeps <= Dependencies && "invariant");
4916 } else {
4917 assert(UnscheduledDeps == Dependencies && "invariant");
4918 }
4919
4920 if (IsScheduled) {
4921 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4922 "unexpected scheduled state");
4923 }
4924 }
4925
4926 /// Returns true if the dependency information has been calculated.
4927 /// Note that depenendency validity can vary between instructions within
4928 /// a single bundle.
4929 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4930
4931 /// Returns true if it is ready for scheduling, i.e. it has no more
4932 /// unscheduled depending instructions/bundles.
4933 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4934
4935 /// Modifies the number of unscheduled dependencies for this instruction,
4936 /// and returns the number of remaining dependencies for the containing
4937 /// bundle.
4938 int incrementUnscheduledDeps(int Incr) {
4939 assert(hasValidDependencies() &&
4940 "increment of unscheduled deps would be meaningless");
4941 UnscheduledDeps += Incr;
4942 assert(UnscheduledDeps >= 0 &&
4943 "Expected valid number of unscheduled deps");
4944 return UnscheduledDeps;
4945 }
4946
4947 /// Sets the number of unscheduled dependencies to the number of
4948 /// dependencies.
4949 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4950
4951 /// Clears all dependency information.
4952 void clearDependencies() {
4953 clearDirectDependencies();
4954 MemoryDependencies.clear();
4955 ControlDependencies.clear();
4956 }
4957
4958 /// Clears all direct dependencies only, except for control and memory
4959 /// dependencies.
4960 /// Required for copyable elements to correctly handle control/memory deps
4961 /// and avoid extra reclaculation of such deps.
4962 void clearDirectDependencies() {
4963 Dependencies = InvalidDeps;
4964 resetUnscheduledDeps();
4965 IsScheduled = false;
4966 }
4967
4968 /// Gets the number of unscheduled dependencies.
4969 int getUnscheduledDeps() const { return UnscheduledDeps; }
4970 /// Gets the number of dependencies.
4971 int getDependencies() const { return Dependencies; }
4972 /// Initializes the number of dependencies.
4973 void initDependencies() { Dependencies = 0; }
4974 /// Increments the number of dependencies.
4975 void incDependencies() { Dependencies++; }
4976
4977 /// Gets scheduling region ID.
4978 int getSchedulingRegionID() const { return SchedulingRegionID; }
4979
4980 /// Gets the instruction.
4981 Instruction *getInst() const { return Inst; }
4982
4983 /// Gets the list of memory dependencies.
4984 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4985 return MemoryDependencies;
4986 }
4987 /// Adds a memory dependency.
4988 void addMemoryDependency(ScheduleData *Dep) {
4989 MemoryDependencies.push_back(Dep);
4990 }
4991 /// Gets the list of control dependencies.
4992 ArrayRef<ScheduleData *> getControlDependencies() const {
4993 return ControlDependencies;
4994 }
4995 /// Adds a control dependency.
4996 void addControlDependency(ScheduleData *Dep) {
4997 ControlDependencies.push_back(Dep);
4998 }
4999 /// Gets/sets the next load/store instruction in the block.
5000 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
5001 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
5002
5003 void dump(raw_ostream &OS) const { OS << *Inst; }
5004
5005 LLVM_DUMP_METHOD void dump() const {
5006 dump(dbgs());
5007 dbgs() << '\n';
5008 }
5009
5010 private:
5011 Instruction *Inst = nullptr;
5012
5013 /// Single linked list of all memory instructions (e.g. load, store, call)
5014 /// in the block - until the end of the scheduling region.
5015 ScheduleData *NextLoadStore = nullptr;
5016
5017 /// The dependent memory instructions.
5018 /// This list is derived on demand in calculateDependencies().
5019 SmallVector<ScheduleData *> MemoryDependencies;
5020
5021 /// List of instructions which this instruction could be control dependent
5022 /// on. Allowing such nodes to be scheduled below this one could introduce
5023 /// a runtime fault which didn't exist in the original program.
5024 /// ex: this is a load or udiv following a readonly call which inf loops
5025 SmallVector<ScheduleData *> ControlDependencies;
5026
5027 /// This ScheduleData is in the current scheduling region if this matches
5028 /// the current SchedulingRegionID of BlockScheduling.
5029 int SchedulingRegionID = 0;
5030
5031 /// The number of dependencies. Constitutes of the number of users of the
5032 /// instruction plus the number of dependent memory instructions (if any).
5033 /// This value is calculated on demand.
5034 /// If InvalidDeps, the number of dependencies is not calculated yet.
5035 int Dependencies = InvalidDeps;
5036
5037 /// The number of dependencies minus the number of dependencies of scheduled
5038 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5039 /// for scheduling.
5040 /// Note that this is negative as long as Dependencies is not calculated.
5041 int UnscheduledDeps = InvalidDeps;
5042 };
5043
5044#ifndef NDEBUG
5046 const BoUpSLP::ScheduleData &SD) {
5047 SD.dump(OS);
5048 return OS;
5049 }
5050#endif
5051
5052 class ScheduleBundle final : public ScheduleEntity {
5053 /// The schedule data for the instructions in the bundle.
5055 /// True if this bundle is valid.
5056 bool IsValid = true;
5057 /// The TreeEntry that this instruction corresponds to.
5058 TreeEntry *TE = nullptr;
5059 ScheduleBundle(bool IsValid)
5060 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5061
5062 public:
5063 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5064 static bool classof(const ScheduleEntity *Entity) {
5065 return Entity->getKind() == Kind::ScheduleBundle;
5066 }
5067
5068 /// Verify basic self consistency properties
5069 void verify() const {
5070 for (const ScheduleEntity *SD : Bundle) {
5071 if (SD->hasValidDependencies()) {
5072 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5073 "invariant");
5074 } else {
5075 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5076 "invariant");
5077 }
5078
5079 if (isScheduled()) {
5080 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5081 "unexpected scheduled state");
5082 }
5083 }
5084 }
5085
5086 /// Returns the number of unscheduled dependencies in the bundle.
5087 int unscheduledDepsInBundle() const {
5088 assert(*this && "bundle must not be empty");
5089 int Sum = 0;
5090 for (const ScheduleEntity *BundleMember : Bundle) {
5091 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5092 return ScheduleData::InvalidDeps;
5093 Sum += BundleMember->getUnscheduledDeps();
5094 }
5095 return Sum;
5096 }
5097
5098 /// Returns true if the dependency information has been calculated.
5099 /// Note that depenendency validity can vary between instructions within
5100 /// a single bundle.
5101 bool hasValidDependencies() const {
5102 return all_of(Bundle, [](const ScheduleEntity *SD) {
5103 return SD->hasValidDependencies();
5104 });
5105 }
5106
5107 /// Returns true if it is ready for scheduling, i.e. it has no more
5108 /// unscheduled depending instructions/bundles.
5109 bool isReady() const {
5110 assert(*this && "bundle must not be empty");
5111 return unscheduledDepsInBundle() == 0 && !isScheduled();
5112 }
5113
5114 /// Returns the bundle of scheduling data, associated with the current
5115 /// instruction.
5116 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5117 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5118 /// Adds an instruction to the bundle.
5119 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5120
5121 /// Gets/sets the associated tree entry.
5122 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5123 TreeEntry *getTreeEntry() const { return TE; }
5124
5125 static ScheduleBundle invalid() { return {false}; }
5126
5127 operator bool() const { return IsValid; }
5128
5129#ifndef NDEBUG
5130 void dump(raw_ostream &OS) const {
5131 if (!*this) {
5132 OS << "[]";
5133 return;
5134 }
5135 OS << '[';
5136 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5138 OS << "<Copyable>";
5139 OS << *SD->getInst();
5140 });
5141 OS << ']';
5142 }
5143
5144 LLVM_DUMP_METHOD void dump() const {
5145 dump(dbgs());
5146 dbgs() << '\n';
5147 }
5148#endif // NDEBUG
5149 };
5150
5151#ifndef NDEBUG
5153 const BoUpSLP::ScheduleBundle &Bundle) {
5154 Bundle.dump(OS);
5155 return OS;
5156 }
5157#endif
5158
5159 /// Contains all scheduling relevant data for the copyable instruction.
5160 /// It models the virtual instructions, supposed to replace the original
5161 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5162 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5163 /// instruction %virt = add %0, 0.
5164 class ScheduleCopyableData final : public ScheduleEntity {
5165 /// The source schedule data for the instruction.
5166 Instruction *Inst = nullptr;
5167 /// The edge information for the instruction.
5168 const EdgeInfo EI;
5169 /// This ScheduleData is in the current scheduling region if this matches
5170 /// the current SchedulingRegionID of BlockScheduling.
5171 int SchedulingRegionID = 0;
5172 /// Bundle, this data is part of.
5173 ScheduleBundle &Bundle;
5174
5175 public:
5176 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5177 const EdgeInfo &EI, ScheduleBundle &Bundle)
5178 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5179 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5180 static bool classof(const ScheduleEntity *Entity) {
5181 return Entity->getKind() == Kind::ScheduleCopyableData;
5182 }
5183
5184 /// Verify basic self consistency properties
5185 void verify() {
5186 if (hasValidDependencies()) {
5187 assert(UnscheduledDeps <= Dependencies && "invariant");
5188 } else {
5189 assert(UnscheduledDeps == Dependencies && "invariant");
5190 }
5191
5192 if (IsScheduled) {
5193 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5194 "unexpected scheduled state");
5195 }
5196 }
5197
5198 /// Returns true if the dependency information has been calculated.
5199 /// Note that depenendency validity can vary between instructions within
5200 /// a single bundle.
5201 bool hasValidDependencies() const {
5202 return Dependencies != ScheduleData::InvalidDeps;
5203 }
5204
5205 /// Returns true if it is ready for scheduling, i.e. it has no more
5206 /// unscheduled depending instructions/bundles.
5207 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5208
5209 /// Modifies the number of unscheduled dependencies for this instruction,
5210 /// and returns the number of remaining dependencies for the containing
5211 /// bundle.
5212 int incrementUnscheduledDeps(int Incr) {
5213 assert(hasValidDependencies() &&
5214 "increment of unscheduled deps would be meaningless");
5215 UnscheduledDeps += Incr;
5216 assert(UnscheduledDeps >= 0 && "invariant");
5217 return UnscheduledDeps;
5218 }
5219
5220 /// Sets the number of unscheduled dependencies to the number of
5221 /// dependencies.
5222 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5223
5224 /// Gets the number of unscheduled dependencies.
5225 int getUnscheduledDeps() const { return UnscheduledDeps; }
5226 /// Gets the number of dependencies.
5227 int getDependencies() const { return Dependencies; }
5228 /// Initializes the number of dependencies.
5229 void initDependencies() { Dependencies = 0; }
5230 /// Increments the number of dependencies.
5231 void incDependencies() { Dependencies++; }
5232
5233 /// Gets scheduling region ID.
5234 int getSchedulingRegionID() const { return SchedulingRegionID; }
5235
5236 /// Gets the instruction.
5237 Instruction *getInst() const { return Inst; }
5238
5239 /// Clears all dependency information.
5240 void clearDependencies() {
5241 Dependencies = ScheduleData::InvalidDeps;
5242 UnscheduledDeps = ScheduleData::InvalidDeps;
5243 IsScheduled = false;
5244 }
5245
5246 /// Gets the edge information.
5247 const EdgeInfo &getEdgeInfo() const { return EI; }
5248
5249 /// Gets the bundle.
5250 ScheduleBundle &getBundle() { return Bundle; }
5251 const ScheduleBundle &getBundle() const { return Bundle; }
5252
5253#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5254 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5255
5256 LLVM_DUMP_METHOD void dump() const {
5257 dump(dbgs());
5258 dbgs() << '\n';
5259 }
5260#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5261
5262 private:
5263 /// true, if it has valid dependency information. These nodes always have
5264 /// only single dependency.
5265 int Dependencies = ScheduleData::InvalidDeps;
5266
5267 /// The number of dependencies minus the number of dependencies of scheduled
5268 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5269 /// for scheduling.
5270 /// Note that this is negative as long as Dependencies is not calculated.
5271 int UnscheduledDeps = ScheduleData::InvalidDeps;
5272 };
5273
5274#ifndef NDEBUG
5275 friend inline raw_ostream &
5276 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5277 SD.dump(OS);
5278 return OS;
5279 }
5280#endif
5281
5282 friend struct GraphTraits<BoUpSLP *>;
5283 friend struct DOTGraphTraits<BoUpSLP *>;
5284
5285 /// Contains all scheduling data for a basic block.
5286 /// It does not schedules instructions, which are not memory read/write
5287 /// instructions and their operands are either constants, or arguments, or
5288 /// phis, or instructions from others blocks, or their users are phis or from
5289 /// the other blocks. The resulting vector instructions can be placed at the
5290 /// beginning of the basic block without scheduling (if operands does not need
5291 /// to be scheduled) or at the end of the block (if users are outside of the
5292 /// block). It allows to save some compile time and memory used by the
5293 /// compiler.
5294 /// ScheduleData is assigned for each instruction in between the boundaries of
5295 /// the tree entry, even for those, which are not part of the graph. It is
5296 /// required to correctly follow the dependencies between the instructions and
5297 /// their correct scheduling. The ScheduleData is not allocated for the
5298 /// instructions, which do not require scheduling, like phis, nodes with
5299 /// extractelements/insertelements only or nodes with instructions, with
5300 /// uses/operands outside of the block.
5301 struct BlockScheduling {
5302 BlockScheduling(BasicBlock *BB)
5303 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5304
5305 void clear() {
5306 ScheduledBundles.clear();
5307 ScheduledBundlesList.clear();
5308 ScheduleCopyableDataMap.clear();
5309 ScheduleCopyableDataMapByInst.clear();
5310 ScheduleCopyableDataMapByInstUser.clear();
5311 ScheduleCopyableDataMapByUsers.clear();
5312 ReadyInsts.clear();
5313 ScheduleStart = nullptr;
5314 ScheduleEnd = nullptr;
5315 FirstLoadStoreInRegion = nullptr;
5316 LastLoadStoreInRegion = nullptr;
5317 RegionHasStackSave = false;
5318
5319 // Reduce the maximum schedule region size by the size of the
5320 // previous scheduling run.
5321 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5322 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5323 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5324 ScheduleRegionSize = 0;
5325
5326 // Make a new scheduling region, i.e. all existing ScheduleData is not
5327 // in the new region yet.
5328 ++SchedulingRegionID;
5329 }
5330
5331 ScheduleData *getScheduleData(Instruction *I) {
5332 if (!I)
5333 return nullptr;
5334 if (BB != I->getParent())
5335 // Avoid lookup if can't possibly be in map.
5336 return nullptr;
5337 ScheduleData *SD = ScheduleDataMap.lookup(I);
5338 if (SD && isInSchedulingRegion(*SD))
5339 return SD;
5340 return nullptr;
5341 }
5342
5343 ScheduleData *getScheduleData(Value *V) {
5344 return getScheduleData(dyn_cast<Instruction>(V));
5345 }
5346
5347 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5348 /// operand number) and value.
5349 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5350 const Value *V) const {
5351 if (ScheduleCopyableDataMap.empty())
5352 return nullptr;
5353 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5354 if (It == ScheduleCopyableDataMap.end())
5355 return nullptr;
5356 ScheduleCopyableData *SD = It->getSecond().get();
5357 if (!isInSchedulingRegion(*SD))
5358 return nullptr;
5359 return SD;
5360 }
5361
5362 /// Returns the ScheduleCopyableData for the given user \p User, operand
5363 /// number and operand \p V.
5365 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5366 const Value *V) {
5367 if (ScheduleCopyableDataMapByInstUser.empty())
5368 return {};
5369 const auto It = ScheduleCopyableDataMapByInstUser.find(
5370 std::make_pair(std::make_pair(User, OperandIdx), V));
5371 if (It == ScheduleCopyableDataMapByInstUser.end())
5372 return {};
5374 for (ScheduleCopyableData *SD : It->getSecond()) {
5375 if (isInSchedulingRegion(*SD))
5376 Res.push_back(SD);
5377 }
5378 return Res;
5379 }
5380
5381 /// Returns true if all operands of the given instruction \p User are
5382 /// replaced by copyable data.
5383 /// \param User The user instruction.
5384 /// \param Op The operand, which might be replaced by the copyable data.
5385 /// \param SLP The SLP tree.
5386 /// \param NumOps The number of operands used. If the instruction uses the
5387 /// same operand several times, check for the first use, then the second,
5388 /// etc.
5389 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5390 Instruction *Op, BoUpSLP &SLP,
5391 unsigned NumOps) const {
5392 assert(NumOps > 0 && "No operands");
5393 if (ScheduleCopyableDataMap.empty())
5394 return false;
5395 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5396 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5397 if (Entries.empty())
5398 return false;
5399 unsigned CurNumOps = 0;
5400 for (const Use &U : User->operands()) {
5401 if (U.get() != Op)
5402 continue;
5403 ++CurNumOps;
5404 // Check all tree entries, if they have operands replaced by copyable
5405 // data.
5406 for (TreeEntry *TE : Entries) {
5407 unsigned Inc = 0;
5408 bool IsNonSchedulableWithParentPhiNode =
5409 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5410 TE->UserTreeIndex.UserTE->hasState() &&
5411 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5412 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5413 // Count the number of unique phi nodes, which are the parent for
5414 // parent entry, and exit, if all the unique phis are processed.
5415 if (IsNonSchedulableWithParentPhiNode) {
5416 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5417 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5418 for (Value *V : ParentTE->Scalars) {
5419 auto *PHI = dyn_cast<PHINode>(V);
5420 if (!PHI)
5421 continue;
5422 if (ParentsUniqueUsers.insert(PHI).second &&
5423 is_contained(PHI->incoming_values(), User))
5424 ++Inc;
5425 }
5426 } else {
5427 Inc = count(TE->Scalars, User);
5428 }
5429
5430 // Check if the user is commutative.
5431 // The commutatives are handled later, as their operands can be
5432 // reordered.
5433 // Same applies even for non-commutative cmps, because we can invert
5434 // their predicate potentially and, thus, reorder the operands.
5435 bool IsCommutativeUser =
5436 ::isCommutative(User) &&
5437 ::isCommutableOperand(User, User, U.getOperandNo());
5438 if (!IsCommutativeUser) {
5439 Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User);
5440 IsCommutativeUser =
5441 ::isCommutative(MainOp, User) &&
5442 ::isCommutableOperand(MainOp, User, U.getOperandNo());
5443 }
5444 // The commutative user with the same operands can be safely
5445 // considered as non-commutative, operands reordering does not change
5446 // the semantics.
5447 assert(
5448 (!IsCommutativeUser ||
5449 (((::isCommutative(User) &&
5450 ::isCommutableOperand(User, User, 0) &&
5451 ::isCommutableOperand(User, User, 1)) ||
5452 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
5453 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5454 User, 0) &&
5455 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5456 User, 1))))) &&
5457 "Expected commutative user with 2 first commutable operands");
5458 bool IsCommutativeWithSameOps =
5459 IsCommutativeUser && User->getOperand(0) == User->getOperand(1);
5460 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5461 !isa<CmpInst>(User)) {
5462 EdgeInfo EI(TE, U.getOperandNo());
5463 if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
5464 continue;
5465 return false;
5466 }
5467 PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5468 .first->getSecond() += Inc;
5469 }
5470 }
5471 if (PotentiallyReorderedEntriesCount.empty())
5472 return true;
5473 // Check the commutative/cmp entries.
5474 for (auto &P : PotentiallyReorderedEntriesCount) {
5475 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5476 bool IsNonSchedulableWithParentPhiNode =
5477 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5478 P.first->UserTreeIndex.UserTE->hasState() &&
5479 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5480 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5481 auto *It = find(P.first->Scalars, User);
5482 do {
5483 assert(It != P.first->Scalars.end() &&
5484 "User is not in the tree entry");
5485 int Lane = std::distance(P.first->Scalars.begin(), It);
5486 assert(Lane >= 0 && "Lane is not found");
5487 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5488 Lane = P.first->ReorderIndices[Lane];
5489 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5490 "Couldn't find extract lane");
5491 // Count the number of unique phi nodes, which are the parent for
5492 // parent entry, and exit, if all the unique phis are processed.
5493 if (IsNonSchedulableWithParentPhiNode) {
5494 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5495 Value *User = ParentTE->Scalars[Lane];
5496 if (!ParentsUniqueUsers.insert(User).second) {
5497 It =
5498 find(make_range(std::next(It), P.first->Scalars.end()), User);
5499 continue;
5500 }
5501 }
5502 for (unsigned OpIdx :
5504 P.first->getMainOp()))) {
5505 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5506 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5507 --P.getSecond();
5508 }
5509 // If parent node is schedulable, it will be handled correctly.
5510 It = find(make_range(std::next(It), P.first->Scalars.end()), User);
5511 } while (It != P.first->Scalars.end());
5512 }
5513 return all_of(PotentiallyReorderedEntriesCount,
5514 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5515 return P.second == NumOps - 1;
5516 });
5517 }
5518
5520 getScheduleCopyableData(const Instruction *I) const {
5521 if (ScheduleCopyableDataMapByInst.empty())
5522 return {};
5523 const auto It = ScheduleCopyableDataMapByInst.find(I);
5524 if (It == ScheduleCopyableDataMapByInst.end())
5525 return {};
5527 for (ScheduleCopyableData *SD : It->getSecond()) {
5528 if (isInSchedulingRegion(*SD))
5529 Res.push_back(SD);
5530 }
5531 return Res;
5532 }
5533
5535 getScheduleCopyableDataUsers(const Instruction *User) const {
5536 if (ScheduleCopyableDataMapByUsers.empty())
5537 return {};
5538 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5539 if (It == ScheduleCopyableDataMapByUsers.end())
5540 return {};
5542 for (ScheduleCopyableData *SD : It->getSecond()) {
5543 if (isInSchedulingRegion(*SD))
5544 Res.push_back(SD);
5545 }
5546 return Res;
5547 }
5548
5549 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5550 Instruction *I,
5551 int SchedulingRegionID,
5552 ScheduleBundle &Bundle) {
5553 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5554 ScheduleCopyableData *CD =
5555 ScheduleCopyableDataMap
5556 .try_emplace(std::make_pair(EI, I),
5557 std::make_unique<ScheduleCopyableData>(
5558 SchedulingRegionID, I, EI, Bundle))
5559 .first->getSecond()
5560 .get();
5561 ScheduleCopyableDataMapByInst[I].push_back(CD);
5562 if (EI.UserTE) {
5563 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5564 const auto *It = find(Op, I);
5565 assert(It != Op.end() && "Lane not set");
5566 SmallPtrSet<Instruction *, 4> Visited;
5567 do {
5568 int Lane = std::distance(Op.begin(), It);
5569 assert(Lane >= 0 && "Lane not set");
5570 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5571 !EI.UserTE->ReorderIndices.empty())
5572 Lane = EI.UserTE->ReorderIndices[Lane];
5573 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5574 "Couldn't find extract lane");
5575 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5576 if (!Visited.insert(In).second) {
5577 It = find(make_range(std::next(It), Op.end()), I);
5578 continue;
5579 }
5580 ScheduleCopyableDataMapByInstUser
5581 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5582 .first->getSecond()
5583 .push_back(CD);
5584 ScheduleCopyableDataMapByUsers.try_emplace(I)
5585 .first->getSecond()
5586 .insert(CD);
5587 // Remove extra deps for users, becoming non-immediate users of the
5588 // instruction. It may happen, if the chain of same copyable elements
5589 // appears in the tree.
5590 if (In == I) {
5591 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5592 if (ScheduleCopyableData *UserCD =
5593 getScheduleCopyableData(UserEI, In))
5594 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5595 }
5596 It = find(make_range(std::next(It), Op.end()), I);
5597 } while (It != Op.end());
5598 } else {
5599 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5600 CD);
5601 }
5602 return *CD;
5603 }
5604
5605 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5606 auto *I = dyn_cast<Instruction>(V);
5607 if (!I)
5608 return {};
5609 auto It = ScheduledBundles.find(I);
5610 if (It == ScheduledBundles.end())
5611 return {};
5612 return It->getSecond();
5613 }
5614
5615 /// Returns true if the entity is in the scheduling region.
5616 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5617 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5618 return Data->getSchedulingRegionID() == SchedulingRegionID;
5619 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5620 return CD->getSchedulingRegionID() == SchedulingRegionID;
5621 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5622 [&](const ScheduleEntity *BundleMember) {
5623 return isInSchedulingRegion(*BundleMember);
5624 });
5625 }
5626
5627 /// Marks an instruction as scheduled and puts all dependent ready
5628 /// instructions into the ready-list.
5629 template <typename ReadyListType>
5630 void schedule(const BoUpSLP &R, const InstructionsState &S,
5631 const EdgeInfo &EI, ScheduleEntity *Data,
5632 ReadyListType &ReadyList) {
5633 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5635 // Handle the def-use chain dependencies.
5636
5637 // Decrement the unscheduled counter and insert to ready list if ready.
5638 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5639 if ((IsControl || Data->hasValidDependencies()) &&
5640 Data->incrementUnscheduledDeps(-1) == 0) {
5641 // There are no more unscheduled dependencies after
5642 // decrementing, so we can put the dependent instruction
5643 // into the ready list.
5644 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5646 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5647 CopyableBundle.push_back(&CD->getBundle());
5648 Bundles = CopyableBundle;
5649 } else {
5650 Bundles = getScheduleBundles(Data->getInst());
5651 }
5652 if (!Bundles.empty()) {
5653 for (ScheduleBundle *Bundle : Bundles) {
5654 if (Bundle->unscheduledDepsInBundle() == 0) {
5655 assert(!Bundle->isScheduled() &&
5656 "already scheduled bundle gets ready");
5657 ReadyList.insert(Bundle);
5659 << "SLP: gets ready: " << *Bundle << "\n");
5660 }
5661 }
5662 return;
5663 }
5664 assert(!Data->isScheduled() &&
5665 "already scheduled bundle gets ready");
5667 "Expected non-copyable data");
5668 ReadyList.insert(Data);
5669 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5670 }
5671 };
5672
5673 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5674 Instruction *I) {
5675 if (!ScheduleCopyableDataMap.empty()) {
5677 getScheduleCopyableData(User, OpIdx, I);
5678 for (ScheduleCopyableData *CD : CopyableData)
5679 DecrUnsched(CD, /*IsControl=*/false);
5680 if (!CopyableData.empty())
5681 return;
5682 }
5683 if (ScheduleData *OpSD = getScheduleData(I))
5684 DecrUnsched(OpSD, /*IsControl=*/false);
5685 };
5686
5687 // If BundleMember is a vector bundle, its operands may have been
5688 // reordered during buildTree(). We therefore need to get its operands
5689 // through the TreeEntry.
5690 if (!Bundles.empty()) {
5691 auto *In = BundleMember->getInst();
5692 // Count uses of each instruction operand.
5693 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5694 unsigned TotalOpCount = 0;
5695 if (isa<ScheduleCopyableData>(BundleMember)) {
5696 // Copyable data is used only once (uses itself).
5697 TotalOpCount = OperandsUses[In] = 1;
5698 } else {
5699 for (const Use &U : In->operands()) {
5700 if (auto *I = dyn_cast<Instruction>(U.get())) {
5701 auto Res = OperandsUses.try_emplace(I, 0);
5702 ++Res.first->getSecond();
5703 ++TotalOpCount;
5704 }
5705 }
5706 }
5707 // Decrement the unscheduled counter and insert to ready list if
5708 // ready.
5709 auto DecrUnschedForInst =
5710 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5711 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5712 &Checked) {
5713 if (!ScheduleCopyableDataMap.empty()) {
5714 const EdgeInfo EI = {UserTE, OpIdx};
5715 if (ScheduleCopyableData *CD =
5716 getScheduleCopyableData(EI, I)) {
5717 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
5718 return;
5719 DecrUnsched(CD, /*IsControl=*/false);
5720 return;
5721 }
5722 }
5723 auto It = OperandsUses.find(I);
5724 assert(It != OperandsUses.end() && "Operand not found");
5725 if (It->second > 0) {
5726 if (ScheduleData *OpSD = getScheduleData(I)) {
5727 if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
5728 return;
5729 --It->getSecond();
5730 assert(TotalOpCount > 0 && "No more operands to decrement");
5731 --TotalOpCount;
5732 DecrUnsched(OpSD, /*IsControl=*/false);
5733 } else {
5734 --It->getSecond();
5735 assert(TotalOpCount > 0 && "No more operands to decrement");
5736 --TotalOpCount;
5737 }
5738 }
5739 };
5740
5741 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5742 for (ScheduleBundle *Bundle : Bundles) {
5743 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5744 break;
5745 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5746 // Need to search for the lane since the tree entry can be
5747 // reordered.
5748 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
5749 bool IsNonSchedulableWithParentPhiNode =
5750 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5751 Bundle->getTreeEntry()->UserTreeIndex &&
5752 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5753 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5754 TreeEntry::SplitVectorize &&
5755 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5756 Instruction::PHI;
5757 do {
5758 int Lane =
5759 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5760 assert(Lane >= 0 && "Lane not set");
5761 if (isa<StoreInst>(In) &&
5762 !Bundle->getTreeEntry()->ReorderIndices.empty())
5763 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5764 assert(Lane < static_cast<int>(
5765 Bundle->getTreeEntry()->Scalars.size()) &&
5766 "Couldn't find extract lane");
5767
5768 // Since vectorization tree is being built recursively this
5769 // assertion ensures that the tree entry has all operands set
5770 // before reaching this code. Couple of exceptions known at the
5771 // moment are extracts where their second (immediate) operand is
5772 // not added. Since immediates do not affect scheduler behavior
5773 // this is considered okay.
5774 assert(In &&
5776 In->getNumOperands() ==
5777 Bundle->getTreeEntry()->getNumOperands() ||
5778 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5779 "Missed TreeEntry operands?");
5780
5781 // Count the number of unique phi nodes, which are the parent for
5782 // parent entry, and exit, if all the unique phis are processed.
5783 if (IsNonSchedulableWithParentPhiNode) {
5784 const TreeEntry *ParentTE =
5785 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5786 Value *User = ParentTE->Scalars[Lane];
5787 if (!ParentsUniqueUsers.insert(User).second) {
5788 It = std::find(std::next(It),
5789 Bundle->getTreeEntry()->Scalars.end(), In);
5790 continue;
5791 }
5792 }
5793
5794 for (unsigned OpIdx :
5795 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5796 if (auto *I = dyn_cast<Instruction>(
5797 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5798 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5799 << *I << "\n");
5800 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5801 }
5802 // If parent node is schedulable, it will be handled correctly.
5803 if (Bundle->getTreeEntry()->isCopyableElement(In))
5804 break;
5805 It = std::find(std::next(It),
5806 Bundle->getTreeEntry()->Scalars.end(), In);
5807 } while (It != Bundle->getTreeEntry()->Scalars.end());
5808 }
5809 } else {
5810 // If BundleMember is a stand-alone instruction, no operand reordering
5811 // has taken place, so we directly access its operands.
5812 for (Use &U : BundleMember->getInst()->operands()) {
5813 if (auto *I = dyn_cast<Instruction>(U.get())) {
5815 << "SLP: check for readiness (def): " << *I << "\n");
5816 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5817 }
5818 }
5819 }
5820 // Handle the memory dependencies.
5821 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5822 if (!SD)
5823 return;
5824 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5825 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5826 if (!VisitedMemory.insert(MemoryDep).second)
5827 continue;
5828 // There are no more unscheduled dependencies after decrementing,
5829 // so we can put the dependent instruction into the ready list.
5830 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5831 << *MemoryDep << "\n");
5832 DecrUnsched(MemoryDep);
5833 }
5834 // Handle the control dependencies.
5835 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5836 for (ScheduleData *Dep : SD->getControlDependencies()) {
5837 if (!VisitedControl.insert(Dep).second)
5838 continue;
5839 // There are no more unscheduled dependencies after decrementing,
5840 // so we can put the dependent instruction into the ready list.
5842 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5843 DecrUnsched(Dep, /*IsControl=*/true);
5844 }
5845 };
5846 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5847 SD->setScheduled(/*Scheduled=*/true);
5848 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5851 Instruction *In = SD->getInst();
5852 if (R.isVectorized(In)) {
5853 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5854 for (TreeEntry *TE : Entries) {
5856 In->getNumOperands() != TE->getNumOperands())
5857 continue;
5858 auto &BundlePtr =
5859 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5860 BundlePtr->setTreeEntry(TE);
5861 BundlePtr->add(SD);
5862 Bundles.push_back(BundlePtr.get());
5863 }
5864 }
5865 ProcessBundleMember(SD, Bundles);
5866 } else {
5867 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5868 Bundle.setScheduled(/*Scheduled=*/true);
5869 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5870 auto AreAllBundlesScheduled =
5871 [&](const ScheduleEntity *SD,
5872 ArrayRef<ScheduleBundle *> SDBundles) {
5874 return true;
5875 return !SDBundles.empty() &&
5876 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5877 return SDBundle->isScheduled();
5878 });
5879 };
5880 for (ScheduleEntity *SD : Bundle.getBundle()) {
5883 SDBundles = getScheduleBundles(SD->getInst());
5884 if (AreAllBundlesScheduled(SD, SDBundles)) {
5885 SD->setScheduled(/*Scheduled=*/true);
5886 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5887 : SDBundles);
5888 }
5889 }
5890 }
5891 }
5892
5893 /// Verify basic self consistency properties of the data structure.
5894 void verify() {
5895 if (!ScheduleStart)
5896 return;
5897
5898 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5899 ScheduleStart->comesBefore(ScheduleEnd) &&
5900 "Not a valid scheduling region?");
5901
5902 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5903 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5904 if (!Bundles.empty()) {
5905 for (ScheduleBundle *Bundle : Bundles) {
5906 assert(isInSchedulingRegion(*Bundle) &&
5907 "primary schedule data not in window?");
5908 Bundle->verify();
5909 }
5910 continue;
5911 }
5912 auto *SD = getScheduleData(I);
5913 if (!SD)
5914 continue;
5915 assert(isInSchedulingRegion(*SD) &&
5916 "primary schedule data not in window?");
5917 SD->verify();
5918 }
5919
5920 assert(all_of(ReadyInsts,
5921 [](const ScheduleEntity *Bundle) {
5922 return Bundle->isReady();
5923 }) &&
5924 "item in ready list not ready?");
5925 }
5926
5927 /// Put all instructions into the ReadyList which are ready for scheduling.
5928 template <typename ReadyListType>
5929 void initialFillReadyList(ReadyListType &ReadyList) {
5930 SmallPtrSet<ScheduleBundle *, 16> Visited;
5931 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5932 ScheduleData *SD = getScheduleData(I);
5933 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5934 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5935 !Bundles.empty()) {
5936 for (ScheduleBundle *Bundle : Bundles) {
5937 if (!Visited.insert(Bundle).second)
5938 continue;
5939 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5940 ReadyList.insert(Bundle);
5941 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5942 << *Bundle << "\n");
5943 }
5944 }
5945 continue;
5946 }
5947 ReadyList.insert(SD);
5949 << "SLP: initially in ready list: " << *SD << "\n");
5950 }
5951 }
5952 }
5953
5954 /// Build a bundle from the ScheduleData nodes corresponding to the
5955 /// scalar instruction for each lane.
5956 /// \param VL The list of scalar instructions.
5957 /// \param S The state of the instructions.
5958 /// \param EI The edge in the SLP graph or the user node/operand number.
5959 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5960 const InstructionsState &S, const EdgeInfo &EI);
5961
5962 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5963 /// cyclic dependencies. This is only a dry-run, no instructions are
5964 /// actually moved at this stage.
5965 /// \returns the scheduling bundle. The returned Optional value is not
5966 /// std::nullopt if \p VL is allowed to be scheduled.
5967 std::optional<ScheduleBundle *>
5968 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5969 const InstructionsState &S, const EdgeInfo &EI);
5970
5971 /// Allocates schedule data chunk.
5972 ScheduleData *allocateScheduleDataChunks();
5973
5974 /// Extends the scheduling region so that V is inside the region.
5975 /// \returns true if the region size is within the limit.
5976 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5977
5978 /// Initialize the ScheduleData structures for new instructions in the
5979 /// scheduling region.
5980 void initScheduleData(Instruction *FromI, Instruction *ToI,
5981 ScheduleData *PrevLoadStore,
5982 ScheduleData *NextLoadStore);
5983
5984 /// Updates the dependency information of a bundle and of all instructions/
5985 /// bundles which depend on the original bundle.
5986 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5987 BoUpSLP *SLP,
5988 ArrayRef<ScheduleData *> ControlDeps = {});
5989
5990 /// Sets all instruction in the scheduling region to un-scheduled.
5991 void resetSchedule();
5992
5993 BasicBlock *BB;
5994
5995 /// Simple memory allocation for ScheduleData.
5997
5998 /// The size of a ScheduleData array in ScheduleDataChunks.
5999 int ChunkSize;
6000
6001 /// The allocator position in the current chunk, which is the last entry
6002 /// of ScheduleDataChunks.
6003 int ChunkPos;
6004
6005 /// Attaches ScheduleData to Instruction.
6006 /// Note that the mapping survives during all vectorization iterations, i.e.
6007 /// ScheduleData structures are recycled.
6008 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
6009
6010 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
6011 /// number) and the operand instruction, represented as copyable element.
6012 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
6013 std::unique_ptr<ScheduleCopyableData>>
6014 ScheduleCopyableDataMap;
6015
6016 /// Represents mapping between instruction and all related
6017 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
6018 /// element). The SLP tree may contain several representations of the same
6019 /// instruction.
6020 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
6021 ScheduleCopyableDataMapByInst;
6022
6023 /// Represents mapping between user value and operand number, the operand
6024 /// value and all related ScheduleCopyableData. The relation is 1:n, because
6025 /// the same user may refernce the same operand in different tree entries
6026 /// and the operand may be modelled by the different copyable data element.
6027 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
6029 ScheduleCopyableDataMapByInstUser;
6030
6031 /// Represents mapping between instruction and all related
6032 /// ScheduleCopyableData. It represents the mapping between the actual
6033 /// instruction and the last copyable data element in the chain. E.g., if
6034 /// the graph models the following instructions:
6035 /// %0 = non-add instruction ...
6036 /// ...
6037 /// %4 = add %3, 1
6038 /// %5 = add %4, 1
6039 /// %6 = insertelement poison, %0, 0
6040 /// %7 = insertelement %6, %5, 1
6041 /// And the graph is modeled as:
6042 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
6043 /// -> [1, 0] -> [%1, 0]
6044 ///
6045 /// this map will map %0 only to the copyable element <1>, which is the last
6046 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
6047 /// keep the map to <0>, not the %0.
6048 SmallDenseMap<const Instruction *,
6049 SmallSetVector<ScheduleCopyableData *, 4>>
6050 ScheduleCopyableDataMapByUsers;
6051
6052 /// Attaches ScheduleBundle to Instruction.
6053 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6054 ScheduledBundles;
6055 /// The list of ScheduleBundles.
6056 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
6057
6058 /// The ready-list for scheduling (only used for the dry-run).
6059 SetVector<ScheduleEntity *> ReadyInsts;
6060
6061 /// The first instruction of the scheduling region.
6062 Instruction *ScheduleStart = nullptr;
6063
6064 /// The first instruction _after_ the scheduling region.
6065 Instruction *ScheduleEnd = nullptr;
6066
6067 /// The first memory accessing instruction in the scheduling region
6068 /// (can be null).
6069 ScheduleData *FirstLoadStoreInRegion = nullptr;
6070
6071 /// The last memory accessing instruction in the scheduling region
6072 /// (can be null).
6073 ScheduleData *LastLoadStoreInRegion = nullptr;
6074
6075 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
6076 /// region? Used to optimize the dependence calculation for the
6077 /// common case where there isn't.
6078 bool RegionHasStackSave = false;
6079
6080 /// The current size of the scheduling region.
6081 int ScheduleRegionSize = 0;
6082
6083 /// The maximum size allowed for the scheduling region.
6084 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6085
6086 /// The ID of the scheduling region. For a new vectorization iteration this
6087 /// is incremented which "removes" all ScheduleData from the region.
6088 /// Make sure that the initial SchedulingRegionID is greater than the
6089 /// initial SchedulingRegionID in ScheduleData (which is 0).
6090 int SchedulingRegionID = 1;
6091 };
6092
6093 /// Attaches the BlockScheduling structures to basic blocks.
6094 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6095
6096 /// Performs the "real" scheduling. Done before vectorization is actually
6097 /// performed in a basic block.
6098 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6099
6100 /// List of users to ignore during scheduling and that don't need extracting.
6101 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6102
6103 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6104 /// sorted SmallVectors of unsigned.
6105 struct OrdersTypeDenseMapInfo {
6106 static OrdersType getEmptyKey() {
6107 OrdersType V;
6108 V.push_back(~1U);
6109 return V;
6110 }
6111
6112 static OrdersType getTombstoneKey() {
6113 OrdersType V;
6114 V.push_back(~2U);
6115 return V;
6116 }
6117
6118 static unsigned getHashValue(const OrdersType &V) {
6119 return static_cast<unsigned>(hash_combine_range(V));
6120 }
6121
6122 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6123 return LHS == RHS;
6124 }
6125 };
6126
6127 // Analysis and block reference.
6128 Function *F;
6129 ScalarEvolution *SE;
6130 TargetTransformInfo *TTI;
6131 TargetLibraryInfo *TLI;
6132 LoopInfo *LI;
6133 DominatorTree *DT;
6134 AssumptionCache *AC;
6135 DemandedBits *DB;
6136 const DataLayout *DL;
6137 OptimizationRemarkEmitter *ORE;
6138
6139 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6140 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6141
6142 /// Instruction builder to construct the vectorized tree.
6143 IRBuilder<TargetFolder> Builder;
6144
6145 /// A map of scalar integer values to the smallest bit width with which they
6146 /// can legally be represented. The values map to (width, signed) pairs,
6147 /// where "width" indicates the minimum bit width and "signed" is True if the
6148 /// value must be signed-extended, rather than zero-extended, back to its
6149 /// original width.
6150 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6151
6152 /// Final size of the reduced vector, if the current graph represents the
6153 /// input for the reduction and it was possible to narrow the size of the
6154 /// reduction.
6155 unsigned ReductionBitWidth = 0;
6156
6157 /// Canonical graph size before the transformations.
6158 unsigned BaseGraphSize = 1;
6159
6160 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6161 /// type sizes, used in the tree.
6162 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6163
6164 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6165 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6166 DenseSet<unsigned> ExtraBitWidthNodes;
6167};
6168
6169template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6173 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6174 SecondInfo::getEmptyKey());
6175 }
6176
6178 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6179 SecondInfo::getTombstoneKey());
6180 }
6181
6182 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6183 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6184 SecondInfo::getHashValue(Val.EdgeIdx));
6185 }
6186
6187 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6188 const BoUpSLP::EdgeInfo &RHS) {
6189 return LHS == RHS;
6190 }
6191};
6192
6193template <> struct llvm::GraphTraits<BoUpSLP *> {
6194 using TreeEntry = BoUpSLP::TreeEntry;
6195
6196 /// NodeRef has to be a pointer per the GraphWriter.
6198
6199 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6200
6201 /// Add the VectorizableTree to the index iterator to be able to return
6202 /// TreeEntry pointers.
6204 : public iterator_adaptor_base<
6205 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6207
6211
6212 NodeRef operator*() { return I->UserTE; }
6213 };
6214
6216 return R.VectorizableTree[0].get();
6217 }
6218
6220 return {&N->UserTreeIndex, N->Container};
6221 }
6222
6224 return {&N->UserTreeIndex + 1, N->Container};
6225 }
6226
6227 /// For the node iterator we just need to turn the TreeEntry iterator into a
6228 /// TreeEntry* iterator so that it dereferences to NodeRef.
6230 using ItTy = ContainerTy::iterator;
6231 ItTy It;
6232
6233 public:
6234 nodes_iterator(const ItTy &It2) : It(It2) {}
6235 NodeRef operator*() { return It->get(); }
6237 ++It;
6238 return *this;
6239 }
6240 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6241 };
6242
6244 return nodes_iterator(R->VectorizableTree.begin());
6245 }
6246
6248 return nodes_iterator(R->VectorizableTree.end());
6249 }
6250
6251 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6252};
6253
6254template <>
6256 using TreeEntry = BoUpSLP::TreeEntry;
6257
6258 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6259
6260 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6261 std::string Str;
6262 raw_string_ostream OS(Str);
6263 OS << Entry->Idx << ".\n";
6264 if (isSplat(Entry->Scalars))
6265 OS << "<splat> ";
6266 for (auto *V : Entry->Scalars) {
6267 OS << *V;
6268 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6269 return EU.Scalar == V;
6270 }))
6271 OS << " <extract>";
6272 OS << "\n";
6273 }
6274 return Str;
6275 }
6276
6277 static std::string getNodeAttributes(const TreeEntry *Entry,
6278 const BoUpSLP *) {
6279 if (Entry->isGather())
6280 return "color=red";
6281 if (Entry->State == TreeEntry::ScatterVectorize ||
6282 Entry->State == TreeEntry::StridedVectorize ||
6283 Entry->State == TreeEntry::CompressVectorize)
6284 return "color=blue";
6285 return "";
6286 }
6287};
6288
6291 for (auto *I : DeletedInstructions) {
6292 if (!I->getParent()) {
6293 // Temporarily insert instruction back to erase them from parent and
6294 // memory later.
6295 if (isa<PHINode>(I))
6296 // Phi nodes must be the very first instructions in the block.
6297 I->insertBefore(F->getEntryBlock(),
6298 F->getEntryBlock().getFirstNonPHIIt());
6299 else
6300 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6301 continue;
6302 }
6303 for (Use &U : I->operands()) {
6304 auto *Op = dyn_cast<Instruction>(U.get());
6305 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6307 DeadInsts.emplace_back(Op);
6308 }
6309 I->dropAllReferences();
6310 }
6311 for (auto *I : DeletedInstructions) {
6312 assert(I->use_empty() &&
6313 "trying to erase instruction with users.");
6314 I->eraseFromParent();
6315 }
6316
6317 // Cleanup any dead scalar code feeding the vectorized instructions
6319
6320#ifdef EXPENSIVE_CHECKS
6321 // If we could guarantee that this call is not extremely slow, we could
6322 // remove the ifdef limitation (see PR47712).
6323 assert(!verifyFunction(*F, &dbgs()));
6324#endif
6325}
6326
6327/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6328/// contains original mask for the scalars reused in the node. Procedure
6329/// transform this mask in accordance with the given \p Mask.
6331 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6332 "Expected non-empty mask.");
6333 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6334 Prev.swap(Reuses);
6335 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6336 if (Mask[I] != PoisonMaskElem)
6337 Reuses[Mask[I]] = Prev[I];
6338}
6339
6340/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6341/// the original order of the scalars. Procedure transforms the provided order
6342/// in accordance with the given \p Mask. If the resulting \p Order is just an
6343/// identity order, \p Order is cleared.
6345 bool BottomOrder = false) {
6346 assert(!Mask.empty() && "Expected non-empty mask.");
6347 unsigned Sz = Mask.size();
6348 if (BottomOrder) {
6349 SmallVector<unsigned> PrevOrder;
6350 if (Order.empty()) {
6351 PrevOrder.resize(Sz);
6352 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6353 } else {
6354 PrevOrder.swap(Order);
6355 }
6356 Order.assign(Sz, Sz);
6357 for (unsigned I = 0; I < Sz; ++I)
6358 if (Mask[I] != PoisonMaskElem)
6359 Order[I] = PrevOrder[Mask[I]];
6360 if (all_of(enumerate(Order), [&](const auto &Data) {
6361 return Data.value() == Sz || Data.index() == Data.value();
6362 })) {
6363 Order.clear();
6364 return;
6365 }
6366 fixupOrderingIndices(Order);
6367 return;
6368 }
6369 SmallVector<int> MaskOrder;
6370 if (Order.empty()) {
6371 MaskOrder.resize(Sz);
6372 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6373 } else {
6374 inversePermutation(Order, MaskOrder);
6375 }
6376 reorderReuses(MaskOrder, Mask);
6377 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6378 Order.clear();
6379 return;
6380 }
6381 Order.assign(Sz, Sz);
6382 for (unsigned I = 0; I < Sz; ++I)
6383 if (MaskOrder[I] != PoisonMaskElem)
6384 Order[MaskOrder[I]] = I;
6385 fixupOrderingIndices(Order);
6386}
6387
6388std::optional<BoUpSLP::OrdersType>
6389BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6390 bool TopToBottom, bool IgnoreReorder) {
6391 assert(TE.isGather() && "Expected gather node only.");
6392 // Try to find subvector extract/insert patterns and reorder only such
6393 // patterns.
6394 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6395 Type *ScalarTy = GatheredScalars.front()->getType();
6396 size_t NumScalars = GatheredScalars.size();
6397 if (!isValidElementType(ScalarTy))
6398 return std::nullopt;
6399 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6400 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6401 SmallVector<int> ExtractMask;
6402 SmallVector<int> Mask;
6405 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6407 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6408 /*ForOrder=*/true);
6409 // No shuffled operands - ignore.
6410 if (GatherShuffles.empty() && ExtractShuffles.empty())
6411 return std::nullopt;
6412 OrdersType CurrentOrder(NumScalars, NumScalars);
6413 if (GatherShuffles.size() == 1 &&
6414 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6415 Entries.front().front()->isSame(TE.Scalars)) {
6416 // If the full matched node in whole tree rotation - no need to consider the
6417 // matching order, rotating the whole tree.
6418 if (TopToBottom)
6419 return std::nullopt;
6420 // No need to keep the order for the same user node.
6421 if (Entries.front().front()->UserTreeIndex.UserTE ==
6422 TE.UserTreeIndex.UserTE)
6423 return std::nullopt;
6424 // No need to keep the order for the matched root node, if it can be freely
6425 // reordered.
6426 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6427 return std::nullopt;
6428 // If shuffling 2 elements only and the matching node has reverse reuses -
6429 // no need to count order, both work fine.
6430 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6431 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6432 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6433 [](const auto &P) {
6434 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6435 }))
6436 return std::nullopt;
6437
6438 // Perfect match in the graph, will reuse the previously vectorized
6439 // node. Cost is 0.
6440 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6441 return CurrentOrder;
6442 }
6443 auto IsSplatMask = [](ArrayRef<int> Mask) {
6444 int SingleElt = PoisonMaskElem;
6445 return all_of(Mask, [&](int I) {
6446 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6447 SingleElt = I;
6448 return I == PoisonMaskElem || I == SingleElt;
6449 });
6450 };
6451 // Exclusive broadcast mask - ignore.
6452 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6453 (Entries.size() != 1 ||
6454 Entries.front().front()->ReorderIndices.empty())) ||
6455 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6456 return std::nullopt;
6457 SmallBitVector ShuffledSubMasks(NumParts);
6458 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6459 ArrayRef<int> Mask, int PartSz, int NumParts,
6460 function_ref<unsigned(unsigned)> GetVF) {
6461 for (int I : seq<int>(0, NumParts)) {
6462 if (ShuffledSubMasks.test(I))
6463 continue;
6464 const int VF = GetVF(I);
6465 if (VF == 0)
6466 continue;
6467 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6468 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6469 // Shuffle of at least 2 vectors - ignore.
6470 if (any_of(Slice, not_equal_to(NumScalars))) {
6471 llvm::fill(Slice, NumScalars);
6472 ShuffledSubMasks.set(I);
6473 continue;
6474 }
6475 // Try to include as much elements from the mask as possible.
6476 int FirstMin = INT_MAX;
6477 int SecondVecFound = false;
6478 for (int K : seq<int>(Limit)) {
6479 int Idx = Mask[I * PartSz + K];
6480 if (Idx == PoisonMaskElem) {
6481 Value *V = GatheredScalars[I * PartSz + K];
6482 if (isConstant(V) && !isa<PoisonValue>(V)) {
6483 SecondVecFound = true;
6484 break;
6485 }
6486 continue;
6487 }
6488 if (Idx < VF) {
6489 if (FirstMin > Idx)
6490 FirstMin = Idx;
6491 } else {
6492 SecondVecFound = true;
6493 break;
6494 }
6495 }
6496 FirstMin = (FirstMin / PartSz) * PartSz;
6497 // Shuffle of at least 2 vectors - ignore.
6498 if (SecondVecFound) {
6499 llvm::fill(Slice, NumScalars);
6500 ShuffledSubMasks.set(I);
6501 continue;
6502 }
6503 for (int K : seq<int>(Limit)) {
6504 int Idx = Mask[I * PartSz + K];
6505 if (Idx == PoisonMaskElem)
6506 continue;
6507 Idx -= FirstMin;
6508 if (Idx >= PartSz) {
6509 SecondVecFound = true;
6510 break;
6511 }
6512 if (CurrentOrder[I * PartSz + Idx] >
6513 static_cast<unsigned>(I * PartSz + K) &&
6514 CurrentOrder[I * PartSz + Idx] !=
6515 static_cast<unsigned>(I * PartSz + Idx))
6516 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6517 }
6518 // Shuffle of at least 2 vectors - ignore.
6519 if (SecondVecFound) {
6520 llvm::fill(Slice, NumScalars);
6521 ShuffledSubMasks.set(I);
6522 continue;
6523 }
6524 }
6525 };
6526 int PartSz = getPartNumElems(NumScalars, NumParts);
6527 if (!ExtractShuffles.empty())
6528 TransformMaskToOrder(
6529 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6530 if (!ExtractShuffles[I])
6531 return 0U;
6532 unsigned VF = 0;
6533 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6534 for (unsigned Idx : seq<unsigned>(Sz)) {
6535 int K = I * PartSz + Idx;
6536 if (ExtractMask[K] == PoisonMaskElem)
6537 continue;
6538 if (!TE.ReuseShuffleIndices.empty())
6539 K = TE.ReuseShuffleIndices[K];
6540 if (K == PoisonMaskElem)
6541 continue;
6542 if (!TE.ReorderIndices.empty())
6543 K = std::distance(TE.ReorderIndices.begin(),
6544 find(TE.ReorderIndices, K));
6545 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6546 if (!EI)
6547 continue;
6548 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6549 ->getElementCount()
6550 .getKnownMinValue());
6551 }
6552 return VF;
6553 });
6554 // Check special corner case - single shuffle of the same entry.
6555 if (GatherShuffles.size() == 1 && NumParts != 1) {
6556 if (ShuffledSubMasks.any())
6557 return std::nullopt;
6558 PartSz = NumScalars;
6559 NumParts = 1;
6560 }
6561 if (!Entries.empty())
6562 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6563 if (!GatherShuffles[I])
6564 return 0U;
6565 return std::max(Entries[I].front()->getVectorFactor(),
6566 Entries[I].back()->getVectorFactor());
6567 });
6568 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6569 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6570 return std::nullopt;
6571 return std::move(CurrentOrder);
6572}
6573
6574static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6575 const TargetLibraryInfo &TLI,
6576 bool CompareOpcodes = true) {
6579 return false;
6580 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6581 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6582 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6583 (!GEP2 || GEP2->getNumOperands() == 2) &&
6584 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6585 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6586 !CompareOpcodes ||
6587 (GEP1 && GEP2 &&
6588 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6589}
6590
6591/// Calculates minimal alignment as a common alignment.
6592template <typename T>
6594 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6595 for (Value *V : VL)
6596 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6597 return CommonAlignment;
6598}
6599
6600/// Check if \p Order represents reverse order.
6602 assert(!Order.empty() &&
6603 "Order is empty. Please check it before using isReverseOrder.");
6604 unsigned Sz = Order.size();
6605 return all_of(enumerate(Order), [&](const auto &Pair) {
6606 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6607 });
6608}
6609
6610/// Checks if the provided list of pointers \p Pointers represents the strided
6611/// pointers for type ElemTy. If they are not, nullptr is returned.
6612/// Otherwise, SCEV* of the stride value is returned.
6613/// If `PointerOps` can be rearanged into the following sequence:
6614/// ```
6615/// %x + c_0 * stride,
6616/// %x + c_1 * stride,
6617/// %x + c_2 * stride
6618/// ...
6619/// ```
6620/// where each `c_i` is constant. The `Coeffs` will contain `c_0, c_1, c_2, ..`
6621/// and the SCEV of the `stride` will be returned.
6622static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6623 const DataLayout &DL, ScalarEvolution &SE,
6624 SmallVectorImpl<unsigned> &SortedIndices,
6625 SmallVectorImpl<int64_t> &Coeffs) {
6626 assert(Coeffs.size() == PointerOps.size() &&
6627 "Coeffs vector needs to be of correct size");
6629 const SCEV *PtrSCEVLowest = nullptr;
6630 const SCEV *PtrSCEVHighest = nullptr;
6631 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6632 // addresses).
6633 for (Value *Ptr : PointerOps) {
6634 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6635 if (!PtrSCEV)
6636 return nullptr;
6637 SCEVs.push_back(PtrSCEV);
6638 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6639 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6640 continue;
6641 }
6642 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6643 if (isa<SCEVCouldNotCompute>(Diff))
6644 return nullptr;
6645 if (Diff->isNonConstantNegative()) {
6646 PtrSCEVLowest = PtrSCEV;
6647 continue;
6648 }
6649 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6650 if (isa<SCEVCouldNotCompute>(Diff1))
6651 return nullptr;
6652 if (Diff1->isNonConstantNegative()) {
6653 PtrSCEVHighest = PtrSCEV;
6654 continue;
6655 }
6656 }
6657 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6658 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6659 if (isa<SCEVCouldNotCompute>(Dist))
6660 return nullptr;
6661 int Size = DL.getTypeStoreSize(ElemTy);
6662 auto TryGetStride = [&](const SCEV *Dist,
6663 const SCEV *Multiplier) -> const SCEV * {
6664 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6665 if (M->getOperand(0) == Multiplier)
6666 return M->getOperand(1);
6667 if (M->getOperand(1) == Multiplier)
6668 return M->getOperand(0);
6669 return nullptr;
6670 }
6671 if (Multiplier == Dist)
6672 return SE.getConstant(Dist->getType(), 1);
6673 return SE.getUDivExactExpr(Dist, Multiplier);
6674 };
6675 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6676 const SCEV *Stride = nullptr;
6677 if (Size != 1 || SCEVs.size() > 2) {
6678 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6679 Stride = TryGetStride(Dist, Sz);
6680 if (!Stride)
6681 return nullptr;
6682 }
6683 if (!Stride || isa<SCEVConstant>(Stride))
6684 return nullptr;
6685 // Iterate through all pointers and check if all distances are
6686 // unique multiple of Stride.
6687 using DistOrdPair = std::pair<int64_t, int>;
6688 auto Compare = llvm::less_first();
6689 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6690 int Cnt = 0;
6691 bool IsConsecutive = true;
6692 for (const auto [Idx, PtrSCEV] : enumerate(SCEVs)) {
6693 unsigned Dist = 0;
6694 if (PtrSCEV != PtrSCEVLowest) {
6695 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6696 const SCEV *Coeff = TryGetStride(Diff, Stride);
6697 if (!Coeff)
6698 return nullptr;
6699 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6700 if (!SC || isa<SCEVCouldNotCompute>(SC))
6701 return nullptr;
6702 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6703 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6704 SE.getMulExpr(Stride, SC)))
6705 ->isZero())
6706 return nullptr;
6707 Dist = SC->getAPInt().getZExtValue();
6708 } else {
6709 Coeffs[Idx] = 0;
6710 }
6711 // If the strides are not the same or repeated, we can't vectorize.
6712 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6713 return nullptr;
6714 auto Res = Offsets.emplace(Dist, Cnt);
6715 if (!Res.second)
6716 return nullptr;
6717 // Consecutive order if the inserted element is the last one.
6718 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6719 ++Cnt;
6720 }
6721 if (Offsets.size() != SCEVs.size())
6722 return nullptr;
6723 SortedIndices.clear();
6724 if (!IsConsecutive) {
6725 // Fill SortedIndices array only if it is non-consecutive.
6726 SortedIndices.resize(PointerOps.size());
6727 Cnt = 0;
6728 for (const std::pair<int64_t, int> &Pair : Offsets) {
6729 SortedIndices[Cnt] = Pair.second;
6730 ++Cnt;
6731 }
6732 }
6733 return Stride;
6734}
6735
6736static std::pair<InstructionCost, InstructionCost>
6738 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6739 Type *ScalarTy, VectorType *VecTy);
6740
6741/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6742/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6743/// subvector pattern.
6744static InstructionCost
6746 VectorType *Tp, ArrayRef<int> Mask = {},
6748 int Index = 0, VectorType *SubTp = nullptr,
6750 VectorType *DstTy = Tp;
6751 if (!Mask.empty())
6752 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6753
6754 if (Kind != TTI::SK_PermuteTwoSrc)
6755 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6756 Args);
6757 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6758 int NumSubElts;
6760 Mask, NumSrcElts, NumSubElts, Index)) {
6761 if (Index + NumSubElts > NumSrcElts &&
6762 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6763 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6764 TTI::TCK_RecipThroughput, Index, Tp);
6765 }
6766 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6767 Args);
6768}
6769
6770/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6771/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6772/// instead of a scalar.
6773static InstructionCost
6775 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6776 bool Extract, TTI::TargetCostKind CostKind,
6777 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6779 "ScalableVectorType is not supported.");
6780 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6781 getNumElements(Ty) &&
6782 "Incorrect usage.");
6783 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6784 assert(SLPReVec && "Only supported by REVEC.");
6785 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6786 // of CreateInsertElement.
6787 unsigned ScalarTyNumElements = VecTy->getNumElements();
6788 InstructionCost Cost = 0;
6789 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6790 if (!DemandedElts[I])
6791 continue;
6792 if (Insert)
6794 I * ScalarTyNumElements, VecTy);
6795 if (Extract)
6797 I * ScalarTyNumElements, VecTy);
6798 }
6799 return Cost;
6800 }
6801 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6802 CostKind, ForPoisonSrc, VL);
6803}
6804
6805/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6806/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6808 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6809 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6810 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6811 if (Opcode == Instruction::ExtractElement) {
6812 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6813 assert(SLPReVec && "Only supported by REVEC.");
6814 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6816 cast<VectorType>(Val), {}, CostKind,
6817 Index * VecTy->getNumElements(), VecTy);
6818 }
6819 }
6820 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6821 ScalarUserAndIdx);
6822}
6823
6824/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6825/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6827 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6828 VectorType *VecTy, unsigned Index,
6830 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6831 assert(SLPReVec && "Only supported by REVEC.");
6832 auto *SubTp =
6833 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6835 Index * ScalarTy->getNumElements(), SubTp) +
6836 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6837 CostKind);
6838 }
6839 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6840}
6841
6842/// Creates subvector insert. Generates shuffle using \p Generator or
6843/// using default shuffle.
6845 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6846 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6847 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6848 return Vec;
6849 const unsigned SubVecVF = getNumElements(V->getType());
6850 // Create shuffle, insertvector requires that index is multiple of
6851 // the subvector length.
6852 const unsigned VecVF = getNumElements(Vec->getType());
6854 if (isa<PoisonValue>(Vec)) {
6855 auto *Begin = std::next(Mask.begin(), Index);
6856 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6857 Vec = Builder.CreateShuffleVector(V, Mask);
6858 return Vec;
6859 }
6860 std::iota(Mask.begin(), Mask.end(), 0);
6861 std::iota(std::next(Mask.begin(), Index),
6862 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6863 if (Generator)
6864 return Generator(Vec, V, Mask);
6865 // 1. Resize V to the size of Vec.
6866 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6867 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6868 V = Builder.CreateShuffleVector(V, ResizeMask);
6869 // 2. Insert V into Vec.
6870 return Builder.CreateShuffleVector(Vec, V, Mask);
6871}
6872
6873/// Generates subvector extract using \p Generator or using default shuffle.
6875 unsigned SubVecVF, unsigned Index) {
6876 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6877 std::iota(Mask.begin(), Mask.end(), Index);
6878 return Builder.CreateShuffleVector(Vec, Mask);
6879}
6880
6881/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6882/// with \p Order.
6883/// \return true if the mask represents strided access, false - otherwise.
6885 ArrayRef<unsigned> Order, Type *ScalarTy,
6886 const DataLayout &DL, ScalarEvolution &SE,
6887 SmallVectorImpl<int> &CompressMask) {
6888 const unsigned Sz = PointerOps.size();
6889 CompressMask.assign(Sz, PoisonMaskElem);
6890 // The first element always set.
6891 CompressMask[0] = 0;
6892 // Check if the mask represents strided access.
6893 std::optional<unsigned> Stride = 0;
6894 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6895 for (unsigned I : seq<unsigned>(1, Sz)) {
6896 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6897 std::optional<int64_t> OptPos =
6898 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6899 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6900 return false;
6901 unsigned Pos = static_cast<unsigned>(*OptPos);
6902 CompressMask[I] = Pos;
6903 if (!Stride)
6904 continue;
6905 if (*Stride == 0) {
6906 *Stride = Pos;
6907 continue;
6908 }
6909 if (Pos != *Stride * I)
6910 Stride.reset();
6911 }
6912 return Stride.has_value();
6913}
6914
6915/// Checks if the \p VL can be transformed to a (masked)load + compress or
6916/// (masked) interleaved load.
6918 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6921 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6922 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6923 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6924 VectorType *&LoadVecTy) {
6925 InterleaveFactor = 0;
6926 Type *ScalarTy = VL.front()->getType();
6927 const size_t Sz = VL.size();
6928 auto *VecTy = getWidenedType(ScalarTy, Sz);
6930 SmallVector<int> Mask;
6931 if (!Order.empty())
6932 inversePermutation(Order, Mask);
6933 // Check external uses.
6934 for (const auto [I, V] : enumerate(VL)) {
6935 if (AreAllUsersVectorized(V))
6936 continue;
6937 InstructionCost ExtractCost =
6938 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6939 Mask.empty() ? I : Mask[I]);
6940 InstructionCost ScalarCost =
6941 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6942 if (ExtractCost <= ScalarCost)
6943 return false;
6944 }
6945 Value *Ptr0;
6946 Value *PtrN;
6947 if (Order.empty()) {
6948 Ptr0 = PointerOps.front();
6949 PtrN = PointerOps.back();
6950 } else {
6951 Ptr0 = PointerOps[Order.front()];
6952 PtrN = PointerOps[Order.back()];
6953 }
6954 std::optional<int64_t> Diff =
6955 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6956 if (!Diff)
6957 return false;
6958 const size_t MaxRegSize =
6960 .getFixedValue();
6961 // Check for very large distances between elements.
6962 if (*Diff / Sz >= MaxRegSize / 8)
6963 return false;
6964 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6965 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6966 Align CommonAlignment = LI->getAlign();
6967 IsMasked = !isSafeToLoadUnconditionally(
6968 Ptr0, LoadVecTy, CommonAlignment, DL,
6969 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6970 &TLI);
6971 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6972 LI->getPointerAddressSpace()))
6973 return false;
6974 // TODO: perform the analysis of each scalar load for better
6975 // safe-load-unconditionally analysis.
6976 bool IsStrided =
6977 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6978 assert(CompressMask.size() >= 2 && "At least two elements are required");
6979 SmallVector<Value *> OrderedPointerOps(PointerOps);
6980 if (!Order.empty())
6981 reorderScalars(OrderedPointerOps, Mask);
6982 auto [ScalarGEPCost, VectorGEPCost] =
6983 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6984 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6985 // The cost of scalar loads.
6986 InstructionCost ScalarLoadsCost =
6987 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6988 [&](InstructionCost C, Value *V) {
6989 return C + TTI.getInstructionCost(cast<Instruction>(V),
6990 CostKind);
6991 }) +
6992 ScalarGEPCost;
6993 APInt DemandedElts = APInt::getAllOnes(Sz);
6994 InstructionCost GatherCost =
6995 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6996 /*Insert=*/true,
6997 /*Extract=*/false, CostKind) +
6998 ScalarLoadsCost;
6999 InstructionCost LoadCost = 0;
7000 if (IsMasked) {
7001 LoadCost = TTI.getMemIntrinsicInstrCost(
7002 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
7003 CommonAlignment,
7004 LI->getPointerAddressSpace()),
7005 CostKind);
7006 } else {
7007 LoadCost =
7008 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
7009 LI->getPointerAddressSpace(), CostKind);
7010 }
7011 if (IsStrided && !IsMasked && Order.empty()) {
7012 // Check for potential segmented(interleaved) loads.
7013 VectorType *AlignedLoadVecTy = getWidenedType(
7014 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
7015 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
7016 DL, cast<LoadInst>(VL.back()), &AC, &DT,
7017 &TLI))
7018 AlignedLoadVecTy = LoadVecTy;
7019 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
7020 CommonAlignment,
7021 LI->getPointerAddressSpace())) {
7022 InstructionCost InterleavedCost =
7023 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
7024 Instruction::Load, AlignedLoadVecTy,
7025 CompressMask[1], {}, CommonAlignment,
7026 LI->getPointerAddressSpace(), CostKind, IsMasked);
7027 if (InterleavedCost < GatherCost) {
7028 InterleaveFactor = CompressMask[1];
7029 LoadVecTy = AlignedLoadVecTy;
7030 return true;
7031 }
7032 }
7033 }
7034 InstructionCost CompressCost = ::getShuffleCost(
7035 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
7036 if (!Order.empty()) {
7037 SmallVector<int> NewMask(Sz, PoisonMaskElem);
7038 for (unsigned I : seq<unsigned>(Sz)) {
7039 NewMask[I] = CompressMask[Mask[I]];
7040 }
7041 CompressMask.swap(NewMask);
7042 }
7043 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7044 return TotalVecCost < GatherCost;
7045}
7046
7047/// Checks if the \p VL can be transformed to a (masked)load + compress or
7048/// (masked) interleaved load.
7049static bool
7052 const DataLayout &DL, ScalarEvolution &SE,
7053 AssumptionCache &AC, const DominatorTree &DT,
7054 const TargetLibraryInfo &TLI,
7055 const function_ref<bool(Value *)> AreAllUsersVectorized) {
7056 bool IsMasked;
7057 unsigned InterleaveFactor;
7058 SmallVector<int> CompressMask;
7059 VectorType *LoadVecTy;
7060 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
7061 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7062 CompressMask, LoadVecTy);
7063}
7064
7065/// Checks if strided loads can be generated out of \p VL loads with pointers \p
7066/// PointerOps:
7067/// 1. Target with strided load support is detected.
7068/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
7069/// potential stride <= MaxProfitableLoadStride and the potential stride is
7070/// power-of-2 (to avoid perf regressions for the very small number of loads)
7071/// and max distance > number of loads, or potential stride is -1.
7072/// 3. The loads are ordered, or number of unordered loads <=
7073/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
7074/// to avoid extra costs for very expensive shuffles).
7075/// 4. Any pointer operand is an instruction with the users outside of the
7076/// current graph (for masked gathers extra extractelement instructions
7077/// might be required).
7079 Align Alignment, const int64_t Diff,
7080 const size_t Sz) const {
7081 if (Diff % (Sz - 1) != 0)
7082 return false;
7083
7084 // Try to generate strided load node.
7085 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
7086 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
7087 return !isVectorized(U) && !MustGather.contains(U);
7088 });
7089 });
7090
7091 const uint64_t AbsoluteDiff = std::abs(Diff);
7092 auto *VecTy = getWidenedType(ScalarTy, Sz);
7093 if (IsAnyPointerUsedOutGraph ||
7094 (AbsoluteDiff > Sz &&
7096 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
7097 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
7098 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7099 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7100 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7101 return false;
7102 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7103 return false;
7104 return true;
7105 }
7106 return false;
7107}
7108
7110 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7111 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7112 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
7113 const size_t Sz = PointerOps.size();
7114 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7115 // Go through `PointerOps` in sorted order and record offsets from
7116 // PointerOps[0]. We use PointerOps[0] rather than Ptr0 because
7117 // sortPtrAccesses only validates getPointersDiff for pairs relative to
7118 // PointerOps[0]. This is safe since only offset differences are used below.
7119 for (unsigned I : seq<unsigned>(Sz)) {
7120 Value *Ptr =
7121 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7122 std::optional<int64_t> Offset =
7123 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, Ptr, *DL, *SE);
7124 assert(Offset && "sortPtrAccesses should have validated this pointer");
7125 SortedOffsetsFromBase[I] = *Offset;
7126 }
7127
7128 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7129 // ```
7130 // [
7131 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7132 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7133 // ...
7134 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7135 // GroupSize - 1}), // last group
7136 // ]
7137 // ```
7138 // The distance between consecutive elements within each group should all be
7139 // the same `StrideWithinGroup`. The distance between the first elements of
7140 // consecutive groups should all be the same `StrideBetweenGroups`.
7141
7142 int64_t StrideWithinGroup =
7143 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7144 // Determine size of the first group. Later we will check that all other
7145 // groups have the same size.
7146 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7147 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7148 StrideWithinGroup;
7149 };
7150 auto Indices = seq<unsigned>(1, Sz);
7151 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7152 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7153
7154 unsigned VecSz = Sz;
7155 Type *NewScalarTy = ScalarTy;
7156
7157 // Quick detour: at this point we can say what the type of strided load would
7158 // be if all the checks pass. Check if this type is legal for the target.
7159 bool NeedsWidening = Sz != GroupSize;
7160 if (NeedsWidening) {
7161 if (Sz % GroupSize != 0)
7162 return false;
7163
7164 if (StrideWithinGroup != 1)
7165 return false;
7166 VecSz = Sz / GroupSize;
7167 NewScalarTy = Type::getIntNTy(
7168 SE->getContext(),
7169 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7170 }
7171
7172 if (!isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7173 return false;
7174
7175 int64_t StrideIntVal = StrideWithinGroup;
7176 if (NeedsWidening) {
7177 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7178 // Check that the strides between groups are all the same.
7179 unsigned CurrentGroupStartIdx = GroupSize;
7180 int64_t StrideBetweenGroups =
7181 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7182 StrideIntVal = StrideBetweenGroups;
7183 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7184 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7185 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7186 StrideBetweenGroups)
7187 return false;
7188 }
7189
7190 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7191 auto Indices = seq<unsigned>(StartIdx + 1, Sz);
7192 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7193 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7194 return GroupEndIdx - StartIdx == GroupSize;
7195 };
7196 for (unsigned I = 0; I < Sz; I += GroupSize) {
7197 if (!CheckGroup(I))
7198 return false;
7199 }
7200 }
7201
7202 Type *StrideTy = DL->getIndexType(Ptr0->getType());
7203 SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal);
7204 SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
7205 return true;
7206}
7207
7209 Type *ScalarTy, Align CommonAlignment,
7210 SmallVectorImpl<unsigned> &SortedIndices,
7211 StridedPtrInfo &SPtrInfo) const {
7212 // If each value in `PointerOps` is of the form `%x + Offset` where `Offset`
7213 // is constant, we partition `PointerOps` sequence into subsequences of
7214 // pointers with the same offset. For each offset we record values from
7215 // `PointerOps` and their indicies in `PointerOps`.
7217 OffsetToPointerOpIdxMap;
7218 for (auto [Idx, Ptr] : enumerate(PointerOps)) {
7219 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7220 if (!PtrSCEV)
7221 return false;
7222
7223 const auto *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
7224 int64_t Offset = 0;
7225 if (Add) {
7226 // `Offset` is non-zero.
7227 for (int I : seq<int>(Add->getNumOperands())) {
7228 const auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
7229 if (!SC)
7230 continue;
7231 Offset = SC->getAPInt().getSExtValue();
7232 break;
7233 }
7234 }
7235 OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
7236 OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
7237 }
7238 unsigned NumOffsets = OffsetToPointerOpIdxMap.size();
7239
7240 // Quick detour: at this point we can say what the type of strided load would
7241 // be if all the checks pass. Check if this type is legal for the target.
7242 const unsigned Sz = PointerOps.size();
7243 unsigned VecSz = Sz;
7244 Type *NewScalarTy = ScalarTy;
7245 if (NumOffsets > 1) {
7246 if (Sz % NumOffsets != 0)
7247 return false;
7248 VecSz = Sz / NumOffsets;
7249 NewScalarTy = Type::getIntNTy(
7250 SE->getContext(),
7251 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * NumOffsets);
7252 }
7253 FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
7254 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7255 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7256 return false;
7257
7258 // Check if the offsets are contiguous and that each group has the required
7259 // size.
7260 SmallVector<int64_t> SortedOffsetsV(NumOffsets);
7261 for (auto [Idx, MapPair] : enumerate(OffsetToPointerOpIdxMap)) {
7262 if (MapPair.second.first.size() != VecSz)
7263 return false;
7264 SortedOffsetsV[Idx] = MapPair.first;
7265 }
7266 sort(SortedOffsetsV);
7267
7268 if (NumOffsets > 1) {
7269 for (int I : seq<int>(1, SortedOffsetsV.size())) {
7270 if (SortedOffsetsV[I] - SortedOffsetsV[I - 1] != 1)
7271 return false;
7272 }
7273 }
7274
7275 // Introduce some notation for the explanations below. Let `PointerOps_j`
7276 // denote the subsequence of `PointerOps` with offsets equal to
7277 // `SortedOffsetsV[j]`. Let `SortedIndices_j` be a such that the sequence
7278 // ```
7279 // PointerOps_j[SortedIndices_j[0]],
7280 // PointerOps_j[SortedIndices_j[1]],
7281 // PointerOps_j[SortedIndices_j[2]],
7282 // ...
7283 // ```
7284 // is sorted. Also, let `IndicesInAllPointerOps_j` be the vector
7285 // of indices of the subsequence `PointerOps_j` in all of `PointerOps`,
7286 // i.e `PointerOps_j[i] = PointerOps[IndicesInAllPointerOps_j[i]]`.
7287 // The entire sorted `PointerOps` looks like this:
7288 // ```
7289 // PointerOps_0[SortedIndices_0[0]] = PointerOps[IndicesInAllPointerOps_0[0]],
7290 // PointerOps_1[SortedIndices_1[0]] = PointerOps[IndicesInAllPointerOps_1[0]],
7291 // PointerOps_2[SortedIndices_2[0]] = PointerOps[IndicesInAllPointerOps_2[0]],
7292 // ...
7293 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[0]] =
7294 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[0]],
7295 //
7296 // PointerOps_0[SortedIndices_0[1]] = PointerOps[IndicesInAllPointerOps_0[1]],
7297 // PointerOps_1[SortedIndices_1[1]] = PointerOps[IndicesInAllPointerOps_1[1]],
7298 // PointerOps_2[SortedIndices_2[1]] = PointerOps[IndicesInAllPointerOps_2[1]],
7299 // ...
7300 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[1]] =
7301 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[1]],
7302 //
7303 // PointerOps_0[SortedIndices_0[2]] = PointerOps[IndicesInAllPointerOps_0[2]],
7304 // PointerOps_1[SortedIndices_1[2]] = PointerOps[IndicesInAllPointerOps_1[2]],
7305 // PointerOps_2[SortedIndices_2[2]] = PointerOps[IndicesInAllPointerOps_2[2]],
7306 // ...
7307 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[2]] =
7308 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[2]],
7309 // ...
7310 // ...
7311 // ...
7312 // PointerOps_0[SortedIndices_0[VecSz - 1]] =
7313 // PointerOps[IndicesInAllPointerOps_0[VecSz - 1]],
7314 // PointerOps_1[SortedIndices_1[VecSz - 1]] =
7315 // PointerOps[IndicesInAllPointerOps_1[VecSz - 1]],
7316 // PointerOps_2[SortedIndices_2[VecSz - 1]] =
7317 // PointerOps[IndicesInAllPointerOps_2[VecSz - 1]],
7318 // ...
7319 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[VecSz - 1]] =
7320 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[VecSz - 1]],
7321 // ```
7322 // In order to be able to generate a strided load, we need the following
7323 // checks to pass:
7324 //
7325 // (1) for each `PointerOps_j` check that the distance
7326 // between adjacent pointers are all equal to the same value (stride).
7327 // (2) for each `PointerOps_j` check that coefficients calculated by
7328 // `calculateRtStride` are all the same.
7329 //
7330 // As we do that, also calculate SortedIndices. Since we should not modify
7331 // `SortedIndices` unless we know that all the checks succeed, record the
7332 // indicies into `SortedIndicesDraft`.
7333 SmallVector<unsigned> SortedIndicesDraft(Sz);
7334
7335 // Given sorted indices for a particular offset (as calculated by
7336 // calculateRtStride), update the `SortedIndicesDraft` for all of PointerOps.
7337 // Let `Offset` be `SortedOffsetsV[OffsetNum]`.
7338 // \param `OffsetNum` the index of `Offset` in `SortedOffsetsV`.
7339 // \param `IndicesInAllPointerOps` vector of indices of the
7340 // subsequence `PointerOps_OffsetNum` in `PointerOps`, i.e. using the above
7341 // notation `IndicesInAllPointerOps = IndicesInAllPointerOps_OffsetNum`.
7342 // \param `SortedIndicesForOffset = SortedIndices_OffsetNum`
7343 auto UpdateSortedIndices =
7344 [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7345 ArrayRef<unsigned> IndicesInAllPointerOps, const int64_t OffsetNum) {
7346 if (SortedIndicesForOffset.empty()) {
7347 SortedIndicesForOffset.resize(IndicesInAllPointerOps.size());
7348 std::iota(SortedIndicesForOffset.begin(),
7349 SortedIndicesForOffset.end(), 0);
7350 }
7351 for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
7352 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7353 IndicesInAllPointerOps[Idx];
7354 }
7355 };
7356
7357 int64_t LowestOffset = SortedOffsetsV[0];
7358 ArrayRef<Value *> PointerOps0 = OffsetToPointerOpIdxMap[LowestOffset].first;
7359
7360 SmallVector<int64_t> Coeffs0(VecSz);
7361 SmallVector<unsigned> SortedIndicesForOffset0;
7362 const SCEV *Stride0 = calculateRtStride(PointerOps0, ScalarTy, *DL, *SE,
7363 SortedIndicesForOffset0, Coeffs0);
7364 if (!Stride0)
7365 return false;
7366 unsigned NumCoeffs0 = Coeffs0.size();
7367 if (NumCoeffs0 * NumOffsets != Sz)
7368 return false;
7369 sort(Coeffs0);
7370
7371 ArrayRef<unsigned> IndicesInAllPointerOps0 =
7372 OffsetToPointerOpIdxMap[LowestOffset].second;
7373 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7374
7375 // Now that we know what the common stride and coefficients has to be check
7376 // the remaining `PointerOps_j`.
7377 SmallVector<int64_t> Coeffs;
7378 SmallVector<unsigned> SortedIndicesForOffset;
7379 for (int J : seq<int>(1, NumOffsets)) {
7380 Coeffs.clear();
7381 Coeffs.resize(VecSz);
7382 SortedIndicesForOffset.clear();
7383
7384 int64_t Offset = SortedOffsetsV[J];
7385 ArrayRef<Value *> PointerOpsForOffset =
7386 OffsetToPointerOpIdxMap[Offset].first;
7387 ArrayRef<unsigned> IndicesInAllPointerOps =
7388 OffsetToPointerOpIdxMap[Offset].second;
7389 const SCEV *StrideWithinGroup =
7390 calculateRtStride(PointerOpsForOffset, ScalarTy, *DL, *SE,
7391 SortedIndicesForOffset, Coeffs);
7392
7393 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7394 return false;
7395 if (Coeffs.size() != NumCoeffs0)
7396 return false;
7397 sort(Coeffs);
7398 if (Coeffs != Coeffs0)
7399 return false;
7400
7401 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7402 }
7403
7404 SortedIndices.clear();
7405 SortedIndices = std::move(SortedIndicesDraft);
7406 SPtrInfo.StrideSCEV = Stride0;
7407 SPtrInfo.Ty = StridedLoadTy;
7408 return true;
7409}
7410
7412 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7413 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7414 unsigned *BestVF, bool TryRecursiveCheck) const {
7415 // Check that a vectorized load would load the same memory as a scalar
7416 // load. For example, we don't want to vectorize loads that are smaller
7417 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7418 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7419 // from such a struct, we read/write packed bits disagreeing with the
7420 // unvectorized version.
7421 if (BestVF)
7422 *BestVF = 0;
7424 return LoadsState::Gather;
7425 Type *ScalarTy = VL0->getType();
7426
7427 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7428 return LoadsState::Gather;
7429
7430 // Make sure all loads in the bundle are simple - we can't vectorize
7431 // atomic or volatile loads.
7432 PointerOps.clear();
7433 const size_t Sz = VL.size();
7434 PointerOps.resize(Sz);
7435 auto *POIter = PointerOps.begin();
7436 for (Value *V : VL) {
7437 auto *L = dyn_cast<LoadInst>(V);
7438 if (!L || !L->isSimple())
7439 return LoadsState::Gather;
7440 *POIter = L->getPointerOperand();
7441 ++POIter;
7442 }
7443
7444 Order.clear();
7445 // Check the order of pointer operands or that all pointers are the same.
7446 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7447
7448 auto *VecTy = getWidenedType(ScalarTy, Sz);
7449 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7450 if (!IsSorted) {
7451 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7452 SPtrInfo))
7454
7455 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7456 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7457 return LoadsState::Gather;
7458
7459 if (!all_of(PointerOps, [&](Value *P) {
7460 return arePointersCompatible(P, PointerOps.front(), *TLI);
7461 }))
7462 return LoadsState::Gather;
7463
7464 } else {
7465 Value *Ptr0;
7466 Value *PtrN;
7467 if (Order.empty()) {
7468 Ptr0 = PointerOps.front();
7469 PtrN = PointerOps.back();
7470 } else {
7471 Ptr0 = PointerOps[Order.front()];
7472 PtrN = PointerOps[Order.back()];
7473 }
7474 // sortPtrAccesses validates getPointersDiff for all pointers relative to
7475 // PointerOps[0], so compute the span using PointerOps[0] as intermediate:
7476 // Diff = offset(PtrN) - offset(Ptr0) relative to PointerOps[0]
7477 std::optional<int64_t> Diff0 =
7478 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, Ptr0, *DL, *SE);
7479 std::optional<int64_t> DiffN =
7480 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, PtrN, *DL, *SE);
7481 assert(Diff0 && DiffN &&
7482 "sortPtrAccesses should have validated these pointers");
7483 int64_t Diff = *DiffN - *Diff0;
7484 // Check that the sorted loads are consecutive.
7485 if (static_cast<uint64_t>(Diff) == Sz - 1)
7486 return LoadsState::Vectorize;
7487 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7488 *TLI, [&](Value *V) {
7489 return areAllUsersVectorized(
7490 cast<Instruction>(V), UserIgnoreList);
7491 }))
7493 Align Alignment =
7494 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7495 ->getAlign();
7496 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7497 Diff, Ptr0, PtrN, SPtrInfo))
7499 }
7500 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7501 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7502 return LoadsState::Gather;
7503 // Correctly identify compare the cost of loads + shuffles rather than
7504 // strided/masked gather loads. Returns true if vectorized + shuffles
7505 // representation is better than just gather.
7506 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7507 unsigned *BestVF,
7508 bool ProfitableGatherPointers) {
7509 if (BestVF)
7510 *BestVF = 0;
7511 // Compare masked gather cost and loads + insert subvector costs.
7513 auto [ScalarGEPCost, VectorGEPCost] =
7514 getGEPCosts(TTI, PointerOps, PointerOps.front(),
7515 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7516 // Estimate the cost of masked gather GEP. If not a splat, roughly
7517 // estimate as a buildvector, otherwise estimate as splat.
7518 APInt DemandedElts = APInt::getAllOnes(Sz);
7519 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7520 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7521 if (static_cast<unsigned>(count_if(
7522 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7523 any_of(PointerOps, [&](Value *V) {
7524 return getUnderlyingObject(V) !=
7525 getUnderlyingObject(PointerOps.front());
7526 }))
7527 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7528 DemandedElts, /*Insert=*/true,
7529 /*Extract=*/false, CostKind);
7530 else
7531 VectorGEPCost +=
7533 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7534 /*Insert=*/true, /*Extract=*/false, CostKind) +
7535 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7536 // The cost of scalar loads.
7537 InstructionCost ScalarLoadsCost =
7538 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7539 [&](InstructionCost C, Value *V) {
7540 return C + TTI.getInstructionCost(
7542 }) +
7543 ScalarGEPCost;
7544 // The cost of masked gather.
7545 InstructionCost MaskedGatherCost =
7546 TTI.getMemIntrinsicInstrCost(
7547 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
7549 /*VariableMask=*/false, CommonAlignment),
7550 CostKind) +
7551 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7552 InstructionCost GatherCost =
7553 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7554 /*Insert=*/true,
7555 /*Extract=*/false, CostKind) +
7556 ScalarLoadsCost;
7557 // The list of loads is small or perform partial check already - directly
7558 // compare masked gather cost and gather cost.
7559 constexpr unsigned ListLimit = 4;
7560 if (!TryRecursiveCheck || VL.size() < ListLimit)
7561 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7562
7563 // FIXME: The following code has not been updated for non-power-of-2
7564 // vectors (and not whole registers). The splitting logic here does not
7565 // cover the original vector if the vector factor is not a power of two.
7566 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7567 return false;
7568
7569 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7570 unsigned MinVF = getMinVF(2 * Sz);
7571 DemandedElts.clearAllBits();
7572 // Iterate through possible vectorization factors and check if vectorized +
7573 // shuffles is better than just gather.
7574 for (unsigned VF =
7575 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7576 VF >= MinVF;
7577 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7579 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7580 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7582 SmallVector<Value *> PointerOps;
7583 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7584 PointerOps, SPtrInfo, BestVF,
7585 /*TryRecursiveCheck=*/false);
7586 // Check that the sorted loads are consecutive.
7587 if (LS == LoadsState::Gather) {
7588 if (BestVF) {
7589 DemandedElts.setAllBits();
7590 break;
7591 }
7592 DemandedElts.setBits(Cnt, Cnt + VF);
7593 continue;
7594 }
7595 // If need the reorder - consider as high-cost masked gather for now.
7596 if ((LS == LoadsState::Vectorize ||
7599 !Order.empty() && !isReverseOrder(Order))
7601 States.push_back(LS);
7602 }
7603 if (DemandedElts.isAllOnes())
7604 // All loads gathered - try smaller VF.
7605 continue;
7606 // Can be vectorized later as a serie of loads/insertelements.
7607 InstructionCost VecLdCost = 0;
7608 if (!DemandedElts.isZero()) {
7609 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7610 /*Insert=*/true,
7611 /*Extract=*/false, CostKind) +
7612 ScalarGEPCost;
7613 for (unsigned Idx : seq<unsigned>(VL.size()))
7614 if (DemandedElts[Idx])
7615 VecLdCost +=
7616 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7617 }
7618 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7619 for (auto [I, LS] : enumerate(States)) {
7620 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7621 InstructionCost VectorGEPCost =
7622 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7623 ? 0
7624 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7625 LI0->getPointerOperand(),
7626 Instruction::GetElementPtr, CostKind, ScalarTy,
7627 SubVecTy)
7628 .second;
7629 if (LS == LoadsState::ScatterVectorize) {
7630 if (static_cast<unsigned>(
7631 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7632 PointerOps.size() - 1 ||
7633 any_of(PointerOps, [&](Value *V) {
7634 return getUnderlyingObject(V) !=
7635 getUnderlyingObject(PointerOps.front());
7636 }))
7637 VectorGEPCost += getScalarizationOverhead(
7638 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7639 /*Insert=*/true, /*Extract=*/false, CostKind);
7640 else
7641 VectorGEPCost +=
7643 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7644 /*Insert=*/true, /*Extract=*/false, CostKind) +
7645 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7646 CostKind);
7647 }
7648 switch (LS) {
7650 VecLdCost +=
7651 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7652 LI0->getPointerAddressSpace(), CostKind,
7654 VectorGEPCost;
7655 break;
7657 VecLdCost += TTI.getMemIntrinsicInstrCost(
7659 Intrinsic::experimental_vp_strided_load,
7660 SubVecTy, LI0->getPointerOperand(),
7661 /*VariableMask=*/false, CommonAlignment),
7662 CostKind) +
7663 VectorGEPCost;
7664 break;
7666 VecLdCost += TTI.getMemIntrinsicInstrCost(
7668 Intrinsic::masked_load, SubVecTy,
7669 CommonAlignment, LI0->getPointerAddressSpace()),
7670 CostKind) +
7672 {}, CostKind);
7673 break;
7675 VecLdCost += TTI.getMemIntrinsicInstrCost(
7677 Intrinsic::masked_gather, SubVecTy,
7678 LI0->getPointerOperand(),
7679 /*VariableMask=*/false, CommonAlignment),
7680 CostKind) +
7681 VectorGEPCost;
7682 break;
7683 case LoadsState::Gather:
7684 // Gathers are already calculated - ignore.
7685 continue;
7686 }
7687 SmallVector<int> ShuffleMask(VL.size());
7688 for (int Idx : seq<int>(0, VL.size()))
7689 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7690 if (I > 0)
7691 VecLdCost +=
7692 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7693 CostKind, I * VF, SubVecTy);
7694 }
7695 // If masked gather cost is higher - better to vectorize, so
7696 // consider it as a gather node. It will be better estimated
7697 // later.
7698 if (MaskedGatherCost >= VecLdCost &&
7699 VecLdCost - GatherCost < -SLPCostThreshold) {
7700 if (BestVF)
7701 *BestVF = VF;
7702 return true;
7703 }
7704 }
7705 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7706 };
7707 // TODO: need to improve analysis of the pointers, if not all of them are
7708 // GEPs or have > 2 operands, we end up with a gather node, which just
7709 // increases the cost.
7710 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7711 bool ProfitableGatherPointers =
7712 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7713 return L->isLoopInvariant(V);
7714 })) <= Sz / 2;
7715 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7717 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7718 (GEP && GEP->getNumOperands() == 2 &&
7719 isa<Constant, Instruction>(GEP->getOperand(1)));
7720 })) {
7721 // Check if potential masked gather can be represented as series
7722 // of loads + insertsubvectors.
7723 // If masked gather cost is higher - better to vectorize, so
7724 // consider it as a gather node. It will be better estimated
7725 // later.
7726 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7727 ProfitableGatherPointers))
7729 }
7730
7731 return LoadsState::Gather;
7732}
7733
7735 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7736 const DataLayout &DL, ScalarEvolution &SE,
7737 SmallVectorImpl<unsigned> &SortedIndices) {
7738 assert(
7739 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7740 "Expected list of pointer operands.");
7741 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7742 // Ptr into, sort and return the sorted indices with values next to one
7743 // another.
7745 std::pair<BasicBlock *, Value *>,
7747 Bases;
7748 Bases
7749 .try_emplace(std::make_pair(
7751 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7752
7753 SortedIndices.clear();
7754 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7755 auto Key = std::make_pair(BBs[Cnt + 1],
7757 bool Found = any_of(Bases.try_emplace(Key).first->second,
7758 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7759 std::optional<int64_t> Diff =
7760 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7761 ElemTy, Ptr, DL, SE,
7762 /*StrictCheck=*/true);
7763 if (!Diff)
7764 return false;
7765
7766 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7767 return true;
7768 });
7769
7770 if (!Found) {
7771 // If we haven't found enough to usefully cluster, return early.
7772 if (Bases.size() > VL.size() / 2 - 1)
7773 return false;
7774
7775 // Not found already - add a new Base
7776 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7777 }
7778 }
7779
7780 if (Bases.size() == VL.size())
7781 return false;
7782
7783 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7784 Bases.front().second.size() == VL.size()))
7785 return false;
7786
7787 // For each of the bases sort the pointers by Offset and check if any of the
7788 // base become consecutively allocated.
7789 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7790 SmallPtrSet<Value *, 13> FirstPointers;
7791 SmallPtrSet<Value *, 13> SecondPointers;
7792 Value *P1 = Ptr1;
7793 Value *P2 = Ptr2;
7794 unsigned Depth = 0;
7795 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7796 if (P1 == P2 || Depth > RecursionMaxDepth)
7797 return false;
7798 FirstPointers.insert(P1);
7799 SecondPointers.insert(P2);
7800 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7801 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7802 ++Depth;
7803 }
7804 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7805 "Unable to find matching root.");
7806 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7807 };
7808 for (auto &Base : Bases) {
7809 for (auto &Vec : Base.second) {
7810 if (Vec.size() > 1) {
7812 int64_t InitialOffset = std::get<1>(Vec[0]);
7813 bool AnyConsecutive =
7814 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7815 return std::get<1>(P.value()) ==
7816 int64_t(P.index()) + InitialOffset;
7817 });
7818 // Fill SortedIndices array only if it looks worth-while to sort the
7819 // ptrs.
7820 if (!AnyConsecutive)
7821 return false;
7822 }
7823 }
7824 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7825 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7826 });
7827 }
7828
7829 for (auto &T : Bases)
7830 for (const auto &Vec : T.second)
7831 for (const auto &P : Vec)
7832 SortedIndices.push_back(std::get<2>(P));
7833
7834 assert(SortedIndices.size() == VL.size() &&
7835 "Expected SortedIndices to be the size of VL");
7836 return true;
7837}
7838
7839std::optional<BoUpSLP::OrdersType>
7840BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7841 assert(TE.isGather() && "Expected gather node only.");
7842 Type *ScalarTy = TE.Scalars[0]->getType();
7843
7845 Ptrs.reserve(TE.Scalars.size());
7847 BBs.reserve(TE.Scalars.size());
7848 for (Value *V : TE.Scalars) {
7849 auto *L = dyn_cast<LoadInst>(V);
7850 if (!L || !L->isSimple())
7851 return std::nullopt;
7852 Ptrs.push_back(L->getPointerOperand());
7853 BBs.push_back(L->getParent());
7854 }
7855
7856 BoUpSLP::OrdersType Order;
7857 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7858 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7859 return std::move(Order);
7860 return std::nullopt;
7861}
7862
7863/// Check if two insertelement instructions are from the same buildvector.
7866 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7867 // Instructions must be from the same basic blocks.
7868 if (VU->getParent() != V->getParent())
7869 return false;
7870 // Checks if 2 insertelements are from the same buildvector.
7871 if (VU->getType() != V->getType())
7872 return false;
7873 // Multiple used inserts are separate nodes.
7874 if (!VU->hasOneUse() && !V->hasOneUse())
7875 return false;
7876 auto *IE1 = VU;
7877 auto *IE2 = V;
7878 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7879 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7880 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7881 return false;
7882 // Go through the vector operand of insertelement instructions trying to find
7883 // either VU as the original vector for IE2 or V as the original vector for
7884 // IE1.
7885 SmallBitVector ReusedIdx(
7886 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7887 bool IsReusedIdx = false;
7888 do {
7889 if (IE2 == VU && !IE1)
7890 return VU->hasOneUse();
7891 if (IE1 == V && !IE2)
7892 return V->hasOneUse();
7893 if (IE1 && IE1 != V) {
7894 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7895 IsReusedIdx |= ReusedIdx.test(Idx1);
7896 ReusedIdx.set(Idx1);
7897 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7898 IE1 = nullptr;
7899 else
7900 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7901 }
7902 if (IE2 && IE2 != VU) {
7903 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7904 IsReusedIdx |= ReusedIdx.test(Idx2);
7905 ReusedIdx.set(Idx2);
7906 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7907 IE2 = nullptr;
7908 else
7909 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7910 }
7911 } while (!IsReusedIdx && (IE1 || IE2));
7912 return false;
7913}
7914
7915/// Checks if the specified instruction \p I is an alternate operation for
7916/// the given \p MainOp and \p AltOp instructions.
7917static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7918 Instruction *AltOp,
7919 const TargetLibraryInfo &TLI);
7920
7921std::optional<BoUpSLP::OrdersType>
7922BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7923 bool IgnoreReorder) {
7924 // No need to reorder if need to shuffle reuses, still need to shuffle the
7925 // node.
7926 if (!TE.ReuseShuffleIndices.empty()) {
7927 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7928 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7929 "Reshuffling scalars not yet supported for nodes with padding");
7930
7931 if (isSplat(TE.Scalars))
7932 return std::nullopt;
7933 // Check if reuse shuffle indices can be improved by reordering.
7934 // For this, check that reuse mask is "clustered", i.e. each scalar values
7935 // is used once in each submask of size <number_of_scalars>.
7936 // Example: 4 scalar values.
7937 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7938 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7939 // element 3 is used twice in the second submask.
7940 unsigned Sz = TE.Scalars.size();
7941 if (TE.isGather()) {
7942 if (std::optional<OrdersType> CurrentOrder =
7943 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7944 SmallVector<int> Mask;
7945 fixupOrderingIndices(*CurrentOrder);
7946 inversePermutation(*CurrentOrder, Mask);
7947 ::addMask(Mask, TE.ReuseShuffleIndices);
7948 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7949 unsigned Sz = TE.Scalars.size();
7950 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7951 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7952 if (Idx != PoisonMaskElem)
7953 Res[Idx + K * Sz] = I + K * Sz;
7954 }
7955 return std::move(Res);
7956 }
7957 }
7958 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7959 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7960 2 * TE.getVectorFactor())) == 1)
7961 return std::nullopt;
7962 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7963 return std::nullopt;
7964 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7965 Sz)) {
7966 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7967 if (TE.ReorderIndices.empty())
7968 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7969 else
7970 inversePermutation(TE.ReorderIndices, ReorderMask);
7971 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7972 unsigned VF = ReorderMask.size();
7973 OrdersType ResOrder(VF, VF);
7974 unsigned NumParts = divideCeil(VF, Sz);
7975 SmallBitVector UsedVals(NumParts);
7976 for (unsigned I = 0; I < VF; I += Sz) {
7977 int Val = PoisonMaskElem;
7978 unsigned UndefCnt = 0;
7979 unsigned Limit = std::min(Sz, VF - I);
7980 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7981 [&](int Idx) {
7982 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7983 Val = Idx;
7984 if (Idx == PoisonMaskElem)
7985 ++UndefCnt;
7986 return Idx != PoisonMaskElem && Idx != Val;
7987 }) ||
7988 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7989 UndefCnt > Sz / 2)
7990 return std::nullopt;
7991 UsedVals.set(Val);
7992 for (unsigned K = 0; K < NumParts; ++K) {
7993 unsigned Idx = Val + Sz * K;
7994 if (Idx < VF && I + K < VF)
7995 ResOrder[Idx] = I + K;
7996 }
7997 }
7998 return std::move(ResOrder);
7999 }
8000 unsigned VF = TE.getVectorFactor();
8001 // Try build correct order for extractelement instructions.
8002 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
8003 TE.ReuseShuffleIndices.end());
8004 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
8005 all_of(TE.Scalars, [Sz](Value *V) {
8006 if (isa<PoisonValue>(V))
8007 return true;
8008 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
8009 return Idx && *Idx < Sz;
8010 })) {
8011 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
8012 "by BinaryOperator and CastInst.");
8013 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
8014 if (TE.ReorderIndices.empty())
8015 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
8016 else
8017 inversePermutation(TE.ReorderIndices, ReorderMask);
8018 for (unsigned I = 0; I < VF; ++I) {
8019 int &Idx = ReusedMask[I];
8020 if (Idx == PoisonMaskElem)
8021 continue;
8022 Value *V = TE.Scalars[ReorderMask[Idx]];
8023 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
8024 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
8025 }
8026 }
8027 // Build the order of the VF size, need to reorder reuses shuffles, they are
8028 // always of VF size.
8029 OrdersType ResOrder(VF);
8030 std::iota(ResOrder.begin(), ResOrder.end(), 0);
8031 auto *It = ResOrder.begin();
8032 for (unsigned K = 0; K < VF; K += Sz) {
8033 OrdersType CurrentOrder(TE.ReorderIndices);
8034 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
8035 if (SubMask.front() == PoisonMaskElem)
8036 std::iota(SubMask.begin(), SubMask.end(), 0);
8037 reorderOrder(CurrentOrder, SubMask);
8038 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
8039 std::advance(It, Sz);
8040 }
8041 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
8042 return Data.index() == Data.value();
8043 }))
8044 return std::nullopt; // No need to reorder.
8045 return std::move(ResOrder);
8046 }
8047 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8048 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8049 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
8050 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
8051 return std::nullopt;
8052 if (TE.State == TreeEntry::SplitVectorize ||
8053 ((TE.State == TreeEntry::Vectorize ||
8054 TE.State == TreeEntry::StridedVectorize ||
8055 TE.State == TreeEntry::CompressVectorize) &&
8057 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
8058 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8059 "Alternate instructions are only supported by "
8060 "BinaryOperator and CastInst.");
8061 return TE.ReorderIndices;
8062 }
8063 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8064 TE.isAltShuffle()) {
8065 assert(TE.ReuseShuffleIndices.empty() &&
8066 "ReuseShuffleIndices should be "
8067 "empty for alternate instructions.");
8068 SmallVector<int> Mask;
8069 TE.buildAltOpShuffleMask(
8070 [&](Instruction *I) {
8071 assert(TE.getMatchingMainOpOrAltOp(I) &&
8072 "Unexpected main/alternate opcode");
8073 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
8074 },
8075 Mask);
8076 const int VF = TE.getVectorFactor();
8077 OrdersType ResOrder(VF, VF);
8078 for (unsigned I : seq<unsigned>(VF)) {
8079 if (Mask[I] == PoisonMaskElem)
8080 continue;
8081 ResOrder[Mask[I] % VF] = I;
8082 }
8083 return std::move(ResOrder);
8084 }
8085 if (!TE.ReorderIndices.empty())
8086 return TE.ReorderIndices;
8087 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8088 if (!TE.ReorderIndices.empty())
8089 return TE.ReorderIndices;
8090
8091 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
8092 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
8093 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
8094 continue;
8095 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
8096 if (!II)
8097 continue;
8098 Instruction *BVHead = nullptr;
8099 BasicBlock *BB = II->getParent();
8100 while (II && II->hasOneUse() && II->getParent() == BB) {
8101 BVHead = II;
8102 II = dyn_cast<InsertElementInst>(II->getOperand(0));
8103 }
8104 I = BVHead;
8105 }
8106
8107 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
8108 assert(BB1 != BB2 && "Expected different basic blocks.");
8109 if (!DT->isReachableFromEntry(BB1))
8110 return false;
8111 if (!DT->isReachableFromEntry(BB2))
8112 return true;
8113 auto *NodeA = DT->getNode(BB1);
8114 auto *NodeB = DT->getNode(BB2);
8115 assert(NodeA && "Should only process reachable instructions");
8116 assert(NodeB && "Should only process reachable instructions");
8117 assert((NodeA == NodeB) ==
8118 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8119 "Different nodes should have different DFS numbers");
8120 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8121 };
8122 auto PHICompare = [&](unsigned I1, unsigned I2) {
8123 Value *V1 = TE.Scalars[I1];
8124 Value *V2 = TE.Scalars[I2];
8125 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
8126 return false;
8127 if (isa<PoisonValue>(V1))
8128 return true;
8129 if (isa<PoisonValue>(V2))
8130 return false;
8131 if (V1->getNumUses() < V2->getNumUses())
8132 return true;
8133 if (V1->getNumUses() > V2->getNumUses())
8134 return false;
8135 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
8136 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
8137 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8138 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8139 FirstUserOfPhi2->getParent());
8140 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
8141 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
8142 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
8143 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
8144 if (IE1 && !IE2)
8145 return true;
8146 if (!IE1 && IE2)
8147 return false;
8148 if (IE1 && IE2) {
8149 if (UserBVHead[I1] && !UserBVHead[I2])
8150 return true;
8151 if (!UserBVHead[I1])
8152 return false;
8153 if (UserBVHead[I1] == UserBVHead[I2])
8154 return getElementIndex(IE1) < getElementIndex(IE2);
8155 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
8156 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
8157 UserBVHead[I2]->getParent());
8158 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8159 }
8160 if (EE1 && !EE2)
8161 return true;
8162 if (!EE1 && EE2)
8163 return false;
8164 if (EE1 && EE2) {
8165 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
8166 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
8167 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
8168 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
8169 if (!Inst2 && !P2)
8170 return Inst1 || P1;
8171 if (EE1->getOperand(0) == EE2->getOperand(0))
8172 return getElementIndex(EE1) < getElementIndex(EE2);
8173 if (!Inst1 && Inst2)
8174 return false;
8175 if (Inst1 && Inst2) {
8176 if (Inst1->getParent() != Inst2->getParent())
8177 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
8178 return Inst1->comesBefore(Inst2);
8179 }
8180 if (!P1 && P2)
8181 return false;
8182 assert(P1 && P2 &&
8183 "Expected either instructions or arguments vector operands.");
8184 return P1->getArgNo() < P2->getArgNo();
8185 }
8186 return false;
8187 };
8188 OrdersType Phis(TE.Scalars.size());
8189 std::iota(Phis.begin(), Phis.end(), 0);
8190 stable_sort(Phis, PHICompare);
8191 if (isIdentityOrder(Phis))
8192 return std::nullopt; // No need to reorder.
8193 return std::move(Phis);
8194 }
8195 if (TE.isGather() &&
8196 (!TE.hasState() || !TE.isAltShuffle() ||
8197 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8198 allSameType(TE.Scalars)) {
8199 // TODO: add analysis of other gather nodes with extractelement
8200 // instructions and other values/instructions, not only undefs.
8201 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8203 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
8204 all_of(TE.Scalars, [](Value *V) {
8205 auto *EE = dyn_cast<ExtractElementInst>(V);
8206 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8207 })) {
8208 // Check that gather of extractelements can be represented as
8209 // just a shuffle of a single vector.
8210 OrdersType CurrentOrder;
8211 bool Reuse =
8212 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
8213 if (Reuse || !CurrentOrder.empty())
8214 return std::move(CurrentOrder);
8215 }
8216 // If the gather node is <undef, v, .., poison> and
8217 // insertelement poison, v, 0 [+ permute]
8218 // is cheaper than
8219 // insertelement poison, v, n - try to reorder.
8220 // If rotating the whole graph, exclude the permute cost, the whole graph
8221 // might be transformed.
8222 int Sz = TE.Scalars.size();
8223 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
8224 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
8225 const auto *It = find_if_not(TE.Scalars, isConstant);
8226 if (It == TE.Scalars.begin())
8227 return OrdersType();
8228 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
8229 if (It != TE.Scalars.end()) {
8230 OrdersType Order(Sz, Sz);
8231 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8232 Order[Idx] = 0;
8233 fixupOrderingIndices(Order);
8234 SmallVector<int> Mask;
8235 inversePermutation(Order, Mask);
8236 InstructionCost PermuteCost =
8237 TopToBottom
8238 ? 0
8239 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
8240 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
8241 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
8242 PoisonValue::get(Ty), *It);
8243 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
8244 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
8245 PoisonValue::get(Ty), *It);
8246 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8247 OrdersType Order(Sz, Sz);
8248 Order[Idx] = 0;
8249 return std::move(Order);
8250 }
8251 }
8252 }
8253 if (isSplat(TE.Scalars))
8254 return std::nullopt;
8255 if (TE.Scalars.size() >= 3)
8256 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
8257 return Order;
8258 // Check if can include the order of vectorized loads. For masked gathers do
8259 // extra analysis later, so include such nodes into a special list.
8260 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8261 SmallVector<Value *> PointerOps;
8262 StridedPtrInfo SPtrInfo;
8263 OrdersType CurrentOrder;
8264 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
8265 CurrentOrder, PointerOps, SPtrInfo);
8268 return std::move(CurrentOrder);
8269 }
8270 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
8271 // has been auditted for correctness with non-power-of-two vectors.
8272 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
8273 if (std::optional<OrdersType> CurrentOrder =
8274 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
8275 return CurrentOrder;
8276 }
8277 return std::nullopt;
8278}
8279
8280/// Checks if the given mask is a "clustered" mask with the same clusters of
8281/// size \p Sz, which are not identity submasks.
8283 unsigned Sz) {
8284 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
8285 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
8286 return false;
8287 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
8288 ArrayRef<int> Cluster = Mask.slice(I, Sz);
8289 if (Cluster != FirstCluster)
8290 return false;
8291 }
8292 return true;
8293}
8294
8295void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8296 // Reorder reuses mask.
8297 reorderReuses(TE.ReuseShuffleIndices, Mask);
8298 const unsigned Sz = TE.Scalars.size();
8299 // For vectorized and non-clustered reused no need to do anything else.
8300 if (!TE.isGather() ||
8302 Sz) ||
8303 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
8304 return;
8305 SmallVector<int> NewMask;
8306 inversePermutation(TE.ReorderIndices, NewMask);
8307 addMask(NewMask, TE.ReuseShuffleIndices);
8308 // Clear reorder since it is going to be applied to the new mask.
8309 TE.ReorderIndices.clear();
8310 // Try to improve gathered nodes with clustered reuses, if possible.
8311 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
8312 SmallVector<unsigned> NewOrder(Slice);
8313 inversePermutation(NewOrder, NewMask);
8314 reorderScalars(TE.Scalars, NewMask);
8315 // Fill the reuses mask with the identity submasks.
8316 for (auto *It = TE.ReuseShuffleIndices.begin(),
8317 *End = TE.ReuseShuffleIndices.end();
8318 It != End; std::advance(It, Sz))
8319 std::iota(It, std::next(It, Sz), 0);
8320}
8321
8323 ArrayRef<unsigned> SecondaryOrder) {
8324 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8325 "Expected same size of orders");
8326 size_t Sz = Order.size();
8327 SmallBitVector UsedIndices(Sz);
8328 for (unsigned Idx : seq<unsigned>(0, Sz)) {
8329 if (Order[Idx] != Sz)
8330 UsedIndices.set(Order[Idx]);
8331 }
8332 if (SecondaryOrder.empty()) {
8333 for (unsigned Idx : seq<unsigned>(0, Sz))
8334 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8335 Order[Idx] = Idx;
8336 } else {
8337 for (unsigned Idx : seq<unsigned>(0, Sz))
8338 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8339 !UsedIndices.test(SecondaryOrder[Idx]))
8340 Order[Idx] = SecondaryOrder[Idx];
8341 }
8342}
8343
8346 return false;
8347
8348 constexpr unsigned TinyVF = 2;
8349 constexpr unsigned TinyTree = 10;
8350 constexpr unsigned PhiOpsLimit = 12;
8351 constexpr unsigned GatherLoadsLimit = 2;
8352 if (VectorizableTree.size() <= TinyTree)
8353 return true;
8354 if (VectorizableTree.front()->hasState() &&
8355 !VectorizableTree.front()->isGather() &&
8356 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8357 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8358 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8359 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8360 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8361 VectorizableTree.front()->ReorderIndices.empty()) {
8362 // Check if the tree has only single store and single (unordered) load node,
8363 // other nodes are phis or geps/binops, combined with phis, and/or single
8364 // gather load node
8365 if (VectorizableTree.front()->hasState() &&
8366 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8367 VectorizableTree.front()->Scalars.size() == TinyVF &&
8368 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8369 return false;
8370 // Single node, which require reorder - skip.
8371 if (VectorizableTree.front()->hasState() &&
8372 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8373 VectorizableTree.front()->ReorderIndices.empty()) {
8374 const unsigned ReorderedSplitsCnt =
8375 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8376 return TE->State == TreeEntry::SplitVectorize &&
8377 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8378 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8379 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
8380 });
8381 if (ReorderedSplitsCnt <= 1 &&
8382 static_cast<unsigned>(count_if(
8383 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8384 return ((!TE->isGather() &&
8385 (TE->ReorderIndices.empty() ||
8386 (TE->UserTreeIndex.UserTE &&
8387 TE->UserTreeIndex.UserTE->State ==
8388 TreeEntry::Vectorize &&
8389 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8390 .empty()))) ||
8391 (TE->isGather() && TE->ReorderIndices.empty() &&
8392 (!TE->hasState() || TE->isAltShuffle() ||
8393 TE->getOpcode() == Instruction::Load ||
8394 TE->getOpcode() == Instruction::ZExt ||
8395 TE->getOpcode() == Instruction::SExt))) &&
8396 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8397 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
8398 return !isConstant(V) && isVectorized(V);
8399 }));
8400 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8401 return false;
8402 }
8403 bool HasPhis = false;
8404 bool HasLoad = true;
8405 unsigned GatherLoads = 0;
8406 for (const std::unique_ptr<TreeEntry> &TE :
8407 ArrayRef(VectorizableTree).drop_front()) {
8408 if (TE->State == TreeEntry::SplitVectorize)
8409 continue;
8410 if (!TE->hasState()) {
8411 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
8413 continue;
8414 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8416 continue;
8417 return true;
8418 }
8419 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8420 if (!TE->isGather()) {
8421 HasLoad = false;
8422 continue;
8423 }
8424 if (HasLoad)
8425 return true;
8426 ++GatherLoads;
8427 if (GatherLoads >= GatherLoadsLimit)
8428 return true;
8429 }
8430 if (TE->getOpcode() == Instruction::GetElementPtr ||
8431 Instruction::isBinaryOp(TE->getOpcode()))
8432 continue;
8433 if (TE->getOpcode() != Instruction::PHI &&
8434 (!TE->hasCopyableElements() ||
8435 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8436 TE->Scalars.size() / 2))
8437 return true;
8438 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8439 TE->getNumOperands() > PhiOpsLimit)
8440 return false;
8441 HasPhis = true;
8442 }
8443 return !HasPhis;
8444 }
8445 return true;
8446}
8447
8448void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8449 ArrayRef<int> MaskOrder) {
8450 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8451 SmallVector<int> NewMask(getVectorFactor());
8452 SmallVector<int> NewMaskOrder(getVectorFactor());
8453 std::iota(NewMask.begin(), NewMask.end(), 0);
8454 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8455 if (Idx == 0) {
8456 copy(Mask, NewMask.begin());
8457 copy(MaskOrder, NewMaskOrder.begin());
8458 } else {
8459 assert(Idx == 1 && "Expected either 0 or 1 index.");
8460 unsigned Offset = CombinedEntriesWithIndices.back().second;
8461 for (unsigned I : seq<unsigned>(Mask.size())) {
8462 NewMask[I + Offset] = Mask[I] + Offset;
8463 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8464 }
8465 }
8466 reorderScalars(Scalars, NewMask);
8467 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8468 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8469 ReorderIndices.clear();
8470}
8471
8473 // Maps VF to the graph nodes.
8475 // ExtractElement gather nodes which can be vectorized and need to handle
8476 // their ordering.
8478
8479 // Phi nodes can have preferred ordering based on their result users
8481
8482 // AltShuffles can also have a preferred ordering that leads to fewer
8483 // instructions, e.g., the addsub instruction in x86.
8484 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8485
8486 // Maps a TreeEntry to the reorder indices of external users.
8488 ExternalUserReorderMap;
8489 // Find all reorderable nodes with the given VF.
8490 // Currently the are vectorized stores,loads,extracts + some gathering of
8491 // extracts.
8492 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8493 const std::unique_ptr<TreeEntry> &TE) {
8494 // Look for external users that will probably be vectorized.
8495 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8496 findExternalStoreUsersReorderIndices(TE.get());
8497 if (!ExternalUserReorderIndices.empty()) {
8498 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8499 ExternalUserReorderMap.try_emplace(TE.get(),
8500 std::move(ExternalUserReorderIndices));
8501 }
8502
8503 // Patterns like [fadd,fsub] can be combined into a single instruction in
8504 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8505 // to take into account their order when looking for the most used order.
8506 if (TE->hasState() && TE->isAltShuffle() &&
8507 TE->State != TreeEntry::SplitVectorize) {
8508 Type *ScalarTy = TE->Scalars[0]->getType();
8509 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8510 unsigned Opcode0 = TE->getOpcode();
8511 unsigned Opcode1 = TE->getAltOpcode();
8512 SmallBitVector OpcodeMask(
8513 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8514 // If this pattern is supported by the target then we consider the order.
8515 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8516 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8517 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8518 }
8519 // TODO: Check the reverse order too.
8520 }
8521
8522 bool IgnoreReorder =
8523 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8524 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8525 VectorizableTree.front()->getOpcode() == Instruction::Store);
8526 if (std::optional<OrdersType> CurrentOrder =
8527 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8528 // Do not include ordering for nodes used in the alt opcode vectorization,
8529 // better to reorder them during bottom-to-top stage. If follow the order
8530 // here, it causes reordering of the whole graph though actually it is
8531 // profitable just to reorder the subgraph that starts from the alternate
8532 // opcode vectorization node. Such nodes already end-up with the shuffle
8533 // instruction and it is just enough to change this shuffle rather than
8534 // rotate the scalars for the whole graph.
8535 unsigned Cnt = 0;
8536 const TreeEntry *UserTE = TE.get();
8537 while (UserTE && Cnt < RecursionMaxDepth) {
8538 if (!UserTE->UserTreeIndex)
8539 break;
8540 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8541 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8542 UserTE->UserTreeIndex.UserTE->Idx != 0)
8543 return;
8544 UserTE = UserTE->UserTreeIndex.UserTE;
8545 ++Cnt;
8546 }
8547 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8548 if (!(TE->State == TreeEntry::Vectorize ||
8549 TE->State == TreeEntry::StridedVectorize ||
8550 TE->State == TreeEntry::SplitVectorize ||
8551 TE->State == TreeEntry::CompressVectorize) ||
8552 !TE->ReuseShuffleIndices.empty())
8553 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8554 if (TE->State == TreeEntry::Vectorize &&
8555 TE->getOpcode() == Instruction::PHI)
8556 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8557 }
8558 });
8559
8560 // Reorder the graph nodes according to their vectorization factor.
8561 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8562 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8563 auto It = VFToOrderedEntries.find(VF);
8564 if (It == VFToOrderedEntries.end())
8565 continue;
8566 // Try to find the most profitable order. We just are looking for the most
8567 // used order and reorder scalar elements in the nodes according to this
8568 // mostly used order.
8569 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8570 // Delete VF entry upon exit.
8571 llvm::scope_exit Cleanup([&]() { VFToOrderedEntries.erase(It); });
8572
8573 // All operands are reordered and used only in this node - propagate the
8574 // most used order to the user node.
8577 OrdersUses;
8578 for (const TreeEntry *OpTE : OrderedEntries) {
8579 // No need to reorder this nodes, still need to extend and to use shuffle,
8580 // just need to merge reordering shuffle and the reuse shuffle.
8581 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8582 OpTE->State != TreeEntry::SplitVectorize)
8583 continue;
8584 // Count number of orders uses.
8585 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8586 &PhisToOrders]() -> const OrdersType & {
8587 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8588 auto It = GathersToOrders.find(OpTE);
8589 if (It != GathersToOrders.end())
8590 return It->second;
8591 }
8592 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8593 auto It = AltShufflesToOrders.find(OpTE);
8594 if (It != AltShufflesToOrders.end())
8595 return It->second;
8596 }
8597 if (OpTE->State == TreeEntry::Vectorize &&
8598 OpTE->getOpcode() == Instruction::PHI) {
8599 auto It = PhisToOrders.find(OpTE);
8600 if (It != PhisToOrders.end())
8601 return It->second;
8602 }
8603 return OpTE->ReorderIndices;
8604 }();
8605 // First consider the order of the external scalar users.
8606 auto It = ExternalUserReorderMap.find(OpTE);
8607 if (It != ExternalUserReorderMap.end()) {
8608 const auto &ExternalUserReorderIndices = It->second;
8609 // If the OpTE vector factor != number of scalars - use natural order,
8610 // it is an attempt to reorder node with reused scalars but with
8611 // external uses.
8612 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8613 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8614 ExternalUserReorderIndices.size();
8615 } else {
8616 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8617 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8618 }
8619 // No other useful reorder data in this entry.
8620 if (Order.empty())
8621 continue;
8622 }
8623 // Stores actually store the mask, not the order, need to invert.
8624 if (OpTE->State == TreeEntry::Vectorize &&
8625 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8626 assert(!OpTE->isAltShuffle() &&
8627 "Alternate instructions are only supported by BinaryOperator "
8628 "and CastInst.");
8629 SmallVector<int> Mask;
8630 inversePermutation(Order, Mask);
8631 unsigned E = Order.size();
8632 OrdersType CurrentOrder(E, E);
8633 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8634 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8635 });
8636 fixupOrderingIndices(CurrentOrder);
8637 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8638 } else {
8639 ++OrdersUses.try_emplace(Order, 0).first->second;
8640 }
8641 }
8642 if (OrdersUses.empty())
8643 continue;
8644 // Choose the most used order.
8645 unsigned IdentityCnt = 0;
8646 unsigned FilledIdentityCnt = 0;
8647 OrdersType IdentityOrder(VF, VF);
8648 for (auto &Pair : OrdersUses) {
8649 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8650 if (!Pair.first.empty())
8651 FilledIdentityCnt += Pair.second;
8652 IdentityCnt += Pair.second;
8653 combineOrders(IdentityOrder, Pair.first);
8654 }
8655 }
8656 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8657 unsigned Cnt = IdentityCnt;
8658 for (auto &Pair : OrdersUses) {
8659 // Prefer identity order. But, if filled identity found (non-empty order)
8660 // with same number of uses, as the new candidate order, we can choose
8661 // this candidate order.
8662 if (Cnt < Pair.second ||
8663 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8664 Cnt == Pair.second && !BestOrder.empty() &&
8665 isIdentityOrder(BestOrder))) {
8666 combineOrders(Pair.first, BestOrder);
8667 BestOrder = Pair.first;
8668 Cnt = Pair.second;
8669 } else {
8670 combineOrders(BestOrder, Pair.first);
8671 }
8672 }
8673 // Set order of the user node.
8674 if (isIdentityOrder(BestOrder))
8675 continue;
8676 fixupOrderingIndices(BestOrder);
8677 SmallVector<int> Mask;
8678 inversePermutation(BestOrder, Mask);
8679 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8680 unsigned E = BestOrder.size();
8681 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8682 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8683 });
8684 // Do an actual reordering, if profitable.
8685 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8686 // Just do the reordering for the nodes with the given VF.
8687 if (TE->Scalars.size() != VF) {
8688 if (TE->ReuseShuffleIndices.size() == VF) {
8689 assert(TE->State != TreeEntry::SplitVectorize &&
8690 "Split vectorized not expected.");
8691 // Need to reorder the reuses masks of the operands with smaller VF to
8692 // be able to find the match between the graph nodes and scalar
8693 // operands of the given node during vectorization/cost estimation.
8694 assert(
8695 (!TE->UserTreeIndex ||
8696 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8697 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8698 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8699 "All users must be of VF size.");
8700 if (SLPReVec) {
8701 assert(SLPReVec && "Only supported by REVEC.");
8702 // ShuffleVectorInst does not do reorderOperands (and it should not
8703 // because ShuffleVectorInst supports only a limited set of
8704 // patterns). Only do reorderNodeWithReuses if the user is not
8705 // ShuffleVectorInst.
8706 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8707 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8708 continue;
8709 }
8710 // Update ordering of the operands with the smaller VF than the given
8711 // one.
8712 reorderNodeWithReuses(*TE, Mask);
8713 // Update orders in user split vectorize nodes.
8714 if (TE->UserTreeIndex &&
8715 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8716 TE->UserTreeIndex.UserTE->reorderSplitNode(
8717 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8718 }
8719 continue;
8720 }
8721 if ((TE->State == TreeEntry::SplitVectorize &&
8722 TE->ReuseShuffleIndices.empty()) ||
8723 ((TE->State == TreeEntry::Vectorize ||
8724 TE->State == TreeEntry::StridedVectorize ||
8725 TE->State == TreeEntry::CompressVectorize) &&
8727 InsertElementInst>(TE->getMainOp()) ||
8728 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8729 assert(
8730 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8731 TE->ReuseShuffleIndices.empty())) &&
8732 "Alternate instructions are only supported by BinaryOperator "
8733 "and CastInst.");
8734 // Build correct orders for extract{element,value}, loads,
8735 // stores and alternate (split) nodes.
8736 reorderOrder(TE->ReorderIndices, Mask);
8737 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8738 TE->reorderOperands(Mask);
8739 } else {
8740 // Reorder the node and its operands.
8741 TE->reorderOperands(Mask);
8742 assert(TE->ReorderIndices.empty() &&
8743 "Expected empty reorder sequence.");
8744 reorderScalars(TE->Scalars, Mask);
8745 }
8746 if (!TE->ReuseShuffleIndices.empty()) {
8747 // Apply reversed order to keep the original ordering of the reused
8748 // elements to avoid extra reorder indices shuffling.
8749 OrdersType CurrentOrder;
8750 reorderOrder(CurrentOrder, MaskOrder);
8751 SmallVector<int> NewReuses;
8752 inversePermutation(CurrentOrder, NewReuses);
8753 addMask(NewReuses, TE->ReuseShuffleIndices);
8754 TE->ReuseShuffleIndices.swap(NewReuses);
8755 } else if (TE->UserTreeIndex &&
8756 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8757 // Update orders in user split vectorize nodes.
8758 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8759 Mask, MaskOrder);
8760 }
8761 }
8762}
8763
8764void BoUpSLP::buildReorderableOperands(
8765 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8766 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8767 SmallVectorImpl<TreeEntry *> &GatherOps) {
8768 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8769 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8770 return OpData.first == I &&
8771 (OpData.second->State == TreeEntry::Vectorize ||
8772 OpData.second->State == TreeEntry::StridedVectorize ||
8773 OpData.second->State == TreeEntry::CompressVectorize ||
8774 OpData.second->State == TreeEntry::SplitVectorize);
8775 }))
8776 continue;
8777 // Do not request operands, if they do not exist.
8778 if (UserTE->hasState()) {
8779 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8780 UserTE->getOpcode() == Instruction::ExtractValue)
8781 continue;
8782 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8783 continue;
8784 if (UserTE->getOpcode() == Instruction::Store &&
8785 UserTE->State == TreeEntry::Vectorize && I == 1)
8786 continue;
8787 if (UserTE->getOpcode() == Instruction::Load &&
8788 (UserTE->State == TreeEntry::Vectorize ||
8789 UserTE->State == TreeEntry::StridedVectorize ||
8790 UserTE->State == TreeEntry::CompressVectorize))
8791 continue;
8792 }
8793 TreeEntry *TE = getOperandEntry(UserTE, I);
8794 assert(TE && "Expected operand entry.");
8795 if (!TE->isGather()) {
8796 // Add the node to the list of the ordered nodes with the identity
8797 // order.
8798 Edges.emplace_back(I, TE);
8799 // Add ScatterVectorize nodes to the list of operands, where just
8800 // reordering of the scalars is required. Similar to the gathers, so
8801 // simply add to the list of gathered ops.
8802 // If there are reused scalars, process this node as a regular vectorize
8803 // node, just reorder reuses mask.
8804 if (TE->State == TreeEntry::ScatterVectorize &&
8805 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8806 GatherOps.push_back(TE);
8807 continue;
8808 }
8809 if (ReorderableGathers.contains(TE))
8810 GatherOps.push_back(TE);
8811 }
8812}
8813
8814void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8815 struct TreeEntryCompare {
8816 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8817 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8818 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8819 return LHS->Idx < RHS->Idx;
8820 }
8821 };
8823 DenseSet<const TreeEntry *> GathersToOrders;
8824 // Find all reorderable leaf nodes with the given VF.
8825 // Currently the are vectorized loads,extracts without alternate operands +
8826 // some gathering of extracts.
8828 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8829 if (TE->State != TreeEntry::Vectorize &&
8830 TE->State != TreeEntry::StridedVectorize &&
8831 TE->State != TreeEntry::CompressVectorize &&
8832 TE->State != TreeEntry::SplitVectorize)
8833 NonVectorized.insert(TE.get());
8834 if (std::optional<OrdersType> CurrentOrder =
8835 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8836 Queue.push(TE.get());
8837 if (!(TE->State == TreeEntry::Vectorize ||
8838 TE->State == TreeEntry::StridedVectorize ||
8839 TE->State == TreeEntry::CompressVectorize ||
8840 TE->State == TreeEntry::SplitVectorize) ||
8841 !TE->ReuseShuffleIndices.empty())
8842 GathersToOrders.insert(TE.get());
8843 }
8844 }
8845
8846 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8847 // I.e., if the node has operands, that are reordered, try to make at least
8848 // one operand order in the natural order and reorder others + reorder the
8849 // user node itself.
8850 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8851 while (!Queue.empty()) {
8852 // 1. Filter out only reordered nodes.
8853 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8854 TreeEntry *TE = Queue.top();
8855 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8856 Queue.pop();
8857 SmallVector<TreeEntry *> OrderedOps(1, TE);
8858 while (!Queue.empty()) {
8859 TE = Queue.top();
8860 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8861 break;
8862 Queue.pop();
8863 OrderedOps.push_back(TE);
8864 }
8865 for (TreeEntry *TE : OrderedOps) {
8866 if (!(TE->State == TreeEntry::Vectorize ||
8867 TE->State == TreeEntry::StridedVectorize ||
8868 TE->State == TreeEntry::CompressVectorize ||
8869 TE->State == TreeEntry::SplitVectorize ||
8870 (TE->isGather() && GathersToOrders.contains(TE))) ||
8871 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8872 !Visited.insert(TE).second)
8873 continue;
8874 // Build a map between user nodes and their operands order to speedup
8875 // search. The graph currently does not provide this dependency directly.
8876 Users.first = TE->UserTreeIndex.UserTE;
8877 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8878 }
8879 if (Users.first) {
8880 auto &Data = Users;
8881 if (Data.first->State == TreeEntry::SplitVectorize) {
8882 assert(
8883 Data.second.size() <= 2 &&
8884 "Expected not greater than 2 operands for split vectorize node.");
8885 if (any_of(Data.second,
8886 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8887 continue;
8888 // Update orders in user split vectorize nodes.
8889 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8890 "Expected exactly 2 entries.");
8891 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8892 TreeEntry &OpTE = *VectorizableTree[P.first];
8893 OrdersType Order = OpTE.ReorderIndices;
8894 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8895 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8896 continue;
8897 const auto BestOrder =
8898 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8899 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8900 continue;
8901 Order = *BestOrder;
8902 }
8903 fixupOrderingIndices(Order);
8904 SmallVector<int> Mask;
8905 inversePermutation(Order, Mask);
8906 const unsigned E = Order.size();
8907 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8908 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8909 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8910 });
8911 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8912 // Clear ordering of the operand.
8913 if (!OpTE.ReorderIndices.empty()) {
8914 OpTE.ReorderIndices.clear();
8915 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8916 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8917 } else {
8918 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8919 reorderScalars(OpTE.Scalars, Mask);
8920 }
8921 }
8922 if (Data.first->ReuseShuffleIndices.empty() &&
8923 !Data.first->ReorderIndices.empty()) {
8924 // Insert user node to the list to try to sink reordering deeper in
8925 // the graph.
8926 Queue.push(Data.first);
8927 }
8928 continue;
8929 }
8930 // Check that operands are used only in the User node.
8931 SmallVector<TreeEntry *> GatherOps;
8932 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8933 GatherOps);
8934 // All operands are reordered and used only in this node - propagate the
8935 // most used order to the user node.
8938 OrdersUses;
8939 // Do the analysis for each tree entry only once, otherwise the order of
8940 // the same node my be considered several times, though might be not
8941 // profitable.
8944 for (const auto &Op : Data.second) {
8945 TreeEntry *OpTE = Op.second;
8946 if (!VisitedOps.insert(OpTE).second)
8947 continue;
8948 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8949 continue;
8950 const auto Order = [&]() -> const OrdersType {
8951 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8952 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8953 IgnoreReorder)
8954 .value_or(OrdersType(1));
8955 return OpTE->ReorderIndices;
8956 }();
8957 // The order is partially ordered, skip it in favor of fully non-ordered
8958 // orders.
8959 if (Order.size() == 1)
8960 continue;
8961
8962 // Check that the reordering does not increase number of shuffles, i.e.
8963 // same-values-nodes has same parents or their parents has same parents.
8964 if (!Order.empty() && !isIdentityOrder(Order)) {
8965 Value *Root = OpTE->hasState()
8966 ? OpTE->getMainOp()
8967 : *find_if_not(OpTE->Scalars, isConstant);
8968 auto GetSameNodesUsers = [&](Value *Root) {
8970 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8971 if (TE != OpTE && TE->UserTreeIndex &&
8972 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8973 TE->Scalars.size() == OpTE->Scalars.size() &&
8974 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8975 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8976 Res.insert(TE->UserTreeIndex.UserTE);
8977 }
8978 for (const TreeEntry *TE : getTreeEntries(Root)) {
8979 if (TE != OpTE && TE->UserTreeIndex &&
8980 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8981 TE->Scalars.size() == OpTE->Scalars.size() &&
8982 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8983 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8984 Res.insert(TE->UserTreeIndex.UserTE);
8985 }
8986 return Res.takeVector();
8987 };
8988 auto GetNumOperands = [](const TreeEntry *TE) {
8989 if (TE->State == TreeEntry::SplitVectorize)
8990 return TE->getNumOperands();
8991 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8992 return CI->arg_size();
8993 return TE->getNumOperands();
8994 };
8995 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8996 const TreeEntry *TE) {
8998 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
9000 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
9003 continue;
9004 const TreeEntry *Op = getOperandEntry(TE, Idx);
9005 if (Op->isGather() && Op->hasState()) {
9006 const TreeEntry *VecOp =
9007 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
9008 if (VecOp)
9009 Op = VecOp;
9010 }
9011 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
9012 return false;
9013 }
9014 return true;
9015 };
9016 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
9017 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
9018 if (!RevisitedOps.insert(UTE).second)
9019 return false;
9020 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
9021 !UTE->ReuseShuffleIndices.empty() ||
9022 (UTE->UserTreeIndex &&
9023 UTE->UserTreeIndex.UserTE == Data.first) ||
9024 (Data.first->UserTreeIndex &&
9025 Data.first->UserTreeIndex.UserTE == UTE) ||
9026 (IgnoreReorder && UTE->UserTreeIndex &&
9027 UTE->UserTreeIndex.UserTE->Idx == 0) ||
9028 NodeShouldBeReorderedWithOperands(UTE);
9029 }))
9030 continue;
9031 for (TreeEntry *UTE : Users) {
9033 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
9035 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
9038 continue;
9039 const TreeEntry *Op = getOperandEntry(UTE, Idx);
9040 Visited.erase(Op);
9041 Queue.push(const_cast<TreeEntry *>(Op));
9042 }
9043 }
9044 }
9045 unsigned NumOps = count_if(
9046 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
9047 return P.second == OpTE;
9048 });
9049 // Stores actually store the mask, not the order, need to invert.
9050 if (OpTE->State == TreeEntry::Vectorize &&
9051 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
9052 assert(!OpTE->isAltShuffle() &&
9053 "Alternate instructions are only supported by BinaryOperator "
9054 "and CastInst.");
9055 SmallVector<int> Mask;
9056 inversePermutation(Order, Mask);
9057 unsigned E = Order.size();
9058 OrdersType CurrentOrder(E, E);
9059 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
9060 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9061 });
9062 fixupOrderingIndices(CurrentOrder);
9063 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
9064 } else {
9065 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
9066 }
9067 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
9068 const auto AllowsReordering = [&](const TreeEntry *TE) {
9069 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9070 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9071 (IgnoreReorder && TE->Idx == 0))
9072 return true;
9073 if (TE->isGather()) {
9074 if (GathersToOrders.contains(TE))
9075 return !getReorderingData(*TE, /*TopToBottom=*/false,
9076 IgnoreReorder)
9077 .value_or(OrdersType(1))
9078 .empty();
9079 return true;
9080 }
9081 return false;
9082 };
9083 if (OpTE->UserTreeIndex) {
9084 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9085 if (!VisitedUsers.insert(UserTE).second)
9086 continue;
9087 // May reorder user node if it requires reordering, has reused
9088 // scalars, is an alternate op vectorize node or its op nodes require
9089 // reordering.
9090 if (AllowsReordering(UserTE))
9091 continue;
9092 // Check if users allow reordering.
9093 // Currently look up just 1 level of operands to avoid increase of
9094 // the compile time.
9095 // Profitable to reorder if definitely more operands allow
9096 // reordering rather than those with natural order.
9098 if (static_cast<unsigned>(count_if(
9099 Ops, [UserTE, &AllowsReordering](
9100 const std::pair<unsigned, TreeEntry *> &Op) {
9101 return AllowsReordering(Op.second) &&
9102 Op.second->UserTreeIndex.UserTE == UserTE;
9103 })) <= Ops.size() / 2)
9104 ++Res.first->second;
9105 }
9106 }
9107 if (OrdersUses.empty()) {
9108 Visited.insert_range(llvm::make_second_range(Data.second));
9109 continue;
9110 }
9111 // Choose the most used order.
9112 unsigned IdentityCnt = 0;
9113 unsigned VF = Data.second.front().second->getVectorFactor();
9114 OrdersType IdentityOrder(VF, VF);
9115 for (auto &Pair : OrdersUses) {
9116 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
9117 IdentityCnt += Pair.second;
9118 combineOrders(IdentityOrder, Pair.first);
9119 }
9120 }
9121 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
9122 unsigned Cnt = IdentityCnt;
9123 for (auto &Pair : OrdersUses) {
9124 // Prefer identity order. But, if filled identity found (non-empty
9125 // order) with same number of uses, as the new candidate order, we can
9126 // choose this candidate order.
9127 if (Cnt < Pair.second) {
9128 combineOrders(Pair.first, BestOrder);
9129 BestOrder = Pair.first;
9130 Cnt = Pair.second;
9131 } else {
9132 combineOrders(BestOrder, Pair.first);
9133 }
9134 }
9135 // Set order of the user node.
9136 if (isIdentityOrder(BestOrder)) {
9137 Visited.insert_range(llvm::make_second_range(Data.second));
9138 continue;
9139 }
9140 fixupOrderingIndices(BestOrder);
9141 // Erase operands from OrderedEntries list and adjust their orders.
9142 VisitedOps.clear();
9143 SmallVector<int> Mask;
9144 inversePermutation(BestOrder, Mask);
9145 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
9146 unsigned E = BestOrder.size();
9147 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
9148 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9149 });
9150 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
9151 TreeEntry *TE = Op.second;
9152 if (!VisitedOps.insert(TE).second)
9153 continue;
9154 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
9155 reorderNodeWithReuses(*TE, Mask);
9156 continue;
9157 }
9158 // Gathers are processed separately.
9159 if (TE->State != TreeEntry::Vectorize &&
9160 TE->State != TreeEntry::StridedVectorize &&
9161 TE->State != TreeEntry::CompressVectorize &&
9162 TE->State != TreeEntry::SplitVectorize &&
9163 (TE->State != TreeEntry::ScatterVectorize ||
9164 TE->ReorderIndices.empty()))
9165 continue;
9166 assert((BestOrder.size() == TE->ReorderIndices.size() ||
9167 TE->ReorderIndices.empty()) &&
9168 "Non-matching sizes of user/operand entries.");
9169 reorderOrder(TE->ReorderIndices, Mask);
9170 if (IgnoreReorder && TE == VectorizableTree.front().get())
9171 IgnoreReorder = false;
9172 }
9173 // For gathers just need to reorder its scalars.
9174 for (TreeEntry *Gather : GatherOps) {
9175 assert(Gather->ReorderIndices.empty() &&
9176 "Unexpected reordering of gathers.");
9177 if (!Gather->ReuseShuffleIndices.empty()) {
9178 // Just reorder reuses indices.
9179 reorderReuses(Gather->ReuseShuffleIndices, Mask);
9180 continue;
9181 }
9182 reorderScalars(Gather->Scalars, Mask);
9183 Visited.insert(Gather);
9184 }
9185 // Reorder operands of the user node and set the ordering for the user
9186 // node itself.
9187 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
9188 return TE.isAltShuffle() &&
9189 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9190 TE.ReorderIndices.empty());
9191 };
9192 if (Data.first->State != TreeEntry::Vectorize ||
9194 Data.first->getMainOp()) ||
9195 IsNotProfitableAltCodeNode(*Data.first))
9196 Data.first->reorderOperands(Mask);
9197 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
9198 IsNotProfitableAltCodeNode(*Data.first) ||
9199 Data.first->State == TreeEntry::StridedVectorize ||
9200 Data.first->State == TreeEntry::CompressVectorize) {
9201 reorderScalars(Data.first->Scalars, Mask);
9202 reorderOrder(Data.first->ReorderIndices, MaskOrder,
9203 /*BottomOrder=*/true);
9204 if (Data.first->ReuseShuffleIndices.empty() &&
9205 !Data.first->ReorderIndices.empty() &&
9206 !IsNotProfitableAltCodeNode(*Data.first)) {
9207 // Insert user node to the list to try to sink reordering deeper in
9208 // the graph.
9209 Queue.push(Data.first);
9210 }
9211 } else {
9212 reorderOrder(Data.first->ReorderIndices, Mask);
9213 }
9214 }
9215 }
9216 // If the reordering is unnecessary, just remove the reorder.
9217 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9218 VectorizableTree.front()->ReuseShuffleIndices.empty())
9219 VectorizableTree.front()->ReorderIndices.clear();
9220}
9221
9222Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
9223 if (Entry.hasState() &&
9224 (Entry.getOpcode() == Instruction::Store ||
9225 Entry.getOpcode() == Instruction::Load) &&
9226 Entry.State == TreeEntry::StridedVectorize &&
9227 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
9228 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
9229 return dyn_cast<Instruction>(Entry.Scalars.front());
9230}
9231
9233 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
9234 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9235 DenseMap<Value *, unsigned> ScalarToExtUses;
9236 // Collect the values that we need to extract from the tree.
9237 for (auto &TEPtr : VectorizableTree) {
9238 TreeEntry *Entry = TEPtr.get();
9239
9240 // No need to handle users of gathered values.
9241 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9242 DeletedNodes.contains(Entry) ||
9243 TransformedToGatherNodes.contains(Entry))
9244 continue;
9245
9246 // For each lane:
9247 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9248 Value *Scalar = Entry->Scalars[Lane];
9249 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
9250 continue;
9251
9252 // All uses must be replaced already? No need to do it again.
9253 auto It = ScalarToExtUses.find(Scalar);
9254 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
9255 continue;
9256
9257 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
9258 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9259 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
9260 << " from " << *Scalar << "for many users.\n");
9261 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9262 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9263 ExternalUsesWithNonUsers.insert(Scalar);
9264 continue;
9265 }
9266
9267 // Check if the scalar is externally used as an extra arg.
9268 const auto ExtI = ExternallyUsedValues.find(Scalar);
9269 if (ExtI != ExternallyUsedValues.end()) {
9270 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9271 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
9272 << FoundLane << " from " << *Scalar << ".\n");
9273 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
9274 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9275 continue;
9276 }
9277 for (User *U : Scalar->users()) {
9278 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
9279
9280 Instruction *UserInst = dyn_cast<Instruction>(U);
9281 if (!UserInst || isDeleted(UserInst))
9282 continue;
9283
9284 // Ignore users in the user ignore list.
9285 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9286 continue;
9287
9288 // Skip in-tree scalars that become vectors
9289 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
9290 any_of(UseEntries, [this](const TreeEntry *UseEntry) {
9291 return !DeletedNodes.contains(UseEntry) &&
9292 !TransformedToGatherNodes.contains(UseEntry);
9293 })) {
9294 // Some in-tree scalars will remain as scalar in vectorized
9295 // instructions. If that is the case, the one in FoundLane will
9296 // be used.
9297 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9298 isa<LoadInst, StoreInst>(UserInst)) ||
9299 isa<CallInst>(UserInst)) ||
9300 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9301 if (DeletedNodes.contains(UseEntry) ||
9302 TransformedToGatherNodes.contains(UseEntry))
9303 return true;
9304 return UseEntry->State == TreeEntry::ScatterVectorize ||
9306 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9307 TTI);
9308 })) {
9309 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9310 << ".\n");
9311 assert(none_of(UseEntries,
9312 [](TreeEntry *UseEntry) {
9313 return UseEntry->isGather();
9314 }) &&
9315 "Bad state");
9316 continue;
9317 }
9318 U = nullptr;
9319 if (It != ScalarToExtUses.end()) {
9320 ExternalUses[It->second].User = nullptr;
9321 break;
9322 }
9323 }
9324
9325 if (U && Scalar->hasNUsesOrMore(UsesLimit))
9326 U = nullptr;
9327 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9328 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9329 << " from lane " << FoundLane << " from " << *Scalar
9330 << ".\n");
9331 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9332 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9333 ExternalUsesWithNonUsers.insert(Scalar);
9334 if (!U)
9335 break;
9336 }
9337 }
9338 }
9339}
9340
9342BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9345 PtrToStoresMap;
9346 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
9347 Value *V = TE->Scalars[Lane];
9348 // Don't iterate over the users of constant data.
9349 if (!isa<Instruction>(V))
9350 continue;
9351 // To save compilation time we don't visit if we have too many users.
9352 if (V->hasNUsesOrMore(UsesLimit))
9353 break;
9354
9355 // Collect stores per pointer object.
9356 for (User *U : V->users()) {
9357 auto *SI = dyn_cast<StoreInst>(U);
9358 // Test whether we can handle the store. V might be a global, which could
9359 // be used in a different function.
9360 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9361 !isValidElementType(SI->getValueOperand()->getType()))
9362 continue;
9363 // Skip entry if already
9364 if (isVectorized(U))
9365 continue;
9366
9367 Value *Ptr =
9368 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
9369 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9370 SI->getValueOperand()->getType(), Ptr}];
9371 // For now just keep one store per pointer object per lane.
9372 // TODO: Extend this to support multiple stores per pointer per lane
9373 if (StoresVec.size() > Lane)
9374 continue;
9375 if (!StoresVec.empty()) {
9376 std::optional<int64_t> Diff = getPointersDiff(
9377 SI->getValueOperand()->getType(), SI->getPointerOperand(),
9378 SI->getValueOperand()->getType(),
9379 StoresVec.front()->getPointerOperand(), *DL, *SE,
9380 /*StrictCheck=*/true);
9381 // We failed to compare the pointers so just abandon this store.
9382 if (!Diff)
9383 continue;
9384 }
9385 StoresVec.push_back(SI);
9386 }
9387 }
9388 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9389 unsigned I = 0;
9390 for (auto &P : PtrToStoresMap) {
9391 Res[I].swap(P.second);
9392 ++I;
9393 }
9394 return Res;
9395}
9396
9397bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9398 OrdersType &ReorderIndices) const {
9399 // We check whether the stores in StoreVec can form a vector by sorting them
9400 // and checking whether they are consecutive.
9401
9402 // To avoid calling getPointersDiff() while sorting we create a vector of
9403 // pairs {store, offset from first} and sort this instead.
9405 StoreInst *S0 = StoresVec[0];
9406 StoreOffsetVec.emplace_back(0, 0);
9407 Type *S0Ty = S0->getValueOperand()->getType();
9408 Value *S0Ptr = S0->getPointerOperand();
9409 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
9410 StoreInst *SI = StoresVec[Idx];
9411 std::optional<int64_t> Diff =
9412 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
9413 SI->getPointerOperand(), *DL, *SE,
9414 /*StrictCheck=*/true);
9415 StoreOffsetVec.emplace_back(*Diff, Idx);
9416 }
9417
9418 // Check if the stores are consecutive by checking if their difference is 1.
9419 if (StoreOffsetVec.size() != StoresVec.size())
9420 return false;
9421 sort(StoreOffsetVec, llvm::less_first());
9422 unsigned Idx = 0;
9423 int64_t PrevDist = 0;
9424 for (const auto &P : StoreOffsetVec) {
9425 if (Idx > 0 && P.first != PrevDist + 1)
9426 return false;
9427 PrevDist = P.first;
9428 ++Idx;
9429 }
9430
9431 // Calculate the shuffle indices according to their offset against the sorted
9432 // StoreOffsetVec.
9433 ReorderIndices.assign(StoresVec.size(), 0);
9434 bool IsIdentity = true;
9435 for (auto [I, P] : enumerate(StoreOffsetVec)) {
9436 ReorderIndices[P.second] = I;
9437 IsIdentity &= P.second == I;
9438 }
9439 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9440 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9441 // same convention here.
9442 if (IsIdentity)
9443 ReorderIndices.clear();
9444
9445 return true;
9446}
9447
9448#ifndef NDEBUG
9450 for (unsigned Idx : Order)
9451 dbgs() << Idx << ", ";
9452 dbgs() << "\n";
9453}
9454#endif
9455
9457BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9458 unsigned NumLanes = TE->Scalars.size();
9459
9460 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9461
9462 // Holds the reorder indices for each candidate store vector that is a user of
9463 // the current TreeEntry.
9464 SmallVector<OrdersType, 1> ExternalReorderIndices;
9465
9466 // Now inspect the stores collected per pointer and look for vectorization
9467 // candidates. For each candidate calculate the reorder index vector and push
9468 // it into `ExternalReorderIndices`
9469 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9470 // If we have fewer than NumLanes stores, then we can't form a vector.
9471 if (StoresVec.size() != NumLanes)
9472 continue;
9473
9474 // If the stores are not consecutive then abandon this StoresVec.
9475 OrdersType ReorderIndices;
9476 if (!canFormVector(StoresVec, ReorderIndices))
9477 continue;
9478
9479 // We now know that the scalars in StoresVec can form a vector instruction,
9480 // so set the reorder indices.
9481 ExternalReorderIndices.push_back(ReorderIndices);
9482 }
9483 return ExternalReorderIndices;
9484}
9485
9487 const SmallDenseSet<Value *> &UserIgnoreLst) {
9488 deleteTree();
9489 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9490 "TreeEntryToStridedPtrInfoMap is not cleared");
9491 UserIgnoreList = &UserIgnoreLst;
9492 if (!allSameType(Roots))
9493 return;
9494 buildTreeRec(Roots, 0, EdgeInfo());
9495}
9496
9498 deleteTree();
9499 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9500 "TreeEntryToStridedPtrInfoMap is not cleared");
9501 if (!allSameType(Roots))
9502 return;
9503 buildTreeRec(Roots, 0, EdgeInfo());
9504}
9505
9506/// Tries to find subvector of loads and builds new vector of only loads if can
9507/// be profitable.
9509 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9511 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9512 bool AddNew = true) {
9513 if (VL.empty())
9514 return;
9515 Type *ScalarTy = getValueType(VL.front());
9516 if (!isValidElementType(ScalarTy))
9517 return;
9519 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9520 for (Value *V : VL) {
9521 auto *LI = dyn_cast<LoadInst>(V);
9522 if (!LI)
9523 continue;
9524 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9525 continue;
9526 bool IsFound = false;
9527 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9528 assert(LI->getParent() == Data.front().first->getParent() &&
9529 LI->getType() == Data.front().first->getType() &&
9530 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9531 getUnderlyingObject(Data.front().first->getPointerOperand(),
9533 "Expected loads with the same type, same parent and same "
9534 "underlying pointer.");
9535 std::optional<int64_t> Dist = getPointersDiff(
9536 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9537 Data.front().first->getPointerOperand(), DL, SE,
9538 /*StrictCheck=*/true);
9539 if (!Dist)
9540 continue;
9541 auto It = Map.find(*Dist);
9542 if (It != Map.end() && It->second != LI)
9543 continue;
9544 if (It == Map.end()) {
9545 Data.emplace_back(LI, *Dist);
9546 Map.try_emplace(*Dist, LI);
9547 }
9548 IsFound = true;
9549 break;
9550 }
9551 if (!IsFound) {
9552 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9553 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9554 }
9555 }
9556 auto FindMatchingLoads =
9559 &GatheredLoads,
9560 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9561 int64_t &Offset, unsigned &Start) {
9562 if (Loads.empty())
9563 return GatheredLoads.end();
9564 LoadInst *LI = Loads.front().first;
9565 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9566 if (Idx < Start)
9567 continue;
9568 ToAdd.clear();
9569 if (LI->getParent() != Data.front().first->getParent() ||
9570 LI->getType() != Data.front().first->getType())
9571 continue;
9572 std::optional<int64_t> Dist =
9574 Data.front().first->getType(),
9575 Data.front().first->getPointerOperand(), DL, SE,
9576 /*StrictCheck=*/true);
9577 if (!Dist)
9578 continue;
9579 SmallSet<int64_t, 4> DataDists;
9581 for (std::pair<LoadInst *, int64_t> P : Data) {
9582 DataDists.insert(P.second);
9583 DataLoads.insert(P.first);
9584 }
9585 // Found matching gathered loads - check if all loads are unique or
9586 // can be effectively vectorized.
9587 unsigned NumUniques = 0;
9588 for (auto [Cnt, Pair] : enumerate(Loads)) {
9589 bool Used = DataLoads.contains(Pair.first);
9590 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9591 ++NumUniques;
9592 ToAdd.insert(Cnt);
9593 } else if (Used) {
9594 Repeated.insert(Cnt);
9595 }
9596 }
9597 if (NumUniques > 0 &&
9598 (Loads.size() == NumUniques ||
9599 (Loads.size() - NumUniques >= 2 &&
9600 Loads.size() - NumUniques >= Loads.size() / 2 &&
9601 (has_single_bit(Data.size() + NumUniques) ||
9602 bit_ceil(Data.size()) <
9603 bit_ceil(Data.size() + NumUniques))))) {
9604 Offset = *Dist;
9605 Start = Idx + 1;
9606 return std::next(GatheredLoads.begin(), Idx);
9607 }
9608 }
9609 ToAdd.clear();
9610 return GatheredLoads.end();
9611 };
9612 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9613 unsigned Start = 0;
9614 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9615 int64_t Offset = 0;
9616 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9617 Offset, Start);
9618 while (It != GatheredLoads.end()) {
9619 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9620 for (unsigned Idx : LocalToAdd)
9621 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9622 ToAdd.insert_range(LocalToAdd);
9623 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9624 Start);
9625 }
9626 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9627 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9628 })) {
9629 auto AddNewLoads =
9631 for (unsigned Idx : seq<unsigned>(Data.size())) {
9632 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9633 continue;
9634 Loads.push_back(Data[Idx]);
9635 }
9636 };
9637 if (!AddNew) {
9638 LoadInst *LI = Data.front().first;
9639 It = find_if(
9640 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9641 return PD.front().first->getParent() == LI->getParent() &&
9642 PD.front().first->getType() == LI->getType();
9643 });
9644 while (It != GatheredLoads.end()) {
9645 AddNewLoads(*It);
9646 It = std::find_if(
9647 std::next(It), GatheredLoads.end(),
9648 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9649 return PD.front().first->getParent() == LI->getParent() &&
9650 PD.front().first->getType() == LI->getType();
9651 });
9652 }
9653 }
9654 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9655 AddNewLoads(GatheredLoads.emplace_back());
9656 }
9657 }
9658}
9659
9660void BoUpSLP::tryToVectorizeGatheredLoads(
9661 const SmallMapVector<
9662 std::tuple<BasicBlock *, Value *, Type *>,
9663 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9664 &GatheredLoads) {
9665 GatheredLoadsEntriesFirst = VectorizableTree.size();
9666
9667 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9668 LoadEntriesToVectorize.size());
9669 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9670 Set.insert_range(VectorizableTree[Idx]->Scalars);
9671
9672 // Sort loads by distance.
9673 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9674 const std::pair<LoadInst *, int64_t> &L2) {
9675 return L1.second > L2.second;
9676 };
9677
9678 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9679 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9680 Loads.size());
9681 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9682 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9683 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9684 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9685 };
9686
9687 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9688 BoUpSLP::ValueSet &VectorizedLoads,
9689 SmallVectorImpl<LoadInst *> &NonVectorized,
9690 bool Final, unsigned MaxVF) {
9692 unsigned StartIdx = 0;
9693 SmallVector<int> CandidateVFs;
9694 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9695 CandidateVFs.push_back(MaxVF);
9696 for (int NumElts = getFloorFullVectorNumberOfElements(
9697 *TTI, Loads.front()->getType(), MaxVF);
9698 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9699 *TTI, Loads.front()->getType(), NumElts - 1)) {
9700 CandidateVFs.push_back(NumElts);
9701 if (VectorizeNonPowerOf2 && NumElts > 2)
9702 CandidateVFs.push_back(NumElts - 1);
9703 }
9704
9705 if (Final && CandidateVFs.empty())
9706 return Results;
9707
9708 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9709 for (unsigned NumElts : CandidateVFs) {
9710 if (Final && NumElts > BestVF)
9711 continue;
9712 SmallVector<unsigned> MaskedGatherVectorized;
9713 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9714 ++Cnt) {
9715 ArrayRef<LoadInst *> Slice =
9716 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9717 if (VectorizedLoads.count(Slice.front()) ||
9718 VectorizedLoads.count(Slice.back()) ||
9720 continue;
9721 // Check if it is profitable to try vectorizing gathered loads. It is
9722 // profitable if we have more than 3 consecutive loads or if we have
9723 // less but all users are vectorized or deleted.
9724 bool AllowToVectorize = false;
9725 // Check if it is profitable to vectorize 2-elements loads.
9726 if (NumElts == 2) {
9727 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9728 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9729 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9730 for (LoadInst *LI : Slice) {
9731 // If single use/user - allow to vectorize.
9732 if (LI->hasOneUse())
9733 continue;
9734 // 1. Check if number of uses equals number of users.
9735 // 2. All users are deleted.
9736 // 3. The load broadcasts are not allowed or the load is not
9737 // broadcasted.
9738 if (static_cast<unsigned int>(std::distance(
9739 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9740 return false;
9741 if (!IsLegalBroadcastLoad)
9742 continue;
9743 if (LI->hasNUsesOrMore(UsesLimit))
9744 return false;
9745 for (User *U : LI->users()) {
9746 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9747 continue;
9748 for (const TreeEntry *UTE : getTreeEntries(U)) {
9749 for (int I : seq<int>(UTE->getNumOperands())) {
9750 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9751 return V == LI || isa<PoisonValue>(V);
9752 }))
9753 // Found legal broadcast - do not vectorize.
9754 return false;
9755 }
9756 }
9757 }
9758 }
9759 return true;
9760 };
9761 AllowToVectorize = CheckIfAllowed(Slice);
9762 } else {
9763 AllowToVectorize =
9764 (NumElts >= 3 ||
9765 any_of(ValueToGatherNodes.at(Slice.front()),
9766 [=](const TreeEntry *TE) {
9767 return TE->Scalars.size() == 2 &&
9768 ((TE->Scalars.front() == Slice.front() &&
9769 TE->Scalars.back() == Slice.back()) ||
9770 (TE->Scalars.front() == Slice.back() &&
9771 TE->Scalars.back() == Slice.front()));
9772 })) &&
9773 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9774 Slice.size());
9775 }
9776 if (AllowToVectorize) {
9777 SmallVector<Value *> PointerOps;
9778 OrdersType CurrentOrder;
9779 // Try to build vector load.
9780 ArrayRef<Value *> Values(
9781 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9782 StridedPtrInfo SPtrInfo;
9783 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9784 PointerOps, SPtrInfo, &BestVF);
9785 if (LS != LoadsState::Gather ||
9786 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9787 if (LS == LoadsState::ScatterVectorize) {
9788 if (MaskedGatherVectorized.empty() ||
9789 Cnt >= MaskedGatherVectorized.back() + NumElts)
9790 MaskedGatherVectorized.push_back(Cnt);
9791 continue;
9792 }
9793 if (LS != LoadsState::Gather) {
9794 Results.emplace_back(Values, LS);
9795 VectorizedLoads.insert_range(Slice);
9796 // If we vectorized initial block, no need to try to vectorize it
9797 // again.
9798 if (Cnt == StartIdx)
9799 StartIdx += NumElts;
9800 }
9801 // Check if the whole array was vectorized already - exit.
9802 if (StartIdx >= Loads.size())
9803 break;
9804 // Erase last masked gather candidate, if another candidate within
9805 // the range is found to be better.
9806 if (!MaskedGatherVectorized.empty() &&
9807 Cnt < MaskedGatherVectorized.back() + NumElts)
9808 MaskedGatherVectorized.pop_back();
9809 Cnt += NumElts - 1;
9810 continue;
9811 }
9812 }
9813 if (!AllowToVectorize || BestVF == 0)
9815 }
9816 // Mark masked gathers candidates as vectorized, if any.
9817 for (unsigned Cnt : MaskedGatherVectorized) {
9818 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9819 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9820 ArrayRef<Value *> Values(
9821 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9822 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9823 VectorizedLoads.insert_range(Slice);
9824 // If we vectorized initial block, no need to try to vectorize it again.
9825 if (Cnt == StartIdx)
9826 StartIdx += NumElts;
9827 }
9828 }
9829 for (LoadInst *LI : Loads) {
9830 if (!VectorizedLoads.contains(LI))
9831 NonVectorized.push_back(LI);
9832 }
9833 return Results;
9834 };
9835 auto ProcessGatheredLoads =
9836 [&, &TTI = *TTI](
9838 bool Final = false) {
9839 SmallVector<LoadInst *> NonVectorized;
9840 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9841 GatheredLoads) {
9842 if (LoadsDists.size() <= 1) {
9843 NonVectorized.push_back(LoadsDists.back().first);
9844 continue;
9845 }
9847 LoadsDists);
9848 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9849 stable_sort(LocalLoadsDists, LoadSorter);
9851 unsigned MaxConsecutiveDistance = 0;
9852 unsigned CurrentConsecutiveDist = 1;
9853 int64_t LastDist = LocalLoadsDists.front().second;
9854 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9855 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9856 if (isVectorized(L.first))
9857 continue;
9858 assert(LastDist >= L.second &&
9859 "Expected first distance always not less than second");
9860 if (static_cast<uint64_t>(LastDist - L.second) ==
9861 CurrentConsecutiveDist) {
9862 ++CurrentConsecutiveDist;
9863 MaxConsecutiveDistance =
9864 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9865 Loads.push_back(L.first);
9866 continue;
9867 }
9868 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9869 !Loads.empty())
9870 Loads.pop_back();
9871 CurrentConsecutiveDist = 1;
9872 LastDist = L.second;
9873 Loads.push_back(L.first);
9874 }
9875 if (Loads.size() <= 1)
9876 continue;
9877 if (AllowMaskedGather)
9878 MaxConsecutiveDistance = Loads.size();
9879 else if (MaxConsecutiveDistance < 2)
9880 continue;
9881 BoUpSLP::ValueSet VectorizedLoads;
9882 SmallVector<LoadInst *> SortedNonVectorized;
9884 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9885 Final, MaxConsecutiveDistance);
9886 if (!Results.empty() && !SortedNonVectorized.empty() &&
9887 OriginalLoads.size() == Loads.size() &&
9888 MaxConsecutiveDistance == Loads.size() &&
9890 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9891 return P.second == LoadsState::ScatterVectorize;
9892 })) {
9893 VectorizedLoads.clear();
9894 SmallVector<LoadInst *> UnsortedNonVectorized;
9896 UnsortedResults =
9897 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9898 UnsortedNonVectorized, Final,
9899 OriginalLoads.size());
9900 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9901 SortedNonVectorized.swap(UnsortedNonVectorized);
9902 Results.swap(UnsortedResults);
9903 }
9904 }
9905 for (auto [Slice, _] : Results) {
9906 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9907 << Slice.size() << ")\n");
9908 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9909 for (Value *L : Slice)
9910 if (!isVectorized(L))
9911 SortedNonVectorized.push_back(cast<LoadInst>(L));
9912 continue;
9913 }
9914
9915 // Select maximum VF as a maximum of user gathered nodes and
9916 // distance between scalar loads in these nodes.
9917 unsigned MaxVF = Slice.size();
9918 unsigned UserMaxVF = 0;
9919 unsigned InterleaveFactor = 0;
9920 if (MaxVF == 2) {
9921 UserMaxVF = MaxVF;
9922 } else {
9923 // Found distance between segments of the interleaved loads.
9924 std::optional<unsigned> InterleavedLoadsDistance = 0;
9925 unsigned Order = 0;
9926 std::optional<unsigned> CommonVF = 0;
9927 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9928 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9929 for (auto [Idx, V] : enumerate(Slice)) {
9930 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9931 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9932 unsigned Pos =
9933 EntryToPosition.try_emplace(E, Idx).first->second;
9934 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9935 if (CommonVF) {
9936 if (*CommonVF == 0) {
9937 CommonVF = E->Scalars.size();
9938 continue;
9939 }
9940 if (*CommonVF != E->Scalars.size())
9941 CommonVF.reset();
9942 }
9943 // Check if the load is the part of the interleaved load.
9944 if (Pos != Idx && InterleavedLoadsDistance) {
9945 if (!DeinterleavedNodes.contains(E) &&
9946 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9947 if (isa<Constant>(V))
9948 return false;
9949 if (isVectorized(V))
9950 return true;
9951 const auto &Nodes = ValueToGatherNodes.at(V);
9952 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9953 !is_contained(Slice, V);
9954 })) {
9955 InterleavedLoadsDistance.reset();
9956 continue;
9957 }
9958 DeinterleavedNodes.insert(E);
9959 if (*InterleavedLoadsDistance == 0) {
9960 InterleavedLoadsDistance = Idx - Pos;
9961 continue;
9962 }
9963 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9964 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9965 InterleavedLoadsDistance.reset();
9966 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9967 }
9968 }
9969 }
9970 DeinterleavedNodes.clear();
9971 // Check if the large load represents interleaved load operation.
9972 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9973 CommonVF.value_or(0) != 0) {
9974 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9975 unsigned VF = *CommonVF;
9976 OrdersType Order;
9977 SmallVector<Value *> PointerOps;
9978 StridedPtrInfo SPtrInfo;
9979 // Segmented load detected - vectorize at maximum vector factor.
9980 if (InterleaveFactor <= Slice.size() &&
9981 TTI.isLegalInterleavedAccessType(
9982 getWidenedType(Slice.front()->getType(), VF),
9983 InterleaveFactor,
9984 cast<LoadInst>(Slice.front())->getAlign(),
9985 cast<LoadInst>(Slice.front())
9987 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9988 SPtrInfo) == LoadsState::Vectorize) {
9989 UserMaxVF = InterleaveFactor * VF;
9990 } else {
9991 InterleaveFactor = 0;
9992 }
9993 }
9994 // Cannot represent the loads as consecutive vectorizable nodes -
9995 // just exit.
9996 unsigned ConsecutiveNodesSize = 0;
9997 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9998 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9999 [&, Slice = Slice](const auto &P) {
10000 const auto *It = find_if(Slice, [&](Value *V) {
10001 return std::get<1>(P).contains(V);
10002 });
10003 if (It == Slice.end())
10004 return false;
10005 const TreeEntry &TE =
10006 *VectorizableTree[std::get<0>(P)];
10007 ArrayRef<Value *> VL = TE.Scalars;
10008 OrdersType Order;
10009 SmallVector<Value *> PointerOps;
10010 StridedPtrInfo SPtrInfo;
10012 VL, VL.front(), Order, PointerOps, SPtrInfo);
10013 if (State == LoadsState::ScatterVectorize ||
10015 return false;
10016 ConsecutiveNodesSize += VL.size();
10017 size_t Start = std::distance(Slice.begin(), It);
10018 size_t Sz = Slice.size() - Start;
10019 return Sz < VL.size() ||
10020 Slice.slice(Start, VL.size()) != VL;
10021 }))
10022 continue;
10023 // Try to build long masked gather loads.
10024 UserMaxVF = bit_ceil(UserMaxVF);
10025 if (InterleaveFactor == 0 &&
10026 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
10027 [&, Slice = Slice](unsigned Idx) {
10028 OrdersType Order;
10029 SmallVector<Value *> PointerOps;
10030 StridedPtrInfo SPtrInfo;
10031 return canVectorizeLoads(
10032 Slice.slice(Idx * UserMaxVF, UserMaxVF),
10033 Slice[Idx * UserMaxVF], Order, PointerOps,
10034 SPtrInfo) == LoadsState::ScatterVectorize;
10035 }))
10036 UserMaxVF = MaxVF;
10037 if (Slice.size() != ConsecutiveNodesSize)
10038 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
10039 }
10040 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10041 bool IsVectorized = true;
10042 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
10043 ArrayRef<Value *> SubSlice =
10044 Slice.slice(I, std::min(VF, E - I));
10045 if (isVectorized(SubSlice.front()))
10046 continue;
10047 // Check if the subslice is to be-vectorized entry, which is not
10048 // equal to entry.
10049 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10050 [&](const auto &P) {
10051 return !SubSlice.equals(
10052 VectorizableTree[std::get<0>(P)]
10053 ->Scalars) &&
10054 set_is_subset(SubSlice, std::get<1>(P));
10055 }))
10056 continue;
10057 unsigned Sz = VectorizableTree.size();
10058 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
10059 if (Sz == VectorizableTree.size()) {
10060 IsVectorized = false;
10061 // Try non-interleaved vectorization with smaller vector
10062 // factor.
10063 if (InterleaveFactor > 0) {
10064 VF = 2 * (MaxVF / InterleaveFactor);
10065 InterleaveFactor = 0;
10066 }
10067 continue;
10068 }
10069 }
10070 if (IsVectorized)
10071 break;
10072 }
10073 }
10074 NonVectorized.append(SortedNonVectorized);
10075 }
10076 return NonVectorized;
10077 };
10078 for (const auto &GLs : GatheredLoads) {
10079 const auto &Ref = GLs.second;
10080 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
10081 if (!Ref.empty() && !NonVectorized.empty() &&
10082 std::accumulate(
10083 Ref.begin(), Ref.end(), 0u,
10084 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10085 -> unsigned { return S + LoadsDists.size(); }) !=
10086 NonVectorized.size() &&
10087 IsMaskedGatherSupported(NonVectorized)) {
10089 FinalGatheredLoads;
10090 for (LoadInst *LI : NonVectorized) {
10091 // Reinsert non-vectorized loads to other list of loads with the same
10092 // base pointers.
10093 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
10094 FinalGatheredLoads,
10095 /*AddNew=*/false);
10096 }
10097 // Final attempt to vectorize non-vectorized loads.
10098 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
10099 }
10100 }
10101 // Try to vectorize postponed load entries, previously marked as gathered.
10102 for (unsigned Idx : LoadEntriesToVectorize) {
10103 const TreeEntry &E = *VectorizableTree[Idx];
10104 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
10105 // Avoid reordering, if possible.
10106 if (!E.ReorderIndices.empty()) {
10107 // Build a mask out of the reorder indices and reorder scalars per this
10108 // mask.
10109 SmallVector<int> ReorderMask;
10110 inversePermutation(E.ReorderIndices, ReorderMask);
10111 reorderScalars(GatheredScalars, ReorderMask);
10112 }
10113 buildTreeRec(GatheredScalars, 0, EdgeInfo());
10114 }
10115 // If no new entries created, consider it as no gathered loads entries must be
10116 // handled.
10117 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10118 VectorizableTree.size())
10119 GatheredLoadsEntriesFirst.reset();
10120}
10121
10122/// Generates key/subkey pair for the given value to provide effective sorting
10123/// of the values and better detection of the vectorizable values sequences. The
10124/// keys/subkeys can be used for better sorting of the values themselves (keys)
10125/// and in values subgroups (subkeys).
10126static std::pair<size_t, size_t> generateKeySubkey(
10127 Value *V, const TargetLibraryInfo *TLI,
10128 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
10129 bool AllowAlternate) {
10130 hash_code Key = hash_value(V->getValueID() + 2);
10131 hash_code SubKey = hash_value(0);
10132 // Sort the loads by the distance between the pointers.
10133 if (auto *LI = dyn_cast<LoadInst>(V)) {
10134 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
10135 if (LI->isSimple())
10136 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
10137 else
10138 Key = SubKey = hash_value(LI);
10139 } else if (isVectorLikeInstWithConstOps(V)) {
10140 // Sort extracts by the vector operands.
10142 Key = hash_value(Value::UndefValueVal + 1);
10143 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
10144 if (!isUndefVector(EI->getVectorOperand()).all() &&
10145 !isa<UndefValue>(EI->getIndexOperand()))
10146 SubKey = hash_value(EI->getVectorOperand());
10147 }
10148 } else if (auto *I = dyn_cast<Instruction>(V)) {
10149 // Sort other instructions just by the opcodes except for CMPInst.
10150 // For CMP also sort by the predicate kind.
10152 isValidForAlternation(I->getOpcode())) {
10153 if (AllowAlternate)
10154 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
10155 else
10156 Key = hash_combine(hash_value(I->getOpcode()), Key);
10157 SubKey = hash_combine(
10158 hash_value(I->getOpcode()), hash_value(I->getType()),
10160 ? I->getType()
10161 : cast<CastInst>(I)->getOperand(0)->getType()));
10162 // For casts, look through the only operand to improve compile time.
10163 if (isa<CastInst>(I)) {
10164 std::pair<size_t, size_t> OpVals =
10165 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
10166 /*AllowAlternate=*/true);
10167 Key = hash_combine(OpVals.first, Key);
10168 SubKey = hash_combine(OpVals.first, SubKey);
10169 }
10170 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
10171 CmpInst::Predicate Pred = CI->getPredicate();
10172 if (CI->isCommutative())
10173 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
10175 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
10176 hash_value(SwapPred),
10177 hash_value(CI->getOperand(0)->getType()));
10178 } else if (auto *Call = dyn_cast<CallInst>(I)) {
10181 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
10182 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
10183 SubKey = hash_combine(hash_value(I->getOpcode()),
10184 hash_value(Call->getCalledFunction()));
10185 } else {
10187 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
10188 }
10189 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
10190 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
10191 hash_value(Op.Tag), SubKey);
10192 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
10193 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
10194 SubKey = hash_value(Gep->getPointerOperand());
10195 else
10196 SubKey = hash_value(Gep);
10197 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
10198 !isa<ConstantInt>(I->getOperand(1))) {
10199 // Do not try to vectorize instructions with potentially high cost.
10200 SubKey = hash_value(I);
10201 } else {
10202 SubKey = hash_value(I->getOpcode());
10203 }
10204 Key = hash_combine(hash_value(I->getParent()), Key);
10205 }
10206 return std::make_pair(Key, SubKey);
10207}
10208
10209/// Checks if the specified instruction \p I is an main operation for the given
10210/// \p MainOp and \p AltOp instructions.
10211static bool isMainInstruction(Instruction *I, Instruction *MainOp,
10212 Instruction *AltOp, const TargetLibraryInfo &TLI);
10213
10214bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
10215 ArrayRef<Value *> VL) const {
10216 Type *ScalarTy = S.getMainOp()->getType();
10217 unsigned Opcode0 = S.getOpcode();
10218 unsigned Opcode1 = S.getAltOpcode();
10219 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10220 // If this pattern is supported by the target then consider it profitable.
10221 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
10222 Opcode1, OpcodeMask))
10223 return true;
10224 SmallVector<ValueList> Operands;
10225 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
10226 Operands.emplace_back();
10227 // Prepare the operand vector.
10228 for (Value *V : VL) {
10229 if (isa<PoisonValue>(V)) {
10230 Operands.back().push_back(
10231 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
10232 continue;
10233 }
10234 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
10235 }
10236 }
10237 if (Operands.size() == 2) {
10238 // Try find best operands candidates.
10239 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
10241 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
10242 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
10243 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
10244 std::optional<int> Res = findBestRootPair(Candidates);
10245 switch (Res.value_or(0)) {
10246 case 0:
10247 break;
10248 case 1:
10249 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
10250 break;
10251 case 2:
10252 std::swap(Operands[0][I], Operands[1][I]);
10253 break;
10254 default:
10255 llvm_unreachable("Unexpected index.");
10256 }
10257 }
10258 }
10259 DenseSet<unsigned> UniqueOpcodes;
10260 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
10261 unsigned NonInstCnt = 0;
10262 // Estimate number of instructions, required for the vectorized node and for
10263 // the buildvector node.
10264 unsigned UndefCnt = 0;
10265 // Count the number of extra shuffles, required for vector nodes.
10266 unsigned ExtraShuffleInsts = 0;
10267 // Check that operands do not contain same values and create either perfect
10268 // diamond match or shuffled match.
10269 if (Operands.size() == 2) {
10270 // Do not count same operands twice.
10271 if (Operands.front() == Operands.back()) {
10272 Operands.erase(Operands.begin());
10273 } else if (!allConstant(Operands.front()) &&
10274 all_of(Operands.front(), [&](Value *V) {
10275 return is_contained(Operands.back(), V);
10276 })) {
10277 Operands.erase(Operands.begin());
10278 ++ExtraShuffleInsts;
10279 }
10280 }
10281 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
10282 // Vectorize node, if:
10283 // 1. at least single operand is constant or splat.
10284 // 2. Operands have many loop invariants (the instructions are not loop
10285 // invariants).
10286 // 3. At least single unique operands is supposed to vectorized.
10287 return none_of(Operands,
10288 [&](ArrayRef<Value *> Op) {
10289 if (allConstant(Op) ||
10290 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
10291 getSameOpcode(Op, *TLI)))
10292 return false;
10293 DenseMap<Value *, unsigned> Uniques;
10294 for (Value *V : Op) {
10296 isVectorized(V) || (L && L->isLoopInvariant(V))) {
10297 if (isa<UndefValue>(V))
10298 ++UndefCnt;
10299 continue;
10300 }
10301 auto Res = Uniques.try_emplace(V, 0);
10302 // Found first duplicate - need to add shuffle.
10303 if (!Res.second && Res.first->second == 1)
10304 ++ExtraShuffleInsts;
10305 ++Res.first->getSecond();
10306 if (auto *I = dyn_cast<Instruction>(V))
10307 UniqueOpcodes.insert(I->getOpcode());
10308 else if (Res.second)
10309 ++NonInstCnt;
10310 }
10311 return none_of(Uniques, [&](const auto &P) {
10312 return P.first->hasNUsesOrMore(P.second + 1) &&
10313 none_of(P.first->users(), [&](User *U) {
10314 return isVectorized(U) || Uniques.contains(U);
10315 });
10316 });
10317 }) ||
10318 // Do not vectorize node, if estimated number of vector instructions is
10319 // more than estimated number of buildvector instructions. Number of
10320 // vector operands is number of vector instructions + number of vector
10321 // instructions for operands (buildvectors). Number of buildvector
10322 // instructions is just number_of_operands * number_of_scalars.
10323 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10324 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
10325 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10326}
10327
10328/// Builds the arguments types vector for the given call instruction with the
10329/// given \p ID for the specified vector factor.
10332 const unsigned VF, unsigned MinBW,
10333 const TargetTransformInfo *TTI) {
10334 SmallVector<Type *> ArgTys;
10335 for (auto [Idx, Arg] : enumerate(CI->args())) {
10338 ArgTys.push_back(Arg->getType());
10339 continue;
10340 }
10341 if (MinBW > 0) {
10342 ArgTys.push_back(
10343 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
10344 continue;
10345 }
10346 }
10347 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
10348 }
10349 return ArgTys;
10350}
10351
10352/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10353/// function (if possible) calls. Returns invalid cost for the corresponding
10354/// calls, if they cannot be vectorized/will be scalarized.
10355static std::pair<InstructionCost, InstructionCost>
10358 ArrayRef<Type *> ArgTys) {
10359 auto Shape = VFShape::get(CI->getFunctionType(),
10361 false /*HasGlobalPred*/);
10362 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10363 auto LibCost = InstructionCost::getInvalid();
10364 if (!CI->isNoBuiltin() && VecFunc) {
10365 // Calculate the cost of the vector library call.
10366 // If the corresponding vector call is cheaper, return its cost.
10367 LibCost =
10368 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
10369 }
10371
10372 // Calculate the cost of the vector intrinsic call.
10373 FastMathFlags FMF;
10374 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
10375 FMF = FPCI->getFastMathFlags();
10376 const InstructionCost ScalarLimit = 10000;
10377 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10378 LibCost.isValid() ? LibCost : ScalarLimit);
10379 auto IntrinsicCost =
10380 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
10381 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10382 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10384
10385 return {IntrinsicCost, LibCost};
10386}
10387
10388BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10389 const InstructionsState &S, ArrayRef<Value *> VL,
10390 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10391 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10392 assert(S.getMainOp() &&
10393 "Expected instructions with same/alternate opcodes only.");
10394
10395 unsigned ShuffleOrOp =
10396 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10397 Instruction *VL0 = S.getMainOp();
10398 switch (ShuffleOrOp) {
10399 case Instruction::PHI: {
10400 // Too many operands - gather, most probably won't be vectorized.
10401 if (VL0->getNumOperands() > MaxPHINumOperands)
10402 return TreeEntry::NeedToGather;
10403 // Check for terminator values (e.g. invoke).
10404 for (Value *V : VL) {
10405 auto *PHI = dyn_cast<PHINode>(V);
10406 if (!PHI)
10407 continue;
10408 for (Value *Incoming : PHI->incoming_values()) {
10410 if (Term && Term->isTerminator()) {
10412 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10413 return TreeEntry::NeedToGather;
10414 }
10415 }
10416 }
10417
10418 return TreeEntry::Vectorize;
10419 }
10420 case Instruction::ExtractElement:
10421 if (any_of(VL, [&](Value *V) {
10422 auto *EI = dyn_cast<ExtractElementInst>(V);
10423 if (!EI)
10424 return true;
10425 return isVectorized(EI->getOperand(0));
10426 }))
10427 return TreeEntry::NeedToGather;
10428 [[fallthrough]];
10429 case Instruction::ExtractValue: {
10430 bool Reuse = canReuseExtract(VL, CurrentOrder);
10431 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10432 // non-full registers).
10433 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
10434 return TreeEntry::NeedToGather;
10435 if (Reuse || !CurrentOrder.empty())
10436 return TreeEntry::Vectorize;
10437 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10438 return TreeEntry::NeedToGather;
10439 }
10440 case Instruction::InsertElement: {
10441 // Check that we have a buildvector and not a shuffle of 2 or more
10442 // different vectors.
10443 ValueSet SourceVectors;
10444 for (Value *V : VL) {
10445 if (isa<PoisonValue>(V)) {
10446 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10447 return TreeEntry::NeedToGather;
10448 }
10449 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10450 assert(getElementIndex(V) != std::nullopt &&
10451 "Non-constant or undef index?");
10452 }
10453
10454 if (count_if(VL, [&SourceVectors](Value *V) {
10455 return !SourceVectors.contains(V);
10456 }) >= 2) {
10457 // Found 2nd source vector - cancel.
10458 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10459 "different source vectors.\n");
10460 return TreeEntry::NeedToGather;
10461 }
10462
10463 if (any_of(VL, [&SourceVectors](Value *V) {
10464 // The last InsertElement can have multiple uses.
10465 return SourceVectors.contains(V) && !V->hasOneUse();
10466 })) {
10467 assert(SLPReVec && "Only supported by REVEC.");
10468 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10469 "multiple uses.\n");
10470 return TreeEntry::NeedToGather;
10471 }
10472
10473 return TreeEntry::Vectorize;
10474 }
10475 case Instruction::Load: {
10476 // Check that a vectorized load would load the same memory as a scalar
10477 // load. For example, we don't want to vectorize loads that are smaller
10478 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10479 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10480 // from such a struct, we read/write packed bits disagreeing with the
10481 // unvectorized version.
10482 auto IsGatheredNode = [&]() {
10483 if (!GatheredLoadsEntriesFirst)
10484 return false;
10485 return all_of(VL, [&](Value *V) {
10486 if (isa<PoisonValue>(V))
10487 return true;
10488 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10489 return TE->Idx >= *GatheredLoadsEntriesFirst;
10490 });
10491 });
10492 };
10493 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10495 return TreeEntry::Vectorize;
10497 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10498 // Delay slow vectorized nodes for better vectorization attempts.
10499 LoadEntriesToVectorize.insert(VectorizableTree.size());
10500 return TreeEntry::NeedToGather;
10501 }
10502 return IsGatheredNode() ? TreeEntry::NeedToGather
10503 : TreeEntry::CompressVectorize;
10505 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10506 // Delay slow vectorized nodes for better vectorization attempts.
10507 LoadEntriesToVectorize.insert(VectorizableTree.size());
10508 return TreeEntry::NeedToGather;
10509 }
10510 return IsGatheredNode() ? TreeEntry::NeedToGather
10511 : TreeEntry::ScatterVectorize;
10513 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10514 // Delay slow vectorized nodes for better vectorization attempts.
10515 LoadEntriesToVectorize.insert(VectorizableTree.size());
10516 return TreeEntry::NeedToGather;
10517 }
10518 return IsGatheredNode() ? TreeEntry::NeedToGather
10519 : TreeEntry::StridedVectorize;
10520 case LoadsState::Gather:
10521#ifndef NDEBUG
10522 Type *ScalarTy = VL0->getType();
10523 if (DL->getTypeSizeInBits(ScalarTy) !=
10524 DL->getTypeAllocSizeInBits(ScalarTy))
10525 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10526 else if (any_of(VL, [](Value *V) {
10527 auto *LI = dyn_cast<LoadInst>(V);
10528 return !LI || !LI->isSimple();
10529 }))
10530 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10531 else
10532 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10533#endif // NDEBUG
10535 return TreeEntry::NeedToGather;
10536 }
10537 llvm_unreachable("Unexpected state of loads");
10538 }
10539 case Instruction::ZExt:
10540 case Instruction::SExt:
10541 case Instruction::FPToUI:
10542 case Instruction::FPToSI:
10543 case Instruction::FPExt:
10544 case Instruction::PtrToInt:
10545 case Instruction::IntToPtr:
10546 case Instruction::SIToFP:
10547 case Instruction::UIToFP:
10548 case Instruction::Trunc:
10549 case Instruction::FPTrunc:
10550 case Instruction::BitCast: {
10551 Type *SrcTy = VL0->getOperand(0)->getType();
10552 for (Value *V : VL) {
10553 if (isa<PoisonValue>(V))
10554 continue;
10555 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10556 if (Ty != SrcTy || !isValidElementType(Ty)) {
10557 LLVM_DEBUG(
10558 dbgs() << "SLP: Gathering casts with different src types.\n");
10559 return TreeEntry::NeedToGather;
10560 }
10561 }
10562 return TreeEntry::Vectorize;
10563 }
10564 case Instruction::ICmp:
10565 case Instruction::FCmp: {
10566 // Check that all of the compares have the same predicate.
10567 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10569 Type *ComparedTy = VL0->getOperand(0)->getType();
10570 for (Value *V : VL) {
10571 if (isa<PoisonValue>(V))
10572 continue;
10573 auto *Cmp = cast<CmpInst>(V);
10574 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10575 Cmp->getOperand(0)->getType() != ComparedTy) {
10576 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10577 return TreeEntry::NeedToGather;
10578 }
10579 }
10580 return TreeEntry::Vectorize;
10581 }
10582 case Instruction::Select:
10583 case Instruction::FNeg:
10584 case Instruction::Add:
10585 case Instruction::FAdd:
10586 case Instruction::Sub:
10587 case Instruction::FSub:
10588 case Instruction::Mul:
10589 case Instruction::FMul:
10590 case Instruction::UDiv:
10591 case Instruction::SDiv:
10592 case Instruction::FDiv:
10593 case Instruction::URem:
10594 case Instruction::SRem:
10595 case Instruction::FRem:
10596 case Instruction::Shl:
10597 case Instruction::LShr:
10598 case Instruction::AShr:
10599 case Instruction::And:
10600 case Instruction::Or:
10601 case Instruction::Xor:
10602 case Instruction::Freeze:
10603 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10604 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10605 auto *I = dyn_cast<Instruction>(V);
10606 return I && I->isBinaryOp() && !I->isFast();
10607 }))
10608 return TreeEntry::NeedToGather;
10609 return TreeEntry::Vectorize;
10610 case Instruction::GetElementPtr: {
10611 // We don't combine GEPs with complicated (nested) indexing.
10612 for (Value *V : VL) {
10613 auto *I = dyn_cast<GetElementPtrInst>(V);
10614 if (!I)
10615 continue;
10616 if (I->getNumOperands() != 2) {
10617 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10618 return TreeEntry::NeedToGather;
10619 }
10620 }
10621
10622 // We can't combine several GEPs into one vector if they operate on
10623 // different types.
10624 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10625 for (Value *V : VL) {
10626 auto *GEP = dyn_cast<GEPOperator>(V);
10627 if (!GEP)
10628 continue;
10629 Type *CurTy = GEP->getSourceElementType();
10630 if (Ty0 != CurTy) {
10631 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10632 return TreeEntry::NeedToGather;
10633 }
10634 }
10635
10636 // We don't combine GEPs with non-constant indexes.
10637 Type *Ty1 = VL0->getOperand(1)->getType();
10638 for (Value *V : VL) {
10639 auto *I = dyn_cast<GetElementPtrInst>(V);
10640 if (!I)
10641 continue;
10642 auto *Op = I->getOperand(1);
10643 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10644 (Op->getType() != Ty1 &&
10645 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10646 Op->getType()->getScalarSizeInBits() >
10647 DL->getIndexSizeInBits(
10648 V->getType()->getPointerAddressSpace())))) {
10649 LLVM_DEBUG(
10650 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10651 return TreeEntry::NeedToGather;
10652 }
10653 }
10654
10655 return TreeEntry::Vectorize;
10656 }
10657 case Instruction::Store: {
10658 // Check if the stores are consecutive or if we need to swizzle them.
10659 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10660 // Avoid types that are padded when being allocated as scalars, while
10661 // being packed together in a vector (such as i1).
10662 if (DL->getTypeSizeInBits(ScalarTy) !=
10663 DL->getTypeAllocSizeInBits(ScalarTy)) {
10664 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10665 return TreeEntry::NeedToGather;
10666 }
10667 // Make sure all stores in the bundle are simple - we can't vectorize
10668 // atomic or volatile stores.
10669 for (Value *V : VL) {
10670 auto *SI = cast<StoreInst>(V);
10671 if (!SI->isSimple()) {
10672 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10673 return TreeEntry::NeedToGather;
10674 }
10675 PointerOps.push_back(SI->getPointerOperand());
10676 }
10677
10678 // Check the order of pointer operands.
10679 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10680 Value *Ptr0;
10681 Value *PtrN;
10682 if (CurrentOrder.empty()) {
10683 Ptr0 = PointerOps.front();
10684 PtrN = PointerOps.back();
10685 } else {
10686 Ptr0 = PointerOps[CurrentOrder.front()];
10687 PtrN = PointerOps[CurrentOrder.back()];
10688 }
10689 std::optional<int64_t> Dist =
10690 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10691 // Check that the sorted pointer operands are consecutive.
10692 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10693 return TreeEntry::Vectorize;
10694 }
10695
10696 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10697 return TreeEntry::NeedToGather;
10698 }
10699 case Instruction::Call: {
10700 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10701 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10702 auto *I = dyn_cast<Instruction>(V);
10703 return I && !I->isFast();
10704 }))
10705 return TreeEntry::NeedToGather;
10706 // Check if the calls are all to the same vectorizable intrinsic or
10707 // library function.
10708 CallInst *CI = cast<CallInst>(VL0);
10710
10711 VFShape Shape = VFShape::get(
10712 CI->getFunctionType(),
10713 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10714 false /*HasGlobalPred*/);
10715 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10716
10717 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10718 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10719 return TreeEntry::NeedToGather;
10720 }
10721 Function *F = CI->getCalledFunction();
10722 unsigned NumArgs = CI->arg_size();
10723 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10724 for (unsigned J = 0; J != NumArgs; ++J)
10726 ScalarArgs[J] = CI->getArgOperand(J);
10727 for (Value *V : VL) {
10728 CallInst *CI2 = dyn_cast<CallInst>(V);
10729 if (!CI2 || CI2->getCalledFunction() != F ||
10730 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10731 (VecFunc &&
10732 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10734 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10735 << "\n");
10736 return TreeEntry::NeedToGather;
10737 }
10738 // Some intrinsics have scalar arguments and should be same in order for
10739 // them to be vectorized.
10740 for (unsigned J = 0; J != NumArgs; ++J) {
10742 Value *A1J = CI2->getArgOperand(J);
10743 if (ScalarArgs[J] != A1J) {
10745 << "SLP: mismatched arguments in call:" << *CI
10746 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10747 return TreeEntry::NeedToGather;
10748 }
10749 }
10750 }
10751 // Verify that the bundle operands are identical between the two calls.
10752 if (CI->hasOperandBundles() &&
10753 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10754 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10755 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10756 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10757 << "!=" << *V << '\n');
10758 return TreeEntry::NeedToGather;
10759 }
10760 }
10761 SmallVector<Type *> ArgTys =
10762 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10763 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10764 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10765 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10766 return TreeEntry::NeedToGather;
10767
10768 return TreeEntry::Vectorize;
10769 }
10770 case Instruction::ShuffleVector: {
10771 if (!S.isAltShuffle()) {
10772 // REVEC can support non alternate shuffle.
10774 return TreeEntry::Vectorize;
10775 // If this is not an alternate sequence of opcode like add-sub
10776 // then do not vectorize this instruction.
10777 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10778 return TreeEntry::NeedToGather;
10779 }
10780 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10781 LLVM_DEBUG(
10782 dbgs()
10783 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10784 "the whole alt sequence is not profitable.\n");
10785 return TreeEntry::NeedToGather;
10786 }
10787
10788 return TreeEntry::Vectorize;
10789 }
10790 default:
10791 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10792 return TreeEntry::NeedToGather;
10793 }
10794}
10795
10796namespace {
10797/// Allows to correctly handle operands of the phi nodes based on the \p Main
10798/// PHINode order of incoming basic blocks/values.
10799class PHIHandler {
10800 DominatorTree &DT;
10801 PHINode *Main = nullptr;
10804
10805public:
10806 PHIHandler() = delete;
10807 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10808 : DT(DT), Main(Main), Phis(Phis),
10809 Operands(Main->getNumIncomingValues(),
10810 SmallVector<Value *>(Phis.size(), nullptr)) {}
10811 void buildOperands() {
10812 constexpr unsigned FastLimit = 4;
10813 if (Main->getNumIncomingValues() <= FastLimit) {
10814 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10815 BasicBlock *InBB = Main->getIncomingBlock(I);
10816 if (!DT.isReachableFromEntry(InBB)) {
10817 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10818 continue;
10819 }
10820 // Prepare the operand vector.
10821 for (auto [Idx, V] : enumerate(Phis)) {
10822 auto *P = dyn_cast<PHINode>(V);
10823 if (!P) {
10825 "Expected isa instruction or poison value.");
10826 Operands[I][Idx] = V;
10827 continue;
10828 }
10829 if (P->getIncomingBlock(I) == InBB)
10830 Operands[I][Idx] = P->getIncomingValue(I);
10831 else
10832 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10833 }
10834 }
10835 return;
10836 }
10837 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10838 Blocks;
10839 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10840 BasicBlock *InBB = Main->getIncomingBlock(I);
10841 if (!DT.isReachableFromEntry(InBB)) {
10842 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10843 continue;
10844 }
10845 Blocks.try_emplace(InBB).first->second.push_back(I);
10846 }
10847 for (auto [Idx, V] : enumerate(Phis)) {
10848 if (isa<PoisonValue>(V)) {
10849 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10850 Operands[I][Idx] = V;
10851 continue;
10852 }
10853 auto *P = cast<PHINode>(V);
10854 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10855 BasicBlock *InBB = P->getIncomingBlock(I);
10856 if (InBB == Main->getIncomingBlock(I)) {
10857 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10858 continue;
10859 Operands[I][Idx] = P->getIncomingValue(I);
10860 continue;
10861 }
10862 auto *It = Blocks.find(InBB);
10863 if (It == Blocks.end())
10864 continue;
10865 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10866 }
10867 }
10868 for (const auto &P : Blocks) {
10869 ArrayRef<unsigned> IncomingValues = P.second;
10870 if (IncomingValues.size() <= 1)
10871 continue;
10872 unsigned BasicI = IncomingValues.consume_front();
10873 for (unsigned I : IncomingValues) {
10874 assert(all_of(enumerate(Operands[I]),
10875 [&](const auto &Data) {
10876 return !Data.value() ||
10877 Data.value() == Operands[BasicI][Data.index()];
10878 }) &&
10879 "Expected empty operands list.");
10880 Operands[I] = Operands[BasicI];
10881 }
10882 }
10883 }
10884 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10885};
10886} // namespace
10887
10888/// Returns main/alternate instructions for the given \p VL. Unlike
10889/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10890/// node support.
10891/// \returns first main/alt instructions, if only poisons and instruction with
10892/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10893static std::pair<Instruction *, Instruction *>
10895 Instruction *MainOp = nullptr;
10896 Instruction *AltOp = nullptr;
10897 for (Value *V : VL) {
10898 if (isa<PoisonValue>(V))
10899 continue;
10900 auto *I = dyn_cast<Instruction>(V);
10901 if (!I)
10902 return {};
10903 if (!MainOp) {
10904 MainOp = I;
10905 continue;
10906 }
10907 if (MainOp->getOpcode() == I->getOpcode()) {
10908 if (I->getParent() != MainOp->getParent())
10909 return {};
10910 continue;
10911 }
10912 if (!AltOp) {
10913 AltOp = I;
10914 continue;
10915 }
10916 if (AltOp->getOpcode() == I->getOpcode()) {
10917 if (I->getParent() != AltOp->getParent())
10918 return {};
10919 continue;
10920 }
10921 return {};
10922 }
10923 if (!AltOp)
10924 return {};
10925 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10926 "Expected different main and alt instructions.");
10927 return std::make_pair(MainOp, AltOp);
10928}
10929
10930/// Checks that every instruction appears once in the list and if not, packs
10931/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10932/// unique scalars is extended by poison values to the whole register size.
10933///
10934/// \returns false if \p VL could not be uniquified, in which case \p VL is
10935/// unchanged and \p ReuseShuffleIndices is empty.
10937 SmallVectorImpl<int> &ReuseShuffleIndices,
10938 const TargetTransformInfo &TTI,
10939 const TargetLibraryInfo &TLI,
10940 const InstructionsState &S,
10941 const BoUpSLP::EdgeInfo &UserTreeIdx,
10942 bool TryPad = false) {
10943 // Check that every instruction appears once in this bundle.
10944 SmallVector<Value *> UniqueValues;
10945 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10946 for (Value *V : VL) {
10947 if (isConstant(V)) {
10948 // Constants are always considered distinct, even if the same constant
10949 // appears multiple times in VL.
10950 ReuseShuffleIndices.emplace_back(
10951 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10952 UniqueValues.emplace_back(V);
10953 continue;
10954 }
10955 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10956 ReuseShuffleIndices.emplace_back(Res.first->second);
10957 if (Res.second)
10958 UniqueValues.emplace_back(V);
10959 }
10960
10961 // Easy case: VL has unique values and a "natural" size
10962 size_t NumUniqueScalarValues = UniqueValues.size();
10963 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10964 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10965 if (NumUniqueScalarValues == VL.size() &&
10966 (VectorizeNonPowerOf2 || IsFullVectors)) {
10967 ReuseShuffleIndices.clear();
10968 return true;
10969 }
10970
10971 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10972 if ((UserTreeIdx.UserTE &&
10973 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10975 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10976 "for nodes with padding.\n");
10977 ReuseShuffleIndices.clear();
10978 return false;
10979 }
10980
10981 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10982 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10983 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10984 return isa<UndefValue>(V) || !isConstant(V);
10985 }))) {
10986 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10987 S.getMainOp()->isSafeToRemove() &&
10988 (S.areInstructionsWithCopyableElements() ||
10989 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10990 // Find the number of elements, which forms full vectors.
10991 unsigned PWSz = getFullVectorNumberOfElements(
10992 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10993 PWSz = std::min<unsigned>(PWSz, VL.size());
10994 if (PWSz == VL.size()) {
10995 // We ended up with the same size after removing duplicates and
10996 // upgrading the resulting vector size to a "nice size". Just keep
10997 // the initial VL then.
10998 ReuseShuffleIndices.clear();
10999 } else {
11000 // Pad unique values with poison to grow the vector to a "nice" size
11001 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
11002 UniqueValues.end());
11003 PaddedUniqueValues.append(
11004 PWSz - UniqueValues.size(),
11005 PoisonValue::get(UniqueValues.front()->getType()));
11006 // Check that extended with poisons/copyable operations are still valid
11007 // for vectorization (div/rem are not allowed).
11008 if ((!S.areInstructionsWithCopyableElements() &&
11009 !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
11010 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
11011 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
11012 isa<CallInst>(S.getMainOp())))) {
11013 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
11014 ReuseShuffleIndices.clear();
11015 return false;
11016 }
11017 VL = std::move(PaddedUniqueValues);
11018 }
11019 return true;
11020 }
11021 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
11022 ReuseShuffleIndices.clear();
11023 return false;
11024 }
11025 VL = std::move(UniqueValues);
11026 return true;
11027}
11028
11029bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
11030 const InstructionsState &LocalState,
11031 SmallVectorImpl<Value *> &Op1,
11032 SmallVectorImpl<Value *> &Op2,
11033 OrdersType &ReorderIndices) const {
11034 constexpr unsigned SmallNodeSize = 4;
11035 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11037 return false;
11038
11039 // Check if this is a duplicate of another split entry.
11040 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
11041 << ".\n");
11042 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
11043 if (E->isSame(VL)) {
11044 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
11045 << *LocalState.getMainOp() << ".\n");
11046 return false;
11047 }
11048 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11049 if (all_of(VL, [&](Value *V) {
11050 return isa<PoisonValue>(V) || Values.contains(V);
11051 })) {
11052 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11053 return false;
11054 }
11055 }
11056
11057 ReorderIndices.assign(VL.size(), VL.size());
11058 SmallBitVector Op1Indices(VL.size());
11059 for (auto [Idx, V] : enumerate(VL)) {
11060 auto *I = dyn_cast<Instruction>(V);
11061 if (!I) {
11062 Op1.push_back(V);
11063 Op1Indices.set(Idx);
11064 continue;
11065 }
11066 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11067 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
11068 *TLI)) ||
11069 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11070 !isAlternateInstruction(I, LocalState.getMainOp(),
11071 LocalState.getAltOp(), *TLI))) {
11072 Op1.push_back(V);
11073 Op1Indices.set(Idx);
11074 continue;
11075 }
11076 Op2.push_back(V);
11077 }
11078 Type *ScalarTy = getValueType(VL.front());
11079 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
11080 unsigned Opcode0 = LocalState.getOpcode();
11081 unsigned Opcode1 = LocalState.getAltOpcode();
11082 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11083 // Enable split node, only if all nodes do not form legal alternate
11084 // instruction (like X86 addsub).
11085 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
11086 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
11087 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11088 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
11089 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
11090 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
11091 return false;
11092 // Enable split node, only if all nodes are power-of-2/full registers.
11093 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
11094 for (unsigned Idx : seq<unsigned>(VL.size())) {
11095 if (Op1Indices.test(Idx)) {
11096 ReorderIndices[Op1Cnt] = Idx;
11097 ++Op1Cnt;
11098 } else {
11099 ReorderIndices[Op2Cnt] = Idx;
11100 ++Op2Cnt;
11101 }
11102 }
11103 if (isIdentityOrder(ReorderIndices))
11104 ReorderIndices.clear();
11105 SmallVector<int> Mask;
11106 if (!ReorderIndices.empty())
11107 inversePermutation(ReorderIndices, Mask);
11108 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11109 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
11110 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
11111 // Check non-profitable single register ops, which better to be represented
11112 // as alternate ops.
11113 if (NumParts >= VL.size())
11114 return false;
11116 InstructionCost InsertCost = ::getShuffleCost(
11117 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
11118 FixedVectorType *SubVecTy =
11119 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
11120 InstructionCost NewShuffleCost =
11121 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
11122 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11123 (Mask.empty() || InsertCost >= NewShuffleCost))
11124 return false;
11125 if ((LocalState.getMainOp()->isBinaryOp() &&
11126 LocalState.getAltOp()->isBinaryOp() &&
11127 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11128 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11129 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11130 (LocalState.getMainOp()->isUnaryOp() &&
11131 LocalState.getAltOp()->isUnaryOp())) {
11132 InstructionCost OriginalVecOpsCost =
11133 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11134 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11135 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
11136 for (unsigned Idx : seq<unsigned>(VL.size())) {
11137 if (isa<PoisonValue>(VL[Idx]))
11138 continue;
11139 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
11140 }
11141 InstructionCost OriginalCost =
11142 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
11143 VecTy, OriginalMask, Kind);
11144 InstructionCost NewVecOpsCost =
11145 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11146 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11147 InstructionCost NewCost =
11148 NewVecOpsCost + InsertCost +
11149 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11150 VectorizableTree.front()->getOpcode() == Instruction::Store
11151 ? NewShuffleCost
11152 : 0);
11153 // If not profitable to split - exit.
11154 if (NewCost >= OriginalCost)
11155 return false;
11156 }
11157 return true;
11158}
11159
11160namespace {
11161/// Class accepts incoming list of values, checks if it is able to model
11162/// "copyable" values as compatible operations, and generates the list of values
11163/// for scheduling and list of operands doe the new nodes.
11164class InstructionsCompatibilityAnalysis {
11165 DominatorTree &DT;
11166 const DataLayout &DL;
11167 const TargetTransformInfo &TTI;
11168 const TargetLibraryInfo &TLI;
11169 unsigned MainOpcode = 0;
11170 Instruction *MainOp = nullptr;
11171
11172 /// Checks if the opcode is supported as the main opcode for copyable
11173 /// elements.
11174 static bool isSupportedOpcode(const unsigned Opcode) {
11175 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11176 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11177 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11178 Opcode == Instruction::And || Opcode == Instruction::Or ||
11179 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11180 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11181 Opcode == Instruction::FDiv;
11182 }
11183
11184 /// Identifies the best candidate value, which represents main opcode
11185 /// operation.
11186 /// Currently the best candidate is the Add instruction with the parent
11187 /// block with the highest DFS incoming number (block, that dominates other).
11188 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
11189 BasicBlock *Parent = nullptr;
11190 // Checks if the instruction has supported opcode.
11191 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
11192 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
11193 return false;
11194 return I && isSupportedOpcode(I->getOpcode()) &&
11195 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
11196 };
11197 // Exclude operands instructions immediately to improve compile time, it
11198 // will be unable to schedule anyway.
11199 SmallDenseSet<Value *, 8> Operands;
11200 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11201 bool AnyUndef = false;
11202 for (Value *V : VL) {
11203 auto *I = dyn_cast<Instruction>(V);
11204 if (!I) {
11205 AnyUndef |= isa<UndefValue>(V);
11206 continue;
11207 }
11208 if (!DT.isReachableFromEntry(I->getParent()))
11209 continue;
11210 if (Candidates.empty()) {
11211 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11212 Parent = I->getParent();
11213 Operands.insert(I->op_begin(), I->op_end());
11214 continue;
11215 }
11216 if (Parent == I->getParent()) {
11217 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11218 Operands.insert(I->op_begin(), I->op_end());
11219 continue;
11220 }
11221 auto *NodeA = DT.getNode(Parent);
11222 auto *NodeB = DT.getNode(I->getParent());
11223 assert(NodeA && "Should only process reachable instructions");
11224 assert(NodeB && "Should only process reachable instructions");
11225 assert((NodeA == NodeB) ==
11226 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11227 "Different nodes should have different DFS numbers");
11228 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11229 Candidates.clear();
11230 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11231 Parent = I->getParent();
11232 Operands.clear();
11233 Operands.insert(I->op_begin(), I->op_end());
11234 }
11235 }
11236 unsigned BestOpcodeNum = 0;
11237 MainOp = nullptr;
11238 bool UsedOutside = false;
11239 for (const auto &P : Candidates) {
11240 bool PUsedOutside = all_of(P.second, isUsedOutsideBlock);
11241 if (UsedOutside && !PUsedOutside)
11242 continue;
11243 if (!UsedOutside && PUsedOutside)
11244 BestOpcodeNum = 0;
11245 if (P.second.size() < BestOpcodeNum)
11246 continue;
11247 // If have inner dependencies - skip.
11248 if (!PUsedOutside && any_of(P.second, [&](Instruction *I) {
11249 return Operands.contains(I);
11250 }))
11251 continue;
11252 UsedOutside = PUsedOutside;
11253 for (Instruction *I : P.second) {
11254 if (IsSupportedInstruction(I, AnyUndef)) {
11255 MainOp = I;
11256 BestOpcodeNum = P.second.size();
11257 break;
11258 }
11259 }
11260 }
11261 if (MainOp) {
11262 // Do not match, if any copyable is a terminator from the same block as
11263 // the main operation.
11264 if (any_of(VL, [&](Value *V) {
11265 auto *I = dyn_cast<Instruction>(V);
11266 return I && I->getParent() == MainOp->getParent() &&
11267 I->isTerminator();
11268 })) {
11269 MainOp = nullptr;
11270 return;
11271 }
11272 MainOpcode = MainOp->getOpcode();
11273 }
11274 }
11275
11276 /// Returns the idempotent value for the \p MainOp with the detected \p
11277 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
11278 /// the operand itself, since V or V == V.
11279 Value *selectBestIdempotentValue() const {
11280 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11281 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
11282 !MainOp->isCommutative());
11283 }
11284
11285 /// Returns the value and operands for the \p V, considering if it is original
11286 /// instruction and its actual operands should be returned, or it is a
11287 /// copyable element and its should be represented as idempotent instruction.
11288 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
11289 if (isa<PoisonValue>(V))
11290 return {V, V};
11291 if (!S.isCopyableElement(V))
11292 return convertTo(cast<Instruction>(V), S).second;
11293 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11294 return {V, selectBestIdempotentValue()};
11295 }
11296
11297 /// Builds operands for the original instructions.
11298 void
11299 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
11300 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
11301
11302 unsigned ShuffleOrOp =
11303 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11304 Instruction *VL0 = S.getMainOp();
11305
11306 switch (ShuffleOrOp) {
11307 case Instruction::PHI: {
11308 auto *PH = cast<PHINode>(VL0);
11309
11310 // Keeps the reordered operands to avoid code duplication.
11311 PHIHandler Handler(DT, PH, VL);
11312 Handler.buildOperands();
11313 Operands.assign(PH->getNumOperands(), {});
11314 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
11315 Operands[I].assign(Handler.getOperands(I).begin(),
11316 Handler.getOperands(I).end());
11317 return;
11318 }
11319 case Instruction::ExtractValue:
11320 case Instruction::ExtractElement:
11321 // This is a special case, as it does not gather, but at the same time
11322 // we are not extending buildTree_rec() towards the operands.
11323 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
11324 return;
11325 case Instruction::InsertElement:
11326 Operands.assign(2, {VL.size(), nullptr});
11327 for (auto [Idx, V] : enumerate(VL)) {
11328 auto *IE = cast<InsertElementInst>(V);
11329 for (auto [OpIdx, Ops] : enumerate(Operands))
11330 Ops[Idx] = IE->getOperand(OpIdx);
11331 }
11332 return;
11333 case Instruction::Load:
11334 Operands.assign(
11335 1, {VL.size(),
11336 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
11337 for (auto [V, Op] : zip(VL, Operands.back())) {
11338 auto *LI = dyn_cast<LoadInst>(V);
11339 if (!LI)
11340 continue;
11341 Op = LI->getPointerOperand();
11342 }
11343 return;
11344 case Instruction::ZExt:
11345 case Instruction::SExt:
11346 case Instruction::FPToUI:
11347 case Instruction::FPToSI:
11348 case Instruction::FPExt:
11349 case Instruction::PtrToInt:
11350 case Instruction::IntToPtr:
11351 case Instruction::SIToFP:
11352 case Instruction::UIToFP:
11353 case Instruction::Trunc:
11354 case Instruction::FPTrunc:
11355 case Instruction::BitCast:
11356 case Instruction::ICmp:
11357 case Instruction::FCmp:
11358 case Instruction::Select:
11359 case Instruction::FNeg:
11360 case Instruction::Add:
11361 case Instruction::FAdd:
11362 case Instruction::Sub:
11363 case Instruction::FSub:
11364 case Instruction::Mul:
11365 case Instruction::FMul:
11366 case Instruction::UDiv:
11367 case Instruction::SDiv:
11368 case Instruction::FDiv:
11369 case Instruction::URem:
11370 case Instruction::SRem:
11371 case Instruction::FRem:
11372 case Instruction::Shl:
11373 case Instruction::LShr:
11374 case Instruction::AShr:
11375 case Instruction::And:
11376 case Instruction::Or:
11377 case Instruction::Xor:
11378 case Instruction::Freeze:
11379 case Instruction::Store:
11380 case Instruction::ShuffleVector:
11381 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11382 for (auto [Idx, V] : enumerate(VL)) {
11383 auto *I = dyn_cast<Instruction>(V);
11384 if (!I) {
11385 for (auto [OpIdx, Ops] : enumerate(Operands))
11386 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11387 continue;
11388 }
11389 auto [Op, ConvertedOps] = convertTo(I, S);
11390 for (auto [OpIdx, Ops] : enumerate(Operands))
11391 Ops[Idx] = ConvertedOps[OpIdx];
11392 }
11393 return;
11394 case Instruction::GetElementPtr: {
11395 Operands.assign(2, {VL.size(), nullptr});
11396 // Need to cast all indices to the same type before vectorization to
11397 // avoid crash.
11398 // Required to be able to find correct matches between different gather
11399 // nodes and reuse the vectorized values rather than trying to gather them
11400 // again.
11401 const unsigned IndexIdx = 1;
11402 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
11403 Type *Ty =
11404 all_of(VL,
11405 [&](Value *V) {
11407 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
11408 })
11409 ? VL0Ty
11410 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
11411 ->getPointerOperandType()
11412 ->getScalarType());
11413 for (auto [Idx, V] : enumerate(VL)) {
11415 if (!GEP) {
11416 Operands[0][Idx] = V;
11417 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11418 continue;
11419 }
11420 Operands[0][Idx] = GEP->getPointerOperand();
11421 auto *Op = GEP->getOperand(IndexIdx);
11422 auto *CI = dyn_cast<ConstantInt>(Op);
11423 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
11424 CI, Ty, CI->getValue().isSignBitSet(), DL)
11425 : Op;
11426 }
11427 return;
11428 }
11429 case Instruction::Call: {
11430 auto *CI = cast<CallInst>(VL0);
11432 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
11434 continue;
11435 auto &Ops = Operands.emplace_back();
11436 for (Value *V : VL) {
11437 auto *I = dyn_cast<Instruction>(V);
11438 Ops.push_back(I ? I->getOperand(Idx)
11439 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
11440 }
11441 }
11442 return;
11443 }
11444 default:
11445 break;
11446 }
11447 llvm_unreachable("Unexpected vectorization of the instructions.");
11448 }
11449
11450public:
11451 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11452 const TargetTransformInfo &TTI,
11453 const TargetLibraryInfo &TLI)
11454 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11455
11456 InstructionsState
11457 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
11458 bool TryCopyableElementsVectorization,
11459 bool WithProfitabilityCheck = false,
11460 bool SkipSameCodeCheck = false) {
11461 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11462 ? InstructionsState::invalid()
11463 : getSameOpcode(VL, TLI);
11464 if (S)
11465 return S;
11466 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
11467 return S;
11468 findAndSetMainInstruction(VL, R);
11469 if (!MainOp)
11470 return InstructionsState::invalid();
11471 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11472 if (!WithProfitabilityCheck)
11473 return S;
11474 // Check if it is profitable to vectorize the instruction.
11475 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11476 auto BuildCandidates =
11477 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11478 Value *V2) {
11479 if (V1 != V2 && isa<PHINode>(V1))
11480 return;
11481 auto *I1 = dyn_cast<Instruction>(V1);
11482 auto *I2 = dyn_cast<Instruction>(V2);
11483 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11484 I1->getParent() != I2->getParent())
11485 return;
11486 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
11487 };
11488 if (VL.size() == 2) {
11489 // Check if the operands allow better vectorization.
11490 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11491 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11492 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11493 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11494 R.findBestRootPair(Candidates1) &&
11495 R.findBestRootPair(Candidates2);
11496 if (!Res && isCommutative(MainOp)) {
11497 Candidates1.clear();
11498 Candidates2.clear();
11499 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11500 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11501 Res = !Candidates1.empty() && !Candidates2.empty() &&
11502 R.findBestRootPair(Candidates1) &&
11503 R.findBestRootPair(Candidates2);
11504 }
11505 if (!Res)
11506 return InstructionsState::invalid();
11508 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
11509 InstructionCost VectorCost;
11510 FixedVectorType *VecTy =
11511 getWidenedType(S.getMainOp()->getType(), VL.size());
11512 switch (MainOpcode) {
11513 case Instruction::Add:
11514 case Instruction::Sub:
11515 case Instruction::LShr:
11516 case Instruction::Shl:
11517 case Instruction::SDiv:
11518 case Instruction::UDiv:
11519 case Instruction::And:
11520 case Instruction::Or:
11521 case Instruction::Xor:
11522 case Instruction::FAdd:
11523 case Instruction::FMul:
11524 case Instruction::FSub:
11525 case Instruction::FDiv:
11526 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
11527 break;
11528 default:
11529 llvm_unreachable("Unexpected instruction.");
11530 }
11531 if (VectorCost > ScalarCost)
11532 return InstructionsState::invalid();
11533 return S;
11534 }
11535 assert(Operands.size() == 2 && "Unexpected number of operands!");
11536 unsigned CopyableNum =
11537 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
11538 if (CopyableNum < VL.size() / 2)
11539 return S;
11540 // Too many phi copyables - exit.
11541 const unsigned Limit = VL.size() / 24;
11542 if ((CopyableNum >= VL.size() - Limit ||
11543 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11544 CopyableNum >= MaxPHINumOperands) &&
11545 all_of(VL, [&](Value *V) {
11546 return isa<PHINode>(V) || !S.isCopyableElement(V);
11547 }))
11548 return InstructionsState::invalid();
11549 // Check profitability if number of copyables > VL.size() / 2.
11550 // 1. Reorder operands for better matching.
11551 if (isCommutative(MainOp)) {
11552 for (auto &Ops : Operands) {
11553 // Make instructions the first operands.
11554 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
11555 std::swap(Ops.front(), Ops.back());
11556 continue;
11557 }
11558 // Make constants the second operands.
11559 if (isa<Constant>(Ops.front())) {
11560 std::swap(Ops.front(), Ops.back());
11561 continue;
11562 }
11563 }
11564 }
11565 // 2. Check, if operands can be vectorized.
11566 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
11567 return InstructionsState::invalid();
11568 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11569 if (allConstant(Ops) || isSplat(Ops))
11570 return true;
11571 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11572 // one is different.
11573 constexpr unsigned Limit = 4;
11574 if (Operands.front().size() >= Limit) {
11575 SmallDenseMap<const Value *, unsigned> Counters;
11576 for (Value *V : Ops) {
11577 if (isa<UndefValue>(V))
11578 continue;
11579 ++Counters[V];
11580 }
11581 if (Counters.size() == 2 &&
11582 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11583 return C.second == 1;
11584 }))
11585 return true;
11586 }
11587 // First operand not a constant or splat? Last attempt - check for
11588 // potential vectorization.
11589 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11590 InstructionsState OpS = Analysis.buildInstructionsState(
11591 Ops, R, /*TryCopyableElementsVectorization=*/true);
11592 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11593 return false;
11594 unsigned CopyableNum =
11595 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11596 return CopyableNum <= VL.size() / 2;
11597 };
11598 if (!CheckOperand(Operands.front()))
11599 return InstructionsState::invalid();
11600
11601 return S;
11602 }
11603
11604 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11605 ArrayRef<Value *> VL) {
11606 assert(S && "Invalid state!");
11608 if (S.areInstructionsWithCopyableElements()) {
11609 MainOp = S.getMainOp();
11610 MainOpcode = S.getOpcode();
11611 Operands.assign(MainOp->getNumOperands(),
11612 BoUpSLP::ValueList(VL.size(), nullptr));
11613 for (auto [Idx, V] : enumerate(VL)) {
11614 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11615 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11616 Operands[OperandIdx][Idx] = Operand;
11617 }
11618 } else {
11619 buildOriginalOperands(S, VL, Operands);
11620 }
11621 return Operands;
11622 }
11623};
11624} // namespace
11625
11626BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11627 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11628 bool TryCopyableElementsVectorization) const {
11629 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11630
11631 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11632 InstructionsState S = Analysis.buildInstructionsState(
11633 VL, *this, TryCopyableElementsVectorization,
11634 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11635
11636 bool AreScatterAllGEPSameBlock = false;
11637 if (!S) {
11638 SmallVector<unsigned> SortedIndices;
11639 BasicBlock *BB = nullptr;
11640 bool IsScatterVectorizeUserTE =
11641 UserTreeIdx.UserTE &&
11642 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11643 AreScatterAllGEPSameBlock =
11644 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11645 VL.size() > 2 &&
11646 all_of(VL,
11647 [&BB](Value *V) {
11648 auto *I = dyn_cast<GetElementPtrInst>(V);
11649 if (!I)
11650 return doesNotNeedToBeScheduled(V);
11651 if (!BB)
11652 BB = I->getParent();
11653 return BB == I->getParent() && I->getNumOperands() == 2;
11654 }) &&
11655 BB &&
11656 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL,
11657 *SE, SortedIndices));
11658 if (!AreScatterAllGEPSameBlock) {
11659 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11660 "C,S,B,O, small shuffle. \n";
11661 dbgs() << "[";
11662 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11663 dbgs() << "]\n");
11664 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11665 /*TryToFindDuplicates=*/true,
11666 /*TrySplitVectorize=*/true);
11667 }
11668 // Reset S to make it GetElementPtr kind of node.
11669 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11670 assert(It != VL.end() && "Expected at least one GEP.");
11671 S = getSameOpcode(*It, *TLI);
11672 }
11673 assert(S && "Must be valid.");
11674
11675 // Don't handle vectors.
11676 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11677 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11678 // Do not try to pack to avoid extra instructions here.
11679 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11680 /*TryToFindDuplicates=*/false);
11681 }
11682
11683 // Check that all of the users of the scalars that we want to vectorize are
11684 // schedulable.
11685 BasicBlock *BB = S.getMainOp()->getParent();
11686
11688 !DT->isReachableFromEntry(BB)) {
11689 // Don't go into unreachable blocks. They may contain instructions with
11690 // dependency cycles which confuse the final scheduling.
11691 // Do not vectorize EH and non-returning blocks, not profitable in most
11692 // cases.
11693 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11694 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11695 }
11696
11697 // Don't go into catchswitch blocks, which can happen with PHIs.
11698 // Such blocks can only have PHIs and the catchswitch. There is no
11699 // place to insert a shuffle if we need to, so just avoid that issue.
11701 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11702 // Do not try to pack to avoid extra instructions here.
11703 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11704 /*TryToFindDuplicates=*/false);
11705 }
11706
11707 // Don't handle scalable vectors
11708 if (S.getOpcode() == Instruction::ExtractElement &&
11710 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11711 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11712 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11713 }
11714
11715 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11716 // a load), in which case peek through to include it in the tree, without
11717 // ballooning over-budget.
11718 if (Depth >= RecursionMaxDepth &&
11719 (S.isAltShuffle() || VL.size() < 4 ||
11720 !(match(S.getMainOp(), m_Load(m_Value())) ||
11721 all_of(VL, [&S](const Value *I) {
11722 return match(I,
11724 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11725 })))) {
11726 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11727 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11728 }
11729
11730 // Check if this is a duplicate of another entry.
11731 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11732 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11733 if (E->isSame(VL)) {
11734 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11735 << ".\n");
11736 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11737 }
11738 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11739 if (all_of(VL, [&](Value *V) {
11740 return isa<PoisonValue>(V) || Values.contains(V) ||
11741 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11742 LI->getLoopFor(S.getMainOp()->getParent()) &&
11743 isVectorized(V));
11744 })) {
11745 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11746 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11747 }
11748 }
11749
11750 // If all of the operands are identical or constant we have a simple solution.
11751 // If we deal with insert/extract instructions, they all must have constant
11752 // indices, otherwise we should gather them, not try to vectorize.
11753 // If alternate op node with 2 elements with gathered operands - do not
11754 // vectorize.
11755 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11756 if (!S || !S.isAltShuffle() || VL.size() > 2)
11757 return false;
11758 if (VectorizableTree.size() < MinTreeSize)
11759 return false;
11760 if (Depth >= RecursionMaxDepth - 1)
11761 return true;
11762 // Check if all operands are extracts, part of vector node or can build a
11763 // regular vectorize node.
11764 SmallVector<unsigned, 8> InstsCount;
11765 for (Value *V : VL) {
11766 auto *I = cast<Instruction>(V);
11767 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11768 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11769 }));
11770 }
11771 bool IsCommutative =
11772 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11773 if ((IsCommutative &&
11774 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11775 (!IsCommutative &&
11776 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11777 return true;
11778 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11780 auto *I1 = cast<Instruction>(VL.front());
11781 auto *I2 = cast<Instruction>(VL.back());
11782 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11783 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11784 I2->getOperand(Op));
11785 if (static_cast<unsigned>(count_if(
11786 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11788 })) >= S.getMainOp()->getNumOperands() / 2)
11789 return false;
11790 if (S.getMainOp()->getNumOperands() > 2)
11791 return true;
11792 if (IsCommutative) {
11793 // Check permuted operands.
11794 Candidates.clear();
11795 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11796 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11797 I2->getOperand((Op + 1) % E));
11798 if (any_of(
11799 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11801 }))
11802 return false;
11803 }
11804 return true;
11805 };
11806 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11807 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11808 if (!AreAllSameInsts || isSplat(VL) ||
11810 S.getMainOp()) &&
11812 NotProfitableForVectorization(VL)) {
11813 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11814 dbgs() << "[";
11815 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11816 dbgs() << "]\n");
11817 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11818 }
11819
11820 // Don't vectorize ephemeral values.
11821 if (!EphValues.empty()) {
11822 for (Value *V : VL) {
11823 if (EphValues.count(V)) {
11824 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11825 << ") is ephemeral.\n");
11826 // Do not try to pack to avoid extra instructions here.
11827 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11828 /*TryToFindDuplicates=*/false);
11829 }
11830 }
11831 }
11832
11833 // We now know that this is a vector of instructions of the same type from
11834 // the same block.
11835
11836 // Check that none of the instructions in the bundle are already in the tree
11837 // and the node may be not profitable for the vectorization as the small
11838 // alternate node.
11839 if (S.isAltShuffle()) {
11840 auto GetNumVectorizedExtracted = [&]() {
11841 APInt Extracted = APInt::getZero(VL.size());
11842 APInt Vectorized = APInt::getAllOnes(VL.size());
11843 for (auto [Idx, V] : enumerate(VL)) {
11844 auto *I = dyn_cast<Instruction>(V);
11845 if (!I || doesNotNeedToBeScheduled(I) ||
11846 all_of(I->operands(), [&](const Use &U) {
11847 return isa<ExtractElementInst>(U.get());
11848 }))
11849 continue;
11850 if (isVectorized(I))
11851 Vectorized.clearBit(Idx);
11852 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11853 Extracted.setBit(Idx);
11854 }
11855 return std::make_pair(Vectorized, Extracted);
11856 };
11857 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11859 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11860 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11861 // Rough cost estimation, if the vector code (+ potential extracts) is
11862 // more profitable than the scalar + buildvector.
11863 Type *ScalarTy = VL.front()->getType();
11864 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11865 InstructionCost VectorizeCostEstimate =
11866 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11867 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11868 /*Insert=*/false, /*Extract=*/true, Kind);
11869 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11870 *TTI, ScalarTy, VecTy, Vectorized,
11871 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11872 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11873 }
11874 if (PreferScalarize) {
11875 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11876 "node is not profitable.\n");
11877 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11878 }
11879 }
11880
11881 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11882 if (UserIgnoreList && !UserIgnoreList->empty()) {
11883 for (Value *V : VL) {
11884 if (UserIgnoreList->contains(V)) {
11885 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11886 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11887 }
11888 }
11889 }
11890
11891 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11892}
11893
11894void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11895 const EdgeInfo &UserTreeIdx,
11896 unsigned InterleaveFactor) {
11897 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11898
11899 SmallVector<int> ReuseShuffleIndices;
11900 SmallVector<Value *> VL(VLRef);
11901
11902 // Tries to build split node.
11903 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11904 SmallVector<Value *> Op1, Op2;
11905 OrdersType ReorderIndices;
11906 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11907 return false;
11908
11909 auto Invalid = ScheduleBundle::invalid();
11910 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11911 UserTreeIdx, {}, ReorderIndices);
11912 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11913 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11914 InstructionsState S = getSameOpcode(Op, *TLI);
11915 if (S && (isa<LoadInst>(S.getMainOp()) ||
11916 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11917 // Build gather node for loads, they will be gathered later.
11918 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11919 Idx == 0 ? 0 : Op1.size());
11920 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11921 } else {
11922 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11923 Idx == 0 ? 0 : Op1.size());
11924 buildTreeRec(Op, Depth, {TE, Idx});
11925 }
11926 };
11927 AddNode(Op1, 0);
11928 AddNode(Op2, 1);
11929 return true;
11930 };
11931
11932 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11933 bool AreConsts = false;
11934 for (Value *V : VL) {
11935 if (isa<PoisonValue>(V))
11936 continue;
11937 if (isa<Constant>(V)) {
11938 AreConsts = true;
11939 continue;
11940 }
11941 if (!isa<PHINode>(V))
11942 return false;
11943 }
11944 return AreConsts;
11945 };
11946 if (AreOnlyConstsWithPHIs(VL)) {
11947 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11948 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11949 return;
11950 }
11951
11952 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11953 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11954 InstructionsState S = Legality.getInstructionsState();
11955 if (!Legality.isLegal()) {
11956 if (Legality.trySplitVectorize()) {
11957 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11958 // Last chance to try to vectorize alternate node.
11959 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11960 return;
11961 }
11962 if (!S)
11963 Legality = getScalarsVectorizationLegality(
11964 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11965 if (!Legality.isLegal()) {
11966 if (Legality.tryToFindDuplicates())
11967 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11968 UserTreeIdx);
11969
11970 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11971 return;
11972 }
11973 S = Legality.getInstructionsState();
11974 }
11975
11976 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11977 if (S.isAltShuffle() && TrySplitNode(S))
11978 return;
11979
11980 // Check that every instruction appears once in this bundle.
11981 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11982 /*TryPad=*/true)) {
11983 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11984 return;
11985 }
11986
11987 // Perform specific checks for each particular instruction kind.
11988 bool IsScatterVectorizeUserTE =
11989 UserTreeIdx.UserTE &&
11990 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11991 OrdersType CurrentOrder;
11992 SmallVector<Value *> PointerOps;
11993 StridedPtrInfo SPtrInfo;
11994 TreeEntry::EntryState State = getScalarsVectorizationState(
11995 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11996 if (State == TreeEntry::NeedToGather) {
11997 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11998 return;
11999 }
12000
12001 Instruction *VL0 = S.getMainOp();
12002 BasicBlock *BB = VL0->getParent();
12003 auto &BSRef = BlocksSchedules[BB];
12004 if (!BSRef)
12005 BSRef = std::make_unique<BlockScheduling>(BB);
12006
12007 BlockScheduling &BS = *BSRef;
12008
12009 SetVector<Value *> UniqueValues(llvm::from_range, VL);
12010 std::optional<ScheduleBundle *> BundlePtr =
12011 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
12012#ifdef EXPENSIVE_CHECKS
12013 // Make sure we didn't break any internal invariants
12014 BS.verify();
12015#endif
12016 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
12017 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
12018 // Last chance to try to vectorize alternate node.
12019 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
12020 return;
12021 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12022 NonScheduledFirst.insert(VL.front());
12023 if (S.getOpcode() == Instruction::Load &&
12024 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
12026 return;
12027 }
12028 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
12029 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
12030 ScheduleBundle Empty;
12031 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
12032 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
12033
12034 unsigned ShuffleOrOp =
12035 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
12036 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
12037 // Postpone PHI nodes creation
12038 SmallVector<unsigned> PHIOps;
12039 for (unsigned I : seq<unsigned>(Operands.size())) {
12040 ArrayRef<Value *> Op = Operands[I];
12041 if (Op.empty())
12042 continue;
12043 InstructionsState S = getSameOpcode(Op, *TLI);
12044 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12045 buildTreeRec(Op, Depth + 1, {TE, I});
12046 else
12047 PHIOps.push_back(I);
12048 }
12049 for (unsigned I : PHIOps)
12050 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12051 };
12052 switch (ShuffleOrOp) {
12053 case Instruction::PHI: {
12054 TreeEntry *TE =
12055 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12056 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
12057 TE->dump());
12058
12059 TE->setOperands(Operands);
12060 CreateOperandNodes(TE, Operands);
12061 return;
12062 }
12063 case Instruction::ExtractValue:
12064 case Instruction::ExtractElement: {
12065 if (CurrentOrder.empty()) {
12066 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
12067 } else {
12068 LLVM_DEBUG({
12069 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
12070 "with order";
12071 for (unsigned Idx : CurrentOrder)
12072 dbgs() << " " << Idx;
12073 dbgs() << "\n";
12074 });
12075 fixupOrderingIndices(CurrentOrder);
12076 }
12077 // Insert new order with initial value 0, if it does not exist,
12078 // otherwise return the iterator to the existing one.
12079 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12080 ReuseShuffleIndices, CurrentOrder);
12081 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
12082 "(ExtractValueInst/ExtractElementInst).\n";
12083 TE->dump());
12084 // This is a special case, as it does not gather, but at the same time
12085 // we are not extending buildTreeRec() towards the operands.
12086 TE->setOperands(Operands);
12087 return;
12088 }
12089 case Instruction::InsertElement: {
12090 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
12091
12092 auto OrdCompare = [](const std::pair<int, int> &P1,
12093 const std::pair<int, int> &P2) {
12094 return P1.first > P2.first;
12095 };
12096 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
12097 decltype(OrdCompare)>
12098 Indices(OrdCompare);
12099 for (int I = 0, E = VL.size(); I < E; ++I) {
12100 unsigned Idx = *getElementIndex(VL[I]);
12101 Indices.emplace(Idx, I);
12102 }
12103 OrdersType CurrentOrder(VL.size(), VL.size());
12104 bool IsIdentity = true;
12105 for (int I = 0, E = VL.size(); I < E; ++I) {
12106 CurrentOrder[Indices.top().second] = I;
12107 IsIdentity &= Indices.top().second == I;
12108 Indices.pop();
12109 }
12110 if (IsIdentity)
12111 CurrentOrder.clear();
12112 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12113 {}, CurrentOrder);
12114 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
12115 TE->dump());
12116
12117 TE->setOperands(Operands);
12118 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
12119 return;
12120 }
12121 case Instruction::Load: {
12122 // Check that a vectorized load would load the same memory as a scalar
12123 // load. For example, we don't want to vectorize loads that are smaller
12124 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
12125 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
12126 // from such a struct, we read/write packed bits disagreeing with the
12127 // unvectorized version.
12128 TreeEntry *TE = nullptr;
12129 fixupOrderingIndices(CurrentOrder);
12130 switch (State) {
12131 case TreeEntry::Vectorize:
12132 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12133 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
12134 if (CurrentOrder.empty())
12135 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
12136 TE->dump());
12137 else
12139 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
12140 TE->dump());
12141 break;
12142 case TreeEntry::CompressVectorize:
12143 // Vectorizing non-consecutive loads with (masked)load + compress.
12144 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
12145 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12146 LLVM_DEBUG(
12147 dbgs()
12148 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12149 TE->dump());
12150 break;
12151 case TreeEntry::StridedVectorize:
12152 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12153 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
12154 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12155 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
12156 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
12157 TE->dump());
12158 break;
12159 case TreeEntry::ScatterVectorize:
12160 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12161 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
12162 UserTreeIdx, ReuseShuffleIndices);
12163 LLVM_DEBUG(
12164 dbgs()
12165 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12166 TE->dump());
12167 break;
12168 case TreeEntry::CombinedVectorize:
12169 case TreeEntry::SplitVectorize:
12170 case TreeEntry::NeedToGather:
12171 llvm_unreachable("Unexpected loads state.");
12172 }
12173 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12174 assert(Operands.size() == 1 && "Expected a single operand only");
12175 SmallVector<int> Mask;
12176 inversePermutation(CurrentOrder, Mask);
12177 reorderScalars(Operands.front(), Mask);
12178 }
12179 TE->setOperands(Operands);
12180 if (State == TreeEntry::ScatterVectorize)
12181 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
12182 return;
12183 }
12184 case Instruction::ZExt:
12185 case Instruction::SExt:
12186 case Instruction::FPToUI:
12187 case Instruction::FPToSI:
12188 case Instruction::FPExt:
12189 case Instruction::PtrToInt:
12190 case Instruction::IntToPtr:
12191 case Instruction::SIToFP:
12192 case Instruction::UIToFP:
12193 case Instruction::Trunc:
12194 case Instruction::FPTrunc:
12195 case Instruction::BitCast: {
12196 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12197 std::make_pair(std::numeric_limits<unsigned>::min(),
12198 std::numeric_limits<unsigned>::max()));
12199 if (ShuffleOrOp == Instruction::ZExt ||
12200 ShuffleOrOp == Instruction::SExt) {
12201 CastMaxMinBWSizes = std::make_pair(
12202 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
12203 PrevMaxBW),
12204 std::min<unsigned>(
12205 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
12206 PrevMinBW));
12207 } else if (ShuffleOrOp == Instruction::Trunc) {
12208 CastMaxMinBWSizes = std::make_pair(
12209 std::max<unsigned>(
12210 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
12211 PrevMaxBW),
12212 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
12213 PrevMinBW));
12214 }
12215 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12216 ReuseShuffleIndices);
12217 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
12218 TE->dump());
12219
12220 TE->setOperands(Operands);
12221 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12222 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12223 if (ShuffleOrOp == Instruction::Trunc) {
12224 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12225 } else if (ShuffleOrOp == Instruction::SIToFP ||
12226 ShuffleOrOp == Instruction::UIToFP) {
12227 unsigned NumSignBits =
12228 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
12229 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
12230 APInt Mask = DB->getDemandedBits(OpI);
12231 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
12232 }
12233 if (NumSignBits * 2 >=
12234 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
12235 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12236 }
12237 return;
12238 }
12239 case Instruction::ICmp:
12240 case Instruction::FCmp: {
12241 // Check that all of the compares have the same predicate.
12242 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12243 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12244 ReuseShuffleIndices);
12245 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
12246 TE->dump());
12247
12248 VLOperands Ops(VL, Operands, S, *this);
12249 if (cast<CmpInst>(VL0)->isCommutative()) {
12250 // Commutative predicate - collect + sort operands of the instructions
12251 // so that each side is more likely to have the same opcode.
12253 "Commutative Predicate mismatch");
12254 Ops.reorder();
12255 Operands.front() = Ops.getVL(0);
12256 Operands.back() = Ops.getVL(1);
12257 } else {
12258 // Collect operands - commute if it uses the swapped predicate.
12259 for (auto [Idx, V] : enumerate(VL)) {
12260 if (isa<PoisonValue>(V))
12261 continue;
12262 auto *Cmp = cast<CmpInst>(V);
12263 if (Cmp->getPredicate() != P0)
12264 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12265 }
12266 }
12267 TE->setOperands(Operands);
12268 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
12269 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
12270 if (ShuffleOrOp == Instruction::ICmp) {
12271 unsigned NumSignBits0 =
12272 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
12273 if (NumSignBits0 * 2 >=
12274 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
12275 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12276 unsigned NumSignBits1 =
12277 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
12278 if (NumSignBits1 * 2 >=
12279 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
12280 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
12281 }
12282 return;
12283 }
12284 case Instruction::Select:
12285 case Instruction::FNeg:
12286 case Instruction::Add:
12287 case Instruction::FAdd:
12288 case Instruction::Sub:
12289 case Instruction::FSub:
12290 case Instruction::Mul:
12291 case Instruction::FMul:
12292 case Instruction::UDiv:
12293 case Instruction::SDiv:
12294 case Instruction::FDiv:
12295 case Instruction::URem:
12296 case Instruction::SRem:
12297 case Instruction::FRem:
12298 case Instruction::Shl:
12299 case Instruction::LShr:
12300 case Instruction::AShr:
12301 case Instruction::And:
12302 case Instruction::Or:
12303 case Instruction::Xor:
12304 case Instruction::Freeze: {
12305 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12306 ReuseShuffleIndices);
12307 LLVM_DEBUG(
12308 dbgs() << "SLP: added a new TreeEntry "
12309 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12310 TE->dump());
12311
12312 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
12313 VLOperands Ops(VL, Operands, S, *this);
12314 Ops.reorder();
12315 Operands[0] = Ops.getVL(0);
12316 Operands[1] = Ops.getVL(1);
12317 }
12318 TE->setOperands(Operands);
12319 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12320 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12321 return;
12322 }
12323 case Instruction::GetElementPtr: {
12324 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12325 ReuseShuffleIndices);
12326 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
12327 TE->dump());
12328 TE->setOperands(Operands);
12329
12330 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
12331 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12332 return;
12333 }
12334 case Instruction::Store: {
12335 bool Consecutive = CurrentOrder.empty();
12336 if (!Consecutive)
12337 fixupOrderingIndices(CurrentOrder);
12338 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12339 ReuseShuffleIndices, CurrentOrder);
12340 if (Consecutive)
12341 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
12342 TE->dump());
12343 else
12344 LLVM_DEBUG(
12345 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
12346 TE->dump());
12347 TE->setOperands(Operands);
12348 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
12349 return;
12350 }
12351 case Instruction::Call: {
12352 // Check if the calls are all to the same vectorizable intrinsic or
12353 // library function.
12354 CallInst *CI = cast<CallInst>(VL0);
12356
12357 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12358 ReuseShuffleIndices);
12359 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
12360 TE->dump());
12361 if (isCommutative(VL0)) {
12362 VLOperands Ops(VL, Operands, S, *this);
12363 Ops.reorder();
12364 Operands[0] = Ops.getVL(0);
12365 Operands[1] = Ops.getVL(1);
12366 }
12367 TE->setOperands(Operands);
12368 for (unsigned I : seq<unsigned>(CI->arg_size())) {
12369 // For scalar operands no need to create an entry since no need to
12370 // vectorize it.
12372 continue;
12373 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12374 }
12375 return;
12376 }
12377 case Instruction::ShuffleVector: {
12378 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12379 ReuseShuffleIndices);
12380 if (S.isAltShuffle()) {
12381 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
12382 TE->dump());
12383 } else {
12384 assert(SLPReVec && "Only supported by REVEC.");
12385 LLVM_DEBUG(
12386 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12387 TE->dump());
12388 }
12389
12390 // Reorder operands if reordering would enable vectorization.
12391 auto *CI = dyn_cast<CmpInst>(VL0);
12392 if (CI && any_of(VL, [](Value *V) {
12393 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
12394 })) {
12395 auto *MainCI = cast<CmpInst>(S.getMainOp());
12396 auto *AltCI = cast<CmpInst>(S.getAltOp());
12397 CmpInst::Predicate MainP = MainCI->getPredicate();
12398 CmpInst::Predicate AltP = AltCI->getPredicate();
12399 assert(MainP != AltP &&
12400 "Expected different main/alternate predicates.");
12401 // Collect operands - commute if it uses the swapped predicate or
12402 // alternate operation.
12403 for (auto [Idx, V] : enumerate(VL)) {
12404 if (isa<PoisonValue>(V))
12405 continue;
12406 auto *Cmp = cast<CmpInst>(V);
12407
12408 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
12409 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12410 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12411 } else {
12412 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12413 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12414 }
12415 }
12416 TE->setOperands(Operands);
12417 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
12418 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
12419 return;
12420 }
12421
12422 if (isa<BinaryOperator>(VL0) || CI) {
12423 VLOperands Ops(VL, Operands, S, *this);
12424 Ops.reorder();
12425 Operands[0] = Ops.getVL(0);
12426 Operands[1] = Ops.getVL(1);
12427 }
12428 TE->setOperands(Operands);
12429 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12430 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12431 return;
12432 }
12433 default:
12434 break;
12435 }
12436 llvm_unreachable("Unexpected vectorization of the instructions.");
12437}
12438
12439unsigned BoUpSLP::canMapToVector(Type *T) const {
12440 unsigned N = 1;
12441 Type *EltTy = T;
12442
12444 if (EltTy->isEmptyTy())
12445 return 0;
12446 if (auto *ST = dyn_cast<StructType>(EltTy)) {
12447 // Check that struct is homogeneous.
12448 for (const auto *Ty : ST->elements())
12449 if (Ty != *ST->element_begin())
12450 return 0;
12451 N *= ST->getNumElements();
12452 EltTy = *ST->element_begin();
12453 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
12454 N *= AT->getNumElements();
12455 EltTy = AT->getElementType();
12456 } else {
12457 auto *VT = cast<FixedVectorType>(EltTy);
12458 N *= VT->getNumElements();
12459 EltTy = VT->getElementType();
12460 }
12461 }
12462
12463 if (!isValidElementType(EltTy))
12464 return 0;
12465 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
12466 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12467 VTSize != DL->getTypeStoreSizeInBits(T))
12468 return 0;
12469 return N;
12470}
12471
12472bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12473 SmallVectorImpl<unsigned> &CurrentOrder,
12474 bool ResizeAllowed) const {
12476 assert(It != VL.end() && "Expected at least one extract instruction.");
12477 auto *E0 = cast<Instruction>(*It);
12478 assert(
12480 "Invalid opcode");
12481 // Check if all of the extracts come from the same vector and from the
12482 // correct offset.
12483 Value *Vec = E0->getOperand(0);
12484
12485 CurrentOrder.clear();
12486
12487 // We have to extract from a vector/aggregate with the same number of elements.
12488 unsigned NElts;
12489 if (E0->getOpcode() == Instruction::ExtractValue) {
12490 NElts = canMapToVector(Vec->getType());
12491 if (!NElts)
12492 return false;
12493 // Check if load can be rewritten as load of vector.
12494 LoadInst *LI = dyn_cast<LoadInst>(Vec);
12495 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
12496 return false;
12497 } else {
12498 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
12499 }
12500
12501 unsigned E = VL.size();
12502 if (!ResizeAllowed && NElts != E)
12503 return false;
12504 SmallVector<int> Indices(E, PoisonMaskElem);
12505 unsigned MinIdx = NElts, MaxIdx = 0;
12506 for (auto [I, V] : enumerate(VL)) {
12507 auto *Inst = dyn_cast<Instruction>(V);
12508 if (!Inst)
12509 continue;
12510 if (Inst->getOperand(0) != Vec)
12511 return false;
12512 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
12513 if (isa<UndefValue>(EE->getIndexOperand()))
12514 continue;
12515 std::optional<unsigned> Idx = getExtractIndex(Inst);
12516 if (!Idx)
12517 return false;
12518 const unsigned ExtIdx = *Idx;
12519 if (ExtIdx >= NElts)
12520 continue;
12521 Indices[I] = ExtIdx;
12522 if (MinIdx > ExtIdx)
12523 MinIdx = ExtIdx;
12524 if (MaxIdx < ExtIdx)
12525 MaxIdx = ExtIdx;
12526 }
12527 if (MaxIdx - MinIdx + 1 > E)
12528 return false;
12529 if (MaxIdx + 1 <= E)
12530 MinIdx = 0;
12531
12532 // Check that all of the indices extract from the correct offset.
12533 bool ShouldKeepOrder = true;
12534 // Assign to all items the initial value E + 1 so we can check if the extract
12535 // instruction index was used already.
12536 // Also, later we can check that all the indices are used and we have a
12537 // consecutive access in the extract instructions, by checking that no
12538 // element of CurrentOrder still has value E + 1.
12539 CurrentOrder.assign(E, E);
12540 for (unsigned I = 0; I < E; ++I) {
12541 if (Indices[I] == PoisonMaskElem)
12542 continue;
12543 const unsigned ExtIdx = Indices[I] - MinIdx;
12544 if (CurrentOrder[ExtIdx] != E) {
12545 CurrentOrder.clear();
12546 return false;
12547 }
12548 ShouldKeepOrder &= ExtIdx == I;
12549 CurrentOrder[ExtIdx] = I;
12550 }
12551 if (ShouldKeepOrder)
12552 CurrentOrder.clear();
12553
12554 return ShouldKeepOrder;
12555}
12556
12557bool BoUpSLP::areAllUsersVectorized(
12558 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12559 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
12560 all_of(I->users(), [this](User *U) {
12561 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12562 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12563 });
12564}
12565
12566void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12567 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12568 SmallVectorImpl<Value *> *OpScalars,
12569 SmallVectorImpl<Value *> *AltScalars) const {
12570 unsigned Sz = Scalars.size();
12571 Mask.assign(Sz, PoisonMaskElem);
12572 SmallVector<int> OrderMask;
12573 if (!ReorderIndices.empty())
12574 inversePermutation(ReorderIndices, OrderMask);
12575 for (unsigned I = 0; I < Sz; ++I) {
12576 unsigned Idx = I;
12577 if (!ReorderIndices.empty())
12578 Idx = OrderMask[I];
12579 if (isa<PoisonValue>(Scalars[Idx]))
12580 continue;
12581 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12582 if (IsAltOp(OpInst)) {
12583 Mask[I] = Sz + Idx;
12584 if (AltScalars)
12585 AltScalars->push_back(OpInst);
12586 } else {
12587 Mask[I] = Idx;
12588 if (OpScalars)
12589 OpScalars->push_back(OpInst);
12590 }
12591 }
12592 if (!ReuseShuffleIndices.empty()) {
12593 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12594 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12595 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12596 });
12597 Mask.swap(NewMask);
12598 }
12599}
12600
12602 Instruction *AltOp,
12603 const TargetLibraryInfo &TLI) {
12604 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12605}
12606
12608 Instruction *AltOp,
12609 const TargetLibraryInfo &TLI) {
12610 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12611 auto *AltCI = cast<CmpInst>(AltOp);
12612 CmpInst::Predicate MainP = MainCI->getPredicate();
12613 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12614 assert(MainP != AltP && "Expected different main/alternate predicates.");
12615 auto *CI = cast<CmpInst>(I);
12616 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12617 return false;
12618 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12619 return true;
12620 CmpInst::Predicate P = CI->getPredicate();
12622
12623 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12624 "CmpInst expected to match either main or alternate predicate or "
12625 "their swap.");
12626 return MainP != P && MainP != SwappedP;
12627 }
12628 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12629}
12630
12631TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) const {
12632 assert(!Ops.empty());
12633 const auto *Op0 = Ops.front();
12634
12635 const bool IsConstant = all_of(Ops, [](Value *V) {
12636 // TODO: We should allow undef elements here
12637 return isConstant(V) && !isa<UndefValue>(V);
12638 });
12639 const bool IsUniform = all_of(Ops, [=](Value *V) {
12640 // TODO: We should allow undef elements here
12641 return V == Op0;
12642 });
12643 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12644 // TODO: We should allow undef elements here
12645 if (auto *CI = dyn_cast<ConstantInt>(V))
12646 return CI->getValue().isPowerOf2();
12647 return false;
12648 });
12649 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12650 // TODO: We should allow undef elements here
12651 if (auto *CI = dyn_cast<ConstantInt>(V))
12652 return CI->getValue().isNegatedPowerOf2();
12653 return false;
12654 });
12655
12657 if (IsConstant && IsUniform)
12659 else if (IsConstant)
12661 else if (IsUniform)
12663
12665 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12666 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12667
12668 return {VK, VP};
12669}
12670
12671namespace {
12672/// The base class for shuffle instruction emission and shuffle cost estimation.
12673class BaseShuffleAnalysis {
12674protected:
12675 Type *ScalarTy = nullptr;
12676
12677 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12678
12679 /// V is expected to be a vectorized value.
12680 /// When REVEC is disabled, there is no difference between VF and
12681 /// VNumElements.
12682 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12683 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12684 /// of 8.
12685 unsigned getVF(Value *V) const {
12686 assert(V && "V cannot be nullptr");
12687 assert(isa<FixedVectorType>(V->getType()) &&
12688 "V does not have FixedVectorType");
12689 assert(ScalarTy && "ScalarTy cannot be nullptr");
12690 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12691 unsigned VNumElements =
12692 cast<FixedVectorType>(V->getType())->getNumElements();
12693 assert(VNumElements > ScalarTyNumElements &&
12694 "the number of elements of V is not large enough");
12695 assert(VNumElements % ScalarTyNumElements == 0 &&
12696 "the number of elements of V is not a vectorized value");
12697 return VNumElements / ScalarTyNumElements;
12698 }
12699
12700 /// Checks if the mask is an identity mask.
12701 /// \param IsStrict if is true the function returns false if mask size does
12702 /// not match vector size.
12703 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12704 bool IsStrict) {
12705 int Limit = Mask.size();
12706 int VF = VecTy->getNumElements();
12707 int Index = -1;
12708 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12709 return true;
12710 if (!IsStrict) {
12711 // Consider extract subvector starting from index 0.
12712 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12713 Index == 0)
12714 return true;
12715 // All VF-size submasks are identity (e.g.
12716 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12717 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12718 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12719 return all_of(Slice, equal_to(PoisonMaskElem)) ||
12721 }))
12722 return true;
12723 }
12724 return false;
12725 }
12726
12727 /// Tries to combine 2 different masks into single one.
12728 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12729 /// change the size of the vector, \p LocalVF is the original size of the
12730 /// shuffled vector.
12731 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12732 ArrayRef<int> ExtMask) {
12733 unsigned VF = Mask.size();
12734 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12735 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12736 if (ExtMask[I] == PoisonMaskElem)
12737 continue;
12738 int MaskedIdx = Mask[ExtMask[I] % VF];
12739 NewMask[I] =
12740 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12741 }
12742 Mask.swap(NewMask);
12743 }
12744
12745 /// Looks through shuffles trying to reduce final number of shuffles in the
12746 /// code. The function looks through the previously emitted shuffle
12747 /// instructions and properly mark indices in mask as undef.
12748 /// For example, given the code
12749 /// \code
12750 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12751 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12752 /// \endcode
12753 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12754 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12755 /// <0, 1, 2, 3> for the shuffle.
12756 /// If 2 operands are of different size, the smallest one will be resized and
12757 /// the mask recalculated properly.
12758 /// For example, given the code
12759 /// \code
12760 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12761 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12762 /// \endcode
12763 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12764 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12765 /// <0, 1, 2, 3> for the shuffle.
12766 /// So, it tries to transform permutations to simple vector merge, if
12767 /// possible.
12768 /// \param V The input vector which must be shuffled using the given \p Mask.
12769 /// If the better candidate is found, \p V is set to this best candidate
12770 /// vector.
12771 /// \param Mask The input mask for the shuffle. If the best candidate is found
12772 /// during looking-through-shuffles attempt, it is updated accordingly.
12773 /// \param SinglePermute true if the shuffle operation is originally a
12774 /// single-value-permutation. In this case the look-through-shuffles procedure
12775 /// may look for resizing shuffles as the best candidates.
12776 /// \return true if the shuffle results in the non-resizing identity shuffle
12777 /// (and thus can be ignored), false - otherwise.
12778 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12779 bool SinglePermute) {
12780 Value *Op = V;
12781 ShuffleVectorInst *IdentityOp = nullptr;
12782 SmallVector<int> IdentityMask;
12783 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12784 // Exit if not a fixed vector type or changing size shuffle.
12785 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12786 if (!SVTy)
12787 break;
12788 // Remember the identity or broadcast mask, if it is not a resizing
12789 // shuffle. If no better candidates are found, this Op and Mask will be
12790 // used in the final shuffle.
12791 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12792 if (!IdentityOp || !SinglePermute ||
12793 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12795 IdentityMask.size()))) {
12796 IdentityOp = SV;
12797 // Store current mask in the IdentityMask so later we did not lost
12798 // this info if IdentityOp is selected as the best candidate for the
12799 // permutation.
12800 IdentityMask.assign(Mask);
12801 }
12802 }
12803 // Remember the broadcast mask. If no better candidates are found, this Op
12804 // and Mask will be used in the final shuffle.
12805 // Zero splat can be used as identity too, since it might be used with
12806 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12807 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12808 // expensive, the analysis founds out, that the source vector is just a
12809 // broadcast, this original mask can be transformed to identity mask <0,
12810 // 1, 2, 3>.
12811 // \code
12812 // %0 = shuffle %v, poison, zeroinitalizer
12813 // %res = shuffle %0, poison, <3, 1, 2, 0>
12814 // \endcode
12815 // may be transformed to
12816 // \code
12817 // %0 = shuffle %v, poison, zeroinitalizer
12818 // %res = shuffle %0, poison, <0, 1, 2, 3>
12819 // \endcode
12820 if (SV->isZeroEltSplat()) {
12821 IdentityOp = SV;
12822 IdentityMask.assign(Mask);
12823 }
12824 int LocalVF = Mask.size();
12825 if (auto *SVOpTy =
12826 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12827 LocalVF = SVOpTy->getNumElements();
12828 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12829 for (auto [Idx, I] : enumerate(Mask)) {
12830 if (I == PoisonMaskElem ||
12831 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12832 continue;
12833 ExtMask[Idx] = SV->getMaskValue(I);
12834 }
12835 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12836 SV->getOperand(0),
12837 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12838 .all();
12839 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12840 SV->getOperand(1),
12841 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12842 .all();
12843 if (!IsOp1Undef && !IsOp2Undef) {
12844 // Update mask and mark undef elems.
12845 for (int &I : Mask) {
12846 if (I == PoisonMaskElem)
12847 continue;
12848 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12850 I = PoisonMaskElem;
12851 }
12852 break;
12853 }
12854 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12855 combineMasks(LocalVF, ShuffleMask, Mask);
12856 Mask.swap(ShuffleMask);
12857 if (IsOp2Undef)
12858 Op = SV->getOperand(0);
12859 else
12860 Op = SV->getOperand(1);
12861 }
12862 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12863 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12865 if (IdentityOp) {
12866 V = IdentityOp;
12867 assert(Mask.size() == IdentityMask.size() &&
12868 "Expected masks of same sizes.");
12869 // Clear known poison elements.
12870 for (auto [I, Idx] : enumerate(Mask))
12871 if (Idx == PoisonMaskElem)
12872 IdentityMask[I] = PoisonMaskElem;
12873 Mask.swap(IdentityMask);
12874 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12875 return SinglePermute &&
12876 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12877 /*IsStrict=*/true) ||
12878 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12879 Shuffle->isZeroEltSplat() &&
12881 all_of(enumerate(Mask), [&](const auto &P) {
12882 return P.value() == PoisonMaskElem ||
12883 Shuffle->getShuffleMask()[P.index()] == 0;
12884 })));
12885 }
12886 V = Op;
12887 return false;
12888 }
12889 V = Op;
12890 return true;
12891 }
12892
12893 /// Smart shuffle instruction emission, walks through shuffles trees and
12894 /// tries to find the best matching vector for the actual shuffle
12895 /// instruction.
12896 template <typename T, typename ShuffleBuilderTy>
12897 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12898 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12899 assert(V1 && "Expected at least one vector value.");
12900 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12901 SmallVector<int> NewMask(Mask);
12902 if (ScalarTyNumElements != 1) {
12903 assert(SLPReVec && "FixedVectorType is not expected.");
12904 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12905 Mask = NewMask;
12906 }
12907 if (V2)
12908 Builder.resizeToMatch(V1, V2);
12909 int VF = Mask.size();
12910 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12911 VF = FTy->getNumElements();
12913 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12914 .all()) {
12915 // Peek through shuffles.
12916 Value *Op1 = V1;
12917 Value *Op2 = V2;
12918 int VF =
12919 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12920 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12921 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12922 for (int I = 0, E = Mask.size(); I < E; ++I) {
12923 if (Mask[I] < VF)
12924 CombinedMask1[I] = Mask[I];
12925 else
12926 CombinedMask2[I] = Mask[I] - VF;
12927 }
12928 Value *PrevOp1;
12929 Value *PrevOp2;
12930 do {
12931 PrevOp1 = Op1;
12932 PrevOp2 = Op2;
12933 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12934 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12935 // Check if we have 2 resizing shuffles - need to peek through operands
12936 // again.
12937 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12938 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12939 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12940 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12941 if (I == PoisonMaskElem)
12942 continue;
12943 ExtMask1[Idx] = SV1->getMaskValue(I);
12944 }
12945 SmallBitVector UseMask1 = buildUseMask(
12946 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12947 ->getNumElements(),
12948 ExtMask1, UseMask::SecondArg);
12949 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12950 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12951 if (I == PoisonMaskElem)
12952 continue;
12953 ExtMask2[Idx] = SV2->getMaskValue(I);
12954 }
12955 SmallBitVector UseMask2 = buildUseMask(
12956 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12957 ->getNumElements(),
12958 ExtMask2, UseMask::SecondArg);
12959 if (SV1->getOperand(0)->getType() ==
12960 SV2->getOperand(0)->getType() &&
12961 SV1->getOperand(0)->getType() != SV1->getType() &&
12962 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12963 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12964 Op1 = SV1->getOperand(0);
12965 Op2 = SV2->getOperand(0);
12966 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12967 int LocalVF = ShuffleMask1.size();
12968 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12969 LocalVF = FTy->getNumElements();
12970 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12971 CombinedMask1.swap(ShuffleMask1);
12972 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12973 LocalVF = ShuffleMask2.size();
12974 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12975 LocalVF = FTy->getNumElements();
12976 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12977 CombinedMask2.swap(ShuffleMask2);
12978 }
12979 }
12980 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12981 Builder.resizeToMatch(Op1, Op2);
12982 VF = std::max(cast<VectorType>(Op1->getType())
12983 ->getElementCount()
12984 .getKnownMinValue(),
12986 ->getElementCount()
12987 .getKnownMinValue());
12988 for (int I = 0, E = Mask.size(); I < E; ++I) {
12989 if (CombinedMask2[I] != PoisonMaskElem) {
12990 assert(CombinedMask1[I] == PoisonMaskElem &&
12991 "Expected undefined mask element");
12992 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12993 }
12994 }
12995 if (Op1 == Op2 &&
12996 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12997 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12999 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
13000 ArrayRef(CombinedMask1))))
13001 return Builder.createIdentity(Op1);
13002 return Builder.createShuffleVector(
13003 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
13004 CombinedMask1);
13005 }
13006 if (isa<PoisonValue>(V1))
13007 return Builder.createPoison(
13008 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
13009 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
13010 assert(V1 && "Expected non-null value after looking through shuffles.");
13011
13012 if (!IsIdentity)
13013 return Builder.createShuffleVector(V1, NewMask);
13014 return Builder.createIdentity(V1);
13015 }
13016
13017 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
13018 /// shuffle emission.
13019 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
13020 ArrayRef<int> Mask) {
13021 for (unsigned I : seq<unsigned>(CommonMask.size()))
13022 if (Mask[I] != PoisonMaskElem)
13023 CommonMask[I] = I;
13024 }
13025};
13026} // namespace
13027
13028/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
13029static std::pair<InstructionCost, InstructionCost>
13031 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
13032 Type *ScalarTy, VectorType *VecTy) {
13033 InstructionCost ScalarCost = 0;
13034 InstructionCost VecCost = 0;
13035 // Here we differentiate two cases: (1) when Ptrs represent a regular
13036 // vectorization tree node (as they are pointer arguments of scattered
13037 // loads) or (2) when Ptrs are the arguments of loads or stores being
13038 // vectorized as plane wide unit-stride load/store since all the
13039 // loads/stores are known to be from/to adjacent locations.
13040 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
13041 // Case 2: estimate costs for pointer related costs when vectorizing to
13042 // a wide load/store.
13043 // Scalar cost is estimated as a set of pointers with known relationship
13044 // between them.
13045 // For vector code we will use BasePtr as argument for the wide load/store
13046 // but we also need to account all the instructions which are going to
13047 // stay in vectorized code due to uses outside of these scalar
13048 // loads/stores.
13049 ScalarCost = TTI.getPointersChainCost(
13050 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
13051 CostKind);
13052
13053 SmallVector<const Value *> PtrsRetainedInVecCode;
13054 for (Value *V : Ptrs) {
13055 if (V == BasePtr) {
13056 PtrsRetainedInVecCode.push_back(V);
13057 continue;
13058 }
13059 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
13060 // For simplicity assume Ptr to stay in vectorized code if it's not a
13061 // GEP instruction. We don't care since it's cost considered free.
13062 // TODO: We should check for any uses outside of vectorizable tree
13063 // rather than just single use.
13064 if (!Ptr || !Ptr->hasOneUse())
13065 PtrsRetainedInVecCode.push_back(V);
13066 }
13067
13068 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
13069 // If all pointers stay in vectorized code then we don't have
13070 // any savings on that.
13071 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
13072 }
13073 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
13074 TTI::PointersChainInfo::getKnownStride(),
13075 VecTy, CostKind);
13076 } else {
13077 // Case 1: Ptrs are the arguments of loads that we are going to transform
13078 // into masked gather load intrinsic.
13079 // All the scalar GEPs will be removed as a result of vectorization.
13080 // For any external uses of some lanes extract element instructions will
13081 // be generated (which cost is estimated separately).
13082 TTI::PointersChainInfo PtrsInfo =
13083 all_of(Ptrs,
13084 [](const Value *V) {
13085 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
13086 return Ptr && !Ptr->hasAllConstantIndices();
13087 })
13088 ? TTI::PointersChainInfo::getUnknownStride()
13089 : TTI::PointersChainInfo::getKnownStride();
13090
13091 ScalarCost =
13092 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
13093 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
13094 if (!BaseGEP) {
13095 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
13096 if (It != Ptrs.end())
13097 BaseGEP = cast<GEPOperator>(*It);
13098 }
13099 if (BaseGEP) {
13100 SmallVector<const Value *> Indices(BaseGEP->indices());
13101 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
13102 BaseGEP->getPointerOperand(), Indices, VecTy,
13103 CostKind);
13104 }
13105 }
13106
13107 return std::make_pair(ScalarCost, VecCost);
13108}
13109
13110void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13111 assert(TE.isGather() && TE.ReorderIndices.empty() &&
13112 "Expected gather node without reordering.");
13113 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
13114 SmallSet<size_t, 2> LoadKeyUsed;
13115
13116 // Do not reorder nodes if it small (just 2 elements), all-constant or all
13117 // instructions have same opcode already.
13118 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
13119 all_of(TE.Scalars, isConstant))
13120 return;
13121
13122 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
13123 return VectorizableTree[Idx]->isSame(TE.Scalars);
13124 }))
13125 return;
13126
13127 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
13128 Key = hash_combine(hash_value(LI->getParent()), Key);
13129 Value *Ptr =
13130 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
13131 if (LoadKeyUsed.contains(Key)) {
13132 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
13133 if (LIt != LoadsMap.end()) {
13134 for (LoadInst *RLI : LIt->second) {
13135 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
13136 LI->getType(), LI->getPointerOperand(), *DL, *SE,
13137 /*StrictCheck=*/true))
13138 return hash_value(RLI->getPointerOperand());
13139 }
13140 for (LoadInst *RLI : LIt->second) {
13142 LI->getPointerOperand(), *TLI)) {
13143 hash_code SubKey = hash_value(RLI->getPointerOperand());
13144 return SubKey;
13145 }
13146 }
13147 if (LIt->second.size() > 2) {
13148 hash_code SubKey =
13149 hash_value(LIt->second.back()->getPointerOperand());
13150 return SubKey;
13151 }
13152 }
13153 }
13154 LoadKeyUsed.insert(Key);
13155 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
13156 return hash_value(LI->getPointerOperand());
13157 };
13158 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13159 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
13160 bool IsOrdered = true;
13161 unsigned NumInstructions = 0;
13162 // Try to "cluster" scalar instructions, to be able to build extra vectorized
13163 // nodes.
13164 for (auto [I, V] : enumerate(TE.Scalars)) {
13165 size_t Key = 1, Idx = 1;
13166 if (auto *Inst = dyn_cast<Instruction>(V);
13168 !isDeleted(Inst) && !isVectorized(V)) {
13169 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
13170 /*AllowAlternate=*/false);
13171 ++NumInstructions;
13172 }
13173 auto &Container = SortedValues[Key];
13174 if (IsOrdered && !KeyToIndex.contains(V) &&
13177 ((Container.contains(Idx) &&
13178 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
13179 (!Container.empty() && !Container.contains(Idx) &&
13180 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
13181 IsOrdered = false;
13182 auto &KTI = KeyToIndex[V];
13183 if (KTI.empty())
13184 Container[Idx].push_back(V);
13185 KTI.push_back(I);
13186 }
13188 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
13189 if (!IsOrdered && NumInstructions > 1) {
13190 unsigned Cnt = 0;
13191 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
13192 for (const auto &D : SortedValues) {
13193 for (const auto &P : D.second) {
13194 unsigned Sz = 0;
13195 for (Value *V : P.second) {
13196 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
13197 for (auto [K, Idx] : enumerate(Indices)) {
13198 TE.ReorderIndices[Cnt + K] = Idx;
13199 TE.Scalars[Cnt + K] = V;
13200 }
13201 Sz += Indices.size();
13202 Cnt += Indices.size();
13203 }
13204 if (Sz > 1 && isa<Instruction>(P.second.front())) {
13205 const unsigned SubVF = getFloorFullVectorNumberOfElements(
13206 *TTI, TE.Scalars.front()->getType(), Sz);
13207 SubVectors.emplace_back(Cnt - Sz, SubVF);
13208 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
13209 DemandedElts.clearBit(I);
13210 } else if (!P.second.empty() && isConstant(P.second.front())) {
13211 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
13212 DemandedElts.clearBit(I);
13213 }
13214 }
13215 }
13216 }
13217 // Reuses always require shuffles, so consider it as profitable.
13218 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
13219 return;
13220 // Do simple cost estimation.
13223 auto *ScalarTy = TE.Scalars.front()->getType();
13224 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
13225 for (auto [Idx, Sz] : SubVectors) {
13227 Idx, getWidenedType(ScalarTy, Sz));
13228 }
13229 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
13230 /*Insert=*/true,
13231 /*Extract=*/false, CostKind);
13232 int Sz = TE.Scalars.size();
13233 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
13234 TE.ReorderIndices.end());
13235 for (unsigned I : seq<unsigned>(Sz)) {
13236 Value *V = TE.getOrdered(I);
13237 if (isa<PoisonValue>(V)) {
13238 ReorderMask[I] = PoisonMaskElem;
13239 } else if (isConstant(V) || DemandedElts[I]) {
13240 ReorderMask[I] = I + TE.ReorderIndices.size();
13241 }
13242 }
13243 Cost += ::getShuffleCost(*TTI,
13244 any_of(ReorderMask, [&](int I) { return I >= Sz; })
13247 VecTy, ReorderMask);
13248 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
13249 ReorderMask.assign(Sz, PoisonMaskElem);
13250 for (unsigned I : seq<unsigned>(Sz)) {
13251 Value *V = TE.getOrdered(I);
13252 if (isConstant(V)) {
13253 DemandedElts.clearBit(I);
13254 if (!isa<PoisonValue>(V))
13255 ReorderMask[I] = I;
13256 } else {
13257 ReorderMask[I] = I + Sz;
13258 }
13259 }
13260 InstructionCost BVCost =
13261 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
13262 /*Insert=*/true, /*Extract=*/false, CostKind);
13263 if (!DemandedElts.isAllOnes())
13264 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
13265 if (Cost >= BVCost) {
13266 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
13267 reorderScalars(TE.Scalars, Mask);
13268 TE.ReorderIndices.clear();
13269 }
13270}
13271
13272/// Check if we can convert fadd/fsub sequence to FMAD.
13273/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
13275 const InstructionsState &S,
13276 DominatorTree &DT, const DataLayout &DL,
13278 const TargetLibraryInfo &TLI) {
13279 assert(all_of(VL,
13280 [](Value *V) {
13281 return V->getType()->getScalarType()->isFloatingPointTy();
13282 }) &&
13283 "Can only convert to FMA for floating point types");
13284 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
13285
13286 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
13287 FastMathFlags FMF;
13288 FMF.set();
13289 for (Value *V : VL) {
13290 auto *I = dyn_cast<Instruction>(V);
13291 if (!I)
13292 continue;
13293 if (S.isCopyableElement(I))
13294 continue;
13295 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
13296 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13297 continue;
13298 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13299 FMF &= FPCI->getFastMathFlags();
13300 }
13301 return FMF.allowContract();
13302 };
13303 if (!CheckForContractable(VL))
13305 // fmul also should be contractable
13306 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
13307 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
13308
13309 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
13310 if (!OpS.valid())
13312
13313 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13315 if (!CheckForContractable(Operands.front()))
13317 // Compare the costs.
13318 InstructionCost FMulPlusFAddCost = 0;
13319 InstructionCost FMACost = 0;
13321 FastMathFlags FMF;
13322 FMF.set();
13323 for (Value *V : VL) {
13324 auto *I = dyn_cast<Instruction>(V);
13325 if (!I)
13326 continue;
13327 if (!S.isCopyableElement(I))
13328 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13329 FMF &= FPCI->getFastMathFlags();
13330 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13331 }
13332 unsigned NumOps = 0;
13333 for (auto [V, Op] : zip(VL, Operands.front())) {
13334 if (S.isCopyableElement(V))
13335 continue;
13336 auto *I = dyn_cast<Instruction>(Op);
13337 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
13338 if (auto *OpI = dyn_cast<Instruction>(V))
13339 FMACost += TTI.getInstructionCost(OpI, CostKind);
13340 if (I)
13341 FMACost += TTI.getInstructionCost(I, CostKind);
13342 continue;
13343 }
13344 ++NumOps;
13345 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13346 FMF &= FPCI->getFastMathFlags();
13347 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13348 }
13349 Type *Ty = VL.front()->getType();
13350 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
13351 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
13352 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
13353}
13354
13355bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
13356 bool &IsBSwap) const {
13357 assert(TE.hasState() && TE.getOpcode() == Instruction::Shl &&
13358 "Expected Shl node.");
13359 IsBSwap = false;
13360 if (TE.State != TreeEntry::Vectorize || !TE.ReorderIndices.empty() ||
13361 !TE.ReuseShuffleIndices.empty() || MinBWs.contains(&TE) ||
13362 any_of(TE.Scalars, [](Value *V) { return !V->hasOneUse(); }))
13363 return false;
13364 Type *ScalarTy = TE.getMainOp()->getType();
13365 // TODO: Check if same can be done for the vector types.
13366 if (!ScalarTy->isIntegerTy())
13367 return false;
13368 if (ScalarTy->isVectorTy())
13369 return false;
13370 const unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
13371 if (!isPowerOf2_64(Sz))
13372 return false;
13373 const TreeEntry *LhsTE = getOperandEntry(&TE, /*Idx=*/0);
13374 const TreeEntry *RhsTE = getOperandEntry(&TE, /*Idx=*/1);
13375 // Lhs should be zext i<stride> to I<sz>.
13376 if (!(LhsTE->State == TreeEntry::Vectorize &&
13377 LhsTE->getOpcode() == Instruction::ZExt &&
13378 LhsTE->ReorderIndices.empty() && LhsTE->ReuseShuffleIndices.empty() &&
13379 !MinBWs.contains(LhsTE) &&
13380 all_of(LhsTE->Scalars, [](Value *V) { return V->hasOneUse(); })))
13381 return false;
13382 Type *SrcScalarTy = cast<ZExtInst>(LhsTE->getMainOp())->getSrcTy();
13383 unsigned Stride = DL->getTypeSizeInBits(SrcScalarTy);
13384 if (!isPowerOf2_64(Stride) || Stride >= Sz)
13385 return false;
13386 if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
13387 RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(RhsTE)))
13388 return false;
13389 Order.clear();
13390 unsigned CurrentValue = 0;
13391 // Rhs should be (0, Stride, 2 * Stride, ..., Sz-Stride).
13392 if (all_of(RhsTE->Scalars,
13393 [&](Value *V) {
13394 CurrentValue += Stride;
13395 if (isa<UndefValue>(V))
13396 return true;
13397 auto *C = dyn_cast<Constant>(V);
13398 if (!C)
13399 return false;
13400 return C->getUniqueInteger() == CurrentValue - Stride;
13401 }) &&
13402 CurrentValue == Sz) {
13403 Order.clear();
13404 } else {
13405 const unsigned VF = RhsTE->getVectorFactor();
13406 Order.assign(VF, VF);
13407 // Check if need to reorder Rhs to make it in form (0, Stride, 2 * Stride,
13408 // ..., Sz-Stride).
13409 if (VF * Stride != Sz)
13410 return false;
13411 for (const auto [Idx, V] : enumerate(RhsTE->Scalars)) {
13412 if (isa<UndefValue>(V))
13413 continue;
13414 auto *C = dyn_cast<Constant>(V);
13415 if (!C)
13416 return false;
13417 const APInt &Val = C->getUniqueInteger();
13418 if (Val.isNegative() || Val.uge(Sz) || Val.getZExtValue() % Stride != 0)
13419 return false;
13420 unsigned Pos = Val.getZExtValue() / Stride;
13421 // TODO: Support Pos >= VF, in this case need to shift the final value.
13422 if (Order[Idx] != VF || Pos >= VF)
13423 return false;
13424 Order[Idx] = Pos;
13425 }
13426 // One of the indices not set - exit.
13427 if (is_contained(Order, VF))
13428 return false;
13429 }
13431 FastMathFlags FMF;
13432 SmallPtrSet<Value *, 4> CheckedExtracts;
13433 auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor());
13434 auto *SrcVecTy = getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
13435 TTI::CastContextHint CastCtx =
13436 getCastContextHint(*getOperandEntry(LhsTE, /*Idx=*/0));
13437 InstructionCost VecCost =
13438 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF, CostKind) +
13439 TTI->getArithmeticInstrCost(Instruction::Shl, VecTy, CostKind,
13440 getOperandInfo(LhsTE->Scalars)) +
13441 TTI->getCastInstrCost(
13442 Instruction::ZExt, VecTy,
13443 getWidenedType(SrcScalarTy, LhsTE->getVectorFactor()), CastCtx,
13444 CostKind);
13445 InstructionCost BitcastCost = TTI->getCastInstrCost(
13446 Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx, CostKind);
13447 if (!Order.empty()) {
13448 fixupOrderingIndices(Order);
13449 SmallVector<int> Mask;
13450 inversePermutation(Order, Mask);
13451 BitcastCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, SrcVecTy,
13452 Mask, CostKind);
13453 }
13454 // Check if the combination can be modeled as a bitcast+byteswap operation.
13455 constexpr unsigned ByteSize = 8;
13456 if (!Order.empty() && isReverseOrder(Order) &&
13457 DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
13458 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, ScalarTy, {ScalarTy});
13459 InstructionCost BSwapCost =
13460 TTI->getCastInstrCost(Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx,
13461 CostKind) +
13462 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
13463 if (BSwapCost <= BitcastCost) {
13464 BitcastCost = BSwapCost;
13465 IsBSwap = true;
13466 }
13467 }
13468 return BitcastCost < VecCost;
13469}
13470
13473 BaseGraphSize = VectorizableTree.size();
13474 // Turn graph transforming mode on and off, when done.
13475 class GraphTransformModeRAAI {
13476 bool &SavedIsGraphTransformMode;
13477
13478 public:
13479 GraphTransformModeRAAI(bool &IsGraphTransformMode)
13480 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13481 IsGraphTransformMode = true;
13482 }
13483 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
13484 } TransformContext(IsGraphTransformMode);
13485 // Operands are profitable if they are:
13486 // 1. At least one constant
13487 // or
13488 // 2. Splats
13489 // or
13490 // 3. Results in good vectorization opportunity, i.e. may generate vector
13491 // nodes and reduce cost of the graph.
13492 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
13493 const InstructionsState &S) {
13495 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
13496 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
13497 I2->getOperand(Op));
13498 return all_of(
13499 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
13500 return all_of(Cand,
13501 [](const std::pair<Value *, Value *> &P) {
13502 return isa<Constant>(P.first) ||
13503 isa<Constant>(P.second) || P.first == P.second;
13504 }) ||
13506 });
13507 };
13508
13509 // Try to reorder gather nodes for better vectorization opportunities.
13510 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13511 TreeEntry &E = *VectorizableTree[Idx];
13512 if (E.isGather())
13513 reorderGatherNode(E);
13514 }
13515
13516 // Better to use full gathered loads analysis, if there are only 2 loads
13517 // gathered nodes each having less than 16 elements.
13518 constexpr unsigned VFLimit = 16;
13519 bool ForceLoadGather =
13520 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
13521 return TE->isGather() && TE->hasState() &&
13522 TE->getOpcode() == Instruction::Load &&
13523 TE->getVectorFactor() < VFLimit;
13524 }) == 2;
13525
13526 // Checks if the scalars are used in other node.
13527 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
13528 function_ref<bool(Value *)> CheckContainer) {
13529 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
13530 if (isa<PoisonValue>(V))
13531 return true;
13532 auto *I = dyn_cast<Instruction>(V);
13533 if (!I)
13534 return false;
13535 return is_contained(TE->Scalars, I) || CheckContainer(I);
13536 });
13537 };
13538 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
13539 if (E.hasState()) {
13540 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
13541 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13542 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13543 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13544 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13545 return is_contained(TEs, TE);
13546 });
13547 });
13548 }))
13549 return true;
13550 ;
13551 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
13552 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13553 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13554 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13555 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13556 return is_contained(TEs, TE);
13557 });
13558 });
13559 }))
13560 return true;
13561 } else {
13562 // Check if the gather node full copy of split node.
13563 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
13564 if (It != E.Scalars.end()) {
13565 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
13566 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13567 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13568 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13569 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13570 return is_contained(TEs, TE);
13571 });
13572 });
13573 }))
13574 return true;
13575 }
13576 }
13577 return false;
13578 };
13579 // The tree may grow here, so iterate over nodes, built before.
13580 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13581 TreeEntry &E = *VectorizableTree[Idx];
13582 if (E.isGather()) {
13583 ArrayRef<Value *> VL = E.Scalars;
13584 const unsigned Sz = getVectorElementSize(VL.front());
13585 unsigned MinVF = getMinVF(2 * Sz);
13586 // Do not try partial vectorization for small nodes (<= 2), nodes with the
13587 // same opcode and same parent block or all constants.
13588 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13589 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
13590 // We use allSameOpcode instead of isAltShuffle because we don't
13591 // want to use interchangeable instruction here.
13592 !allSameOpcode(VL) || !allSameBlock(VL)) ||
13593 allConstant(VL) || isSplat(VL))
13594 continue;
13595 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13596 continue;
13597 // Check if the node is a copy of other vector nodes.
13598 if (CheckForSameVectorNodes(E))
13599 continue;
13600 // Try to find vectorizable sequences and transform them into a series of
13601 // insertvector instructions.
13602 unsigned StartIdx = 0;
13603 unsigned End = VL.size();
13604 SmallBitVector Processed(End);
13605 for (unsigned VF = getFloorFullVectorNumberOfElements(
13606 *TTI, VL.front()->getType(), VL.size() - 1);
13607 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13608 *TTI, VL.front()->getType(), VF - 1)) {
13609 if (StartIdx + VF > End)
13610 continue;
13612 bool AllStrided = true;
13613 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13614 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
13615 // If any instruction is vectorized already - do not try again.
13616 // Reuse the existing node, if it fully matches the slice.
13617 if ((Processed.test(Cnt) || isVectorized(Slice.front())) &&
13618 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
13619 continue;
13620 // Constant already handled effectively - skip.
13621 if (allConstant(Slice))
13622 continue;
13623 // Do not try to vectorize small splats (less than vector register and
13624 // only with the single non-undef element).
13625 bool IsSplat = isSplat(Slice);
13626 bool IsTwoRegisterSplat = true;
13627 if (IsSplat && VF == 2) {
13628 unsigned NumRegs2VF = ::getNumberOfParts(
13629 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
13630 IsTwoRegisterSplat = NumRegs2VF == 2;
13631 }
13632 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
13633 count(Slice, Slice.front()) ==
13634 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
13635 : 1)) {
13636 if (IsSplat)
13637 continue;
13638 InstructionsState S = getSameOpcode(Slice, *TLI);
13639 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
13640 (S.getOpcode() == Instruction::Load &&
13642 (S.getOpcode() != Instruction::Load &&
13643 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
13644 continue;
13645 if (VF == 2) {
13646 // Try to vectorize reduced values or if all users are vectorized.
13647 // For expensive instructions extra extracts might be profitable.
13648 if ((!UserIgnoreList || E.Idx != 0) &&
13649 TTI->getInstructionCost(S.getMainOp(), CostKind) <
13651 !all_of(Slice, [&](Value *V) {
13652 if (isa<PoisonValue>(V))
13653 return true;
13654 return areAllUsersVectorized(cast<Instruction>(V),
13655 UserIgnoreList);
13656 }))
13657 continue;
13658 if (S.getOpcode() == Instruction::Load) {
13659 OrdersType Order;
13660 SmallVector<Value *> PointerOps;
13661 StridedPtrInfo SPtrInfo;
13662 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
13663 PointerOps, SPtrInfo);
13664 AllStrided &= Res == LoadsState::StridedVectorize ||
13666 Res == LoadsState::Gather;
13667 // Do not vectorize gathers.
13668 if (Res == LoadsState::ScatterVectorize ||
13669 Res == LoadsState::Gather) {
13670 if (Res == LoadsState::Gather) {
13672 // If reductions and the scalars from the root node are
13673 // analyzed - mark as non-vectorizable reduction.
13674 if (UserIgnoreList && E.Idx == 0)
13675 analyzedReductionVals(Slice);
13676 }
13677 continue;
13678 }
13679 } else if (S.getOpcode() == Instruction::ExtractElement ||
13680 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
13682 !CheckOperandsProfitability(
13683 S.getMainOp(),
13686 S))) {
13687 // Do not vectorize extractelements (handled effectively
13688 // alread). Do not vectorize non-profitable instructions (with
13689 // low cost and non-vectorizable operands.)
13690 continue;
13691 }
13692 }
13693 }
13694 Slices.emplace_back(Cnt, Slice.size());
13695 }
13696 // Do not try to vectorize if all slides are strided or gathered with
13697 // vector factor 2 and there are more than 2 slices. Better to handle
13698 // them in gathered loads analysis, may result in better vectorization.
13699 if (VF == 2 && AllStrided && Slices.size() > 2)
13700 continue;
13701 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13702 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13703 Processed.set(Cnt, Cnt + Sz);
13704 if (StartIdx == Cnt)
13705 StartIdx = Cnt + Sz;
13706 if (End == Cnt + Sz)
13707 End = Cnt;
13708 };
13709 for (auto [Cnt, Sz] : Slices) {
13710 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13711 const TreeEntry *SameTE = nullptr;
13712 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13713 It != Slice.end()) {
13714 // If any instruction is vectorized already - do not try again.
13715 SameTE = getSameValuesTreeEntry(*It, Slice);
13716 }
13717 unsigned PrevSize = VectorizableTree.size();
13718 [[maybe_unused]] unsigned PrevEntriesSize =
13719 LoadEntriesToVectorize.size();
13720 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13721 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13722 VectorizableTree[PrevSize]->isGather() &&
13723 VectorizableTree[PrevSize]->hasState() &&
13724 VectorizableTree[PrevSize]->getOpcode() !=
13725 Instruction::ExtractElement &&
13726 !isSplat(Slice)) {
13727 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13728 analyzedReductionVals(Slice);
13729 VectorizableTree.pop_back();
13730 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13731 "LoadEntriesToVectorize expected to remain the same");
13732 continue;
13733 }
13734 AddCombinedNode(PrevSize, Cnt, Sz);
13735 }
13736 }
13737 // Restore ordering, if no extra vectorization happened.
13738 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13739 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13740 reorderScalars(E.Scalars, Mask);
13741 E.ReorderIndices.clear();
13742 }
13743 }
13744 if (!E.hasState())
13745 continue;
13746 switch (E.getOpcode()) {
13747 case Instruction::Load: {
13748 // No need to reorder masked gather loads, just reorder the scalar
13749 // operands.
13750 if (E.State != TreeEntry::Vectorize)
13751 break;
13752 Type *ScalarTy = E.getMainOp()->getType();
13753 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13754 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13755 // Check if profitable to represent consecutive load + reverse as strided
13756 // load with stride -1.
13757 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13758 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13759 SmallVector<int> Mask;
13760 inversePermutation(E.ReorderIndices, Mask);
13761 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13762 InstructionCost OriginalVecCost =
13763 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13764 BaseLI->getPointerAddressSpace(), CostKind,
13766 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13767 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13768 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13769 VecTy, BaseLI->getPointerOperand(),
13770 /*VariableMask=*/false, CommonAlignment,
13771 BaseLI),
13772 CostKind);
13773 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13774 // Strided load is more profitable than consecutive load + reverse -
13775 // transform the node to strided load.
13776 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13777 ->getPointerOperand()
13778 ->getType());
13779 StridedPtrInfo SPtrInfo;
13780 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13781 SPtrInfo.Ty = VecTy;
13782 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13783 E.State = TreeEntry::StridedVectorize;
13784 }
13785 }
13786 break;
13787 }
13788 case Instruction::Store: {
13789 Type *ScalarTy =
13790 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13791 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13792 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13793 // Check if profitable to represent consecutive load + reverse as strided
13794 // load with stride -1.
13795 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13796 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13797 SmallVector<int> Mask;
13798 inversePermutation(E.ReorderIndices, Mask);
13799 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13800 InstructionCost OriginalVecCost =
13801 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13802 BaseSI->getPointerAddressSpace(), CostKind,
13804 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13805 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13806 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13807 VecTy, BaseSI->getPointerOperand(),
13808 /*VariableMask=*/false, CommonAlignment,
13809 BaseSI),
13810 CostKind);
13811 if (StridedCost < OriginalVecCost)
13812 // Strided store is more profitable than reverse + consecutive store -
13813 // transform the node to strided store.
13814 E.State = TreeEntry::StridedVectorize;
13815 } else if (!E.ReorderIndices.empty()) {
13816 // Check for interleaved stores.
13817 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13818 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13819 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13820 if (Mask.size() < 4)
13821 return 0u;
13822 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13824 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13825 TTI.isLegalInterleavedAccessType(
13826 VecTy, Factor, BaseSI->getAlign(),
13827 BaseSI->getPointerAddressSpace()))
13828 return Factor;
13829 }
13830
13831 return 0u;
13832 };
13833 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13834 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13835 if (InterleaveFactor != 0)
13836 E.setInterleave(InterleaveFactor);
13837 }
13838 break;
13839 }
13840 case Instruction::Select: {
13841 if (E.State != TreeEntry::Vectorize)
13842 break;
13843 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13844 if (MinMaxID == Intrinsic::not_intrinsic)
13845 break;
13846 // This node is a minmax node.
13847 E.CombinedOp = TreeEntry::MinMax;
13848 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13849 if (SelectOnly && CondEntry->UserTreeIndex &&
13850 CondEntry->State == TreeEntry::Vectorize) {
13851 // The condition node is part of the combined minmax node.
13852 CondEntry->State = TreeEntry::CombinedVectorize;
13853 }
13854 break;
13855 }
13856 case Instruction::FSub:
13857 case Instruction::FAdd: {
13858 // Check if possible to convert (a*b)+c to fma.
13859 if (E.State != TreeEntry::Vectorize ||
13860 !E.getOperations().isAddSubLikeOp())
13861 break;
13862 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13863 .isValid())
13864 break;
13865 // This node is a fmuladd node.
13866 E.CombinedOp = TreeEntry::FMulAdd;
13867 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13868 if (FMulEntry->UserTreeIndex &&
13869 FMulEntry->State == TreeEntry::Vectorize) {
13870 // The FMul node is part of the combined fmuladd node.
13871 FMulEntry->State = TreeEntry::CombinedVectorize;
13872 }
13873 break;
13874 }
13875 case Instruction::Shl: {
13876 if (E.Idx != 0)
13877 break;
13878 if (!UserIgnoreList)
13879 break;
13880 // Check that all reduction operands are disjoint or instructions.
13881 if (any_of(*UserIgnoreList, [](Value *V) {
13882 return !match(V, m_DisjointOr(m_Value(), m_Value()));
13883 }))
13884 break;
13885 OrdersType Order;
13886 bool IsBSwap;
13887 if (!matchesShlZExt(E, Order, IsBSwap))
13888 break;
13889 // This node is a (reduced disjoint or) bitcast node.
13890 TreeEntry::CombinedOpcode Code =
13891 IsBSwap ? TreeEntry::ReducedBitcastBSwap : TreeEntry::ReducedBitcast;
13892 E.CombinedOp = Code;
13893 if (!IsBSwap)
13894 E.ReorderIndices = std::move(Order);
13895 TreeEntry *ZExtEntry = getOperandEntry(&E, 0);
13896 assert(ZExtEntry->UserTreeIndex &&
13897 ZExtEntry->State == TreeEntry::Vectorize &&
13898 ZExtEntry->getOpcode() == Instruction::ZExt &&
13899 "Expected ZExt node.");
13900 // The ZExt node is part of the combined node.
13901 ZExtEntry->State = TreeEntry::CombinedVectorize;
13902 ZExtEntry->CombinedOp = Code;
13903 TreeEntry *ConstEntry = getOperandEntry(&E, 1);
13904 assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
13905 "Expected ZExt node.");
13906 // The ConstNode node is part of the combined node.
13907 ConstEntry->State = TreeEntry::CombinedVectorize;
13908 ConstEntry->CombinedOp = Code;
13909 break;
13910 }
13911 default:
13912 break;
13913 }
13914 }
13915
13916 if (LoadEntriesToVectorize.empty()) {
13917 // Single load node - exit.
13918 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13919 VectorizableTree.front()->getOpcode() == Instruction::Load)
13920 return;
13921 // Small graph with small VF - exit.
13922 constexpr unsigned SmallTree = 3;
13923 constexpr unsigned SmallVF = 2;
13924 if ((VectorizableTree.size() <= SmallTree &&
13925 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13926 (VectorizableTree.size() <= 2 && UserIgnoreList))
13927 return;
13928
13929 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13930 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13931 getCanonicalGraphSize() <= SmallTree &&
13932 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13933 [](const std::unique_ptr<TreeEntry> &TE) {
13934 return TE->isGather() && TE->hasState() &&
13935 TE->getOpcode() == Instruction::Load &&
13936 !allSameBlock(TE->Scalars);
13937 }) == 1)
13938 return;
13939 }
13940
13941 // A list of loads to be gathered during the vectorization process. We can
13942 // try to vectorize them at the end, if profitable.
13943 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13945 GatheredLoads;
13946
13947 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13948 TreeEntry &E = *TE;
13949 if (E.isGather() &&
13950 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13951 (!E.hasState() && any_of(E.Scalars,
13952 [&](Value *V) {
13953 return isa<LoadInst>(V) &&
13954 !isVectorized(V) &&
13955 !isDeleted(cast<Instruction>(V));
13956 }))) &&
13957 !isSplat(E.Scalars)) {
13958 for (Value *V : E.Scalars) {
13959 auto *LI = dyn_cast<LoadInst>(V);
13960 if (!LI)
13961 continue;
13962 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13963 continue;
13965 *this, V, *DL, *SE, *TTI,
13966 GatheredLoads[std::make_tuple(
13967 LI->getParent(),
13968 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13969 LI->getType())]);
13970 }
13971 }
13972 }
13973 // Try to vectorize gathered loads if this is not just a gather of loads.
13974 if (!GatheredLoads.empty())
13975 tryToVectorizeGatheredLoads(GatheredLoads);
13976}
13977
13978/// Merges shuffle masks and emits final shuffle instruction, if required. It
13979/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13980/// when the actual shuffle instruction is generated only if this is actually
13981/// required. Otherwise, the shuffle instruction emission is delayed till the
13982/// end of the process, to reduce the number of emitted instructions and further
13983/// analysis/transformations.
13984class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13985 bool IsFinalized = false;
13986 SmallVector<int> CommonMask;
13988 const TargetTransformInfo &TTI;
13989 InstructionCost Cost = 0;
13990 SmallDenseSet<Value *> VectorizedVals;
13991 BoUpSLP &R;
13992 SmallPtrSetImpl<Value *> &CheckedExtracts;
13993 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13994 /// While set, still trying to estimate the cost for the same nodes and we
13995 /// can delay actual cost estimation (virtual shuffle instruction emission).
13996 /// May help better estimate the cost if same nodes must be permuted + allows
13997 /// to move most of the long shuffles cost estimation to TTI.
13998 bool SameNodesEstimated = true;
13999
14000 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
14001 if (Ty->getScalarType()->isPointerTy()) {
14004 IntegerType::get(Ty->getContext(),
14005 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
14006 Ty->getScalarType());
14007 if (auto *VTy = dyn_cast<VectorType>(Ty))
14008 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
14009 return Res;
14010 }
14011 return Constant::getAllOnesValue(Ty);
14012 }
14013
14014 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
14015 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
14016 return TTI::TCC_Free;
14017 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14018 InstructionCost GatherCost = 0;
14019 SmallVector<Value *> Gathers(VL);
14020 if (!Root && isSplat(VL)) {
14021 // Found the broadcasting of the single scalar, calculate the cost as
14022 // the broadcast.
14023 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
14024 assert(It != VL.end() && "Expected at least one non-undef value.");
14025 // Add broadcast for non-identity shuffle only.
14026 bool NeedShuffle =
14027 count(VL, *It) > 1 &&
14028 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
14029 if (!NeedShuffle) {
14030 if (isa<FixedVectorType>(ScalarTy)) {
14031 assert(SLPReVec && "FixedVectorType is not expected.");
14032 return TTI.getShuffleCost(
14033 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
14034 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
14035 cast<FixedVectorType>(ScalarTy));
14036 }
14037 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
14038 CostKind, std::distance(VL.begin(), It),
14039 PoisonValue::get(VecTy), *It);
14040 }
14041
14042 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
14043 transform(VL, ShuffleMask.begin(), [](Value *V) {
14044 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
14045 });
14046 InstructionCost InsertCost =
14047 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
14048 PoisonValue::get(VecTy), *It);
14049 return InsertCost + ::getShuffleCost(TTI,
14051 VecTy, ShuffleMask, CostKind,
14052 /*Index=*/0, /*SubTp=*/nullptr,
14053 /*Args=*/*It);
14054 }
14055 return GatherCost +
14056 (all_of(Gathers, IsaPred<UndefValue>)
14058 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
14059 ScalarTy));
14060 };
14061
14062 /// Compute the cost of creating a vector containing the extracted values from
14063 /// \p VL.
14065 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
14066 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14067 unsigned NumParts) {
14068 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
14069 unsigned NumElts =
14070 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
14071 auto *EE = dyn_cast<ExtractElementInst>(V);
14072 if (!EE)
14073 return Sz;
14074 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
14075 if (!VecTy)
14076 return Sz;
14077 return std::max(Sz, VecTy->getNumElements());
14078 });
14079 // FIXME: this must be moved to TTI for better estimation.
14080 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
14081 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
14083 SmallVectorImpl<unsigned> &SubVecSizes)
14084 -> std::optional<TTI::ShuffleKind> {
14085 if (NumElts <= EltsPerVector)
14086 return std::nullopt;
14087 int OffsetReg0 =
14088 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
14089 [](int S, int I) {
14090 if (I == PoisonMaskElem)
14091 return S;
14092 return std::min(S, I);
14093 }),
14094 EltsPerVector);
14095 int OffsetReg1 = OffsetReg0;
14096 DenseSet<int> RegIndices;
14097 // Check that if trying to permute same single/2 input vectors.
14099 int FirstRegId = -1;
14100 Indices.assign(1, OffsetReg0);
14101 for (auto [Pos, I] : enumerate(Mask)) {
14102 if (I == PoisonMaskElem)
14103 continue;
14104 int Idx = I - OffsetReg0;
14105 int RegId =
14106 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
14107 if (FirstRegId < 0)
14108 FirstRegId = RegId;
14109 RegIndices.insert(RegId);
14110 if (RegIndices.size() > 2)
14111 return std::nullopt;
14112 if (RegIndices.size() == 2) {
14113 ShuffleKind = TTI::SK_PermuteTwoSrc;
14114 if (Indices.size() == 1) {
14115 OffsetReg1 = alignDown(
14116 std::accumulate(
14117 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
14118 [&](int S, int I) {
14119 if (I == PoisonMaskElem)
14120 return S;
14121 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
14122 ((I - OffsetReg0) % NumElts) / EltsPerVector;
14123 if (RegId == FirstRegId)
14124 return S;
14125 return std::min(S, I);
14126 }),
14127 EltsPerVector);
14128 unsigned Index = OffsetReg1 % NumElts;
14129 Indices.push_back(Index);
14130 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
14131 }
14132 Idx = I - OffsetReg1;
14133 }
14134 I = (Idx % NumElts) % EltsPerVector +
14135 (RegId == FirstRegId ? 0 : EltsPerVector);
14136 }
14137 return ShuffleKind;
14138 };
14139 InstructionCost Cost = 0;
14140
14141 // Process extracts in blocks of EltsPerVector to check if the source vector
14142 // operand can be re-used directly. If not, add the cost of creating a
14143 // shuffle to extract the values into a vector register.
14144 for (unsigned Part : seq<unsigned>(NumParts)) {
14145 if (!ShuffleKinds[Part])
14146 continue;
14147 ArrayRef<int> MaskSlice = Mask.slice(
14148 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
14149 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
14150 copy(MaskSlice, SubMask.begin());
14152 SmallVector<unsigned, 2> SubVecSizes;
14153 std::optional<TTI::ShuffleKind> RegShuffleKind =
14154 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
14155 if (!RegShuffleKind) {
14156 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
14158 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
14159 Cost +=
14160 ::getShuffleCost(TTI, *ShuffleKinds[Part],
14161 getWidenedType(ScalarTy, NumElts), MaskSlice);
14162 continue;
14163 }
14164 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
14165 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
14166 Cost +=
14167 ::getShuffleCost(TTI, *RegShuffleKind,
14168 getWidenedType(ScalarTy, EltsPerVector), SubMask);
14169 }
14170 const unsigned BaseVF = getFullVectorNumberOfElements(
14171 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
14172 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
14173 assert((Idx + SubVecSize) <= BaseVF &&
14174 "SK_ExtractSubvector index out of range");
14176 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
14177 Idx, getWidenedType(ScalarTy, SubVecSize));
14178 }
14179 // Second attempt to check, if just a permute is better estimated than
14180 // subvector extract.
14181 SubMask.assign(NumElts, PoisonMaskElem);
14182 copy(MaskSlice, SubMask.begin());
14183 InstructionCost OriginalCost = ::getShuffleCost(
14184 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
14185 if (OriginalCost < Cost)
14186 Cost = OriginalCost;
14187 }
14188 return Cost;
14189 }
14190 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
14191 /// mask \p Mask, register number \p Part, that includes \p SliceSize
14192 /// elements.
14193 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
14194 ArrayRef<int> Mask, unsigned Part,
14195 unsigned SliceSize) {
14196 if (SameNodesEstimated) {
14197 // Delay the cost estimation if the same nodes are reshuffling.
14198 // If we already requested the cost of reshuffling of E1 and E2 before, no
14199 // need to estimate another cost with the sub-Mask, instead include this
14200 // sub-Mask into the CommonMask to estimate it later and avoid double cost
14201 // estimation.
14202 if ((InVectors.size() == 2 &&
14203 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
14204 cast<const TreeEntry *>(InVectors.back()) == E2) ||
14205 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
14206 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
14207 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
14208 [](int Idx) { return Idx == PoisonMaskElem; }) &&
14209 "Expected all poisoned elements.");
14210 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
14211 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
14212 return;
14213 }
14214 // Found non-matching nodes - need to estimate the cost for the matched
14215 // and transform mask.
14216 Cost += createShuffle(InVectors.front(),
14217 InVectors.size() == 1 ? nullptr : InVectors.back(),
14218 CommonMask);
14219 transformMaskAfterShuffle(CommonMask, CommonMask);
14220 } else if (InVectors.size() == 2) {
14221 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14222 transformMaskAfterShuffle(CommonMask, CommonMask);
14223 }
14224 SameNodesEstimated = false;
14225 if (!E2 && InVectors.size() == 1) {
14226 unsigned VF = E1.getVectorFactor();
14227 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
14228 VF = std::max(VF, getVF(V1));
14229 } else {
14230 const auto *E = cast<const TreeEntry *>(InVectors.front());
14231 VF = std::max(VF, E->getVectorFactor());
14232 }
14233 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14234 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14235 CommonMask[Idx] = Mask[Idx] + VF;
14236 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
14237 transformMaskAfterShuffle(CommonMask, CommonMask);
14238 } else {
14239 auto P = InVectors.front();
14240 Cost += createShuffle(&E1, E2, Mask);
14241 unsigned VF = Mask.size();
14242 if (Value *V1 = dyn_cast<Value *>(P)) {
14243 VF = std::max(VF,
14244 getNumElements(V1->getType()));
14245 } else {
14246 const auto *E = cast<const TreeEntry *>(P);
14247 VF = std::max(VF, E->getVectorFactor());
14248 }
14249 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14250 if (Mask[Idx] != PoisonMaskElem)
14251 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
14252 Cost += createShuffle(P, InVectors.front(), CommonMask);
14253 transformMaskAfterShuffle(CommonMask, CommonMask);
14254 }
14255 }
14256
14257 class ShuffleCostBuilder {
14258 const TargetTransformInfo &TTI;
14259
14260 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
14261 int Index = -1;
14262 return Mask.empty() ||
14263 (VF == Mask.size() &&
14266 Index == 0);
14267 }
14268
14269 public:
14270 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
14271 ~ShuffleCostBuilder() = default;
14272 InstructionCost createShuffleVector(Value *V1, Value *,
14273 ArrayRef<int> Mask) const {
14274 // Empty mask or identity mask are free.
14275 unsigned VF =
14276 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
14277 if (isEmptyOrIdentity(Mask, VF))
14278 return TTI::TCC_Free;
14279 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
14280 cast<VectorType>(V1->getType()), Mask);
14281 }
14282 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
14283 // Empty mask or identity mask are free.
14284 unsigned VF =
14285 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
14286 if (isEmptyOrIdentity(Mask, VF))
14287 return TTI::TCC_Free;
14288 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
14289 cast<VectorType>(V1->getType()), Mask);
14290 }
14291 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
14292 InstructionCost createPoison(Type *Ty, unsigned VF) const {
14293 return TTI::TCC_Free;
14294 }
14295 void resizeToMatch(Value *&, Value *&) const {}
14296 };
14297
14298 /// Smart shuffle instruction emission, walks through shuffles trees and
14299 /// tries to find the best matching vector for the actual shuffle
14300 /// instruction.
14302 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
14304 ArrayRef<int> Mask) {
14305 ShuffleCostBuilder Builder(TTI);
14306 SmallVector<int> CommonMask(Mask);
14307 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
14308 unsigned CommonVF = Mask.size();
14309 InstructionCost ExtraCost = 0;
14310 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
14311 unsigned VF) -> InstructionCost {
14312 if (E.isGather() && allConstant(E.Scalars))
14313 return TTI::TCC_Free;
14314 Type *EScalarTy = E.Scalars.front()->getType();
14315 bool IsSigned = true;
14316 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
14317 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
14318 IsSigned = It->second.second;
14319 }
14320 if (EScalarTy != ScalarTy) {
14321 unsigned CastOpcode = Instruction::Trunc;
14322 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14323 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14324 if (DstSz > SrcSz)
14325 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14326 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
14327 getWidenedType(EScalarTy, VF),
14328 TTI::CastContextHint::None, CostKind);
14329 }
14330 return TTI::TCC_Free;
14331 };
14332 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
14333 if (isa<Constant>(V))
14334 return TTI::TCC_Free;
14335 auto *VecTy = cast<VectorType>(V->getType());
14336 Type *EScalarTy = VecTy->getElementType();
14337 if (EScalarTy != ScalarTy) {
14338 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
14339 unsigned CastOpcode = Instruction::Trunc;
14340 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14341 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14342 if (DstSz > SrcSz)
14343 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14344 return TTI.getCastInstrCost(
14345 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
14346 VecTy, TTI::CastContextHint::None, CostKind);
14347 }
14348 return TTI::TCC_Free;
14349 };
14350 if (!V1 && !V2 && !P2.isNull()) {
14351 // Shuffle 2 entry nodes.
14352 const TreeEntry *E = cast<const TreeEntry *>(P1);
14353 unsigned VF = E->getVectorFactor();
14354 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
14355 CommonVF = std::max(VF, E2->getVectorFactor());
14356 assert(all_of(Mask,
14357 [=](int Idx) {
14358 return Idx < 2 * static_cast<int>(CommonVF);
14359 }) &&
14360 "All elements in mask must be less than 2 * CommonVF.");
14361 if (E->Scalars.size() == E2->Scalars.size()) {
14362 SmallVector<int> EMask = E->getCommonMask();
14363 SmallVector<int> E2Mask = E2->getCommonMask();
14364 if (!EMask.empty() || !E2Mask.empty()) {
14365 for (int &Idx : CommonMask) {
14366 if (Idx == PoisonMaskElem)
14367 continue;
14368 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
14369 Idx = EMask[Idx];
14370 else if (Idx >= static_cast<int>(CommonVF))
14371 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
14372 E->Scalars.size();
14373 }
14374 }
14375 CommonVF = E->Scalars.size();
14376 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14377 GetNodeMinBWAffectedCost(*E2, CommonVF);
14378 } else {
14379 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14380 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14381 }
14382 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14383 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14384 } else if (!V1 && P2.isNull()) {
14385 // Shuffle single entry node.
14386 const TreeEntry *E = cast<const TreeEntry *>(P1);
14387 unsigned VF = E->getVectorFactor();
14388 CommonVF = VF;
14389 assert(
14390 all_of(Mask,
14391 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14392 "All elements in mask must be less than CommonVF.");
14393 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14394 SmallVector<int> EMask = E->getCommonMask();
14395 assert(!EMask.empty() && "Expected non-empty common mask.");
14396 for (int &Idx : CommonMask) {
14397 if (Idx != PoisonMaskElem)
14398 Idx = EMask[Idx];
14399 }
14400 CommonVF = E->Scalars.size();
14401 } else if (unsigned Factor = E->getInterleaveFactor();
14402 Factor > 0 && E->Scalars.size() != Mask.size() &&
14404 Factor)) {
14405 // Deinterleaved nodes are free.
14406 std::iota(CommonMask.begin(), CommonMask.end(), 0);
14407 }
14408 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14409 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14410 // Not identity/broadcast? Try to see if the original vector is better.
14411 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14412 CommonVF == CommonMask.size() &&
14413 any_of(enumerate(CommonMask),
14414 [](const auto &&P) {
14415 return P.value() != PoisonMaskElem &&
14416 static_cast<unsigned>(P.value()) != P.index();
14417 }) &&
14418 any_of(CommonMask,
14419 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
14420 SmallVector<int> ReorderMask;
14421 inversePermutation(E->ReorderIndices, ReorderMask);
14422 ::addMask(CommonMask, ReorderMask);
14423 }
14424 } else if (V1 && P2.isNull()) {
14425 // Shuffle single vector.
14426 ExtraCost += GetValueMinBWAffectedCost(V1);
14427 CommonVF = getVF(V1);
14428 assert(
14429 all_of(Mask,
14430 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14431 "All elements in mask must be less than CommonVF.");
14432 } else if (V1 && !V2) {
14433 // Shuffle vector and tree node.
14434 unsigned VF = getVF(V1);
14435 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
14436 CommonVF = std::max(VF, E2->getVectorFactor());
14437 assert(all_of(Mask,
14438 [=](int Idx) {
14439 return Idx < 2 * static_cast<int>(CommonVF);
14440 }) &&
14441 "All elements in mask must be less than 2 * CommonVF.");
14442 if (E2->Scalars.size() == VF && VF != CommonVF) {
14443 SmallVector<int> E2Mask = E2->getCommonMask();
14444 assert(!E2Mask.empty() && "Expected non-empty common mask.");
14445 for (int &Idx : CommonMask) {
14446 if (Idx == PoisonMaskElem)
14447 continue;
14448 if (Idx >= static_cast<int>(CommonVF))
14449 Idx = E2Mask[Idx - CommonVF] + VF;
14450 }
14451 CommonVF = VF;
14452 }
14453 ExtraCost += GetValueMinBWAffectedCost(V1);
14454 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14455 ExtraCost += GetNodeMinBWAffectedCost(
14456 *E2, std::min(CommonVF, E2->getVectorFactor()));
14457 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14458 } else if (!V1 && V2) {
14459 // Shuffle vector and tree node.
14460 unsigned VF = getVF(V2);
14461 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
14462 CommonVF = std::max(VF, E1->getVectorFactor());
14463 assert(all_of(Mask,
14464 [=](int Idx) {
14465 return Idx < 2 * static_cast<int>(CommonVF);
14466 }) &&
14467 "All elements in mask must be less than 2 * CommonVF.");
14468 if (E1->Scalars.size() == VF && VF != CommonVF) {
14469 SmallVector<int> E1Mask = E1->getCommonMask();
14470 assert(!E1Mask.empty() && "Expected non-empty common mask.");
14471 for (int &Idx : CommonMask) {
14472 if (Idx == PoisonMaskElem)
14473 continue;
14474 if (Idx >= static_cast<int>(CommonVF))
14475 Idx = E1Mask[Idx - CommonVF] + VF;
14476 else
14477 Idx = E1Mask[Idx];
14478 }
14479 CommonVF = VF;
14480 }
14481 ExtraCost += GetNodeMinBWAffectedCost(
14482 *E1, std::min(CommonVF, E1->getVectorFactor()));
14483 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14484 ExtraCost += GetValueMinBWAffectedCost(V2);
14485 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14486 } else {
14487 assert(V1 && V2 && "Expected both vectors.");
14488 unsigned VF = getVF(V1);
14489 CommonVF = std::max(VF, getVF(V2));
14490 assert(all_of(Mask,
14491 [=](int Idx) {
14492 return Idx < 2 * static_cast<int>(CommonVF);
14493 }) &&
14494 "All elements in mask must be less than 2 * CommonVF.");
14495 ExtraCost +=
14496 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14497 if (V1->getType() != V2->getType()) {
14498 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14499 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14500 } else {
14501 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
14502 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14503 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
14504 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14505 }
14506 }
14507 InVectors.front() =
14508 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14509 if (InVectors.size() == 2)
14510 InVectors.pop_back();
14511 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14512 V1, V2, CommonMask, Builder, ScalarTy);
14513 }
14514
14515public:
14517 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
14518 SmallPtrSetImpl<Value *> &CheckedExtracts)
14519 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14520 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14521 CheckedExtracts(CheckedExtracts) {}
14522 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14523 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14524 unsigned NumParts, bool &UseVecBaseAsInput) {
14525 UseVecBaseAsInput = false;
14526 if (Mask.empty())
14527 return nullptr;
14528 Value *VecBase = nullptr;
14529 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14530 if (!E->ReorderIndices.empty()) {
14531 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14532 E->ReorderIndices.end());
14533 reorderScalars(VL, ReorderMask);
14534 }
14535 // Check if it can be considered reused if same extractelements were
14536 // vectorized already.
14537 bool PrevNodeFound = any_of(
14538 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14539 [&](const std::unique_ptr<TreeEntry> &TE) {
14540 return ((TE->hasState() && !TE->isAltShuffle() &&
14541 TE->getOpcode() == Instruction::ExtractElement) ||
14542 TE->isGather()) &&
14543 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14544 return VL.size() > Data.index() &&
14545 (Mask[Data.index()] == PoisonMaskElem ||
14546 isa<UndefValue>(VL[Data.index()]) ||
14547 Data.value() == VL[Data.index()]);
14548 });
14549 });
14550 SmallPtrSet<Value *, 4> UniqueBases;
14551 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14552 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
14553 for (unsigned Part : seq<unsigned>(NumParts)) {
14554 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14555 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14556 for (auto [I, V] :
14557 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
14558 // Ignore non-extractelement scalars.
14559 if (isa<UndefValue>(V) ||
14560 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
14561 continue;
14562 // If all users of instruction are going to be vectorized and this
14563 // instruction itself is not going to be vectorized, consider this
14564 // instruction as dead and remove its cost from the final cost of the
14565 // vectorized tree.
14566 // Also, avoid adjusting the cost for extractelements with multiple uses
14567 // in different graph entries.
14568 auto *EE = cast<ExtractElementInst>(V);
14569 VecBase = EE->getVectorOperand();
14570 UniqueBases.insert(VecBase);
14571 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
14572 if (!CheckedExtracts.insert(V).second ||
14573 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
14574 any_of(VEs,
14575 [&](const TreeEntry *TE) {
14576 return R.DeletedNodes.contains(TE) ||
14577 R.TransformedToGatherNodes.contains(TE);
14578 }) ||
14579 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
14580 !R.isVectorized(EE) &&
14581 count_if(E->Scalars, [&](Value *V) { return V == EE; }) !=
14582 count_if(E->UserTreeIndex.UserTE->Scalars,
14583 [&](Value *V) { return V == EE; })) ||
14584 any_of(EE->users(),
14585 [&](User *U) {
14586 return isa<GetElementPtrInst>(U) &&
14587 !R.areAllUsersVectorized(cast<Instruction>(U),
14588 &VectorizedVals);
14589 }) ||
14590 (!VEs.empty() && !is_contained(VEs, E)))
14591 continue;
14592 std::optional<unsigned> EEIdx = getExtractIndex(EE);
14593 if (!EEIdx)
14594 continue;
14595 unsigned Idx = *EEIdx;
14596 // Take credit for instruction that will become dead.
14597 if (EE->hasOneUse() || !PrevNodeFound) {
14598 Instruction *Ext = EE->user_back();
14599 if (isa<SExtInst, ZExtInst>(Ext) &&
14601 // Use getExtractWithExtendCost() to calculate the cost of
14602 // extractelement/ext pair.
14603 Cost -= TTI.getExtractWithExtendCost(
14604 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
14605 Idx, CostKind);
14606 // Add back the cost of s|zext which is subtracted separately.
14607 Cost += TTI.getCastInstrCost(
14608 Ext->getOpcode(), Ext->getType(), EE->getType(),
14610 continue;
14611 }
14612 }
14613 APInt &DemandedElts =
14614 VectorOpsToExtracts
14615 .try_emplace(VecBase,
14616 APInt::getZero(getNumElements(VecBase->getType())))
14617 .first->getSecond();
14618 DemandedElts.setBit(Idx);
14619 }
14620 }
14621 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14623 DemandedElts, /*Insert=*/false,
14624 /*Extract=*/true, CostKind);
14625 // Check that gather of extractelements can be represented as just a
14626 // shuffle of a single/two vectors the scalars are extracted from.
14627 // Found the bunch of extractelement instructions that must be gathered
14628 // into a vector and can be represented as a permutation elements in a
14629 // single input vector or of 2 input vectors.
14630 // Done for reused if same extractelements were vectorized already.
14631 if (!PrevNodeFound)
14632 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14633 InVectors.assign(1, E);
14634 CommonMask.assign(Mask.begin(), Mask.end());
14635 transformMaskAfterShuffle(CommonMask, CommonMask);
14636 SameNodesEstimated = false;
14637 if (NumParts != 1 && UniqueBases.size() != 1) {
14638 UseVecBaseAsInput = true;
14639 VecBase =
14640 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14641 }
14642 return VecBase;
14643 }
14644 /// Checks if the specified entry \p E needs to be delayed because of its
14645 /// dependency nodes.
14646 std::optional<InstructionCost>
14647 needToDelay(const TreeEntry *,
14649 // No need to delay the cost estimation during analysis.
14650 return std::nullopt;
14651 }
14652 /// Reset the builder to handle perfect diamond match.
14654 IsFinalized = false;
14655 CommonMask.clear();
14656 InVectors.clear();
14657 Cost = 0;
14658 VectorizedVals.clear();
14659 SameNodesEstimated = true;
14660 }
14661 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14662 if (&E1 == &E2) {
14663 assert(all_of(Mask,
14664 [&](int Idx) {
14665 return Idx < static_cast<int>(E1.getVectorFactor());
14666 }) &&
14667 "Expected single vector shuffle mask.");
14668 add(E1, Mask);
14669 return;
14670 }
14671 if (InVectors.empty()) {
14672 CommonMask.assign(Mask.begin(), Mask.end());
14673 InVectors.assign({&E1, &E2});
14674 return;
14675 }
14676 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14677 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14678 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14679 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14680 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
14681 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14682 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14683 }
14684 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14685 if (InVectors.empty()) {
14686 CommonMask.assign(Mask.begin(), Mask.end());
14687 InVectors.assign(1, &E1);
14688 return;
14689 }
14690 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14691 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14692 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14693 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14694 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
14695 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14696 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
14697 if (!SameNodesEstimated && InVectors.size() == 1)
14698 InVectors.emplace_back(&E1);
14699 }
14700 /// Adds 2 input vectors and the mask for their shuffling.
14701 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14702 // May come only for shuffling of 2 vectors with extractelements, already
14703 // handled in adjustExtracts.
14704 assert(InVectors.size() == 1 &&
14705 all_of(enumerate(CommonMask),
14706 [&](auto P) {
14707 if (P.value() == PoisonMaskElem)
14708 return Mask[P.index()] == PoisonMaskElem;
14709 auto *EI = cast<ExtractElementInst>(
14710 cast<const TreeEntry *>(InVectors.front())
14711 ->getOrdered(P.index()));
14712 return EI->getVectorOperand() == V1 ||
14713 EI->getVectorOperand() == V2;
14714 }) &&
14715 "Expected extractelement vectors.");
14716 }
14717 /// Adds another one input vector and the mask for the shuffling.
14718 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14719 if (InVectors.empty()) {
14720 assert(CommonMask.empty() && !ForExtracts &&
14721 "Expected empty input mask/vectors.");
14722 CommonMask.assign(Mask.begin(), Mask.end());
14723 InVectors.assign(1, V1);
14724 return;
14725 }
14726 if (ForExtracts) {
14727 // No need to add vectors here, already handled them in adjustExtracts.
14728 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14729 !CommonMask.empty() &&
14730 all_of(enumerate(CommonMask),
14731 [&](auto P) {
14732 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14733 ->getOrdered(P.index());
14734 if (P.value() == PoisonMaskElem)
14735 return P.value() == Mask[P.index()] ||
14736 isa<UndefValue>(Scalar);
14737 if (isa<Constant>(V1))
14738 return true;
14739 auto *EI = cast<ExtractElementInst>(Scalar);
14740 return EI->getVectorOperand() == V1;
14741 }) &&
14742 "Expected only tree entry for extractelement vectors.");
14743 return;
14744 }
14745 assert(!InVectors.empty() && !CommonMask.empty() &&
14746 "Expected only tree entries from extracts/reused buildvectors.");
14747 unsigned VF = getVF(V1);
14748 if (InVectors.size() == 2) {
14749 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14750 transformMaskAfterShuffle(CommonMask, CommonMask);
14751 VF = std::max<unsigned>(VF, CommonMask.size());
14752 } else if (const auto *InTE =
14753 InVectors.front().dyn_cast<const TreeEntry *>()) {
14754 VF = std::max(VF, InTE->getVectorFactor());
14755 } else {
14756 VF = std::max(
14757 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14758 ->getNumElements());
14759 }
14760 InVectors.push_back(V1);
14761 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14762 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14763 CommonMask[Idx] = Mask[Idx] + VF;
14764 }
14765 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14766 Value *Root = nullptr) {
14767 Cost += getBuildVectorCost(VL, Root);
14768 if (!Root) {
14769 // FIXME: Need to find a way to avoid use of getNullValue here.
14771 unsigned VF = VL.size();
14772 if (MaskVF != 0)
14773 VF = std::min(VF, MaskVF);
14774 Type *VLScalarTy = VL.front()->getType();
14775 for (Value *V : VL.take_front(VF)) {
14776 Type *ScalarTy = VLScalarTy->getScalarType();
14777 if (isa<PoisonValue>(V)) {
14778 Vals.push_back(PoisonValue::get(ScalarTy));
14779 continue;
14780 }
14781 if (isa<UndefValue>(V)) {
14782 Vals.push_back(UndefValue::get(ScalarTy));
14783 continue;
14784 }
14785 Vals.push_back(Constant::getNullValue(ScalarTy));
14786 }
14787 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14788 assert(SLPReVec && "FixedVectorType is not expected.");
14789 // When REVEC is enabled, we need to expand vector types into scalar
14790 // types.
14791 Vals = replicateMask(Vals, VecTy->getNumElements());
14792 }
14793 return ConstantVector::get(Vals);
14794 }
14797 cast<FixedVectorType>(Root->getType())->getNumElements()),
14798 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14799 }
14801 /// Finalize emission of the shuffles.
14803 ArrayRef<int> ExtMask,
14804 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14805 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14808 Action = {}) {
14809 IsFinalized = true;
14810 if (Action) {
14811 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14812 if (InVectors.size() == 2)
14813 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14814 else
14815 Cost += createShuffle(Vec, nullptr, CommonMask);
14816 transformMaskAfterShuffle(CommonMask, CommonMask);
14817 assert(VF > 0 &&
14818 "Expected vector length for the final value before action.");
14819 Value *V = cast<Value *>(Vec);
14820 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14821 Cost += createShuffle(V1, V2, Mask);
14822 return V1;
14823 });
14824 InVectors.front() = V;
14825 }
14826 if (!SubVectors.empty()) {
14827 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14828 if (InVectors.size() == 2)
14829 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14830 else
14831 Cost += createShuffle(Vec, nullptr, CommonMask);
14832 transformMaskAfterShuffle(CommonMask, CommonMask);
14833 // Add subvectors permutation cost.
14834 if (!SubVectorsMask.empty()) {
14835 assert(SubVectorsMask.size() <= CommonMask.size() &&
14836 "Expected same size of masks for subvectors and common mask.");
14837 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14838 copy(SubVectorsMask, SVMask.begin());
14839 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14840 if (I2 != PoisonMaskElem) {
14841 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14842 I1 = I2 + CommonMask.size();
14843 }
14844 }
14846 getWidenedType(ScalarTy, CommonMask.size()),
14847 SVMask, CostKind);
14848 }
14849 for (auto [E, Idx] : SubVectors) {
14850 Type *EScalarTy = E->Scalars.front()->getType();
14851 bool IsSigned = true;
14852 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14853 EScalarTy =
14854 IntegerType::get(EScalarTy->getContext(), It->second.first);
14855 IsSigned = It->second.second;
14856 }
14857 if (ScalarTy != EScalarTy) {
14858 unsigned CastOpcode = Instruction::Trunc;
14859 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14860 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14861 if (DstSz > SrcSz)
14862 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14863 Cost += TTI.getCastInstrCost(
14864 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14865 getWidenedType(EScalarTy, E->getVectorFactor()),
14867 }
14870 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14871 getWidenedType(ScalarTy, E->getVectorFactor()));
14872 if (!CommonMask.empty()) {
14873 std::iota(std::next(CommonMask.begin(), Idx),
14874 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14875 Idx);
14876 }
14877 }
14878 }
14879
14880 if (!ExtMask.empty()) {
14881 if (CommonMask.empty()) {
14882 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14883 } else {
14884 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14885 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14886 if (ExtMask[I] == PoisonMaskElem)
14887 continue;
14888 NewMask[I] = CommonMask[ExtMask[I]];
14889 }
14890 CommonMask.swap(NewMask);
14891 }
14892 }
14893 if (CommonMask.empty()) {
14894 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14895 return Cost;
14896 }
14897 return Cost +
14898 createShuffle(InVectors.front(),
14899 InVectors.size() == 2 ? InVectors.back() : nullptr,
14900 CommonMask);
14901 }
14902
14904 assert((IsFinalized || CommonMask.empty()) &&
14905 "Shuffle construction must be finalized.");
14906 }
14907};
14908
14909const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14910 unsigned Idx) const {
14911 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14912 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14913 return Op;
14914}
14915
14916TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14917 if (TE.State == TreeEntry::ScatterVectorize ||
14918 TE.State == TreeEntry::StridedVectorize)
14920 if (TE.State == TreeEntry::CompressVectorize)
14922 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14923 !TE.isAltShuffle()) {
14924 if (TE.ReorderIndices.empty())
14926 SmallVector<int> Mask;
14927 inversePermutation(TE.ReorderIndices, Mask);
14928 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14930 }
14932}
14933
14935BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14936 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14937 ArrayRef<Value *> VL = E->Scalars;
14938
14939 Type *ScalarTy = getValueType(VL[0]);
14940 if (!isValidElementType(ScalarTy))
14941 return InstructionCost::getInvalid();
14943
14944 // If we have computed a smaller type for the expression, update VecTy so
14945 // that the costs will be accurate.
14946 auto It = MinBWs.find(E);
14947 Type *OrigScalarTy = ScalarTy;
14948 if (It != MinBWs.end()) {
14949 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14950 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14951 if (VecTy)
14952 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14953 } else if (E->Idx == 0 && isReducedBitcastRoot()) {
14954 const TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
14955 ScalarTy = cast<CastInst>(ZExt->getMainOp())->getSrcTy();
14956 }
14957 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14958 unsigned EntryVF = E->getVectorFactor();
14959 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14960
14961 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
14962 if (allConstant(VL))
14963 return 0;
14964 if (isa<InsertElementInst>(VL[0]))
14965 return InstructionCost::getInvalid();
14966 if (isa<CmpInst>(VL.front()))
14967 ScalarTy = VL.front()->getType();
14968 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14969 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14970 }
14971 if (E->State == TreeEntry::SplitVectorize) {
14972 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14973 "Expected exactly 2 combined entries.");
14974 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14975 InstructionCost VectorCost = 0;
14976 if (E->ReorderIndices.empty()) {
14977 VectorCost = ::getShuffleCost(
14978 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14979 E->CombinedEntriesWithIndices.back().second,
14981 ScalarTy,
14982 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14983 ->getVectorFactor()));
14984 } else {
14985 unsigned CommonVF =
14986 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14987 ->getVectorFactor(),
14988 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14989 ->getVectorFactor());
14990 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14991 getWidenedType(ScalarTy, CommonVF),
14992 E->getSplitMask(), CostKind);
14993 }
14994 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14995 return VectorCost;
14996 }
14997 InstructionCost CommonCost = 0;
14998 SmallVector<int> Mask;
14999 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
15000 (E->State != TreeEntry::StridedVectorize ||
15001 !isReverseOrder(E->ReorderIndices))) {
15002 SmallVector<int> NewMask;
15003 if (E->getOpcode() == Instruction::Store) {
15004 // For stores the order is actually a mask.
15005 NewMask.resize(E->ReorderIndices.size());
15006 copy(E->ReorderIndices, NewMask.begin());
15007 } else {
15008 inversePermutation(E->ReorderIndices, NewMask);
15009 }
15010 ::addMask(Mask, NewMask);
15011 }
15012 if (!E->ReuseShuffleIndices.empty())
15013 ::addMask(Mask, E->ReuseShuffleIndices);
15014 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
15015 CommonCost =
15016 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
15017 assert((E->State == TreeEntry::Vectorize ||
15018 E->State == TreeEntry::ScatterVectorize ||
15019 E->State == TreeEntry::StridedVectorize ||
15020 E->State == TreeEntry::CompressVectorize) &&
15021 "Unhandled state");
15022 assert(E->getOpcode() &&
15023 ((allSameType(VL) && allSameBlock(VL)) ||
15024 (E->getOpcode() == Instruction::GetElementPtr &&
15025 E->getMainOp()->getType()->isPointerTy()) ||
15026 E->hasCopyableElements()) &&
15027 "Invalid VL");
15028 Instruction *VL0 = E->getMainOp();
15029 unsigned ShuffleOrOp =
15030 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15031 if (E->CombinedOp != TreeEntry::NotCombinedOp)
15032 ShuffleOrOp = E->CombinedOp;
15033 SmallSetVector<Value *, 16> UniqueValues;
15034 SmallVector<unsigned, 16> UniqueIndexes;
15035 for (auto [Idx, V] : enumerate(VL))
15036 if (UniqueValues.insert(V))
15037 UniqueIndexes.push_back(Idx);
15038 const unsigned Sz = UniqueValues.size();
15039 SmallBitVector UsedScalars(Sz, false);
15040 for (unsigned I = 0; I < Sz; ++I) {
15041 if (isa<Instruction>(UniqueValues[I]) &&
15042 !E->isCopyableElement(UniqueValues[I]) &&
15043 getTreeEntries(UniqueValues[I]).front() == E)
15044 continue;
15045 UsedScalars.set(I);
15046 }
15047 auto GetCastContextHint = [&](Value *V) {
15048 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
15049 return getCastContextHint(*OpTEs.front());
15050 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
15051 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
15052 !SrcState.isAltShuffle())
15055 };
15056 auto GetCostDiff =
15057 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
15058 function_ref<InstructionCost(InstructionCost)> VectorCost) {
15059 // Calculate the cost of this instruction.
15060 InstructionCost ScalarCost = 0;
15061 if (isa<CastInst, CallInst>(VL0)) {
15062 // For some of the instructions no need to calculate cost for each
15063 // particular instruction, we can use the cost of the single
15064 // instruction x total number of scalar instructions.
15065 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
15066 } else {
15067 for (unsigned I = 0; I < Sz; ++I) {
15068 if (UsedScalars.test(I))
15069 continue;
15070 ScalarCost += ScalarEltCost(I);
15071 }
15072 }
15073
15074 InstructionCost VecCost = VectorCost(CommonCost);
15075 // Check if the current node must be resized, if the parent node is not
15076 // resized.
15077 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
15078 E->Idx != 0 &&
15079 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
15080 const EdgeInfo &EI = E->UserTreeIndex;
15081 if (!EI.UserTE->hasState() ||
15082 EI.UserTE->getOpcode() != Instruction::Select ||
15083 EI.EdgeIdx != 0) {
15084 auto UserBWIt = MinBWs.find(EI.UserTE);
15085 Type *UserScalarTy =
15086 (EI.UserTE->isGather() ||
15087 EI.UserTE->State == TreeEntry::SplitVectorize)
15088 ? EI.UserTE->Scalars.front()->getType()
15089 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
15090 if (UserBWIt != MinBWs.end())
15091 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
15092 UserBWIt->second.first);
15093 if (ScalarTy != UserScalarTy) {
15094 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15095 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
15096 unsigned VecOpcode;
15097 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
15098 if (BWSz > SrcBWSz)
15099 VecOpcode = Instruction::Trunc;
15100 else
15101 VecOpcode =
15102 It->second.second ? Instruction::SExt : Instruction::ZExt;
15103 TTI::CastContextHint CCH = GetCastContextHint(VL0);
15104 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
15105 CostKind);
15106 }
15107 }
15108 }
15109 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
15110 ScalarCost, "Calculated costs for Tree"));
15111 return VecCost - ScalarCost;
15112 };
15113 // Calculate cost difference from vectorizing set of GEPs.
15114 // Negative value means vectorizing is profitable.
15115 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
15116 assert((E->State == TreeEntry::Vectorize ||
15117 E->State == TreeEntry::StridedVectorize ||
15118 E->State == TreeEntry::CompressVectorize) &&
15119 "Entry state expected to be Vectorize, StridedVectorize or "
15120 "MaskedLoadCompressVectorize here.");
15121 InstructionCost ScalarCost = 0;
15122 InstructionCost VecCost = 0;
15123 std::tie(ScalarCost, VecCost) = getGEPCosts(
15124 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
15125 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
15126 "Calculated GEPs cost for Tree"));
15127
15128 return VecCost - ScalarCost;
15129 };
15130
15131 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
15132 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
15133 if (MinMaxID == Intrinsic::not_intrinsic)
15134 return InstructionCost::getInvalid();
15135 Type *CanonicalType = Ty;
15136 if (CanonicalType->isPtrOrPtrVectorTy())
15137 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
15138 CanonicalType->getContext(),
15139 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
15140
15141 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
15142 {CanonicalType, CanonicalType});
15144 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15145 // If the selects are the only uses of the compares, they will be
15146 // dead and we can adjust the cost by removing their cost.
15147 if (VI && SelectOnly) {
15148 assert((!Ty->isVectorTy() || SLPReVec) &&
15149 "Expected only for scalar type.");
15150 auto *CI = cast<CmpInst>(VI->getOperand(0));
15151 IntrinsicCost -= TTI->getCmpSelInstrCost(
15152 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
15153 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
15154 {TTI::OK_AnyValue, TTI::OP_None}, CI);
15155 }
15156 return IntrinsicCost;
15157 };
15158 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
15159 Instruction *VI) {
15160 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
15161 return Cost;
15162 };
15163 switch (ShuffleOrOp) {
15164 case Instruction::PHI: {
15165 // Count reused scalars.
15166 InstructionCost ScalarCost = 0;
15167 SmallPtrSet<const TreeEntry *, 4> CountedOps;
15168 for (Value *V : UniqueValues) {
15169 auto *PHI = dyn_cast<PHINode>(V);
15170 if (!PHI)
15171 continue;
15172
15173 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
15174 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
15175 Value *Op = PHI->getIncomingValue(I);
15176 Operands[I] = Op;
15177 }
15178 if (const TreeEntry *OpTE =
15179 getSameValuesTreeEntry(Operands.front(), Operands))
15180 if (CountedOps.insert(OpTE).second &&
15181 !OpTE->ReuseShuffleIndices.empty())
15182 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
15183 OpTE->Scalars.size());
15184 }
15185
15186 return CommonCost - ScalarCost;
15187 }
15188 case Instruction::ExtractValue:
15189 case Instruction::ExtractElement: {
15190 APInt DemandedElts;
15191 VectorType *SrcVecTy = nullptr;
15192 auto GetScalarCost = [&](unsigned Idx) {
15193 if (isa<PoisonValue>(UniqueValues[Idx]))
15195
15196 auto *I = cast<Instruction>(UniqueValues[Idx]);
15197 if (!SrcVecTy) {
15198 if (ShuffleOrOp == Instruction::ExtractElement) {
15199 auto *EE = cast<ExtractElementInst>(I);
15200 SrcVecTy = EE->getVectorOperandType();
15201 } else {
15202 auto *EV = cast<ExtractValueInst>(I);
15203 Type *AggregateTy = EV->getAggregateOperand()->getType();
15204 unsigned NumElts;
15205 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
15206 NumElts = ATy->getNumElements();
15207 else
15208 NumElts = AggregateTy->getStructNumElements();
15209 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
15210 }
15211 }
15212 if (I->hasOneUse()) {
15213 Instruction *Ext = I->user_back();
15214 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
15216 // Use getExtractWithExtendCost() to calculate the cost of
15217 // extractelement/ext pair.
15218 InstructionCost Cost = TTI->getExtractWithExtendCost(
15219 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
15220 CostKind);
15221 // Subtract the cost of s|zext which is subtracted separately.
15222 Cost -= TTI->getCastInstrCost(
15223 Ext->getOpcode(), Ext->getType(), I->getType(),
15225 return Cost;
15226 }
15227 }
15228 if (DemandedElts.isZero())
15229 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
15230 DemandedElts.setBit(*getExtractIndex(I));
15232 };
15233 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15234 return CommonCost - (DemandedElts.isZero()
15236 : TTI.getScalarizationOverhead(
15237 SrcVecTy, DemandedElts, /*Insert=*/false,
15238 /*Extract=*/true, CostKind));
15239 };
15240 return GetCostDiff(GetScalarCost, GetVectorCost);
15241 }
15242 case Instruction::InsertElement: {
15243 assert(E->ReuseShuffleIndices.empty() &&
15244 "Unique insertelements only are expected.");
15245 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
15246 unsigned const NumElts = SrcVecTy->getNumElements();
15247 unsigned const NumScalars = VL.size();
15248
15249 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
15250
15251 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15252 unsigned OffsetBeg = *getElementIndex(VL.front());
15253 unsigned OffsetEnd = OffsetBeg;
15254 InsertMask[OffsetBeg] = 0;
15255 for (auto [I, V] : enumerate(VL.drop_front())) {
15256 unsigned Idx = *getElementIndex(V);
15257 if (OffsetBeg > Idx)
15258 OffsetBeg = Idx;
15259 else if (OffsetEnd < Idx)
15260 OffsetEnd = Idx;
15261 InsertMask[Idx] = I + 1;
15262 }
15263 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
15264 if (NumOfParts > 0 && NumOfParts < NumElts)
15265 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
15266 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15267 VecScalarsSz;
15268 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15269 unsigned InsertVecSz = std::min<unsigned>(
15270 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
15271 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15272 bool IsWholeSubvector =
15273 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
15274 // Check if we can safely insert a subvector. If it is not possible, just
15275 // generate a whole-sized vector and shuffle the source vector and the new
15276 // subvector.
15277 if (OffsetBeg + InsertVecSz > VecSz) {
15278 // Align OffsetBeg to generate correct mask.
15279 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
15280 InsertVecSz = VecSz;
15281 }
15282
15283 APInt DemandedElts = APInt::getZero(NumElts);
15284 // TODO: Add support for Instruction::InsertValue.
15285 SmallVector<int> Mask;
15286 if (!E->ReorderIndices.empty()) {
15287 inversePermutation(E->ReorderIndices, Mask);
15288 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
15289 } else {
15290 Mask.assign(VecSz, PoisonMaskElem);
15291 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
15292 }
15293 bool IsIdentity = true;
15294 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
15295 Mask.swap(PrevMask);
15296 for (unsigned I = 0; I < NumScalars; ++I) {
15297 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
15298 DemandedElts.setBit(InsertIdx);
15299 IsIdentity &= InsertIdx - OffsetBeg == I;
15300 Mask[InsertIdx - OffsetBeg] = I;
15301 }
15302 assert(Offset < NumElts && "Failed to find vector index offset");
15303
15305 Cost -=
15306 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
15307 /*Insert*/ true, /*Extract*/ false, CostKind);
15308
15309 // First cost - resize to actual vector size if not identity shuffle or
15310 // need to shift the vector.
15311 // Do not calculate the cost if the actual size is the register size and
15312 // we can merge this shuffle with the following SK_Select.
15313 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
15314 if (!IsIdentity)
15316 InsertVecTy, Mask);
15317 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15318 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15319 }));
15320 // Second cost - permutation with subvector, if some elements are from the
15321 // initial vector or inserting a subvector.
15322 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
15323 // subvector of ActualVecTy.
15324 SmallBitVector InMask =
15325 isUndefVector(FirstInsert->getOperand(0),
15326 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
15327 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
15328 if (InsertVecSz != VecSz) {
15329 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
15330 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
15331 CostKind, OffsetBeg - Offset, InsertVecTy);
15332 } else {
15333 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
15334 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
15335 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
15336 I <= End; ++I)
15337 if (Mask[I] != PoisonMaskElem)
15338 Mask[I] = I + VecSz;
15339 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
15340 Mask[I] =
15341 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
15342 Cost +=
15343 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
15344 }
15345 }
15346 return Cost;
15347 }
15348 case Instruction::ZExt:
15349 case Instruction::SExt:
15350 case Instruction::FPToUI:
15351 case Instruction::FPToSI:
15352 case Instruction::FPExt:
15353 case Instruction::PtrToInt:
15354 case Instruction::IntToPtr:
15355 case Instruction::SIToFP:
15356 case Instruction::UIToFP:
15357 case Instruction::Trunc:
15358 case Instruction::FPTrunc:
15359 case Instruction::BitCast: {
15360 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15361 Type *SrcScalarTy = VL0->getOperand(0)->getType();
15362 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
15363 unsigned Opcode = ShuffleOrOp;
15364 unsigned VecOpcode = Opcode;
15365 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15366 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
15367 // Check if the values are candidates to demote.
15368 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
15369 if (SrcIt != MinBWs.end()) {
15370 SrcBWSz = SrcIt->second.first;
15371 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
15372 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
15373 SrcVecTy =
15374 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
15375 }
15376 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15377 if (BWSz == SrcBWSz) {
15378 VecOpcode = Instruction::BitCast;
15379 } else if (BWSz < SrcBWSz) {
15380 VecOpcode = Instruction::Trunc;
15381 } else if (It != MinBWs.end()) {
15382 assert(BWSz > SrcBWSz && "Invalid cast!");
15383 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15384 } else if (SrcIt != MinBWs.end()) {
15385 assert(BWSz > SrcBWSz && "Invalid cast!");
15386 VecOpcode =
15387 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15388 }
15389 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15390 !SrcIt->second.second) {
15391 VecOpcode = Instruction::UIToFP;
15392 }
15393 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
15394 assert(Idx == 0 && "Expected 0 index only");
15395 return TTI->getCastInstrCost(Opcode, VL0->getType(),
15396 VL0->getOperand(0)->getType(),
15398 };
15399 auto GetVectorCost = [=](InstructionCost CommonCost) {
15400 // Do not count cost here if minimum bitwidth is in effect and it is just
15401 // a bitcast (here it is just a noop).
15402 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
15403 return CommonCost;
15404 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
15405 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
15406
15407 bool IsArithmeticExtendedReduction =
15408 E->Idx == 0 && UserIgnoreList &&
15409 all_of(*UserIgnoreList, [](Value *V) {
15410 auto *I = cast<Instruction>(V);
15411 return is_contained({Instruction::Add, Instruction::FAdd,
15412 Instruction::Mul, Instruction::FMul,
15413 Instruction::And, Instruction::Or,
15414 Instruction::Xor},
15415 I->getOpcode());
15416 });
15417 if (IsArithmeticExtendedReduction &&
15418 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
15419 return CommonCost;
15420 return CommonCost +
15421 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
15422 VecOpcode == Opcode ? VI : nullptr);
15423 };
15424 return GetCostDiff(GetScalarCost, GetVectorCost);
15425 }
15426 case Instruction::FCmp:
15427 case Instruction::ICmp:
15428 case Instruction::Select: {
15429 CmpPredicate VecPred, SwappedVecPred;
15430 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
15431 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
15432 match(VL0, MatchCmp))
15433 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
15434 else
15435 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
15438 auto GetScalarCost = [&](unsigned Idx) {
15439 if (isa<PoisonValue>(UniqueValues[Idx]))
15441
15442 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15443 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
15446 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
15447 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
15448 !match(VI, MatchCmp)) ||
15449 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
15450 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
15451 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
15454
15455 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
15456 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
15457 CostKind, getOperandInfo(VI->getOperand(0)),
15458 getOperandInfo(VI->getOperand(1)), VI);
15459 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
15460 if (IntrinsicCost.isValid())
15461 ScalarCost = IntrinsicCost;
15462
15463 return ScalarCost;
15464 };
15465 auto GetVectorCost = [&](InstructionCost CommonCost) {
15466 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15467
15468 InstructionCost VecCost =
15469 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
15470 CostKind, getOperandInfo(E->getOperand(0)),
15471 getOperandInfo(E->getOperand(1)), VL0);
15472 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
15473 auto *CondType =
15474 getWidenedType(SI->getCondition()->getType(), VL.size());
15475 unsigned CondNumElements = CondType->getNumElements();
15476 unsigned VecTyNumElements = getNumElements(VecTy);
15477 assert(VecTyNumElements >= CondNumElements &&
15478 VecTyNumElements % CondNumElements == 0 &&
15479 "Cannot vectorize Instruction::Select");
15480 if (CondNumElements != VecTyNumElements) {
15481 // When the return type is i1 but the source is fixed vector type, we
15482 // need to duplicate the condition value.
15483 VecCost += ::getShuffleCost(
15484 *TTI, TTI::SK_PermuteSingleSrc, CondType,
15485 createReplicatedMask(VecTyNumElements / CondNumElements,
15486 CondNumElements));
15487 }
15488 }
15489 return VecCost + CommonCost;
15490 };
15491 return GetCostDiff(GetScalarCost, GetVectorCost);
15492 }
15493 case TreeEntry::MinMax: {
15494 auto GetScalarCost = [&](unsigned Idx) {
15495 return GetMinMaxCost(OrigScalarTy);
15496 };
15497 auto GetVectorCost = [&](InstructionCost CommonCost) {
15498 InstructionCost VecCost = GetMinMaxCost(VecTy);
15499 return VecCost + CommonCost;
15500 };
15501 return GetCostDiff(GetScalarCost, GetVectorCost);
15502 }
15503 case TreeEntry::FMulAdd: {
15504 auto GetScalarCost = [&](unsigned Idx) {
15505 if (isa<PoisonValue>(UniqueValues[Idx]))
15507 return GetFMulAddCost(E->getOperations(),
15508 cast<Instruction>(UniqueValues[Idx]));
15509 };
15510 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15511 FastMathFlags FMF;
15512 FMF.set();
15513 for (Value *V : E->Scalars) {
15514 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
15515 FMF &= FPCI->getFastMathFlags();
15516 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
15517 FMF &= FPCIOp->getFastMathFlags();
15518 }
15519 }
15520 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15521 {VecTy, VecTy, VecTy}, FMF);
15522 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
15523 return VecCost + CommonCost;
15524 };
15525 return GetCostDiff(GetScalarCost, GetVectorCost);
15526 }
15527 case TreeEntry::ReducedBitcast:
15528 case TreeEntry::ReducedBitcastBSwap: {
15529 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
15530 if (isa<PoisonValue>(UniqueValues[Idx]))
15532 auto *Shl = dyn_cast<Instruction>(UniqueValues[Idx]);
15533 if (!Shl)
15535 InstructionCost ScalarCost = TTI.getInstructionCost(Shl, CostKind);
15536 auto *ZExt = dyn_cast<Instruction>(Shl->getOperand(0));
15537 if (!ZExt)
15538 return ScalarCost;
15539 ScalarCost += TTI.getInstructionCost(ZExt, CostKind);
15540 return ScalarCost;
15541 };
15542 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15543 const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
15544 TTI::CastContextHint CastCtx =
15545 getCastContextHint(*getOperandEntry(LhsTE, /*Idx=*/0));
15546 Type *SrcScalarTy = cast<ZExtInst>(LhsTE->getMainOp())->getSrcTy();
15547 auto *SrcVecTy = getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
15548 InstructionCost BitcastCost = TTI.getCastInstrCost(
15549 Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx, CostKind);
15550 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
15551 auto *OrigScalarTy = E->getMainOp()->getType();
15552 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, OrigScalarTy,
15553 {OrigScalarTy});
15555 TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
15556 BitcastCost += IntrinsicCost;
15557 }
15558 return BitcastCost + CommonCost;
15559 };
15560 return GetCostDiff(GetScalarCost, GetVectorCost);
15561 }
15562 case Instruction::FNeg:
15563 case Instruction::Add:
15564 case Instruction::FAdd:
15565 case Instruction::Sub:
15566 case Instruction::FSub:
15567 case Instruction::Mul:
15568 case Instruction::FMul:
15569 case Instruction::UDiv:
15570 case Instruction::SDiv:
15571 case Instruction::FDiv:
15572 case Instruction::URem:
15573 case Instruction::SRem:
15574 case Instruction::FRem:
15575 case Instruction::Shl:
15576 case Instruction::LShr:
15577 case Instruction::AShr:
15578 case Instruction::And:
15579 case Instruction::Or:
15580 case Instruction::Xor: {
15581 auto GetScalarCost = [&](unsigned Idx) {
15582 if (isa<PoisonValue>(UniqueValues[Idx]))
15584
15585 // We cannot retrieve the operand from UniqueValues[Idx] because an
15586 // interchangeable instruction may be used. The order and the actual
15587 // operand might differ from what is retrieved from UniqueValues[Idx].
15588 unsigned Lane = UniqueIndexes[Idx];
15589 Value *Op1 = E->getOperand(0)[Lane];
15590 Value *Op2;
15591 SmallVector<const Value *, 2> Operands(1, Op1);
15592 if (isa<UnaryOperator>(UniqueValues[Idx])) {
15593 Op2 = Op1;
15594 } else {
15595 Op2 = E->getOperand(1)[Lane];
15596 Operands.push_back(Op2);
15597 }
15600 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
15601 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
15602 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
15603 I && (ShuffleOrOp == Instruction::FAdd ||
15604 ShuffleOrOp == Instruction::FSub)) {
15605 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
15606 if (IntrinsicCost.isValid())
15607 ScalarCost = IntrinsicCost;
15608 }
15609 return ScalarCost;
15610 };
15611 auto GetVectorCost = [=](InstructionCost CommonCost) {
15612 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15613 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15614 ArrayRef<Value *> Ops = E->getOperand(I);
15615 if (all_of(Ops, [&](Value *Op) {
15616 auto *CI = dyn_cast<ConstantInt>(Op);
15617 return CI && CI->getValue().countr_one() >= It->second.first;
15618 }))
15619 return CommonCost;
15620 }
15621 }
15622 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
15623 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
15624 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
15625 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
15626 Op2Info, {}, nullptr, TLI) +
15627 CommonCost;
15628 };
15629 return GetCostDiff(GetScalarCost, GetVectorCost);
15630 }
15631 case Instruction::GetElementPtr: {
15632 return CommonCost + GetGEPCostDiff(VL, VL0);
15633 }
15634 case Instruction::Load: {
15635 auto GetScalarCost = [&](unsigned Idx) {
15636 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
15637 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15638 VI->getAlign(), VI->getPointerAddressSpace(),
15640 };
15641 auto *LI0 = cast<LoadInst>(VL0);
15642 auto GetVectorCost = [&](InstructionCost CommonCost) {
15643 InstructionCost VecLdCost;
15644 switch (E->State) {
15645 case TreeEntry::Vectorize:
15646 if (unsigned Factor = E->getInterleaveFactor()) {
15647 VecLdCost = TTI->getInterleavedMemoryOpCost(
15648 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15649 LI0->getPointerAddressSpace(), CostKind);
15650
15651 } else {
15652 VecLdCost = TTI->getMemoryOpCost(
15653 Instruction::Load, VecTy, LI0->getAlign(),
15654 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15655 }
15656 break;
15657 case TreeEntry::StridedVectorize: {
15658 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
15659 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15660 assert(StridedLoadTy && "Missing StridedPointerInfo for tree entry.");
15661 Align CommonAlignment =
15662 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15663 VecLdCost = TTI->getMemIntrinsicInstrCost(
15664 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15665 StridedLoadTy, LI0->getPointerOperand(),
15666 /*VariableMask=*/false, CommonAlignment),
15667 CostKind);
15668 if (StridedLoadTy != VecTy)
15669 VecLdCost +=
15670 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15671 getCastContextHint(*E), CostKind);
15672
15673 break;
15674 }
15675 case TreeEntry::CompressVectorize: {
15676 bool IsMasked;
15677 unsigned InterleaveFactor;
15678 SmallVector<int> CompressMask;
15679 VectorType *LoadVecTy;
15680 SmallVector<Value *> Scalars(VL);
15681 if (!E->ReorderIndices.empty()) {
15682 SmallVector<int> Mask(E->ReorderIndices.begin(),
15683 E->ReorderIndices.end());
15684 reorderScalars(Scalars, Mask);
15685 }
15686 SmallVector<Value *> PointerOps(Scalars.size());
15687 for (auto [I, V] : enumerate(Scalars))
15688 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15689 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15690 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15691 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
15692 CompressMask, LoadVecTy);
15693 assert(IsVectorized && "Failed to vectorize load");
15694 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
15695 InterleaveFactor, IsMasked);
15696 Align CommonAlignment = LI0->getAlign();
15697 if (InterleaveFactor) {
15698 VecLdCost = TTI->getInterleavedMemoryOpCost(
15699 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15700 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
15701 } else if (IsMasked) {
15702 VecLdCost = TTI->getMemIntrinsicInstrCost(
15703 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15704 CommonAlignment,
15705 LI0->getPointerAddressSpace()),
15706 CostKind);
15707 // TODO: include this cost into CommonCost.
15708 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15709 LoadVecTy, CompressMask, CostKind);
15710 } else {
15711 VecLdCost = TTI->getMemoryOpCost(
15712 Instruction::Load, LoadVecTy, CommonAlignment,
15713 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15714 // TODO: include this cost into CommonCost.
15715 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15716 LoadVecTy, CompressMask, CostKind);
15717 }
15718 break;
15719 }
15720 case TreeEntry::ScatterVectorize: {
15721 Align CommonAlignment =
15722 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15723 VecLdCost = TTI->getMemIntrinsicInstrCost(
15724 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15725 LI0->getPointerOperand(),
15726 /*VariableMask=*/false, CommonAlignment),
15727 CostKind);
15728 break;
15729 }
15730 case TreeEntry::CombinedVectorize:
15731 case TreeEntry::SplitVectorize:
15732 case TreeEntry::NeedToGather:
15733 llvm_unreachable("Unexpected vectorization state.");
15734 }
15735 return VecLdCost + CommonCost;
15736 };
15737
15738 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
15739 // If this node generates masked gather load then it is not a terminal node.
15740 // Hence address operand cost is estimated separately.
15741 if (E->State == TreeEntry::ScatterVectorize)
15742 return Cost;
15743
15744 // Estimate cost of GEPs since this tree node is a terminator.
15745 SmallVector<Value *> PointerOps(VL.size());
15746 for (auto [I, V] : enumerate(VL))
15747 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15748 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15749 }
15750 case Instruction::Store: {
15751 bool IsReorder = !E->ReorderIndices.empty();
15752 auto GetScalarCost = [=](unsigned Idx) {
15753 auto *VI = cast<StoreInst>(VL[Idx]);
15754 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
15755 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15756 VI->getAlign(), VI->getPointerAddressSpace(),
15757 CostKind, OpInfo, VI);
15758 };
15759 auto *BaseSI =
15760 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
15761 auto GetVectorCost = [=](InstructionCost CommonCost) {
15762 // We know that we can merge the stores. Calculate the cost.
15763 InstructionCost VecStCost;
15764 if (E->State == TreeEntry::StridedVectorize) {
15765 Align CommonAlignment =
15766 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
15767 VecStCost = TTI->getMemIntrinsicInstrCost(
15768 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15769 VecTy, BaseSI->getPointerOperand(),
15770 /*VariableMask=*/false, CommonAlignment),
15771 CostKind);
15772 } else {
15773 assert(E->State == TreeEntry::Vectorize &&
15774 "Expected either strided or consecutive stores.");
15775 if (unsigned Factor = E->getInterleaveFactor()) {
15776 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
15777 "No reused shuffles expected");
15778 CommonCost = 0;
15779 VecStCost = TTI->getInterleavedMemoryOpCost(
15780 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15781 BaseSI->getPointerAddressSpace(), CostKind);
15782 } else {
15783 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
15784 VecStCost = TTI->getMemoryOpCost(
15785 Instruction::Store, VecTy, BaseSI->getAlign(),
15786 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
15787 }
15788 }
15789 return VecStCost + CommonCost;
15790 };
15791 SmallVector<Value *> PointerOps(VL.size());
15792 for (auto [I, V] : enumerate(VL)) {
15793 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
15794 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
15795 }
15796
15797 return GetCostDiff(GetScalarCost, GetVectorCost) +
15798 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15799 }
15800 case Instruction::Call: {
15801 auto GetScalarCost = [&](unsigned Idx) {
15802 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15805 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15806 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15807 }
15808 return TTI->getCallInstrCost(CI->getCalledFunction(),
15810 CI->getFunctionType()->params(), CostKind);
15811 };
15812 auto GetVectorCost = [=](InstructionCost CommonCost) {
15813 auto *CI = cast<CallInst>(VL0);
15816 CI, ID, VecTy->getNumElements(),
15817 It != MinBWs.end() ? It->second.first : 0, TTI);
15818 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15819 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15820 };
15821 return GetCostDiff(GetScalarCost, GetVectorCost);
15822 }
15823 case Instruction::ShuffleVector: {
15824 if (!SLPReVec || E->isAltShuffle())
15825 assert(E->isAltShuffle() &&
15826 ((Instruction::isBinaryOp(E->getOpcode()) &&
15827 Instruction::isBinaryOp(E->getAltOpcode())) ||
15828 (Instruction::isCast(E->getOpcode()) &&
15829 Instruction::isCast(E->getAltOpcode())) ||
15830 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15831 "Invalid Shuffle Vector Operand");
15832 // Try to find the previous shuffle node with the same operands and same
15833 // main/alternate ops.
15834 auto TryFindNodeWithEqualOperands = [=]() {
15835 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15836 if (TE.get() == E)
15837 break;
15838 if (TE->hasState() && TE->isAltShuffle() &&
15839 ((TE->getOpcode() == E->getOpcode() &&
15840 TE->getAltOpcode() == E->getAltOpcode()) ||
15841 (TE->getOpcode() == E->getAltOpcode() &&
15842 TE->getAltOpcode() == E->getOpcode())) &&
15843 TE->hasEqualOperands(*E))
15844 return true;
15845 }
15846 return false;
15847 };
15848 auto GetScalarCost = [&](unsigned Idx) {
15849 if (isa<PoisonValue>(UniqueValues[Idx]))
15851
15852 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15853 assert(E->getMatchingMainOpOrAltOp(VI) &&
15854 "Unexpected main/alternate opcode");
15855 (void)E;
15856 return TTI->getInstructionCost(VI, CostKind);
15857 };
15858 // Need to clear CommonCost since the final shuffle cost is included into
15859 // vector cost.
15860 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15861 // VecCost is equal to sum of the cost of creating 2 vectors
15862 // and the cost of creating shuffle.
15863 InstructionCost VecCost = 0;
15864 if (TryFindNodeWithEqualOperands()) {
15865 LLVM_DEBUG({
15866 dbgs() << "SLP: diamond match for alternate node found.\n";
15867 E->dump();
15868 });
15869 // No need to add new vector costs here since we're going to reuse
15870 // same main/alternate vector ops, just do different shuffling.
15871 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15872 VecCost =
15873 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15874 VecCost +=
15875 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15876 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15877 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15878 VecCost = TTIRef.getCmpSelInstrCost(
15879 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15880 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15881 VL0);
15882 VecCost += TTIRef.getCmpSelInstrCost(
15883 E->getOpcode(), VecTy, MaskTy,
15884 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15885 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15886 E->getAltOp());
15887 } else {
15888 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15889 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15890 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15891 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15892 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15893 unsigned SrcBWSz =
15894 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15895 if (SrcIt != MinBWs.end()) {
15896 SrcBWSz = SrcIt->second.first;
15897 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15898 SrcTy = getWidenedType(SrcSclTy, VL.size());
15899 }
15900 if (BWSz <= SrcBWSz) {
15901 if (BWSz < SrcBWSz)
15902 VecCost =
15903 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15905 LLVM_DEBUG({
15906 dbgs()
15907 << "SLP: alternate extension, which should be truncated.\n";
15908 E->dump();
15909 });
15910 return VecCost;
15911 }
15912 }
15913 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15915 VecCost +=
15916 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15918 }
15919 SmallVector<int> Mask;
15920 E->buildAltOpShuffleMask(
15921 [&](Instruction *I) {
15922 assert(E->getMatchingMainOpOrAltOp(I) &&
15923 "Unexpected main/alternate opcode");
15924 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15925 *TLI);
15926 },
15927 Mask);
15929 FinalVecTy, Mask, CostKind);
15930 // Patterns like [fadd,fsub] can be combined into a single instruction
15931 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15932 // need to take into account their order when looking for the most used
15933 // order.
15934 unsigned Opcode0 = E->getOpcode();
15935 unsigned Opcode1 = E->getAltOpcode();
15936 SmallBitVector OpcodeMask(
15937 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15938 // If this pattern is supported by the target then we consider the
15939 // order.
15940 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15941 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15942 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15943 return AltVecCost < VecCost ? AltVecCost : VecCost;
15944 }
15945 // TODO: Check the reverse order too.
15946 return VecCost;
15947 };
15948 if (SLPReVec && !E->isAltShuffle())
15949 return GetCostDiff(
15950 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15951 // If a group uses mask in order, the shufflevector can be
15952 // eliminated by instcombine. Then the cost is 0.
15954 "Not supported shufflevector usage.");
15955 auto *SV = cast<ShuffleVectorInst>(VL.front());
15956 unsigned SVNumElements =
15957 cast<FixedVectorType>(SV->getOperand(0)->getType())
15958 ->getNumElements();
15959 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15960 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15961 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15962 int NextIndex = 0;
15963 if (!all_of(Group, [&](Value *V) {
15965 "Not supported shufflevector usage.");
15966 auto *SV = cast<ShuffleVectorInst>(V);
15967 int Index;
15968 [[maybe_unused]] bool IsExtractSubvectorMask =
15969 SV->isExtractSubvectorMask(Index);
15970 assert(IsExtractSubvectorMask &&
15971 "Not supported shufflevector usage.");
15972 if (NextIndex != Index)
15973 return false;
15974 NextIndex += SV->getShuffleMask().size();
15975 return true;
15976 }))
15977 return ::getShuffleCost(
15979 calculateShufflevectorMask(E->Scalars));
15980 }
15981 return TTI::TCC_Free;
15982 });
15983 return GetCostDiff(GetScalarCost, GetVectorCost);
15984 }
15985 case Instruction::Freeze:
15986 return CommonCost;
15987 default:
15988 llvm_unreachable("Unknown instruction");
15989 }
15990}
15991
15992bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15993 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15994 << VectorizableTree.size() << " is fully vectorizable .\n");
15995
15996 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15997 SmallVector<int> Mask;
15998 return TE->isGather() &&
15999 !any_of(TE->Scalars,
16000 [this](Value *V) { return EphValues.contains(V); }) &&
16001 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
16002 TE->Scalars.size() < Limit ||
16003 (((TE->hasState() &&
16004 TE->getOpcode() == Instruction::ExtractElement) ||
16006 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
16007 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
16008 !TE->isAltShuffle()) ||
16009 any_of(TE->Scalars, IsaPred<LoadInst>));
16010 };
16011
16012 // We only handle trees of heights 1 and 2.
16013 if (VectorizableTree.size() == 1 &&
16014 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
16015 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
16016 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
16017 (ForReduction &&
16018 AreVectorizableGathers(VectorizableTree[0].get(),
16019 VectorizableTree[0]->Scalars.size()) &&
16020 VectorizableTree[0]->getVectorFactor() > 2)))
16021 return true;
16022
16023 if (VectorizableTree.size() != 2)
16024 return false;
16025
16026 // Handle splat and all-constants stores. Also try to vectorize tiny trees
16027 // with the second gather nodes if they have less scalar operands rather than
16028 // the initial tree element (may be profitable to shuffle the second gather)
16029 // or they are extractelements, which form shuffle.
16030 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
16031 AreVectorizableGathers(VectorizableTree[1].get(),
16032 VectorizableTree[0]->Scalars.size()))
16033 return true;
16034
16035 // Gathering cost would be too much for tiny trees.
16036 if (VectorizableTree[0]->isGather() ||
16037 (VectorizableTree[1]->isGather() &&
16038 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
16039 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
16040 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
16041 return false;
16042
16043 return true;
16044}
16045
16046static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
16048 bool MustMatchOrInst) {
16049 // Look past the root to find a source value. Arbitrarily follow the
16050 // path through operand 0 of any 'or'. Also, peek through optional
16051 // shift-left-by-multiple-of-8-bits.
16052 Value *ZextLoad = Root;
16053 const APInt *ShAmtC;
16054 bool FoundOr = false;
16055 while (!isa<ConstantExpr>(ZextLoad) &&
16056 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
16057 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
16058 ShAmtC->urem(8) == 0))) {
16059 auto *BinOp = cast<BinaryOperator>(ZextLoad);
16060 ZextLoad = BinOp->getOperand(0);
16061 if (BinOp->getOpcode() == Instruction::Or)
16062 FoundOr = true;
16063 }
16064 // Check if the input is an extended load of the required or/shift expression.
16065 Value *Load;
16066 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
16067 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
16068 return false;
16069
16070 // Require that the total load bit width is a legal integer type.
16071 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
16072 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
16073 Type *SrcTy = Load->getType();
16074 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
16075 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
16076 return false;
16077
16078 // Everything matched - assume that we can fold the whole sequence using
16079 // load combining.
16080 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
16081 << *(cast<Instruction>(Root)) << "\n");
16082
16083 return true;
16084}
16085
16087 if (RdxKind != RecurKind::Or)
16088 return false;
16089
16090 unsigned NumElts = VectorizableTree[0]->Scalars.size();
16091 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
16092 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
16093 /* MatchOr */ false);
16094}
16095
16097 // Peek through a final sequence of stores and check if all operations are
16098 // likely to be load-combined.
16099 unsigned NumElts = Stores.size();
16100 for (Value *Scalar : Stores) {
16101 Value *X;
16102 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
16103 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
16104 return false;
16105 }
16106 return true;
16107}
16108
16109bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
16110 if (!DebugCounter::shouldExecute(VectorizedGraphs))
16111 return true;
16112
16113 // Graph is empty - do nothing.
16114 if (VectorizableTree.empty()) {
16115 assert(ExternalUses.empty() && "We shouldn't have any external users");
16116
16117 return true;
16118 }
16119
16120 // No need to vectorize inserts of gathered values.
16121 if (VectorizableTree.size() == 2 &&
16122 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
16123 VectorizableTree[1]->isGather() &&
16124 (VectorizableTree[1]->getVectorFactor() <= 2 ||
16125 !(isSplat(VectorizableTree[1]->Scalars) ||
16126 allConstant(VectorizableTree[1]->Scalars))))
16127 return true;
16128
16129 // If the graph includes only PHI nodes and gathers, it is defnitely not
16130 // profitable for the vectorization, we can skip it, if the cost threshold is
16131 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
16132 // gathers/buildvectors.
16133 constexpr int Limit = 4;
16134 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
16135 !VectorizableTree.empty() &&
16136 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
16137 return (TE->isGather() &&
16138 (!TE->hasState() ||
16139 TE->getOpcode() != Instruction::ExtractElement) &&
16140 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
16141 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
16142 }))
16143 return true;
16144
16145 // Do not vectorize small tree of phis only, if all vector phis are also
16146 // gathered.
16147 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16148 VectorizableTree.size() <= Limit &&
16149 all_of(VectorizableTree,
16150 [&](const std::unique_ptr<TreeEntry> &TE) {
16151 return (TE->isGather() &&
16152 (!TE->hasState() ||
16153 TE->getOpcode() != Instruction::ExtractElement) &&
16154 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
16155 Limit) ||
16156 (TE->hasState() &&
16157 (TE->getOpcode() == Instruction::InsertElement ||
16158 (TE->getOpcode() == Instruction::PHI &&
16159 all_of(TE->Scalars, [&](Value *V) {
16160 return isa<PoisonValue>(V) || MustGather.contains(V);
16161 }))));
16162 }) &&
16163 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
16164 return TE->State == TreeEntry::Vectorize &&
16165 TE->getOpcode() == Instruction::PHI;
16166 }))
16167 return true;
16168
16169 // If the tree contains only phis, buildvectors, split nodes and
16170 // small nodes with reuses, we can skip it.
16171 SmallVector<const TreeEntry *> StoreLoadNodes;
16172 unsigned NumGathers = 0;
16173 constexpr int LimitTreeSize = 36;
16174 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
16175 all_of(VectorizableTree,
16176 [&](const std::unique_ptr<TreeEntry> &TE) {
16177 if (!TE->isGather() && TE->hasState() &&
16178 (TE->getOpcode() == Instruction::Load ||
16179 TE->getOpcode() == Instruction::Store)) {
16180 StoreLoadNodes.push_back(TE.get());
16181 return true;
16182 }
16183 if (TE->isGather())
16184 ++NumGathers;
16185 return TE->State == TreeEntry::SplitVectorize ||
16186 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
16187 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
16188 VectorizableTree.size() > LimitTreeSize) ||
16189 (TE->isGather() &&
16190 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
16191 (TE->hasState() &&
16192 (TE->getOpcode() == Instruction::PHI ||
16193 (TE->hasCopyableElements() &&
16194 static_cast<unsigned>(count_if(
16195 TE->Scalars, IsaPred<PHINode, Constant>)) >=
16196 TE->Scalars.size() / 2) ||
16197 ((!TE->ReuseShuffleIndices.empty() ||
16198 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
16199 TE->Scalars.size() == 2)));
16200 }) &&
16201 (StoreLoadNodes.empty() ||
16202 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
16203 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
16204 return TE->getOpcode() == Instruction::Store ||
16205 all_of(TE->Scalars, [&](Value *V) {
16206 return !isa<LoadInst>(V) ||
16207 areAllUsersVectorized(cast<Instruction>(V));
16208 });
16209 })))))
16210 return true;
16211
16212 // If the tree contains only buildvector, 2 non-buildvectors (with root user
16213 // tree node) and other buildvectors, we can skip it.
16214 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16215 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
16216 VectorizableTree.size() >= Limit &&
16217 count_if(ArrayRef(VectorizableTree).drop_front(),
16218 [&](const std::unique_ptr<TreeEntry> &TE) {
16219 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
16220 TE->UserTreeIndex.UserTE->Idx == 0;
16221 }) == 2)
16222 return true;
16223
16224 // If the tree contains only vectorization of the phi node from the
16225 // buildvector - skip it.
16226 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16227 VectorizableTree.size() > 2 &&
16228 VectorizableTree.front()->State == TreeEntry::Vectorize &&
16229 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16230 VectorizableTree[1]->State == TreeEntry::Vectorize &&
16231 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
16232 all_of(
16233 ArrayRef(VectorizableTree).drop_front(2),
16234 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
16235 return true;
16236
16237 // We can vectorize the tree if its size is greater than or equal to the
16238 // minimum size specified by the MinTreeSize command line option.
16239 if (VectorizableTree.size() >= MinTreeSize)
16240 return false;
16241
16242 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
16243 // can vectorize it if we can prove it fully vectorizable.
16244 if (isFullyVectorizableTinyTree(ForReduction))
16245 return false;
16246
16247 // Check if any of the gather node forms an insertelement buildvector
16248 // somewhere.
16249 bool IsAllowedSingleBVNode =
16250 VectorizableTree.size() > 1 ||
16251 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
16252 !VectorizableTree.front()->isAltShuffle() &&
16253 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
16254 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
16255 allSameBlock(VectorizableTree.front()->Scalars));
16256 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
16257 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
16258 return isa<ExtractElementInst, Constant>(V) ||
16259 (IsAllowedSingleBVNode &&
16260 !V->hasNUsesOrMore(UsesLimit) &&
16261 any_of(V->users(), IsaPred<InsertElementInst>));
16262 });
16263 }))
16264 return false;
16265
16266 if (VectorizableTree.back()->isGather() &&
16267 VectorizableTree.back()->hasState() &&
16268 VectorizableTree.back()->isAltShuffle() &&
16269 VectorizableTree.back()->getVectorFactor() > 2 &&
16270 allSameBlock(VectorizableTree.back()->Scalars) &&
16271 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
16272 TTI->getScalarizationOverhead(
16273 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
16274 VectorizableTree.back()->getVectorFactor()),
16275 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
16276 /*Insert=*/true, /*Extract=*/false,
16278 return false;
16279
16280 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
16281 // vectorizable.
16282 return true;
16283}
16284
16287 constexpr unsigned SmallTree = 3;
16288 if (VectorizableTree.front()->isNonPowOf2Vec() &&
16289 getCanonicalGraphSize() <= SmallTree &&
16290 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
16291 [](const std::unique_ptr<TreeEntry> &TE) {
16292 return TE->isGather() && TE->hasState() &&
16293 TE->getOpcode() == Instruction::Load &&
16294 !allSameBlock(TE->Scalars);
16295 }) == 1)
16296 return true;
16297 return false;
16298 }
16299 bool Res = false;
16300 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
16301 TreeEntry &E = *VectorizableTree[Idx];
16302 if (E.State == TreeEntry::SplitVectorize)
16303 return false;
16304 if (!E.isGather())
16305 continue;
16306 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
16307 (!E.hasState() &&
16309 (isa<ExtractElementInst>(E.Scalars.front()) &&
16310 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
16311 return false;
16312 if (isSplat(E.Scalars) || allConstant(E.Scalars))
16313 continue;
16314 Res = true;
16315 }
16316 return Res;
16317}
16318
16320 // Walk from the bottom of the tree to the top, tracking which values are
16321 // live. When we see a call instruction that is not part of our tree,
16322 // query TTI to see if there is a cost to keeping values live over it
16323 // (for example, if spills and fills are required).
16324
16325 const TreeEntry *Root = VectorizableTree.front().get();
16326 if (Root->isGather())
16327 return 0;
16328
16329 InstructionCost Cost = 0;
16331 EntriesToOperands;
16332 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
16333 SmallPtrSet<const Instruction *, 8> LastInstructions;
16334 SmallPtrSet<const TreeEntry *, 8> ScalarOrPseudoEntries;
16335 for (const auto &TEPtr : VectorizableTree) {
16336 if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast ||
16337 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap) {
16338 ScalarOrPseudoEntries.insert(TEPtr.get());
16339 continue;
16340 }
16341 if (!TEPtr->isGather()) {
16342 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
16343 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
16344 LastInstructions.insert(LastInst);
16345 }
16346 if (TEPtr->UserTreeIndex)
16347 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
16348 }
16349
16350 auto NoCallIntrinsic = [this](const Instruction *I) {
16351 const auto *II = dyn_cast<IntrinsicInst>(I);
16352 if (!II)
16353 return false;
16354 if (II->isAssumeLikeIntrinsic())
16355 return true;
16356 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
16357 InstructionCost IntrCost =
16358 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
16359 InstructionCost CallCost = TTI->getCallInstrCost(
16360 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
16361 return IntrCost < CallCost;
16362 };
16363
16364 // Maps last instruction in the entry to the last instruction for the one of
16365 // operand entries and the flag. If the flag is true, there are no calls in
16366 // between these instructions.
16368 CheckedInstructions;
16369 unsigned Budget = 0;
16370 const unsigned BudgetLimit =
16371 ScheduleRegionSizeBudget / VectorizableTree.size();
16372 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
16373 const Instruction *Last) {
16374 assert(First->getParent() == Last->getParent() &&
16375 "Expected instructions in same block.");
16376 if (auto It = CheckedInstructions.find(Last);
16377 It != CheckedInstructions.end()) {
16378 const Instruction *Checked = It->second.getPointer();
16379 if (Checked == First || Checked->comesBefore(First))
16380 return It->second.getInt() != 0;
16381 Last = Checked;
16382 } else if (Last == First || Last->comesBefore(First)) {
16383 return true;
16384 }
16386 ++First->getIterator().getReverse(),
16387 PrevInstIt =
16388 Last->getIterator().getReverse();
16389 SmallVector<const Instruction *> LastInstsInRange;
16390 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
16391 // Debug information does not impact spill cost.
16392 // Vectorized calls, represented as vector intrinsics, do not impact spill
16393 // cost.
16394 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
16395 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
16396 for (const Instruction *LastInst : LastInstsInRange)
16397 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
16398 return false;
16399 }
16400 if (LastInstructions.contains(&*PrevInstIt))
16401 LastInstsInRange.push_back(&*PrevInstIt);
16402
16403 ++PrevInstIt;
16404 ++Budget;
16405 }
16406 for (const Instruction *LastInst : LastInstsInRange)
16407 CheckedInstructions.try_emplace(
16408 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
16409 Budget <= BudgetLimit ? 1 : 0);
16410 return Budget <= BudgetLimit;
16411 };
16412 auto AddCosts = [&](const TreeEntry *Op) {
16413 if (ScalarOrPseudoEntries.contains(Op))
16414 return;
16415 Type *ScalarTy = Op->Scalars.front()->getType();
16416 auto It = MinBWs.find(Op);
16417 if (It != MinBWs.end())
16418 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
16419 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
16420 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
16421 if (ScalarTy->isVectorTy()) {
16422 // Handle revec dead vector instructions.
16423 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
16424 }
16425 };
16426 // Memoize the relationship between blocks, i.e. if there is (at least one)
16427 // non-vectorized call between the blocks. This allows to skip the analysis of
16428 // the same block paths multiple times.
16430 ParentOpParentToPreds;
16431 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
16432 BasicBlock *OpParent) {
16433 auto Key = std::make_pair(Root, OpParent);
16434 if (auto It = ParentOpParentToPreds.find(Key);
16435 It != ParentOpParentToPreds.end())
16436 return It->second;
16438 if (Pred)
16439 Worklist.push_back(Pred);
16440 else
16441 Worklist.append(pred_begin(Root), pred_end(Root));
16444 ParentsPairsToAdd;
16445 bool Res = false;
16447 for (const auto &KeyPair : ParentsPairsToAdd) {
16448 assert(!ParentOpParentToPreds.contains(KeyPair) &&
16449 "Should not have been added before.");
16450 ParentOpParentToPreds.try_emplace(KeyPair, Res);
16451 }
16452 });
16453 while (!Worklist.empty()) {
16454 BasicBlock *BB = Worklist.pop_back_val();
16455 if (BB == OpParent || !Visited.insert(BB).second)
16456 continue;
16457 auto Pair = std::make_pair(BB, OpParent);
16458 if (auto It = ParentOpParentToPreds.find(Pair);
16459 It != ParentOpParentToPreds.end()) {
16460 Res = It->second;
16461 return Res;
16462 }
16463 ParentsPairsToAdd.insert(Pair);
16464 unsigned BlockSize = BB->size();
16465 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
16466 return Res;
16467 Budget += BlockSize;
16468 if (Budget > BudgetLimit)
16469 return Res;
16470 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
16471 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
16472 BB->getTerminator()))
16473 return Res;
16474 Worklist.append(pred_begin(BB), pred_end(BB));
16475 }
16476 Res = true;
16477 return Res;
16478 };
16479 SmallVector<const TreeEntry *> LiveEntries(1, Root);
16480 auto FindNonScalarParentEntry = [&](const TreeEntry *E) -> const TreeEntry * {
16481 assert(ScalarOrPseudoEntries.contains(E) &&
16482 "Expected scalar or pseudo entry.");
16483 const TreeEntry *Entry = E;
16484 while (Entry->UserTreeIndex) {
16485 Entry = Entry->UserTreeIndex.UserTE;
16486 if (!ScalarOrPseudoEntries.contains(Entry))
16487 return Entry;
16488 }
16489 return nullptr;
16490 };
16491 while (!LiveEntries.empty()) {
16492 const TreeEntry *Entry = LiveEntries.pop_back_val();
16493 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
16494 if (Operands.empty())
16495 continue;
16496 if (ScalarOrPseudoEntries.contains(Entry)) {
16497 Entry = FindNonScalarParentEntry(Entry);
16498 if (!Entry) {
16499 for (const TreeEntry *Op : Operands) {
16500 if (!Op->isGather())
16501 LiveEntries.push_back(Op);
16502 }
16503 continue;
16504 }
16505 }
16506 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
16507 BasicBlock *Parent = LastInst->getParent();
16508 for (const TreeEntry *Op : Operands) {
16509 if (!Op->isGather())
16510 LiveEntries.push_back(Op);
16511 if (ScalarOrPseudoEntries.contains(Op))
16512 continue;
16513 if (Entry->State == TreeEntry::SplitVectorize ||
16514 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
16515 (Op->isGather() && allConstant(Op->Scalars)))
16516 continue;
16517 Budget = 0;
16518 BasicBlock *Pred = nullptr;
16519 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
16520 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
16521 BasicBlock *OpParent;
16522 Instruction *OpLastInst;
16523 if (Op->isGather()) {
16524 assert(Entry->getOpcode() == Instruction::PHI &&
16525 "Expected phi node only.");
16526 OpParent = cast<PHINode>(Entry->getMainOp())
16527 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
16528 OpLastInst = OpParent->getTerminator();
16529 for (Value *V : Op->Scalars) {
16530 auto *Inst = dyn_cast<Instruction>(V);
16531 if (!Inst)
16532 continue;
16533 if (isVectorized(V)) {
16534 OpParent = Inst->getParent();
16535 OpLastInst = Inst;
16536 break;
16537 }
16538 }
16539 } else {
16540 OpLastInst = EntriesToLastInstruction.at(Op);
16541 OpParent = OpLastInst->getParent();
16542 }
16543 // Check the call instructions within the same basic blocks.
16544 if (OpParent == Parent) {
16545 if (Entry->getOpcode() == Instruction::PHI) {
16546 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16547 AddCosts(Op);
16548 continue;
16549 }
16550 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16551 AddCosts(Op);
16552 continue;
16553 }
16554 // Check for call instruction in between blocks.
16555 // 1. Check entry's block to the head.
16556 if (Entry->getOpcode() != Instruction::PHI &&
16557 !CheckForNonVecCallsInSameBlock(
16558 &*Parent->getFirstNonPHIOrDbgOrAlloca(), LastInst)) {
16559 AddCosts(Op);
16560 continue;
16561 }
16562 // 2. Check op's block from the end.
16563 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16564 OpParent->getTerminator())) {
16565 AddCosts(Op);
16566 continue;
16567 }
16568 // 3. Check the predecessors of entry's block till op's block.
16569 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16570 AddCosts(Op);
16571 continue;
16572 }
16573 }
16574 }
16575
16576 return Cost;
16577}
16578
16579/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
16580/// buildvector sequence.
16582 const InsertElementInst *IE2) {
16583 if (IE1 == IE2)
16584 return false;
16585 const auto *I1 = IE1;
16586 const auto *I2 = IE2;
16587 const InsertElementInst *PrevI1;
16588 const InsertElementInst *PrevI2;
16589 unsigned Idx1 = *getElementIndex(IE1);
16590 unsigned Idx2 = *getElementIndex(IE2);
16591 do {
16592 if (I2 == IE1)
16593 return true;
16594 if (I1 == IE2)
16595 return false;
16596 PrevI1 = I1;
16597 PrevI2 = I2;
16598 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16599 getElementIndex(I1).value_or(Idx2) != Idx2)
16600 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
16601 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
16602 getElementIndex(I2).value_or(Idx1) != Idx1)
16603 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
16604 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16605 llvm_unreachable("Two different buildvectors not expected.");
16606}
16607
16608namespace {
16609/// Returns incoming Value *, if the requested type is Value * too, or a default
16610/// value, otherwise.
16611struct ValueSelect {
16612 template <typename U>
16613 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
16614 return V;
16615 }
16616 template <typename U>
16617 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
16618 return U();
16619 }
16620};
16621} // namespace
16622
16623/// Does the analysis of the provided shuffle masks and performs the requested
16624/// actions on the vectors with the given shuffle masks. It tries to do it in
16625/// several steps.
16626/// 1. If the Base vector is not undef vector, resizing the very first mask to
16627/// have common VF and perform action for 2 input vectors (including non-undef
16628/// Base). Other shuffle masks are combined with the resulting after the 1 stage
16629/// and processed as a shuffle of 2 elements.
16630/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
16631/// action only for 1 vector with the given mask, if it is not the identity
16632/// mask.
16633/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
16634/// vectors, combing the masks properly between the steps.
16635template <typename T>
16637 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
16638 function_ref<unsigned(T *)> GetVF,
16639 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
16641 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
16642 SmallVector<int> Mask(ShuffleMask.begin()->second);
16643 auto VMIt = std::next(ShuffleMask.begin());
16644 T *Prev = nullptr;
16645 SmallBitVector UseMask =
16646 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16647 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
16648 if (!IsBaseUndef.all()) {
16649 // Base is not undef, need to combine it with the next subvectors.
16650 std::pair<T *, bool> Res =
16651 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
16652 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
16653 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16654 if (Mask[Idx] == PoisonMaskElem)
16655 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
16656 else
16657 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16658 }
16659 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
16660 assert((!V || GetVF(V) == Mask.size()) &&
16661 "Expected base vector of VF number of elements.");
16662 Prev = Action(Mask, {nullptr, Res.first});
16663 } else if (ShuffleMask.size() == 1) {
16664 // Base is undef and only 1 vector is shuffled - perform the action only for
16665 // single vector, if the mask is not the identity mask.
16666 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16667 /*ForSingleMask=*/true);
16668 if (Res.second)
16669 // Identity mask is found.
16670 Prev = Res.first;
16671 else
16672 Prev = Action(Mask, {ShuffleMask.begin()->first});
16673 } else {
16674 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
16675 // shuffles step by step, combining shuffle between the steps.
16676 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16677 unsigned Vec2VF = GetVF(VMIt->first);
16678 if (Vec1VF == Vec2VF) {
16679 // No need to resize the input vectors since they are of the same size, we
16680 // can shuffle them directly.
16681 ArrayRef<int> SecMask = VMIt->second;
16682 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16683 if (SecMask[I] != PoisonMaskElem) {
16684 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16685 Mask[I] = SecMask[I] + Vec1VF;
16686 }
16687 }
16688 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16689 } else {
16690 // Vectors of different sizes - resize and reshuffle.
16691 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16692 /*ForSingleMask=*/false);
16693 std::pair<T *, bool> Res2 =
16694 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16695 ArrayRef<int> SecMask = VMIt->second;
16696 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16697 if (Mask[I] != PoisonMaskElem) {
16698 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16699 if (Res1.second)
16700 Mask[I] = I;
16701 } else if (SecMask[I] != PoisonMaskElem) {
16702 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16703 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
16704 }
16705 }
16706 Prev = Action(Mask, {Res1.first, Res2.first});
16707 }
16708 VMIt = std::next(VMIt);
16709 }
16710 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
16711 // Perform requested actions for the remaining masks/vectors.
16712 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16713 // Shuffle other input vectors, if any.
16714 std::pair<T *, bool> Res =
16715 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16716 ArrayRef<int> SecMask = VMIt->second;
16717 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16718 if (SecMask[I] != PoisonMaskElem) {
16719 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
16720 "Multiple uses of scalars.");
16721 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
16722 } else if (Mask[I] != PoisonMaskElem) {
16723 Mask[I] = I;
16724 }
16725 }
16726 Prev = Action(Mask, {Prev, Res.first});
16727 }
16728 return Prev;
16729}
16730
16732 ArrayRef<Value *> VectorizedVals) {
16734 SmallPtrSet<Value *, 4> CheckedExtracts;
16735 SmallSetVector<TreeEntry *, 4> GatheredLoadsNodes;
16736 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16737 << VectorizableTree.size() << ".\n");
16738 InstructionCost Cost = 0;
16739 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16740 TreeEntry &TE = *Ptr;
16741 // No need to count the cost for combined entries, they are combined and
16742 // just skip their cost.
16743 if (TE.State == TreeEntry::CombinedVectorize) {
16744 LLVM_DEBUG(
16745 dbgs() << "SLP: Skipping cost for combined node that starts with "
16746 << *TE.Scalars[0] << ".\n";
16747 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16748 NodesCosts.try_emplace(&TE);
16749 continue;
16750 }
16751 if (TE.hasState() &&
16752 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16753 if (const TreeEntry *E =
16754 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16755 E && E->getVectorFactor() == TE.getVectorFactor()) {
16756 // Some gather nodes might be absolutely the same as some vectorizable
16757 // nodes after reordering, need to handle it.
16758 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16759 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16760 << "SLP: Current total cost = " << Cost << "\n");
16761 NodesCosts.try_emplace(&TE);
16762 continue;
16763 }
16764 }
16765
16766 // Exclude cost of gather loads nodes which are not used. These nodes were
16767 // built as part of the final attempt to vectorize gathered loads.
16768 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16769 "Expected gather nodes with users only.");
16770
16771 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
16772 Cost += C;
16773 NodesCosts.try_emplace(&TE, C);
16774 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16775 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16776 << "SLP: Current total cost = " << Cost << "\n");
16777 // Add gathered loads nodes to the set for later processing.
16778 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
16779 TE.getOpcode() == Instruction::Load)
16780 GatheredLoadsNodes.insert(&TE);
16781 }
16782 // Bail out if the cost threshold is negative and cost already below it.
16783 if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 &&
16784 Cost < -SLPCostThreshold)
16785 return Cost;
16786 // The narrow non-profitable tree in loop? Skip, may cause regressions.
16787 constexpr unsigned PartLimit = 2;
16788 const unsigned Sz =
16789 getVectorElementSize(VectorizableTree.front()->Scalars.front());
16790 const unsigned MinVF = getMinVF(Sz);
16791 if (Cost >= -SLPCostThreshold &&
16792 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
16793 (!VectorizableTree.front()->hasState() ||
16794 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
16795 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
16796 return Cost;
16798 VectorizableTree.size());
16799 auto UpdateParentNodes =
16800 [&](const TreeEntry *UserTE, const TreeEntry *TE, InstructionCost C,
16802 &VisitedUser,
16803 bool AddToList = true) {
16804 while (UserTE &&
16805 VisitedUser.insert(std::make_pair(TE, UserTE)).second) {
16806 SubtreeCosts[UserTE->Idx].first += C;
16807 if (AddToList)
16808 SubtreeCosts[UserTE->Idx].second.push_back(TE->Idx);
16809 UserTE = UserTE->UserTreeIndex.UserTE;
16810 }
16811 };
16812 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16813 TreeEntry &TE = *Ptr;
16814 InstructionCost C = NodesCosts.at(&TE);
16815 SubtreeCosts[TE.Idx].first += C;
16816 if (const TreeEntry *UserTE = TE.UserTreeIndex.UserTE) {
16818 VisitedUser;
16819 UpdateParentNodes(UserTE, &TE, C, VisitedUser);
16820 }
16821 }
16823 for (TreeEntry *TE : GatheredLoadsNodes) {
16824 InstructionCost C = SubtreeCosts[TE->Idx].first;
16825 for (Value *V : TE->Scalars) {
16826 for (const TreeEntry *BVTE : ValueToGatherNodes.lookup(V))
16827 UpdateParentNodes(BVTE, TE, C, Visited, /*AddToList=*/false);
16828 }
16829 }
16830 Visited.clear();
16831 using CostIndicesTy =
16832 std::pair<TreeEntry *, std::pair<InstructionCost, SmallVector<unsigned>>>;
16833 struct FirstGreater {
16834 bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
16835 return LHS.second.first < RHS.second.first ||
16836 (LHS.second.first == RHS.second.first &&
16837 LHS.first->Idx < RHS.first->Idx);
16838 }
16839 };
16841 Worklist;
16842 for (const auto [Idx, P] : enumerate(SubtreeCosts))
16843 Worklist.emplace(VectorizableTree[Idx].get(), P);
16844
16845 // Narrow store trees with non-profitable immediate values - exit.
16846 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
16847 VectorizableTree.front()->hasState() &&
16848 VectorizableTree.front()->getOpcode() == Instruction::Store &&
16849 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
16850 return Cost;
16851
16853 bool Changed = false;
16854 while (!Worklist.empty() && Worklist.top().second.first > 0) {
16855 TreeEntry *TE = Worklist.top().first;
16856 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE) ||
16857 // Exit early if the parent node is split node and any of scalars is
16858 // used in other split nodes.
16859 (TE->UserTreeIndex &&
16860 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
16861 any_of(TE->Scalars, [&](Value *V) {
16862 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
16863 return Entries.size() > 1;
16864 }))) {
16865 Worklist.pop();
16866 continue;
16867 }
16868
16869 // Calculate the gather cost of the root node.
16870 InstructionCost SubtreeCost = Worklist.top().second.first;
16871 if (SubtreeCost < TE->Scalars.size()) {
16872 Worklist.pop();
16873 continue;
16874 }
16875 if (!TransformedToGatherNodes.empty()) {
16876 for (unsigned Idx : Worklist.top().second.second) {
16877 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].get());
16878 if (It != TransformedToGatherNodes.end()) {
16879 SubtreeCost -= SubtreeCosts[Idx].first;
16880 SubtreeCost += It->second;
16881 }
16882 }
16883 }
16884 if (SubtreeCost < 0 || SubtreeCost < TE->Scalars.size()) {
16885 Worklist.pop();
16886 continue;
16887 }
16888 const unsigned Sz = TE->Scalars.size();
16889 APInt DemandedElts = APInt::getAllOnes(Sz);
16890 for (auto [Idx, V] : enumerate(TE->Scalars)) {
16891 if (isConstant(V))
16892 DemandedElts.clearBit(Idx);
16893 }
16894
16895 Type *ScalarTy = getValueType(TE->Scalars.front());
16896 auto *VecTy = getWidenedType(ScalarTy, Sz);
16897 const unsigned EntryVF = TE->getVectorFactor();
16898 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
16900 *TTI, ScalarTy, VecTy, DemandedElts,
16901 /*Insert=*/true, /*Extract=*/false, CostKind);
16902 SmallVector<int> Mask;
16903 if (!TE->ReorderIndices.empty() &&
16904 TE->State != TreeEntry::CompressVectorize &&
16905 (TE->State != TreeEntry::StridedVectorize ||
16906 !isReverseOrder(TE->ReorderIndices))) {
16907 SmallVector<int> NewMask;
16908 if (TE->getOpcode() == Instruction::Store) {
16909 // For stores the order is actually a mask.
16910 NewMask.resize(TE->ReorderIndices.size());
16911 copy(TE->ReorderIndices, NewMask.begin());
16912 } else {
16913 inversePermutation(TE->ReorderIndices, NewMask);
16914 }
16915 ::addMask(Mask, NewMask);
16916 }
16917 if (!TE->ReuseShuffleIndices.empty())
16918 ::addMask(Mask, TE->ReuseShuffleIndices);
16919 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF))
16920 GatherCost +=
16921 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
16922 // If all scalars are reused in gather node(s) or other vector nodes, there
16923 // might be extra cost for inserting them.
16924 if (all_of(TE->Scalars, [&](Value *V) {
16925 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
16926 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
16927 }))
16928 GatherCost *= 2;
16929 // Erase subtree if it is non-profitable.
16930 if (SubtreeCost > GatherCost) {
16931 // If the remaining tree is just a buildvector - exit, it will cause
16932 // endless attempts to vectorize.
16933 if (VectorizableTree.front()->hasState() &&
16934 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16935 TE->Idx == 1)
16937
16938 LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
16939 << TE->Idx << " with cost "
16940 << Worklist.top().second.first << " and gather cost "
16941 << GatherCost << ".\n");
16942 if (TE->UserTreeIndex) {
16943 TransformedToGatherNodes.try_emplace(TE, GatherCost);
16944 NodesCosts.erase(TE);
16945 } else {
16946 DeletedNodes.insert(TE);
16947 TransformedToGatherNodes.erase(TE);
16948 NodesCosts.erase(TE);
16949 }
16950 for (unsigned Idx : Worklist.top().second.second) {
16951 TreeEntry &ChildTE = *VectorizableTree[Idx];
16952 DeletedNodes.insert(&ChildTE);
16953 TransformedToGatherNodes.erase(&ChildTE);
16954 NodesCosts.erase(&ChildTE);
16955 }
16956 Changed = true;
16957 }
16958 Worklist.pop();
16959 }
16960 if (!Changed)
16961 return SubtreeCosts.front().first;
16962
16963 SmallPtrSet<TreeEntry *, 4> GatheredLoadsToDelete;
16964 InstructionCost LoadsExtractsCost = 0;
16965 // Check if all loads of gathered loads nodes are marked for deletion. In this
16966 // case the whole gathered loads subtree must be deleted.
16967 // Also, try to account for extracts, which might be required, if only part of
16968 // gathered load must be vectorized. Keep partially vectorized nodes, if
16969 // extracts are cheaper than gathers.
16970 for (TreeEntry *TE : GatheredLoadsNodes) {
16971 if (DeletedNodes.contains(TE) || TransformedToGatherNodes.contains(TE))
16972 continue;
16973 GatheredLoadsToDelete.insert(TE);
16974 APInt DemandedElts = APInt::getZero(TE->getVectorFactor());
16975 // All loads are removed from gathered? Need to delete the subtree.
16977 for (Value *V : TE->Scalars) {
16978 unsigned Pos = TE->findLaneForValue(V);
16979 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16980 if (DeletedNodes.contains(BVE))
16981 continue;
16982 DemandedElts.setBit(Pos);
16983 ValuesToInsert.try_emplace(BVE).first->second.push_back(V);
16984 }
16985 }
16986 if (!DemandedElts.isZero()) {
16987 Type *ScalarTy = TE->Scalars.front()->getType();
16988 auto *VecTy = getWidenedType(ScalarTy, TE->getVectorFactor());
16990 *TTI, ScalarTy, VecTy, DemandedElts,
16991 /*Insert=*/false, /*Extract=*/true, CostKind);
16992 InstructionCost BVCost = 0;
16993 for (const auto &[BVE, Values] : ValuesToInsert) {
16994 APInt BVDemandedElts = APInt::getZero(BVE->getVectorFactor());
16995 SmallVector<Value *> BVValues(BVE->getVectorFactor(),
16996 PoisonValue::get(ScalarTy));
16997 for (Value *V : Values) {
16998 unsigned Pos = BVE->findLaneForValue(V);
16999 BVValues[Pos] = V;
17000 BVDemandedElts.setBit(Pos);
17001 }
17002 auto *BVVecTy = getWidenedType(ScalarTy, BVE->getVectorFactor());
17004 *TTI, ScalarTy, BVVecTy, BVDemandedElts,
17005 /*Insert=*/true, /*Extract=*/false, CostKind,
17006 BVDemandedElts.isAllOnes(), BVValues);
17007 }
17008 if (ExtractsCost < BVCost) {
17009 LoadsExtractsCost += ExtractsCost;
17010 GatheredLoadsToDelete.erase(TE);
17011 continue;
17012 }
17013 LoadsExtractsCost += BVCost;
17014 }
17015 NodesCosts.erase(TE);
17016 }
17017
17018 // Deleted all subtrees rooted at gathered loads nodes.
17019 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17020 if (TE->UserTreeIndex &&
17021 GatheredLoadsToDelete.contains(TE->UserTreeIndex.UserTE)) {
17022 DeletedNodes.insert(TE.get());
17023 NodesCosts.erase(TE.get());
17024 GatheredLoadsToDelete.insert(TE.get());
17025 }
17026 }
17027
17028 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17029 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(TE.get())) {
17030 assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
17031 continue;
17032 }
17033 if (DeletedNodes.contains(TE.get()))
17034 continue;
17035 if (!NodesCosts.contains(TE.get())) {
17037 getEntryCost(TE.get(), VectorizedVals, CheckedExtracts);
17038 NodesCosts.try_emplace(TE.get(), C);
17039 }
17040 }
17041
17042 LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
17043 InstructionCost NewCost = 0;
17044 for (const auto &P : NodesCosts) {
17045 NewCost += P.second;
17046 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
17047 << shortBundleName(P.first->Scalars, P.first->Idx)
17048 << ".\n"
17049 << "SLP: Current total cost = " << Cost << "\n");
17050 }
17051 if (NewCost + LoadsExtractsCost >= Cost) {
17052 DeletedNodes.clear();
17053 TransformedToGatherNodes.clear();
17054 NewCost = Cost;
17055 }
17056 return NewCost;
17057}
17058
17059namespace {
17060/// Data type for handling buildvector sequences with the reused scalars from
17061/// other tree entries.
17062template <typename T> struct ShuffledInsertData {
17063 /// List of insertelements to be replaced by shuffles.
17064 SmallVector<InsertElementInst *> InsertElements;
17065 /// The parent vectors and shuffle mask for the given list of inserts.
17067};
17068} // namespace
17069
17071 ArrayRef<Value *> VectorizedVals,
17072 InstructionCost ReductionCost) {
17073 InstructionCost Cost = TreeCost + ReductionCost;
17074
17075 if (Cost >= -SLPCostThreshold &&
17076 none_of(ExternalUses, [](const ExternalUser &EU) {
17077 return isa_and_nonnull<InsertElementInst>(EU.User);
17078 }))
17079 return Cost;
17080
17081 SmallPtrSet<Value *, 16> ExtractCostCalculated;
17082 InstructionCost ExtractCost = 0;
17084 SmallVector<APInt> DemandedElts;
17085 SmallDenseSet<Value *, 4> UsedInserts;
17087 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
17089 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
17090 // Keep track {Scalar, Index, User} tuple.
17091 // On AArch64, this helps in fusing a mov instruction, associated with
17092 // extractelement, with fmul in the backend so that extractelement is free.
17094 for (ExternalUser &EU : ExternalUses) {
17095 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
17096 }
17097 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
17098 for (ExternalUser &EU : ExternalUses) {
17099 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
17100 << EU.E.Idx << " in lane " << EU.Lane << "\n");
17101 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
17102 else dbgs() << " User: nullptr\n");
17103 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
17104
17105 // Uses by ephemeral values are free (because the ephemeral value will be
17106 // removed prior to code generation, and so the extraction will be
17107 // removed as well).
17108 if (EphValues.count(EU.User))
17109 continue;
17110
17111 // Check if the scalar for the given user or all users is accounted already.
17112 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
17113 (EU.User &&
17114 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
17115 continue;
17116
17117 // Used in unreachable blocks or in EH pads (rarely executed) or is
17118 // terminated with unreachable instruction.
17119 if (BasicBlock *UserParent =
17120 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
17121 UserParent &&
17122 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
17123 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
17124 continue;
17125
17126 // We only add extract cost once for the same scalar.
17127 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
17128 !ExtractCostCalculated.insert(EU.Scalar).second)
17129 continue;
17130
17131 // No extract cost for vector "scalar" if REVEC is disabled
17132 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
17133 continue;
17134
17135 // If found user is an insertelement, do not calculate extract cost but try
17136 // to detect it as a final shuffled/identity match.
17137 // TODO: what if a user is insertvalue when REVEC is enabled?
17138 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
17139 VU && VU->getOperand(1) == EU.Scalar) {
17140 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
17141 if (!UsedInserts.insert(VU).second)
17142 continue;
17143 std::optional<unsigned> InsertIdx = getElementIndex(VU);
17144 if (InsertIdx) {
17145 const TreeEntry *ScalarTE = &EU.E;
17146 auto *It = find_if(
17147 ShuffledInserts,
17148 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
17149 // Checks if 2 insertelements are from the same buildvector.
17150 InsertElementInst *VecInsert = Data.InsertElements.front();
17152 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
17153 Value *Op0 = II->getOperand(0);
17154 if (isVectorized(II) && !isVectorized(Op0))
17155 return nullptr;
17156 return Op0;
17157 });
17158 });
17159 int VecId = -1;
17160 if (It == ShuffledInserts.end()) {
17161 auto &Data = ShuffledInserts.emplace_back();
17162 Data.InsertElements.emplace_back(VU);
17163 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
17164 VecId = ShuffledInserts.size() - 1;
17165 auto It = MinBWs.find(ScalarTE);
17166 if (It != MinBWs.end() &&
17167 VectorCasts
17168 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
17169 .second) {
17170 unsigned BWSz = It->second.first;
17171 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
17172 unsigned VecOpcode;
17173 if (DstBWSz < BWSz)
17174 VecOpcode = Instruction::Trunc;
17175 else
17176 VecOpcode =
17177 It->second.second ? Instruction::SExt : Instruction::ZExt;
17179 InstructionCost C = TTI->getCastInstrCost(
17180 VecOpcode, FTy,
17181 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
17182 FTy->getNumElements()),
17184 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17185 << " for extending externally used vector with "
17186 "non-equal minimum bitwidth.\n");
17187 Cost += C;
17188 }
17189 } else {
17190 if (isFirstInsertElement(VU, It->InsertElements.front()))
17191 It->InsertElements.front() = VU;
17192 VecId = std::distance(ShuffledInserts.begin(), It);
17193 }
17194 int InIdx = *InsertIdx;
17195 SmallVectorImpl<int> &Mask =
17196 ShuffledInserts[VecId].ValueMasks[ScalarTE];
17197 if (Mask.empty())
17198 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
17199 Mask[InIdx] = EU.Lane;
17200 DemandedElts[VecId].setBit(InIdx);
17201 continue;
17202 }
17203 }
17204 }
17205
17207 // If we plan to rewrite the tree in a smaller type, we will need to sign
17208 // extend the extracted value back to the original type. Here, we account
17209 // for the extract and the added cost of the sign extend if needed.
17210 InstructionCost ExtraCost = TTI::TCC_Free;
17211 auto *ScalarTy = EU.Scalar->getType();
17212 const unsigned BundleWidth = EU.E.getVectorFactor();
17213 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
17214 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
17215 const TreeEntry *Entry = &EU.E;
17216 auto It = MinBWs.find(Entry);
17217 if (It != MinBWs.end()) {
17218 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
17219 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
17220 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
17221 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
17222 ? Instruction::ZExt
17223 : Instruction::SExt;
17224 VecTy = getWidenedType(MinTy, BundleWidth);
17225 ExtraCost =
17226 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
17227 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
17228 << ExtraCost << "\n");
17229 } else {
17230 ExtraCost =
17231 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
17232 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
17233 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
17234 << *VecTy << ": " << ExtraCost << "\n");
17235 }
17236 // Leave the scalar instructions as is if they are cheaper than extracts.
17237 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
17238 Entry->getOpcode() == Instruction::Load) {
17239 // Checks if the user of the external scalar is phi in loop body.
17240 auto IsPhiInLoop = [&](const ExternalUser &U) {
17241 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
17242 auto *I = cast<Instruction>(U.Scalar);
17243 const Loop *L = LI->getLoopFor(Phi->getParent());
17244 return L && (Phi->getParent() == I->getParent() ||
17245 L == LI->getLoopFor(I->getParent()));
17246 }
17247 return false;
17248 };
17249 if (!ValueToExtUses) {
17250 ValueToExtUses.emplace();
17251 for (const auto &P : enumerate(ExternalUses)) {
17252 // Ignore phis in loops.
17253 if (IsPhiInLoop(P.value()))
17254 continue;
17255
17256 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
17257 }
17258 }
17259 // Can use original instruction, if no operands vectorized or they are
17260 // marked as externally used already.
17261 auto *Inst = cast<Instruction>(EU.Scalar);
17262 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
17263 auto OperandIsScalar = [&](Value *V) {
17264 if (!isVectorized(V)) {
17265 // Some extractelements might be not vectorized, but
17266 // transformed into shuffle and removed from the function,
17267 // consider it here.
17268 if (auto *EE = dyn_cast<ExtractElementInst>(V))
17269 return !EE->hasOneUse() || !MustGather.contains(EE);
17270 return true;
17271 }
17272 return ValueToExtUses->contains(V);
17273 };
17274 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
17275 bool CanBeUsedAsScalarCast = false;
17276 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
17277 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
17278 Op && all_of(Op->operands(), OperandIsScalar)) {
17279 InstructionCost OpCost =
17280 (isVectorized(Op) && !ValueToExtUses->contains(Op))
17281 ? TTI->getInstructionCost(Op, CostKind)
17282 : 0;
17283 if (ScalarCost + OpCost <= ExtraCost) {
17284 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
17285 ScalarCost += OpCost;
17286 }
17287 }
17288 }
17289 if (CanBeUsedAsScalar) {
17290 bool KeepScalar = ScalarCost <= ExtraCost;
17291 // Try to keep original scalar if the user is the phi node from the same
17292 // block as the root phis, currently vectorized. It allows to keep
17293 // better ordering info of PHIs, being vectorized currently.
17294 bool IsProfitablePHIUser =
17295 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
17296 VectorizableTree.front()->Scalars.size() > 2)) &&
17297 VectorizableTree.front()->hasState() &&
17298 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
17299 !Inst->hasNUsesOrMore(UsesLimit) &&
17300 none_of(Inst->users(),
17301 [&](User *U) {
17302 auto *PHIUser = dyn_cast<PHINode>(U);
17303 return (!PHIUser ||
17304 PHIUser->getParent() !=
17305 cast<Instruction>(
17306 VectorizableTree.front()->getMainOp())
17307 ->getParent()) &&
17308 !isVectorized(U);
17309 }) &&
17310 count_if(Entry->Scalars, [&](Value *V) {
17311 return ValueToExtUses->contains(V);
17312 }) <= 2;
17313 if (IsProfitablePHIUser) {
17314 KeepScalar = true;
17315 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
17316 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
17317 (!GatheredLoadsEntriesFirst.has_value() ||
17318 Entry->Idx < *GatheredLoadsEntriesFirst)) {
17319 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
17320 return ValueToExtUses->contains(V);
17321 });
17322 auto It = ExtractsCount.find(Entry);
17323 if (It != ExtractsCount.end()) {
17324 assert(ScalarUsesCount >= It->getSecond().size() &&
17325 "Expected total number of external uses not less than "
17326 "number of scalar uses.");
17327 ScalarUsesCount -= It->getSecond().size();
17328 }
17329 // Keep original scalar if number of externally used instructions in
17330 // the same entry is not power of 2. It may help to do some extra
17331 // vectorization for now.
17332 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
17333 }
17334 if (KeepScalar) {
17335 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
17336 for (Value *V : Inst->operands()) {
17337 auto It = ValueToExtUses->find(V);
17338 if (It != ValueToExtUses->end()) {
17339 // Replace all uses to avoid compiler crash.
17340 ExternalUses[It->second].User = nullptr;
17341 }
17342 }
17343 ExtraCost = ScalarCost;
17344 if (!IsPhiInLoop(EU))
17345 ExtractsCount[Entry].insert(Inst);
17346 if (CanBeUsedAsScalarCast) {
17347 ScalarOpsFromCasts.insert(Inst->getOperand(0));
17348 // Update the users of the operands of the cast operand to avoid
17349 // compiler crash.
17350 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
17351 for (Value *V : IOp->operands()) {
17352 auto It = ValueToExtUses->find(V);
17353 if (It != ValueToExtUses->end()) {
17354 // Replace all uses to avoid compiler crash.
17355 ExternalUses[It->second].User = nullptr;
17356 }
17357 }
17358 }
17359 }
17360 }
17361 }
17362 }
17363
17364 ExtractCost += ExtraCost;
17365 }
17366 // Insert externals for extract of operands of casts to be emitted as scalars
17367 // instead of extractelement.
17368 for (Value *V : ScalarOpsFromCasts) {
17369 ExternalUsesAsOriginalScalar.insert(V);
17370 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
17371 const auto *It = find_if_not(TEs, [&](TreeEntry *TE) {
17372 return TransformedToGatherNodes.contains(TE) ||
17373 DeletedNodes.contains(TE);
17374 });
17375 if (It != TEs.end()) {
17376 const TreeEntry *UserTE = *It;
17377 ExternalUses.emplace_back(V, nullptr, *UserTE,
17378 UserTE->findLaneForValue(V));
17379 }
17380 }
17381 }
17382 // Add reduced value cost, if resized.
17383 if (!VectorizedVals.empty()) {
17384 const TreeEntry &Root = *VectorizableTree.front();
17385 auto BWIt = MinBWs.find(&Root);
17386 if (BWIt != MinBWs.end()) {
17387 Type *DstTy = Root.Scalars.front()->getType();
17388 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
17389 unsigned SrcSz =
17390 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
17391 if (OriginalSz != SrcSz) {
17392 unsigned Opcode = Instruction::Trunc;
17393 if (OriginalSz > SrcSz)
17394 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
17395 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
17396 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
17397 assert(SLPReVec && "Only supported by REVEC.");
17398 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
17399 }
17400 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
17403 }
17404 }
17405 }
17406
17407 // Buildvector with externally used scalars, which should remain as scalars,
17408 // should not be vectorized, the compiler may hang.
17409 if (SLPCostThreshold < 0 && VectorizableTree.size() > 1 &&
17410 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
17411 VectorizableTree[1]->hasState() &&
17412 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17413 all_of(VectorizableTree[1]->Scalars, [&](Value *V) {
17414 return ExternalUsesAsOriginalScalar.contains(V);
17415 }))
17417
17418 Cost += ExtractCost;
17419 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
17420 bool ForSingleMask) {
17421 InstructionCost C = 0;
17422 unsigned VF = Mask.size();
17423 unsigned VecVF = TE->getVectorFactor();
17424 bool HasLargeIndex =
17425 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
17426 if ((VF != VecVF && HasLargeIndex) ||
17428
17429 if (HasLargeIndex) {
17430 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
17431 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
17432 OrigMask.begin());
17434 getWidenedType(TE->getMainOp()->getType(), VecVF),
17435 OrigMask);
17436 LLVM_DEBUG(
17437 dbgs() << "SLP: Adding cost " << C
17438 << " for final shuffle of insertelement external users.\n";
17439 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17440 Cost += C;
17441 return std::make_pair(TE, true);
17442 }
17443
17444 if (!ForSingleMask) {
17445 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
17446 for (unsigned I = 0; I < VF; ++I) {
17447 if (Mask[I] != PoisonMaskElem)
17448 ResizeMask[Mask[I]] = Mask[I];
17449 }
17450 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
17453 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
17454 LLVM_DEBUG(
17455 dbgs() << "SLP: Adding cost " << C
17456 << " for final shuffle of insertelement external users.\n";
17457 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17458
17459 Cost += C;
17460 }
17461 }
17462 return std::make_pair(TE, false);
17463 };
17464 // Calculate the cost of the reshuffled vectors, if any.
17465 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
17466 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
17467 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
17468 unsigned VF = 0;
17469 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
17471 assert((TEs.size() == 1 || TEs.size() == 2) &&
17472 "Expected exactly 1 or 2 tree entries.");
17473 if (TEs.size() == 1) {
17474 if (VF == 0)
17475 VF = TEs.front()->getVectorFactor();
17476 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17477 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
17478 !all_of(enumerate(Mask), [=](const auto &Data) {
17479 return Data.value() == PoisonMaskElem ||
17480 (Data.index() < VF &&
17481 static_cast<int>(Data.index()) == Data.value());
17482 })) {
17485 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17486 << " for final shuffle of insertelement "
17487 "external users.\n";
17488 TEs.front()->dump();
17489 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17490 Cost += C;
17491 }
17492 } else {
17493 if (VF == 0) {
17494 if (TEs.front() &&
17495 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
17496 VF = TEs.front()->getVectorFactor();
17497 else
17498 VF = Mask.size();
17499 }
17500 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17502 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
17503 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17504 << " for final shuffle of vector node and external "
17505 "insertelement users.\n";
17506 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
17507 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17508 Cost += C;
17509 }
17510 VF = Mask.size();
17511 return TEs.back();
17512 };
17514 MutableArrayRef(Vector.data(), Vector.size()), Base,
17515 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
17516 EstimateShufflesCost);
17517 InstructionCost InsertCost = TTI->getScalarizationOverhead(
17519 ShuffledInserts[I].InsertElements.front()->getType()),
17520 DemandedElts[I],
17521 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
17522 Cost -= InsertCost;
17523 }
17524
17525 // Add the cost for reduced value resize (if required).
17526 if (ReductionBitWidth != 0) {
17527 assert(UserIgnoreList && "Expected reduction tree.");
17528 const TreeEntry &E = *VectorizableTree.front();
17529 auto It = MinBWs.find(&E);
17530 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
17531 unsigned SrcSize = It->second.first;
17532 unsigned DstSize = ReductionBitWidth;
17533 unsigned Opcode = Instruction::Trunc;
17534 if (SrcSize < DstSize) {
17535 bool IsArithmeticExtendedReduction =
17536 all_of(*UserIgnoreList, [](Value *V) {
17537 auto *I = cast<Instruction>(V);
17538 return is_contained({Instruction::Add, Instruction::FAdd,
17539 Instruction::Mul, Instruction::FMul,
17540 Instruction::And, Instruction::Or,
17541 Instruction::Xor},
17542 I->getOpcode());
17543 });
17544 if (IsArithmeticExtendedReduction)
17545 Opcode =
17546 Instruction::BitCast; // Handle it by getExtendedReductionCost
17547 else
17548 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
17549 }
17550 if (Opcode != Instruction::BitCast) {
17551 auto *SrcVecTy =
17552 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
17553 auto *DstVecTy =
17554 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
17555 TTI::CastContextHint CCH = getCastContextHint(E);
17556 InstructionCost CastCost;
17557 switch (E.getOpcode()) {
17558 case Instruction::SExt:
17559 case Instruction::ZExt:
17560 case Instruction::Trunc: {
17561 const TreeEntry *OpTE = getOperandEntry(&E, 0);
17562 CCH = getCastContextHint(*OpTE);
17563 break;
17564 }
17565 default:
17566 break;
17567 }
17568 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
17570 Cost += CastCost;
17571 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
17572 << " for final resize for reduction from " << SrcVecTy
17573 << " to " << DstVecTy << "\n";
17574 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17575 }
17576 }
17577 }
17578
17579 std::optional<InstructionCost> SpillCost;
17580 if (Cost < -SLPCostThreshold) {
17581 SpillCost = getSpillCost();
17582 Cost += *SpillCost;
17583 }
17584#ifndef NDEBUG
17585 SmallString<256> Str;
17586 {
17587 raw_svector_ostream OS(Str);
17588 OS << "SLP: Spill Cost = ";
17589 if (SpillCost)
17590 OS << *SpillCost;
17591 else
17592 OS << "<skipped>";
17593 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
17594 << "SLP: Total Cost = " << Cost << ".\n";
17595 }
17596 LLVM_DEBUG(dbgs() << Str);
17597 if (ViewSLPTree)
17598 ViewGraph(this, "SLP" + F->getName(), false, Str);
17599#endif
17600
17601 return Cost;
17602}
17603
17604/// Tries to find extractelement instructions with constant indices from fixed
17605/// vector type and gather such instructions into a bunch, which highly likely
17606/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17607/// successful, the matched scalars are replaced by poison values in \p VL for
17608/// future analysis.
17609std::optional<TTI::ShuffleKind>
17610BoUpSLP::tryToGatherSingleRegisterExtractElements(
17612 // Scan list of gathered scalars for extractelements that can be represented
17613 // as shuffles.
17615 SmallVector<int> UndefVectorExtracts;
17616 for (int I = 0, E = VL.size(); I < E; ++I) {
17617 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
17618 if (!EI) {
17619 if (isa<UndefValue>(VL[I]))
17620 UndefVectorExtracts.push_back(I);
17621 continue;
17622 }
17623 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
17624 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
17625 continue;
17626 std::optional<unsigned> Idx = getExtractIndex(EI);
17627 // Undefined index.
17628 if (!Idx) {
17629 UndefVectorExtracts.push_back(I);
17630 continue;
17631 }
17632 if (Idx >= VecTy->getNumElements()) {
17633 UndefVectorExtracts.push_back(I);
17634 continue;
17635 }
17636 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
17637 ExtractMask.reset(*Idx);
17638 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
17639 UndefVectorExtracts.push_back(I);
17640 continue;
17641 }
17642 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
17643 }
17644 // Sort the vector operands by the maximum number of uses in extractelements.
17646 VectorOpToIdx.takeVector();
17647 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
17648 return P1.second.size() > P2.second.size();
17649 });
17650 // Find the best pair of the vectors or a single vector.
17651 const int UndefSz = UndefVectorExtracts.size();
17652 unsigned SingleMax = 0;
17653 unsigned PairMax = 0;
17654 if (!Vectors.empty()) {
17655 SingleMax = Vectors.front().second.size() + UndefSz;
17656 if (Vectors.size() > 1) {
17657 auto *ItNext = std::next(Vectors.begin());
17658 PairMax = SingleMax + ItNext->second.size();
17659 }
17660 }
17661 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
17662 return std::nullopt;
17663 // Check if better to perform a shuffle of 2 vectors or just of a single
17664 // vector.
17665 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
17666 SmallVector<Value *> GatheredExtracts(
17667 VL.size(), PoisonValue::get(VL.front()->getType()));
17668 if (SingleMax >= PairMax && SingleMax) {
17669 for (int Idx : Vectors.front().second)
17670 std::swap(GatheredExtracts[Idx], VL[Idx]);
17671 } else if (!Vectors.empty()) {
17672 for (unsigned Idx : {0, 1})
17673 for (int Idx : Vectors[Idx].second)
17674 std::swap(GatheredExtracts[Idx], VL[Idx]);
17675 }
17676 // Add extracts from undefs too.
17677 for (int Idx : UndefVectorExtracts)
17678 std::swap(GatheredExtracts[Idx], VL[Idx]);
17679 // Check that gather of extractelements can be represented as just a
17680 // shuffle of a single/two vectors the scalars are extracted from.
17681 std::optional<TTI::ShuffleKind> Res =
17682 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
17683 if (!Res || all_of(Mask, equal_to(PoisonMaskElem))) {
17684 // TODO: try to check other subsets if possible.
17685 // Restore the original VL if attempt was not successful.
17686 copy(SavedVL, VL.begin());
17687 return std::nullopt;
17688 }
17689 // Restore unused scalars from mask, if some of the extractelements were not
17690 // selected for shuffle.
17691 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
17692 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
17693 isa<UndefValue>(GatheredExtracts[I])) {
17694 std::swap(VL[I], GatheredExtracts[I]);
17695 continue;
17696 }
17697 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
17698 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
17699 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
17700 is_contained(UndefVectorExtracts, I))
17701 continue;
17702 }
17703 return Res;
17704}
17705
17706/// Tries to find extractelement instructions with constant indices from fixed
17707/// vector type and gather such instructions into a bunch, which highly likely
17708/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17709/// successful, the matched scalars are replaced by poison values in \p VL for
17710/// future analysis.
17712BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
17713 SmallVectorImpl<int> &Mask,
17714 unsigned NumParts) const {
17715 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
17716 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
17717 Mask.assign(VL.size(), PoisonMaskElem);
17718 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17719 for (unsigned Part : seq<unsigned>(NumParts)) {
17720 // Scan list of gathered scalars for extractelements that can be represented
17721 // as shuffles.
17722 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
17723 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17724 SmallVector<int> SubMask;
17725 std::optional<TTI::ShuffleKind> Res =
17726 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
17727 ShufflesRes[Part] = Res;
17728 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
17729 }
17730 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
17731 return Res.has_value();
17732 }))
17733 ShufflesRes.clear();
17734 return ShufflesRes;
17735}
17736
17737std::optional<TargetTransformInfo::ShuffleKind>
17738BoUpSLP::isGatherShuffledSingleRegisterEntry(
17739 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
17740 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
17741 Entries.clear();
17742 if (TE->Idx == 0)
17743 return std::nullopt;
17744 // TODO: currently checking only for Scalars in the tree entry, need to count
17745 // reused elements too for better cost estimation.
17746 auto GetUserEntry = [&](const TreeEntry *TE) {
17747 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17748 TE = TE->UserTreeIndex.UserTE;
17749 if (TE == VectorizableTree.front().get())
17750 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
17751 return TE->UserTreeIndex;
17752 };
17753 auto HasGatherUser = [&](const TreeEntry *TE) {
17754 while (TE->Idx != 0 && TE->UserTreeIndex) {
17755 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17756 return true;
17757 TE = TE->UserTreeIndex.UserTE;
17758 }
17759 return false;
17760 };
17761 const EdgeInfo TEUseEI = GetUserEntry(TE);
17762 if (!TEUseEI)
17763 return std::nullopt;
17764 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
17765 const BasicBlock *TEInsertBlock = nullptr;
17766 // Main node of PHI entries keeps the correct order of operands/incoming
17767 // blocks.
17768 if (auto *PHI = dyn_cast_or_null<PHINode>(
17769 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
17770 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
17771 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
17772 TEInsertPt = TEInsertBlock->getTerminator();
17773 } else {
17774 TEInsertBlock = TEInsertPt->getParent();
17775 }
17776 if (!DT->isReachableFromEntry(TEInsertBlock))
17777 return std::nullopt;
17778 auto *NodeUI = DT->getNode(TEInsertBlock);
17779 assert(NodeUI && "Should only process reachable instructions");
17780 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
17781 auto CheckOrdering = [&](const Instruction *InsertPt) {
17782 // Argument InsertPt is an instruction where vector code for some other
17783 // tree entry (one that shares one or more scalars with TE) is going to be
17784 // generated. This lambda returns true if insertion point of vector code
17785 // for the TE dominates that point (otherwise dependency is the other way
17786 // around). The other node is not limited to be of a gather kind. Gather
17787 // nodes are not scheduled and their vector code is inserted before their
17788 // first user. If user is PHI, that is supposed to be at the end of a
17789 // predecessor block. Otherwise it is the last instruction among scalars of
17790 // the user node. So, instead of checking dependency between instructions
17791 // themselves, we check dependency between their insertion points for vector
17792 // code (since each scalar instruction ends up as a lane of a vector
17793 // instruction).
17794 const BasicBlock *InsertBlock = InsertPt->getParent();
17795 auto *NodeEUI = DT->getNode(InsertBlock);
17796 if (!NodeEUI)
17797 return false;
17798 assert((NodeUI == NodeEUI) ==
17799 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
17800 "Different nodes should have different DFS numbers");
17801 // Check the order of the gather nodes users.
17802 if (TEInsertPt->getParent() != InsertBlock &&
17803 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
17804 return false;
17805 if (TEInsertPt->getParent() == InsertBlock &&
17806 TEInsertPt->comesBefore(InsertPt))
17807 return false;
17808 return true;
17809 };
17810 // Find all tree entries used by the gathered values. If no common entries
17811 // found - not a shuffle.
17812 // Here we build a set of tree nodes for each gathered value and trying to
17813 // find the intersection between these sets. If we have at least one common
17814 // tree node for each gathered value - we have just a permutation of the
17815 // single vector. If we have 2 different sets, we're in situation where we
17816 // have a permutation of 2 input vectors.
17818 SmallDenseMap<Value *, int> UsedValuesEntry;
17819 SmallPtrSet<const Value *, 16> VisitedValue;
17820 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
17821 // The node is reused - exit.
17822 if ((TEPtr->getVectorFactor() != VL.size() &&
17823 TEPtr->Scalars.size() != VL.size()) ||
17824 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
17825 return false;
17826 UsedTEs.clear();
17827 UsedTEs.emplace_back().insert(TEPtr);
17828 for (Value *V : VL) {
17829 if (isConstant(V))
17830 continue;
17831 UsedValuesEntry.try_emplace(V, 0);
17832 }
17833 return true;
17834 };
17835 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
17836 unsigned EdgeIdx) {
17837 const TreeEntry *Ptr1 = User1;
17838 const TreeEntry *Ptr2 = User2;
17839 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
17840 while (Ptr2) {
17841 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
17842 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
17843 Ptr2 = Ptr2->UserTreeIndex.UserTE;
17844 }
17845 while (Ptr1) {
17846 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
17847 Ptr1 = Ptr1->UserTreeIndex.UserTE;
17848 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
17849 return Idx < It->second;
17850 }
17851 return false;
17852 };
17853 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
17854 Instruction *InsertPt) {
17855 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
17856 !TEUseEI.UserTE->isCopyableElement(
17857 const_cast<Instruction *>(TEInsertPt)) &&
17858 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17859 InsertPt->getNextNode() == TEInsertPt &&
17860 (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) ||
17861 !isUsedOutsideBlock(InsertPt));
17862 };
17863 for (Value *V : VL) {
17864 if (isConstant(V) || !VisitedValue.insert(V).second)
17865 continue;
17866 // Build a list of tree entries where V is used.
17867 SmallPtrSet<const TreeEntry *, 4> VToTEs;
17869 ValueToGatherNodes.lookup(V).takeVector());
17870 if (TransformedToGatherNodes.contains(TE)) {
17871 for (TreeEntry *E : getSplitTreeEntries(V)) {
17872 if (TE == E || !TransformedToGatherNodes.contains(E) ||
17873 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
17874 continue;
17875 GatherNodes.push_back(E);
17876 }
17877 for (TreeEntry *E : getTreeEntries(V)) {
17878 if (TE == E || !TransformedToGatherNodes.contains(E) ||
17879 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
17880 continue;
17881 GatherNodes.push_back(E);
17882 }
17883 }
17884 for (const TreeEntry *TEPtr : GatherNodes) {
17885 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
17886 continue;
17887 assert(any_of(TEPtr->Scalars,
17888 [&](Value *V) { return GatheredScalars.contains(V); }) &&
17889 "Must contain at least single gathered value.");
17890 assert(TEPtr->UserTreeIndex &&
17891 "Expected only single user of a gather node.");
17892 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17893
17894 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17895 UseEI.UserTE->hasState())
17896 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
17897 : nullptr;
17898 Instruction *InsertPt =
17899 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
17900 : &getLastInstructionInBundle(UseEI.UserTE);
17901 if (TEInsertPt == InsertPt) {
17902 // Check nodes, which might be emitted first.
17903 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17904 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17905 TEUseEI.UserTE->isAltShuffle()) &&
17906 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
17907 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17908 (UseEI.UserTE->hasState() &&
17909 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17910 !UseEI.UserTE->isAltShuffle()) ||
17911 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
17912 continue;
17913 }
17914
17915 // If the schedulable insertion point is used in multiple entries - just
17916 // exit, no known ordering at this point, available only after real
17917 // scheduling.
17918 if (!doesNotNeedToBeScheduled(InsertPt) &&
17919 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17920 continue;
17921 // If the users are the PHI nodes with the same incoming blocks - skip.
17922 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17923 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17924 UseEI.UserTE->State == TreeEntry::Vectorize &&
17925 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17926 TEUseEI.UserTE != UseEI.UserTE)
17927 continue;
17928 // If 2 gathers are operands of the same entry (regardless of whether
17929 // user is PHI or else), compare operands indices, use the earlier one
17930 // as the base.
17931 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17932 continue;
17933 // If the user instruction is used for some reason in different
17934 // vectorized nodes - make it depend on index.
17935 if (TEUseEI.UserTE != UseEI.UserTE &&
17936 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17937 HasGatherUser(TEUseEI.UserTE)))
17938 continue;
17939 // If the user node is the operand of the other user node - skip.
17940 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17941 continue;
17942 }
17943
17944 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17945 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17946 UseEI.UserTE->doesNotNeedToSchedule() &&
17947 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
17948 continue;
17949 // Check if the user node of the TE comes after user node of TEPtr,
17950 // otherwise TEPtr depends on TE.
17951 if ((TEInsertBlock != InsertPt->getParent() ||
17952 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17953 (!CheckOrdering(InsertPt) ||
17954 (UseEI.UserTE->hasCopyableElements() &&
17955 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
17956 is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
17957 continue;
17958 // The node is reused - exit.
17959 if (CheckAndUseSameNode(TEPtr))
17960 break;
17961 // The parent node is copyable with last inst used outside? And the last
17962 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
17963 // preserve def-use chain.
17964 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17965 continue;
17966 VToTEs.insert(TEPtr);
17967 }
17968 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
17969 const auto *It = find_if(VTEs, [&](const TreeEntry *MTE) {
17970 return MTE != TE && MTE != TEUseEI.UserTE &&
17971 !DeletedNodes.contains(MTE) &&
17972 !TransformedToGatherNodes.contains(MTE);
17973 });
17974 if (It != VTEs.end()) {
17975 const TreeEntry *VTE = *It;
17976 if (none_of(TE->CombinedEntriesWithIndices,
17977 [&](const auto &P) { return P.first == VTE->Idx; })) {
17978 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17979 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17980 continue;
17981 }
17982 // The node is reused - exit.
17983 if (CheckAndUseSameNode(VTE))
17984 break;
17985 VToTEs.insert(VTE);
17986 }
17987 }
17988 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
17989 const auto *It = find_if(VTEs, [&, MainTE = TE](const TreeEntry *TE) {
17990 return TE != MainTE && !DeletedNodes.contains(TE) &&
17991 !TransformedToGatherNodes.contains(TE);
17992 });
17993 if (It != VTEs.end()) {
17994 const TreeEntry *VTE = *It;
17995 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17996 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17997 VTEs = VTEs.drop_front();
17998 // Iterate through all vectorized nodes.
17999 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
18000 return MTE->State == TreeEntry::Vectorize;
18001 });
18002 if (MIt == VTEs.end())
18003 continue;
18004 VTE = *MIt;
18005 }
18006 if (none_of(TE->CombinedEntriesWithIndices,
18007 [&](const auto &P) { return P.first == VTE->Idx; })) {
18008 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
18009 if (&LastBundleInst == TEInsertPt ||
18010 !CheckOrdering(&LastBundleInst) ||
18011 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
18012 continue;
18013 }
18014 // The node is reused - exit.
18015 if (CheckAndUseSameNode(VTE))
18016 break;
18017 VToTEs.insert(VTE);
18018 }
18019 }
18020 if (VToTEs.empty())
18021 continue;
18022 if (UsedTEs.empty()) {
18023 // The first iteration, just insert the list of nodes to vector.
18024 UsedTEs.push_back(VToTEs);
18025 UsedValuesEntry.try_emplace(V, 0);
18026 } else {
18027 // Need to check if there are any previously used tree nodes which use V.
18028 // If there are no such nodes, consider that we have another one input
18029 // vector.
18030 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
18031 unsigned Idx = 0;
18032 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
18033 // Do we have a non-empty intersection of previously listed tree entries
18034 // and tree entries using current V?
18035 set_intersect(VToTEs, Set);
18036 if (!VToTEs.empty()) {
18037 // Yes, write the new subset and continue analysis for the next
18038 // scalar.
18039 Set.swap(VToTEs);
18040 break;
18041 }
18042 VToTEs = SavedVToTEs;
18043 ++Idx;
18044 }
18045 // No non-empty intersection found - need to add a second set of possible
18046 // source vectors.
18047 if (Idx == UsedTEs.size()) {
18048 // If the number of input vectors is greater than 2 - not a permutation,
18049 // fallback to the regular gather.
18050 // TODO: support multiple reshuffled nodes.
18051 if (UsedTEs.size() == 2)
18052 continue;
18053 UsedTEs.push_back(SavedVToTEs);
18054 Idx = UsedTEs.size() - 1;
18055 }
18056 UsedValuesEntry.try_emplace(V, Idx);
18057 }
18058 }
18059
18060 if (UsedTEs.empty()) {
18061 Entries.clear();
18062 return std::nullopt;
18063 }
18064
18065 unsigned VF = 0;
18066 if (UsedTEs.size() == 1) {
18067 // Keep the order to avoid non-determinism.
18068 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
18069 UsedTEs.front().end());
18070 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
18071 return TE1->Idx < TE2->Idx;
18072 });
18073 // Try to find the perfect match in another gather node at first.
18074 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
18075 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
18076 });
18077 if (It != FirstEntries.end() &&
18078 ((*It)->getVectorFactor() == VL.size() ||
18079 ((*It)->getVectorFactor() == TE->Scalars.size() &&
18080 TE->ReuseShuffleIndices.size() == VL.size() &&
18081 (*It)->isSame(TE->Scalars)))) {
18082 Entries.push_back(*It);
18083 if ((*It)->getVectorFactor() == VL.size()) {
18084 std::iota(std::next(Mask.begin(), Part * VL.size()),
18085 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
18086 } else {
18087 SmallVector<int> CommonMask = TE->getCommonMask();
18088 copy(CommonMask, Mask.begin());
18089 }
18090 // Clear undef scalars.
18091 for (unsigned I : seq<unsigned>(VL.size()))
18092 if (isa<PoisonValue>(VL[I]))
18093 Mask[Part * VL.size() + I] = PoisonMaskElem;
18095 }
18096 // No perfect match, just shuffle, so choose the first tree node from the
18097 // tree.
18098 Entries.push_back(FirstEntries.front());
18099 // Update mapping between values and corresponding tree entries.
18100 for (auto &P : UsedValuesEntry)
18101 P.second = 0;
18102 VF = FirstEntries.front()->getVectorFactor();
18103 } else {
18104 // Try to find nodes with the same vector factor.
18105 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
18106 // Keep the order of tree nodes to avoid non-determinism.
18107 DenseMap<int, const TreeEntry *> VFToTE;
18108 for (const TreeEntry *TE : UsedTEs.front()) {
18109 unsigned VF = TE->getVectorFactor();
18110 auto It = VFToTE.find(VF);
18111 if (It != VFToTE.end()) {
18112 if (It->second->Idx > TE->Idx)
18113 It->getSecond() = TE;
18114 continue;
18115 }
18116 VFToTE.try_emplace(VF, TE);
18117 }
18118 // Same, keep the order to avoid non-determinism.
18119 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
18120 UsedTEs.back().end());
18121 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
18122 return TE1->Idx < TE2->Idx;
18123 });
18124 for (const TreeEntry *TE : SecondEntries) {
18125 auto It = VFToTE.find(TE->getVectorFactor());
18126 if (It != VFToTE.end()) {
18127 VF = It->first;
18128 Entries.push_back(It->second);
18129 Entries.push_back(TE);
18130 break;
18131 }
18132 }
18133 // No 2 source vectors with the same vector factor - just choose 2 with max
18134 // index.
18135 if (Entries.empty()) {
18137 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
18138 return TE1->Idx < TE2->Idx;
18139 }));
18140 Entries.push_back(SecondEntries.front());
18141 VF = std::max(Entries.front()->getVectorFactor(),
18142 Entries.back()->getVectorFactor());
18143 } else {
18144 VF = Entries.front()->getVectorFactor();
18145 }
18146 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
18147 for (const TreeEntry *E : Entries)
18148 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
18149 E->Scalars.end());
18150 // Update mapping between values and corresponding tree entries.
18151 for (auto &P : UsedValuesEntry) {
18152 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
18153 if (ValuesToEntries[Idx].contains(P.first)) {
18154 P.second = Idx;
18155 break;
18156 }
18157 }
18158 }
18159
18160 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
18161 // Checks if the 2 PHIs are compatible in terms of high possibility to be
18162 // vectorized.
18163 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
18164 auto *PHI = cast<PHINode>(V);
18165 auto *PHI1 = cast<PHINode>(V1);
18166 // Check that all incoming values are compatible/from same parent (if they
18167 // are instructions).
18168 // The incoming values are compatible if they all are constants, or
18169 // instruction with the same/alternate opcodes from the same basic block.
18170 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
18171 Value *In = PHI->getIncomingValue(I);
18172 Value *In1 = PHI1->getIncomingValue(I);
18173 if (isConstant(In) && isConstant(In1))
18174 continue;
18175 if (!getSameOpcode({In, In1}, *TLI))
18176 return false;
18177 if (cast<Instruction>(In)->getParent() !=
18179 return false;
18180 }
18181 return true;
18182 };
18183 // Check if the value can be ignored during analysis for shuffled gathers.
18184 // We suppose it is better to ignore instruction, which do not form splats,
18185 // are not vectorized/not extractelements (these instructions will be handled
18186 // by extractelements processing) or may form vector node in future.
18187 auto MightBeIgnored = [=](Value *V) {
18188 auto *I = dyn_cast<Instruction>(V);
18189 return I && !IsSplatOrUndefs && !isVectorized(I) &&
18191 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
18192 };
18193 // Check that the neighbor instruction may form a full vector node with the
18194 // current instruction V. It is possible, if they have same/alternate opcode
18195 // and same parent basic block.
18196 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
18197 Value *V1 = VL[Idx];
18198 bool UsedInSameVTE = false;
18199 auto It = UsedValuesEntry.find(V1);
18200 if (It != UsedValuesEntry.end())
18201 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
18202 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
18203 getSameOpcode({V, V1}, *TLI) &&
18204 cast<Instruction>(V)->getParent() ==
18205 cast<Instruction>(V1)->getParent() &&
18206 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
18207 };
18208 // Build a shuffle mask for better cost estimation and vector emission.
18209 SmallBitVector UsedIdxs(Entries.size());
18211 for (int I = 0, E = VL.size(); I < E; ++I) {
18212 Value *V = VL[I];
18213 auto It = UsedValuesEntry.find(V);
18214 if (It == UsedValuesEntry.end())
18215 continue;
18216 // Do not try to shuffle scalars, if they are constants, or instructions
18217 // that can be vectorized as a result of the following vector build
18218 // vectorization.
18219 if (isConstant(V) || (MightBeIgnored(V) &&
18220 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
18221 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
18222 continue;
18223 unsigned Idx = It->second;
18224 EntryLanes.emplace_back(Idx, I);
18225 UsedIdxs.set(Idx);
18226 }
18227 // Iterate through all shuffled scalars and select entries, which can be used
18228 // for final shuffle.
18230 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
18231 if (!UsedIdxs.test(I))
18232 continue;
18233 // Fix the entry number for the given scalar. If it is the first entry, set
18234 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
18235 // These indices are used when calculating final shuffle mask as the vector
18236 // offset.
18237 for (std::pair<unsigned, int> &Pair : EntryLanes)
18238 if (Pair.first == I)
18239 Pair.first = TempEntries.size();
18240 TempEntries.push_back(Entries[I]);
18241 }
18242 Entries.swap(TempEntries);
18243 if (EntryLanes.size() == Entries.size() &&
18244 !VL.equals(ArrayRef(TE->Scalars)
18245 .slice(Part * VL.size(),
18246 std::min<int>(VL.size(), TE->Scalars.size())))) {
18247 // We may have here 1 or 2 entries only. If the number of scalars is equal
18248 // to the number of entries, no need to do the analysis, it is not very
18249 // profitable. Since VL is not the same as TE->Scalars, it means we already
18250 // have some shuffles before. Cut off not profitable case.
18251 Entries.clear();
18252 return std::nullopt;
18253 }
18254 // Build the final mask, check for the identity shuffle, if possible.
18255 bool IsIdentity = Entries.size() == 1;
18256 // Pair.first is the offset to the vector, while Pair.second is the index of
18257 // scalar in the list.
18258 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
18259 unsigned Idx = Part * VL.size() + Pair.second;
18260 Mask[Idx] =
18261 Pair.first * VF +
18262 (ForOrder ? std::distance(
18263 Entries[Pair.first]->Scalars.begin(),
18264 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
18265 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
18266 IsIdentity &= Mask[Idx] == Pair.second;
18267 }
18268 if (ForOrder || IsIdentity || Entries.empty()) {
18269 switch (Entries.size()) {
18270 case 1:
18271 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
18273 break;
18274 case 2:
18275 if (EntryLanes.size() > 2 || VL.size() <= 2)
18277 break;
18278 default:
18279 break;
18280 }
18281 } else if (!isa<VectorType>(VL.front()->getType()) &&
18282 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
18283 // Do the cost estimation if shuffle beneficial than buildvector.
18284 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
18285 std::next(Mask.begin(), (Part + 1) * VL.size()));
18286 int MinElement = SubMask.front(), MaxElement = SubMask.front();
18287 for (int Idx : SubMask) {
18288 if (Idx == PoisonMaskElem)
18289 continue;
18290 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
18291 MinElement = Idx;
18292 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
18293 MaxElement = Idx;
18294 }
18295 assert(MaxElement >= 0 && MinElement >= 0 &&
18296 MaxElement % VF >= MinElement % VF &&
18297 "Expected at least single element.");
18298 unsigned NewVF = std::max<unsigned>(
18299 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
18300 (MaxElement % VF) -
18301 (MinElement % VF) + 1));
18302 if (NewVF < VF) {
18303 for (int &Idx : SubMask) {
18304 if (Idx == PoisonMaskElem)
18305 continue;
18306 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
18307 (Idx >= static_cast<int>(VF) ? NewVF : 0);
18308 }
18309 } else {
18310 NewVF = VF;
18311 }
18312
18314 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
18315 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
18316 auto GetShuffleCost = [&,
18317 &TTI = *TTI](ArrayRef<int> Mask,
18319 VectorType *VecTy) -> InstructionCost {
18320 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
18322 Mask, Entries.front()->getInterleaveFactor()))
18323 return TTI::TCC_Free;
18324 return ::getShuffleCost(TTI,
18325 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
18327 VecTy, Mask, CostKind);
18328 };
18329 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
18330 InstructionCost FirstShuffleCost = 0;
18331 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
18332 if (Entries.size() == 1 || !Entries[0]->isGather()) {
18333 FirstShuffleCost = ShuffleCost;
18334 } else {
18335 // Transform mask to include only first entry.
18336 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
18337 bool IsIdentity = true;
18338 for (auto [I, Idx] : enumerate(FirstMask)) {
18339 if (Idx >= static_cast<int>(NewVF)) {
18340 Idx = PoisonMaskElem;
18341 } else {
18342 DemandedElts.clearBit(I);
18343 if (Idx != PoisonMaskElem)
18344 IsIdentity &= static_cast<int>(I) == Idx;
18345 }
18346 }
18347 if (!IsIdentity)
18348 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
18349 FirstShuffleCost += getScalarizationOverhead(
18350 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
18351 /*Extract=*/false, CostKind);
18352 }
18353 InstructionCost SecondShuffleCost = 0;
18354 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
18355 if (Entries.size() == 1 || !Entries[1]->isGather()) {
18356 SecondShuffleCost = ShuffleCost;
18357 } else {
18358 // Transform mask to include only first entry.
18359 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
18360 bool IsIdentity = true;
18361 for (auto [I, Idx] : enumerate(SecondMask)) {
18362 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
18363 Idx = PoisonMaskElem;
18364 } else {
18365 DemandedElts.clearBit(I);
18366 if (Idx != PoisonMaskElem) {
18367 Idx -= NewVF;
18368 IsIdentity &= static_cast<int>(I) == Idx;
18369 }
18370 }
18371 }
18372 if (!IsIdentity)
18373 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
18374 SecondShuffleCost += getScalarizationOverhead(
18375 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
18376 /*Extract=*/false, CostKind);
18377 }
18378 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
18379 for (auto [I, Idx] : enumerate(SubMask))
18380 if (Idx == PoisonMaskElem)
18381 DemandedElts.clearBit(I);
18382 InstructionCost BuildVectorCost = getScalarizationOverhead(
18383 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
18384 /*Extract=*/false, CostKind);
18385 const TreeEntry *BestEntry = nullptr;
18386 if (FirstShuffleCost < ShuffleCost) {
18387 std::for_each(std::next(Mask.begin(), Part * VL.size()),
18388 std::next(Mask.begin(), (Part + 1) * VL.size()),
18389 [&](int &Idx) {
18390 if (Idx >= static_cast<int>(VF))
18391 Idx = PoisonMaskElem;
18392 });
18393 BestEntry = Entries.front();
18394 ShuffleCost = FirstShuffleCost;
18395 }
18396 if (SecondShuffleCost < ShuffleCost) {
18397 std::for_each(std::next(Mask.begin(), Part * VL.size()),
18398 std::next(Mask.begin(), (Part + 1) * VL.size()),
18399 [&](int &Idx) {
18400 if (Idx < static_cast<int>(VF))
18401 Idx = PoisonMaskElem;
18402 else
18403 Idx -= VF;
18404 });
18405 BestEntry = Entries[1];
18406 ShuffleCost = SecondShuffleCost;
18407 }
18408 if (BuildVectorCost >= ShuffleCost) {
18409 if (BestEntry) {
18410 Entries.clear();
18411 Entries.push_back(BestEntry);
18412 }
18413 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
18415 }
18416 }
18417 Entries.clear();
18418 // Clear the corresponding mask elements.
18419 std::fill(std::next(Mask.begin(), Part * VL.size()),
18420 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
18421 return std::nullopt;
18422}
18423
18425BoUpSLP::isGatherShuffledEntry(
18426 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
18427 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
18428 bool ForOrder) {
18429 assert(NumParts > 0 && NumParts < VL.size() &&
18430 "Expected positive number of registers.");
18431 Entries.clear();
18432 // No need to check for the topmost gather node.
18433 if (TE == VectorizableTree.front().get() &&
18434 (!GatheredLoadsEntriesFirst.has_value() ||
18435 none_of(ArrayRef(VectorizableTree).drop_front(),
18436 [](const std::unique_ptr<TreeEntry> &TE) {
18437 return !TE->isGather();
18438 })))
18439 return {};
18440 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
18441 // implemented yet.
18442 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
18443 return {};
18444 Mask.assign(VL.size(), PoisonMaskElem);
18445 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
18446 "Expected only single user of the gather node.");
18447 assert(VL.size() % NumParts == 0 &&
18448 "Number of scalars must be divisible by NumParts.");
18449 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
18450 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
18451 (TE->Idx == 0 ||
18452 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
18453 isSplat(TE->Scalars) ||
18454 (TE->hasState() &&
18455 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
18456 return {};
18457 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18459 for (unsigned Part : seq<unsigned>(NumParts)) {
18460 ArrayRef<Value *> SubVL =
18461 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
18462 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
18463 std::optional<TTI::ShuffleKind> SubRes =
18464 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
18465 ForOrder);
18466 if (!SubRes)
18467 SubEntries.clear();
18468 Res.push_back(SubRes);
18469 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
18470 SubEntries.front()->getVectorFactor() == VL.size() &&
18471 (SubEntries.front()->isSame(TE->Scalars) ||
18472 SubEntries.front()->isSame(VL))) {
18473 SmallVector<const TreeEntry *> LocalSubEntries;
18474 LocalSubEntries.swap(SubEntries);
18475 Entries.clear();
18476 Res.clear();
18477 std::iota(Mask.begin(), Mask.end(), 0);
18478 // Clear undef scalars.
18479 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
18480 if (isa<PoisonValue>(VL[I]))
18482 Entries.emplace_back(1, LocalSubEntries.front());
18484 return Res;
18485 }
18486 }
18487 if (all_of(Res,
18488 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
18489 Entries.clear();
18490 return {};
18491 }
18492 return Res;
18493}
18494
18495InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
18496 Type *ScalarTy) const {
18497 const unsigned VF = VL.size();
18498 auto *VecTy = getWidenedType(ScalarTy, VF);
18499 // Find the cost of inserting/extracting values from the vector.
18500 // Check if the same elements are inserted several times and count them as
18501 // shuffle candidates.
18502 APInt DemandedElements = APInt::getZero(VF);
18505 auto EstimateInsertCost = [&](unsigned I, Value *V) {
18506 DemandedElements.setBit(I);
18507 if (V->getType() != ScalarTy)
18508 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
18510 };
18511 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
18512 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
18513 for (auto [I, V] : enumerate(VL)) {
18514 // No need to shuffle duplicates for constants.
18515 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
18516 continue;
18517
18518 if (isConstant(V)) {
18519 ConstantShuffleMask[I] = I + VF;
18520 continue;
18521 }
18522 EstimateInsertCost(I, V);
18523 }
18524 // FIXME: add a cost for constant vector materialization.
18525 bool IsAnyNonUndefConst =
18526 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
18527 // 1. Shuffle input source vector and constant vector.
18528 if (!ForPoisonSrc && IsAnyNonUndefConst) {
18530 ConstantShuffleMask);
18531 }
18532
18533 // 2. Insert unique non-constants.
18534 if (!DemandedElements.isZero())
18535 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
18536 /*Insert=*/true,
18537 /*Extract=*/false, CostKind,
18538 ForPoisonSrc && !IsAnyNonUndefConst, VL);
18539 return Cost;
18540}
18541
18542Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
18543 auto It = EntryToLastInstruction.find(E);
18544 if (It != EntryToLastInstruction.end())
18545 return *cast<Instruction>(It->second);
18546 Instruction *Res = nullptr;
18547 // Get the basic block this bundle is in. All instructions in the bundle
18548 // should be in this block (except for extractelement-like instructions with
18549 // constant indices or gathered loads or copyables).
18550 Instruction *Front;
18551 unsigned Opcode;
18552 if (E->hasState()) {
18553 Front = E->getMainOp();
18554 Opcode = E->getOpcode();
18555 } else {
18556 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
18557 Opcode = Front->getOpcode();
18558 }
18559 auto *BB = Front->getParent();
18560 assert(
18561 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
18562 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
18563 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
18564 all_of(E->Scalars,
18565 [=](Value *V) -> bool {
18566 if (Opcode == Instruction::GetElementPtr &&
18567 !isa<GetElementPtrInst>(V))
18568 return true;
18569 auto *I = dyn_cast<Instruction>(V);
18570 return !I || !E->getMatchingMainOpOrAltOp(I) ||
18571 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
18572 })) &&
18573 "Expected gathered loads or GEPs or instructions from same basic "
18574 "block.");
18575
18576 auto FindLastInst = [&]() {
18577 Instruction *LastInst = Front;
18578 for (Value *V : E->Scalars) {
18579 auto *I = dyn_cast<Instruction>(V);
18580 if (!I)
18581 continue;
18582 if (E->isCopyableElement(I))
18583 continue;
18584 if (LastInst->getParent() == I->getParent()) {
18585 if (LastInst->comesBefore(I))
18586 LastInst = I;
18587 continue;
18588 }
18589 assert(((Opcode == Instruction::GetElementPtr &&
18591 E->State == TreeEntry::SplitVectorize ||
18592 (isVectorLikeInstWithConstOps(LastInst) &&
18594 (GatheredLoadsEntriesFirst.has_value() &&
18595 Opcode == Instruction::Load && E->isGather() &&
18596 E->Idx < *GatheredLoadsEntriesFirst)) &&
18597 "Expected vector-like or non-GEP in GEP node insts only.");
18598 if (!DT->isReachableFromEntry(LastInst->getParent())) {
18599 LastInst = I;
18600 continue;
18601 }
18602 if (!DT->isReachableFromEntry(I->getParent()))
18603 continue;
18604 auto *NodeA = DT->getNode(LastInst->getParent());
18605 auto *NodeB = DT->getNode(I->getParent());
18606 assert(NodeA && "Should only process reachable instructions");
18607 assert(NodeB && "Should only process reachable instructions");
18608 assert((NodeA == NodeB) ==
18609 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18610 "Different nodes should have different DFS numbers");
18611 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
18612 LastInst = I;
18613 }
18614 BB = LastInst->getParent();
18615 return LastInst;
18616 };
18617
18618 auto FindFirstInst = [&]() {
18619 Instruction *FirstInst = Front;
18620 for (Value *V : E->Scalars) {
18621 auto *I = dyn_cast<Instruction>(V);
18622 if (!I)
18623 continue;
18624 if (E->isCopyableElement(I))
18625 continue;
18626 if (FirstInst->getParent() == I->getParent()) {
18627 if (I->comesBefore(FirstInst))
18628 FirstInst = I;
18629 continue;
18630 }
18631 assert(((Opcode == Instruction::GetElementPtr &&
18633 (isVectorLikeInstWithConstOps(FirstInst) &&
18635 "Expected vector-like or non-GEP in GEP node insts only.");
18636 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
18637 FirstInst = I;
18638 continue;
18639 }
18640 if (!DT->isReachableFromEntry(I->getParent()))
18641 continue;
18642 auto *NodeA = DT->getNode(FirstInst->getParent());
18643 auto *NodeB = DT->getNode(I->getParent());
18644 assert(NodeA && "Should only process reachable instructions");
18645 assert(NodeB && "Should only process reachable instructions");
18646 assert((NodeA == NodeB) ==
18647 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18648 "Different nodes should have different DFS numbers");
18649 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
18650 FirstInst = I;
18651 }
18652 return FirstInst;
18653 };
18654
18655 if (E->State == TreeEntry::SplitVectorize) {
18656 Res = FindLastInst();
18657 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
18658 for (auto *E : Entries) {
18659 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
18660 if (!I)
18661 I = &getLastInstructionInBundle(E);
18662 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
18663 Res = I;
18664 }
18665 }
18666 EntryToLastInstruction.try_emplace(E, Res);
18667 return *Res;
18668 }
18669
18670 // Set insertpoint for gathered loads to the very first load.
18671 if (GatheredLoadsEntriesFirst.has_value() &&
18672 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
18673 Opcode == Instruction::Load) {
18674 Res = FindFirstInst();
18675 EntryToLastInstruction.try_emplace(E, Res);
18676 return *Res;
18677 }
18678
18679 // Set the insert point to the beginning of the basic block if the entry
18680 // should not be scheduled.
18681 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
18682 if (E->isGather())
18683 return nullptr;
18684 // Found previously that the instruction do not need to be scheduled.
18685 const auto *It = BlocksSchedules.find(BB);
18686 if (It == BlocksSchedules.end())
18687 return nullptr;
18688 for (Value *V : E->Scalars) {
18689 auto *I = dyn_cast<Instruction>(V);
18690 if (!I || isa<PHINode>(I) ||
18691 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
18692 continue;
18693 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
18694 if (Bundles.empty())
18695 continue;
18696 const auto *It = find_if(
18697 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
18698 if (It != Bundles.end())
18699 return *It;
18700 }
18701 return nullptr;
18702 };
18703 const ScheduleBundle *Bundle = FindScheduleBundle(E);
18704 if (!E->isGather() && !Bundle) {
18705 if ((Opcode == Instruction::GetElementPtr &&
18706 any_of(E->Scalars,
18707 [](Value *V) {
18708 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
18709 })) ||
18710 (all_of(E->Scalars,
18711 [&](Value *V) {
18712 return isa<PoisonValue>(V) ||
18713 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
18714 E->isCopyableElement(V) ||
18715 (!isVectorLikeInstWithConstOps(V) &&
18716 isUsedOutsideBlock(V));
18717 }) &&
18718 (!E->doesNotNeedToSchedule() ||
18719 any_of(E->Scalars,
18720 [&](Value *V) {
18721 if (!isa<Instruction>(V) ||
18722 (E->hasCopyableElements() && E->isCopyableElement(V)))
18723 return false;
18724 return !areAllOperandsNonInsts(V);
18725 }) ||
18726 none_of(E->Scalars, [&](Value *V) {
18727 if (!isa<Instruction>(V) ||
18728 (E->hasCopyableElements() && E->isCopyableElement(V)))
18729 return false;
18730 return MustGather.contains(V);
18731 }))))
18732 Res = FindLastInst();
18733 else
18734 Res = FindFirstInst();
18735 EntryToLastInstruction.try_emplace(E, Res);
18736 return *Res;
18737 }
18738
18739 // Find the last instruction. The common case should be that BB has been
18740 // scheduled, and the last instruction is VL.back(). So we start with
18741 // VL.back() and iterate over schedule data until we reach the end of the
18742 // bundle. The end of the bundle is marked by null ScheduleData.
18743 if (Bundle) {
18744 assert(!E->isGather() && "Gathered instructions should not be scheduled");
18745 Res = Bundle->getBundle().back()->getInst();
18746 EntryToLastInstruction.try_emplace(E, Res);
18747 return *Res;
18748 }
18749
18750 // LastInst can still be null at this point if there's either not an entry
18751 // for BB in BlocksSchedules or there's no ScheduleData available for
18752 // VL.back(). This can be the case if buildTreeRec aborts for various
18753 // reasons (e.g., the maximum recursion depth is reached, the maximum region
18754 // size is reached, etc.). ScheduleData is initialized in the scheduling
18755 // "dry-run".
18756 //
18757 // If this happens, we can still find the last instruction by brute force. We
18758 // iterate forwards from Front (inclusive) until we either see all
18759 // instructions in the bundle or reach the end of the block. If Front is the
18760 // last instruction in program order, LastInst will be set to Front, and we
18761 // will visit all the remaining instructions in the block.
18762 //
18763 // One of the reasons we exit early from buildTreeRec is to place an upper
18764 // bound on compile-time. Thus, taking an additional compile-time hit here is
18765 // not ideal. However, this should be exceedingly rare since it requires that
18766 // we both exit early from buildTreeRec and that the bundle be out-of-order
18767 // (causing us to iterate all the way to the end of the block).
18768 if (!Res)
18769 Res = FindLastInst();
18770 assert(Res && "Failed to find last instruction in bundle");
18771 EntryToLastInstruction.try_emplace(E, Res);
18772 return *Res;
18773}
18774
18775void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
18776 auto *Front = E->getMainOp();
18777 Instruction *LastInst = &getLastInstructionInBundle(E);
18778 assert(LastInst && "Failed to find last instruction in bundle");
18779 BasicBlock::iterator LastInstIt = LastInst->getIterator();
18780 // If the instruction is PHI, set the insert point after all the PHIs.
18781 bool IsPHI = isa<PHINode>(LastInst);
18782 if (IsPHI) {
18783 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
18784 if (LastInstIt != LastInst->getParent()->end() &&
18785 LastInstIt->getParent()->isLandingPad())
18786 LastInstIt = std::next(LastInstIt);
18787 }
18788 if (IsPHI ||
18789 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
18790 (E->doesNotNeedToSchedule() ||
18791 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
18792 isUsedOutsideBlock(LastInst)))) ||
18793 (GatheredLoadsEntriesFirst.has_value() &&
18794 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
18795 E->getOpcode() == Instruction::Load)) {
18796 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
18797 } else {
18798 // Set the insertion point after the last instruction in the bundle. Set the
18799 // debug location to Front.
18800 Builder.SetInsertPoint(
18801 LastInst->getParent(),
18802 LastInst->getNextNode()->getIterator());
18803 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
18804 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
18805 } else {
18806 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
18807 PoisonValue::get(Builder.getPtrTy()),
18808 MaybeAlign());
18809 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
18810 eraseInstruction(Res);
18811 LastInstructionToPos.try_emplace(LastInst, Res);
18812 }
18813 }
18814 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
18815}
18816
18817Value *BoUpSLP::gather(
18818 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
18819 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
18820 // List of instructions/lanes from current block and/or the blocks which are
18821 // part of the current loop. These instructions will be inserted at the end to
18822 // make it possible to optimize loops and hoist invariant instructions out of
18823 // the loops body with better chances for success.
18825 SmallSet<int, 4> PostponedIndices;
18826 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
18827 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
18828 SmallPtrSet<BasicBlock *, 4> Visited;
18829 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
18830 InsertBB = InsertBB->getSinglePredecessor();
18831 return InsertBB && InsertBB == InstBB;
18832 };
18833 for (int I = 0, E = VL.size(); I < E; ++I) {
18834 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
18835 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
18836 isVectorized(Inst) ||
18837 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
18838 PostponedIndices.insert(I).second)
18839 PostponedInsts.emplace_back(Inst, I);
18840 }
18841
18842 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
18843 Type *Ty) {
18844 Value *Scalar = V;
18845 if (Scalar->getType() != Ty) {
18846 assert(Scalar->getType()->isIntOrIntVectorTy() &&
18847 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
18848 Value *V = Scalar;
18849 if (auto *CI = dyn_cast<CastInst>(Scalar);
18851 Value *Op = CI->getOperand(0);
18852 if (auto *IOp = dyn_cast<Instruction>(Op);
18853 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
18854 V = Op;
18855 }
18856 Scalar = Builder.CreateIntCast(
18857 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
18858 }
18859
18860 Instruction *InsElt;
18861 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
18862 assert(SLPReVec && "FixedVectorType is not expected.");
18863 Vec =
18864 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
18865 auto *II = dyn_cast<Instruction>(Vec);
18866 if (!II)
18867 return Vec;
18868 InsElt = II;
18869 } else {
18870 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
18871 InsElt = dyn_cast<InsertElementInst>(Vec);
18872 if (!InsElt)
18873 return Vec;
18874 }
18875 GatherShuffleExtractSeq.insert(InsElt);
18876 CSEBlocks.insert(InsElt->getParent());
18877 // Add to our 'need-to-extract' list.
18878 if (isa<Instruction>(V)) {
18879 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
18880 const auto *It = find_if(Entries, [&](const TreeEntry *E) {
18881 return !TransformedToGatherNodes.contains(E) &&
18882 !DeletedNodes.contains(E);
18883 });
18884 if (It != Entries.end()) {
18885 // Find which lane we need to extract.
18886 User *UserOp = nullptr;
18887 if (Scalar != V) {
18888 if (auto *SI = dyn_cast<Instruction>(Scalar))
18889 UserOp = SI;
18890 } else {
18891 if (V->getType()->isVectorTy()) {
18892 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
18893 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
18894 // Find shufflevector, caused by resize.
18895 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
18896 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
18897 if (SV->getOperand(0) == V)
18898 return SV;
18899 if (SV->getOperand(1) == V)
18900 return SV;
18901 }
18902 return nullptr;
18903 };
18904 InsElt = nullptr;
18905 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18906 InsElt = User;
18907 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18908 InsElt = User;
18909 assert(InsElt &&
18910 "Failed to find shufflevector, caused by resize.");
18911 }
18912 }
18913 UserOp = InsElt;
18914 }
18915 if (UserOp) {
18916 unsigned FoundLane = (*It)->findLaneForValue(V);
18917 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
18918 }
18919 }
18920 }
18921 return Vec;
18922 };
18923 auto *VecTy = getWidenedType(ScalarTy, VL.size());
18924 Value *Vec = PoisonValue::get(VecTy);
18925 SmallVector<int> NonConsts;
18926 SmallVector<int> Mask(VL.size());
18927 std::iota(Mask.begin(), Mask.end(), 0);
18928 Value *OriginalRoot = Root;
18929 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
18930 SV && isa<PoisonValue>(SV->getOperand(1)) &&
18931 SV->getOperand(0)->getType() == VecTy) {
18932 Root = SV->getOperand(0);
18933 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18934 }
18935 // Insert constant values at first.
18936 for (int I = 0, E = VL.size(); I < E; ++I) {
18937 if (PostponedIndices.contains(I))
18938 continue;
18939 if (!isConstant(VL[I])) {
18940 NonConsts.push_back(I);
18941 continue;
18942 }
18943 if (isa<PoisonValue>(VL[I]))
18944 continue;
18945 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18946 Mask[I] = I + E;
18947 }
18948 if (Root) {
18949 if (isa<PoisonValue>(Vec)) {
18950 Vec = OriginalRoot;
18951 } else {
18952 Vec = CreateShuffle(Root, Vec, Mask);
18953 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
18954 OI && OI->use_empty() &&
18955 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18956 return TE->VectorizedValue == OI;
18957 }))
18958 eraseInstruction(OI);
18959 }
18960 }
18961 // Insert non-constant values.
18962 for (int I : NonConsts)
18963 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18964 // Append instructions, which are/may be part of the loop, in the end to make
18965 // it possible to hoist non-loop-based instructions.
18966 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18967 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18968
18969 return Vec;
18970}
18971
18972/// Merges shuffle masks and emits final shuffle instruction, if required. It
18973/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
18974/// when the actual shuffle instruction is generated only if this is actually
18975/// required. Otherwise, the shuffle instruction emission is delayed till the
18976/// end of the process, to reduce the number of emitted instructions and further
18977/// analysis/transformations.
18978/// The class also will look through the previously emitted shuffle instructions
18979/// and properly mark indices in mask as undef.
18980/// For example, given the code
18981/// \code
18982/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
18983/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
18984/// \endcode
18985/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
18986/// look through %s1 and %s2 and emit
18987/// \code
18988/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18989/// \endcode
18990/// instead.
18991/// If 2 operands are of different size, the smallest one will be resized and
18992/// the mask recalculated properly.
18993/// For example, given the code
18994/// \code
18995/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
18996/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
18997/// \endcode
18998/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
18999/// look through %s1 and %s2 and emit
19000/// \code
19001/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
19002/// \endcode
19003/// instead.
19004class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
19005 bool IsFinalized = false;
19006 /// Combined mask for all applied operands and masks. It is built during
19007 /// analysis and actual emission of shuffle vector instructions.
19008 SmallVector<int> CommonMask;
19009 /// List of operands for the shuffle vector instruction. It hold at max 2
19010 /// operands, if the 3rd is going to be added, the first 2 are combined into
19011 /// shuffle with \p CommonMask mask, the first operand sets to be the
19012 /// resulting shuffle and the second operand sets to be the newly added
19013 /// operand. The \p CommonMask is transformed in the proper way after that.
19014 SmallVector<Value *, 2> InVectors;
19015 IRBuilderBase &Builder;
19016 BoUpSLP &R;
19017
19018 class ShuffleIRBuilder {
19019 IRBuilderBase &Builder;
19020 /// Holds all of the instructions that we gathered.
19021 SetVector<Instruction *> &GatherShuffleExtractSeq;
19022 /// A list of blocks that we are going to CSE.
19023 DenseSet<BasicBlock *> &CSEBlocks;
19024 /// Data layout.
19025 const DataLayout &DL;
19026
19027 public:
19028 ShuffleIRBuilder(IRBuilderBase &Builder,
19029 SetVector<Instruction *> &GatherShuffleExtractSeq,
19030 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
19031 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
19032 CSEBlocks(CSEBlocks), DL(DL) {}
19033 ~ShuffleIRBuilder() = default;
19034 /// Creates shufflevector for the 2 operands with the given mask.
19035 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
19036 if (V1->getType() != V2->getType()) {
19038 V1->getType()->isIntOrIntVectorTy() &&
19039 "Expected integer vector types only.");
19040 if (V1->getType() != V2->getType()) {
19041 if (cast<VectorType>(V2->getType())
19042 ->getElementType()
19043 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
19044 ->getElementType()
19045 ->getIntegerBitWidth())
19046 V2 = Builder.CreateIntCast(
19047 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
19048 else
19049 V1 = Builder.CreateIntCast(
19050 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
19051 }
19052 }
19053 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
19054 if (auto *I = dyn_cast<Instruction>(Vec)) {
19055 GatherShuffleExtractSeq.insert(I);
19056 CSEBlocks.insert(I->getParent());
19057 }
19058 return Vec;
19059 }
19060 /// Creates permutation of the single vector operand with the given mask, if
19061 /// it is not identity mask.
19062 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
19063 if (Mask.empty())
19064 return V1;
19065 unsigned VF = Mask.size();
19066 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
19067 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
19068 return V1;
19069 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
19070 if (auto *I = dyn_cast<Instruction>(Vec)) {
19071 GatherShuffleExtractSeq.insert(I);
19072 CSEBlocks.insert(I->getParent());
19073 }
19074 return Vec;
19075 }
19076 Value *createIdentity(Value *V) { return V; }
19077 Value *createPoison(Type *Ty, unsigned VF) {
19078 return PoisonValue::get(getWidenedType(Ty, VF));
19079 }
19080 /// Resizes 2 input vector to match the sizes, if the they are not equal
19081 /// yet. The smallest vector is resized to the size of the larger vector.
19082 void resizeToMatch(Value *&V1, Value *&V2) {
19083 if (V1->getType() == V2->getType())
19084 return;
19085 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
19086 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
19087 int VF = std::max(V1VF, V2VF);
19088 int MinVF = std::min(V1VF, V2VF);
19089 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
19090 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
19091 0);
19092 Value *&Op = MinVF == V1VF ? V1 : V2;
19093 Op = Builder.CreateShuffleVector(Op, IdentityMask);
19094 if (auto *I = dyn_cast<Instruction>(Op)) {
19095 GatherShuffleExtractSeq.insert(I);
19096 CSEBlocks.insert(I->getParent());
19097 }
19098 if (MinVF == V1VF)
19099 V1 = Op;
19100 else
19101 V2 = Op;
19102 }
19103 };
19104
19105 /// Smart shuffle instruction emission, walks through shuffles trees and
19106 /// tries to find the best matching vector for the actual shuffle
19107 /// instruction.
19108 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
19109 assert(V1 && "Expected at least one vector value.");
19110 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
19111 R.CSEBlocks, *R.DL);
19112 return BaseShuffleAnalysis::createShuffle<Value *>(
19113 V1, V2, Mask, ShuffleBuilder, ScalarTy);
19114 }
19115
19116 /// Cast value \p V to the vector type with the same number of elements, but
19117 /// the base type \p ScalarTy.
19118 Value *castToScalarTyElem(Value *V,
19119 std::optional<bool> IsSigned = std::nullopt) {
19120 auto *VecTy = cast<VectorType>(V->getType());
19121 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
19122 if (VecTy->getElementType() == ScalarTy->getScalarType())
19123 return V;
19124 return Builder.CreateIntCast(
19125 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
19126 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
19127 }
19128
19129 Value *getVectorizedValue(const TreeEntry &E) {
19130 Value *Vec = E.VectorizedValue;
19131 if (!Vec->getType()->isIntOrIntVectorTy())
19132 return Vec;
19133 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
19134 return !isa<PoisonValue>(V) &&
19135 !isKnownNonNegative(
19136 V, SimplifyQuery(*R.DL));
19137 }));
19138 }
19139
19140public:
19142 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
19143
19144 /// Adjusts extractelements after reusing them.
19145 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
19146 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
19147 unsigned NumParts, bool &UseVecBaseAsInput) {
19148 UseVecBaseAsInput = false;
19149 SmallPtrSet<Value *, 4> UniqueBases;
19150 Value *VecBase = nullptr;
19151 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
19152 if (!E->ReorderIndices.empty()) {
19153 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
19154 E->ReorderIndices.end());
19155 reorderScalars(VL, ReorderMask);
19156 }
19157 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
19158 int Idx = Mask[I];
19159 if (Idx == PoisonMaskElem)
19160 continue;
19161 auto *EI = cast<ExtractElementInst>(VL[I]);
19162 VecBase = EI->getVectorOperand();
19163 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
19164 VecBase = TEs.front()->VectorizedValue;
19165 assert(VecBase && "Expected vectorized value.");
19166 UniqueBases.insert(VecBase);
19167 // If the only one use is vectorized - can delete the extractelement
19168 // itself.
19169 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
19170 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
19171 !R.isVectorized(EI) &&
19172 count_if(E->Scalars, [&](Value *V) { return V == EI; }) !=
19173 count_if(E->UserTreeIndex.UserTE->Scalars,
19174 [&](Value *V) { return V == EI; })) ||
19175 (NumParts != 1 && count(VL, EI) > 1) ||
19176 any_of(EI->users(), [&](User *U) {
19177 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
19178 return UTEs.empty() || UTEs.size() > 1 ||
19179 any_of(UTEs,
19180 [&](const TreeEntry *TE) {
19181 return R.DeletedNodes.contains(TE) ||
19182 R.TransformedToGatherNodes.contains(TE);
19183 }) ||
19185 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
19186 (!UTEs.empty() &&
19187 count_if(R.VectorizableTree,
19188 [&](const std::unique_ptr<TreeEntry> &TE) {
19189 return TE->UserTreeIndex.UserTE ==
19190 UTEs.front() &&
19191 is_contained(VL, EI);
19192 }) != 1);
19193 }))
19194 continue;
19195 R.eraseInstruction(EI);
19196 }
19197 if (NumParts == 1 || UniqueBases.size() == 1) {
19198 assert(VecBase && "Expected vectorized value.");
19199 return castToScalarTyElem(VecBase);
19200 }
19201 UseVecBaseAsInput = true;
19202 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
19203 for (auto [I, Idx] : enumerate(Mask))
19204 if (Idx != PoisonMaskElem)
19205 Idx = I;
19206 };
19207 // Perform multi-register vector shuffle, joining them into a single virtual
19208 // long vector.
19209 // Need to shuffle each part independently and then insert all this parts
19210 // into a long virtual vector register, forming the original vector.
19211 Value *Vec = nullptr;
19212 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
19213 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
19214 for (unsigned Part : seq<unsigned>(NumParts)) {
19215 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
19216 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
19217 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
19218 constexpr int MaxBases = 2;
19219 SmallVector<Value *, MaxBases> Bases(MaxBases);
19220 auto VLMask = zip(SubVL, SubMask);
19221 const unsigned VF = std::accumulate(
19222 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
19223 if (std::get<1>(D) == PoisonMaskElem)
19224 return S;
19225 Value *VecOp =
19226 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
19227 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
19228 !TEs.empty())
19229 VecOp = TEs.front()->VectorizedValue;
19230 assert(VecOp && "Expected vectorized value.");
19231 const unsigned Size =
19232 cast<FixedVectorType>(VecOp->getType())->getNumElements();
19233 return std::max(S, Size);
19234 });
19235 for (const auto [V, I] : VLMask) {
19236 if (I == PoisonMaskElem)
19237 continue;
19238 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
19239 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
19240 VecOp = TEs.front()->VectorizedValue;
19241 assert(VecOp && "Expected vectorized value.");
19242 VecOp = castToScalarTyElem(VecOp);
19243 Bases[I / VF] = VecOp;
19244 }
19245 if (!Bases.front())
19246 continue;
19247 Value *SubVec;
19248 if (Bases.back()) {
19249 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
19250 TransformToIdentity(SubMask);
19251 } else {
19252 SubVec = Bases.front();
19253 }
19254 if (!Vec) {
19255 Vec = SubVec;
19256 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
19257 [&](unsigned P) {
19258 ArrayRef<int> SubMask =
19259 Mask.slice(P * SliceSize,
19260 getNumElems(Mask.size(),
19261 SliceSize, P));
19262 return all_of(SubMask, [](int Idx) {
19263 return Idx == PoisonMaskElem;
19264 });
19265 })) &&
19266 "Expected first part or all previous parts masked.");
19267 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
19268 } else {
19269 unsigned NewVF =
19270 cast<FixedVectorType>(Vec->getType())->getNumElements();
19271 if (Vec->getType() != SubVec->getType()) {
19272 unsigned SubVecVF =
19273 cast<FixedVectorType>(SubVec->getType())->getNumElements();
19274 NewVF = std::max(NewVF, SubVecVF);
19275 }
19276 // Adjust SubMask.
19277 for (int &Idx : SubMask)
19278 if (Idx != PoisonMaskElem)
19279 Idx += NewVF;
19280 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
19281 Vec = createShuffle(Vec, SubVec, VecMask);
19282 TransformToIdentity(VecMask);
19283 }
19284 }
19285 copy(VecMask, Mask.begin());
19286 return Vec;
19287 }
19288 /// Checks if the specified entry \p E needs to be delayed because of its
19289 /// dependency nodes.
19290 std::optional<Value *>
19291 needToDelay(const TreeEntry *E,
19293 // No need to delay emission if all deps are ready.
19294 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
19295 return all_of(
19296 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
19297 }))
19298 return std::nullopt;
19299 // Postpone gather emission, will be emitted after the end of the
19300 // process to keep correct order.
19301 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
19302 return Builder.CreateAlignedLoad(
19303 ResVecTy,
19304 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
19305 MaybeAlign());
19306 }
19307 /// Reset the builder to handle perfect diamond match.
19309 IsFinalized = false;
19310 CommonMask.clear();
19311 InVectors.clear();
19312 }
19313 /// Adds 2 input vectors (in form of tree entries) and the mask for their
19314 /// shuffling.
19315 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
19316 Value *V1 = getVectorizedValue(E1);
19317 Value *V2 = getVectorizedValue(E2);
19318 add(V1, V2, Mask);
19319 }
19320 /// Adds single input vector (in form of tree entry) and the mask for its
19321 /// shuffling.
19322 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
19323 Value *V1 = getVectorizedValue(E1);
19324 add(V1, Mask);
19325 }
19326 /// Adds 2 input vectors and the mask for their shuffling.
19327 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
19328 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
19331 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
19332 V1 = castToScalarTyElem(V1);
19333 V2 = castToScalarTyElem(V2);
19334 if (InVectors.empty()) {
19335 InVectors.push_back(V1);
19336 InVectors.push_back(V2);
19337 CommonMask.assign(Mask.begin(), Mask.end());
19338 return;
19339 }
19340 Value *Vec = InVectors.front();
19341 if (InVectors.size() == 2) {
19342 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19343 transformMaskAfterShuffle(CommonMask, CommonMask);
19344 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
19345 Mask.size()) {
19346 Vec = createShuffle(Vec, nullptr, CommonMask);
19347 transformMaskAfterShuffle(CommonMask, CommonMask);
19348 }
19349 V1 = createShuffle(V1, V2, Mask);
19350 unsigned VF = std::max(getVF(V1), getVF(Vec));
19351 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19352 if (Mask[Idx] != PoisonMaskElem)
19353 CommonMask[Idx] = Idx + VF;
19354 InVectors.front() = Vec;
19355 if (InVectors.size() == 2)
19356 InVectors.back() = V1;
19357 else
19358 InVectors.push_back(V1);
19359 }
19360 /// Adds another one input vector and the mask for the shuffling.
19361 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
19363 "castToScalarTyElem expects V1 to be FixedVectorType");
19364 V1 = castToScalarTyElem(V1);
19365 if (InVectors.empty()) {
19366 InVectors.push_back(V1);
19367 CommonMask.assign(Mask.begin(), Mask.end());
19368 return;
19369 }
19370 const auto *It = find(InVectors, V1);
19371 if (It == InVectors.end()) {
19372 if (InVectors.size() == 2 ||
19373 InVectors.front()->getType() != V1->getType()) {
19374 Value *V = InVectors.front();
19375 if (InVectors.size() == 2) {
19376 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
19377 transformMaskAfterShuffle(CommonMask, CommonMask);
19378 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
19379 CommonMask.size()) {
19380 V = createShuffle(InVectors.front(), nullptr, CommonMask);
19381 transformMaskAfterShuffle(CommonMask, CommonMask);
19382 }
19383 unsigned VF = std::max(CommonMask.size(), Mask.size());
19384 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19385 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
19386 CommonMask[Idx] = V->getType() != V1->getType()
19387 ? Idx + VF
19388 : Mask[Idx] + getVF(V1);
19389 if (V->getType() != V1->getType())
19390 V1 = createShuffle(V1, nullptr, Mask);
19391 InVectors.front() = V;
19392 if (InVectors.size() == 2)
19393 InVectors.back() = V1;
19394 else
19395 InVectors.push_back(V1);
19396 return;
19397 }
19398 // Check if second vector is required if the used elements are already
19399 // used from the first one.
19400 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19401 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
19402 InVectors.push_back(V1);
19403 break;
19404 }
19405 }
19406 unsigned VF = 0;
19407 for (Value *V : InVectors)
19408 VF = std::max(VF, getVF(V));
19409 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19410 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
19411 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
19412 }
19413 /// Adds another one input vector and the mask for the shuffling.
19415 SmallVector<int> NewMask;
19416 inversePermutation(Order, NewMask);
19417 add(V1, NewMask);
19418 }
19419 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
19420 Value *Root = nullptr) {
19421 return R.gather(VL, Root, ScalarTy,
19422 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
19423 return createShuffle(V1, V2, Mask);
19424 });
19425 }
19426 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
19427 /// Finalize emission of the shuffles.
19428 /// \param Action the action (if any) to be performed before final applying of
19429 /// the \p ExtMask mask.
19431 ArrayRef<int> ExtMask,
19432 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
19433 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
19436 Action = {}) {
19437 IsFinalized = true;
19438 if (Action) {
19439 Value *Vec = InVectors.front();
19440 if (InVectors.size() == 2) {
19441 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19442 InVectors.pop_back();
19443 } else {
19444 Vec = createShuffle(Vec, nullptr, CommonMask);
19445 }
19446 transformMaskAfterShuffle(CommonMask, CommonMask);
19447 assert(VF > 0 &&
19448 "Expected vector length for the final value before action.");
19449 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
19450 if (VecVF < VF) {
19451 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
19452 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
19453 Vec = createShuffle(Vec, nullptr, ResizeMask);
19454 }
19455 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
19456 return createShuffle(V1, V2, Mask);
19457 });
19458 InVectors.front() = Vec;
19459 }
19460 if (!SubVectors.empty()) {
19461 Value *Vec = InVectors.front();
19462 if (InVectors.size() == 2) {
19463 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19464 InVectors.pop_back();
19465 } else {
19466 Vec = createShuffle(Vec, nullptr, CommonMask);
19467 }
19468 transformMaskAfterShuffle(CommonMask, CommonMask);
19469 auto CreateSubVectors = [&](Value *Vec,
19470 SmallVectorImpl<int> &CommonMask) {
19471 for (auto [E, Idx] : SubVectors) {
19472 Value *V = getVectorizedValue(*E);
19473 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
19474 // Use scalar version of the SCalarType to correctly handle shuffles
19475 // for revectorization. The revectorization mode operates by the
19476 // vectors, but here we need to operate on the scalars, because the
19477 // masks were already transformed for the vector elements and we don't
19478 // need doing this transformation again.
19479 Type *OrigScalarTy = ScalarTy;
19480 ScalarTy = ScalarTy->getScalarType();
19481 Vec = createInsertVector(
19482 Builder, Vec, V, InsertionIndex,
19483 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
19484 _3));
19485 ScalarTy = OrigScalarTy;
19486 if (!CommonMask.empty()) {
19487 std::iota(std::next(CommonMask.begin(), Idx),
19488 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
19489 Idx);
19490 }
19491 }
19492 return Vec;
19493 };
19494 if (SubVectorsMask.empty()) {
19495 Vec = CreateSubVectors(Vec, CommonMask);
19496 } else {
19497 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
19498 copy(SubVectorsMask, SVMask.begin());
19499 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
19500 if (I2 != PoisonMaskElem) {
19501 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
19502 I1 = I2 + CommonMask.size();
19503 }
19504 }
19505 Value *InsertVec =
19506 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
19507 Vec = createShuffle(InsertVec, Vec, SVMask);
19508 transformMaskAfterShuffle(CommonMask, SVMask);
19509 }
19510 InVectors.front() = Vec;
19511 }
19512
19513 if (!ExtMask.empty()) {
19514 if (CommonMask.empty()) {
19515 CommonMask.assign(ExtMask.begin(), ExtMask.end());
19516 } else {
19517 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
19518 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
19519 if (ExtMask[I] == PoisonMaskElem)
19520 continue;
19521 NewMask[I] = CommonMask[ExtMask[I]];
19522 }
19523 CommonMask.swap(NewMask);
19524 }
19525 }
19526 if (CommonMask.empty()) {
19527 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
19528 return InVectors.front();
19529 }
19530 if (InVectors.size() == 2)
19531 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
19532 return createShuffle(InVectors.front(), nullptr, CommonMask);
19533 }
19534
19536 assert((IsFinalized || CommonMask.empty()) &&
19537 "Shuffle construction must be finalized.");
19538 }
19539};
19540
19541Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
19542 return vectorizeTree(getOperandEntry(E, NodeIdx));
19543}
19544
19545template <typename BVTy, typename ResTy, typename... Args>
19546ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
19547 Args &...Params) {
19548 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
19549 "Expected gather node.");
19550 unsigned VF = E->getVectorFactor();
19551
19552 bool NeedFreeze = false;
19553 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
19554 // Do not process split vectorize node, marked to be gathers/buildvectors.
19556 E->CombinedEntriesWithIndices.size());
19557 if (E->State == TreeEntry::SplitVectorize &&
19558 TransformedToGatherNodes.contains(E)) {
19559 SubVectors.clear();
19560 } else {
19561 // Clear values, to be replaced by insertvector instructions.
19562 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
19563 for_each(MutableArrayRef(GatheredScalars)
19564 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
19565 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
19566 transform(
19567 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19568 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19569 });
19570 }
19571 // Build a mask out of the reorder indices and reorder scalars per this
19572 // mask.
19573 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
19574 E->ReorderIndices.end());
19575 if (!ReorderMask.empty())
19576 reorderScalars(GatheredScalars, ReorderMask);
19577 SmallVector<int> SubVectorsMask;
19578 inversePermutation(E->ReorderIndices, SubVectorsMask);
19579 // Transform non-clustered elements in the mask to poison (-1).
19580 // "Clustered" operations will be reordered using this mask later.
19581 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
19582 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
19583 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
19584 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
19585 } else {
19586 SubVectorsMask.clear();
19587 }
19588 SmallVector<Value *> StoredGS(GatheredScalars);
19589 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
19590 unsigned I, unsigned SliceSize,
19591 bool IsNotPoisonous) {
19592 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
19593 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19594 }))
19595 return false;
19596 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
19597 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
19598 if (UserTE->getNumOperands() != 2)
19599 return false;
19600 if (!IsNotPoisonous) {
19601 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
19602 [=](const std::unique_ptr<TreeEntry> &TE) {
19603 return TE->UserTreeIndex.UserTE == UserTE &&
19604 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
19605 });
19606 if (It == VectorizableTree.end())
19607 return false;
19608 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
19609 if (!(*It)->ReorderIndices.empty()) {
19610 inversePermutation((*It)->ReorderIndices, ReorderMask);
19611 reorderScalars(GS, ReorderMask);
19612 }
19613 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
19614 Value *V0 = std::get<0>(P);
19615 Value *V1 = std::get<1>(P);
19616 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
19617 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
19618 is_contained(E->Scalars, V1));
19619 }))
19620 return false;
19621 }
19622 int Idx;
19623 if ((Mask.size() < InputVF &&
19624 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
19625 Idx == 0) ||
19626 (Mask.size() == InputVF &&
19627 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
19628 std::iota(
19629 std::next(Mask.begin(), I * SliceSize),
19630 std::next(Mask.begin(),
19631 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
19632 0);
19633 } else {
19634 unsigned IVal =
19635 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
19636 std::fill(
19637 std::next(Mask.begin(), I * SliceSize),
19638 std::next(Mask.begin(),
19639 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
19640 IVal);
19641 }
19642 return true;
19643 };
19644 BVTy ShuffleBuilder(ScalarTy, Params...);
19645 ResTy Res = ResTy();
19646 SmallVector<int> Mask;
19647 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
19649 Value *ExtractVecBase = nullptr;
19650 bool UseVecBaseAsInput = false;
19653 Type *OrigScalarTy = GatheredScalars.front()->getType();
19654 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
19655 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
19656 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
19657 // Check for gathered extracts.
19658 bool Resized = false;
19659 ExtractShuffles =
19660 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
19661 if (!ExtractShuffles.empty()) {
19662 SmallVector<const TreeEntry *> ExtractEntries;
19663 for (auto [Idx, I] : enumerate(ExtractMask)) {
19664 if (I == PoisonMaskElem)
19665 continue;
19666 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
19667 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
19668 !TEs.empty())
19669 ExtractEntries.append(TEs.begin(), TEs.end());
19670 }
19671 if (std::optional<ResTy> Delayed =
19672 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
19673 // Delay emission of gathers which are not ready yet.
19674 PostponedGathers.insert(E);
19675 // Postpone gather emission, will be emitted after the end of the
19676 // process to keep correct order.
19677 return *Delayed;
19678 }
19679 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
19680 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
19681 ExtractVecBase = VecBase;
19682 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
19683 if (VF == VecBaseTy->getNumElements() &&
19684 GatheredScalars.size() != VF) {
19685 Resized = true;
19686 GatheredScalars.append(VF - GatheredScalars.size(),
19687 PoisonValue::get(OrigScalarTy));
19688 NumParts =
19689 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
19690 }
19691 }
19692 }
19693 // Gather extracts after we check for full matched gathers only.
19694 if (!ExtractShuffles.empty() || !E->hasState() ||
19695 E->getOpcode() != Instruction::Load ||
19696 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
19697 any_of(E->Scalars, IsaPred<LoadInst>)) &&
19698 any_of(E->Scalars,
19699 [this](Value *V) {
19700 return isa<LoadInst>(V) && isVectorized(V);
19701 })) ||
19702 (E->hasState() && E->isAltShuffle()) ||
19703 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
19704 isSplat(E->Scalars) ||
19705 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
19706 GatherShuffles =
19707 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
19708 }
19709 if (!GatherShuffles.empty()) {
19710 if (std::optional<ResTy> Delayed =
19711 ShuffleBuilder.needToDelay(E, Entries)) {
19712 // Delay emission of gathers which are not ready yet.
19713 PostponedGathers.insert(E);
19714 // Postpone gather emission, will be emitted after the end of the
19715 // process to keep correct order.
19716 return *Delayed;
19717 }
19718 if (GatherShuffles.size() == 1 &&
19719 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
19720 Entries.front().front()->isSame(E->Scalars)) {
19721 // Perfect match in the graph, will reuse the previously vectorized
19722 // node. Cost is 0.
19723 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
19724 << shortBundleName(E->Scalars, E->Idx) << ".\n");
19725 // Restore the mask for previous partially matched values.
19726 Mask.resize(E->Scalars.size());
19727 const TreeEntry *FrontTE = Entries.front().front();
19728 if (FrontTE->ReorderIndices.empty() &&
19729 ((FrontTE->ReuseShuffleIndices.empty() &&
19730 E->Scalars.size() == FrontTE->Scalars.size()) ||
19731 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
19732 std::iota(Mask.begin(), Mask.end(), 0);
19733 } else {
19734 for (auto [I, V] : enumerate(E->Scalars)) {
19735 if (isa<PoisonValue>(V)) {
19736 Mask[I] = PoisonMaskElem;
19737 continue;
19738 }
19739 Mask[I] = FrontTE->findLaneForValue(V);
19740 }
19741 }
19742 // Reset the builder(s) to correctly handle perfect diamond matched
19743 // nodes.
19744 ShuffleBuilder.resetForSameNode();
19745 ShuffleBuilder.add(*FrontTE, Mask);
19746 // Full matched entry found, no need to insert subvectors.
19747 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
19748 return Res;
19749 }
19750 if (!Resized) {
19751 if (GatheredScalars.size() != VF &&
19752 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
19753 return any_of(TEs, [&](const TreeEntry *TE) {
19754 return TE->getVectorFactor() == VF;
19755 });
19756 }))
19757 GatheredScalars.append(VF - GatheredScalars.size(),
19758 PoisonValue::get(OrigScalarTy));
19759 }
19760 // Remove shuffled elements from list of gathers.
19761 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
19762 if (Mask[I] != PoisonMaskElem)
19763 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
19764 }
19765 }
19766 }
19767 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
19768 SmallVectorImpl<int> &ReuseMask,
19769 bool IsRootPoison) {
19770 // For splats with can emit broadcasts instead of gathers, so try to find
19771 // such sequences.
19772 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
19773 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
19774 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
19775 SmallVector<int> UndefPos;
19776 DenseMap<Value *, unsigned> UniquePositions;
19777 // Gather unique non-const values and all constant values.
19778 // For repeated values, just shuffle them.
19779 int NumNonConsts = 0;
19780 int SinglePos = 0;
19781 for (auto [I, V] : enumerate(Scalars)) {
19782 if (isa<UndefValue>(V)) {
19783 if (!isa<PoisonValue>(V)) {
19784 ReuseMask[I] = I;
19785 UndefPos.push_back(I);
19786 }
19787 continue;
19788 }
19789 if (isConstant(V)) {
19790 ReuseMask[I] = I;
19791 continue;
19792 }
19793 ++NumNonConsts;
19794 SinglePos = I;
19795 Value *OrigV = V;
19796 Scalars[I] = PoisonValue::get(OrigScalarTy);
19797 if (IsSplat) {
19798 Scalars.front() = OrigV;
19799 ReuseMask[I] = 0;
19800 } else {
19801 const auto Res = UniquePositions.try_emplace(OrigV, I);
19802 Scalars[Res.first->second] = OrigV;
19803 ReuseMask[I] = Res.first->second;
19804 }
19805 }
19806 if (NumNonConsts == 1) {
19807 // Restore single insert element.
19808 if (IsSplat) {
19809 ReuseMask.assign(VF, PoisonMaskElem);
19810 std::swap(Scalars.front(), Scalars[SinglePos]);
19811 if (!UndefPos.empty() && UndefPos.front() == 0)
19812 Scalars.front() = UndefValue::get(OrigScalarTy);
19813 }
19814 ReuseMask[SinglePos] = SinglePos;
19815 } else if (!UndefPos.empty() && IsSplat) {
19816 // For undef values, try to replace them with the simple broadcast.
19817 // We can do it if the broadcasted value is guaranteed to be
19818 // non-poisonous, or by freezing the incoming scalar value first.
19819 auto *It = find_if(Scalars, [this, E](Value *V) {
19820 return !isa<UndefValue>(V) &&
19822 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
19823 // Check if the value already used in the same operation in
19824 // one of the nodes already.
19825 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
19826 is_contained(E->UserTreeIndex.UserTE->Scalars,
19827 U.getUser());
19828 })));
19829 });
19830 if (It != Scalars.end()) {
19831 // Replace undefs by the non-poisoned scalars and emit broadcast.
19832 int Pos = std::distance(Scalars.begin(), It);
19833 for (int I : UndefPos) {
19834 // Set the undef position to the non-poisoned scalar.
19835 ReuseMask[I] = Pos;
19836 // Replace the undef by the poison, in the mask it is replaced by
19837 // non-poisoned scalar already.
19838 if (I != Pos)
19839 Scalars[I] = PoisonValue::get(OrigScalarTy);
19840 }
19841 } else {
19842 // Replace undefs by the poisons, emit broadcast and then emit
19843 // freeze.
19844 for (int I : UndefPos) {
19845 ReuseMask[I] = PoisonMaskElem;
19846 if (isa<UndefValue>(Scalars[I]))
19847 Scalars[I] = PoisonValue::get(OrigScalarTy);
19848 }
19849 NeedFreeze = true;
19850 }
19851 }
19852 };
19853 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
19854 bool IsNonPoisoned = true;
19855 bool IsUsedInExpr = true;
19856 Value *Vec1 = nullptr;
19857 if (!ExtractShuffles.empty()) {
19858 // Gather of extractelements can be represented as just a shuffle of
19859 // a single/two vectors the scalars are extracted from.
19860 // Find input vectors.
19861 Value *Vec2 = nullptr;
19862 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
19863 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
19864 ExtractMask[I] = PoisonMaskElem;
19865 }
19866 if (UseVecBaseAsInput) {
19867 Vec1 = ExtractVecBase;
19868 } else {
19869 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
19870 if (ExtractMask[I] == PoisonMaskElem)
19871 continue;
19872 if (isa<UndefValue>(StoredGS[I]))
19873 continue;
19874 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
19875 Value *VecOp = EI->getVectorOperand();
19876 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
19877 !TEs.empty() && TEs.front()->VectorizedValue)
19878 VecOp = TEs.front()->VectorizedValue;
19879 if (!Vec1) {
19880 Vec1 = VecOp;
19881 } else if (Vec1 != VecOp) {
19882 assert((!Vec2 || Vec2 == VecOp) &&
19883 "Expected only 1 or 2 vectors shuffle.");
19884 Vec2 = VecOp;
19885 }
19886 }
19887 }
19888 if (Vec2) {
19889 IsUsedInExpr = false;
19890 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
19891 isGuaranteedNotToBePoison(Vec2, AC);
19892 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
19893 } else if (Vec1) {
19894 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
19895 IsUsedInExpr &= FindReusedSplat(
19896 ExtractMask,
19897 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
19898 ExtractMask.size(), IsNotPoisonedVec);
19899 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
19900 IsNonPoisoned &= IsNotPoisonedVec;
19901 } else {
19902 IsUsedInExpr = false;
19903 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
19904 /*ForExtracts=*/true);
19905 }
19906 }
19907 if (!GatherShuffles.empty()) {
19908 unsigned SliceSize =
19909 getPartNumElems(E->Scalars.size(),
19910 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
19911 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
19912 for (const auto [I, TEs] : enumerate(Entries)) {
19913 if (TEs.empty()) {
19914 assert(!GatherShuffles[I] &&
19915 "No shuffles with empty entries list expected.");
19916 continue;
19917 }
19918 assert((TEs.size() == 1 || TEs.size() == 2) &&
19919 "Expected shuffle of 1 or 2 entries.");
19920 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
19921 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
19922 VecMask.assign(VecMask.size(), PoisonMaskElem);
19923 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
19924 if (TEs.size() == 1) {
19925 bool IsNotPoisonedVec =
19926 TEs.front()->VectorizedValue
19927 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
19928 : true;
19929 IsUsedInExpr &=
19930 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
19931 SliceSize, IsNotPoisonedVec);
19932 ShuffleBuilder.add(*TEs.front(), VecMask);
19933 IsNonPoisoned &= IsNotPoisonedVec;
19934 } else {
19935 IsUsedInExpr = false;
19936 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19937 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19938 IsNonPoisoned &=
19939 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
19940 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
19941 }
19942 }
19943 }
19944 // Try to figure out best way to combine values: build a shuffle and insert
19945 // elements or just build several shuffles.
19946 // Insert non-constant scalars.
19947 SmallVector<Value *> NonConstants(GatheredScalars);
19948 int EMSz = ExtractMask.size();
19949 int MSz = Mask.size();
19950 // Try to build constant vector and shuffle with it only if currently we
19951 // have a single permutation and more than 1 scalar constants.
19952 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
19953 bool IsIdentityShuffle =
19954 ((UseVecBaseAsInput ||
19955 all_of(ExtractShuffles,
19956 [](const std::optional<TTI::ShuffleKind> &SK) {
19957 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19959 })) &&
19960 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
19961 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
19962 (!GatherShuffles.empty() &&
19963 all_of(GatherShuffles,
19964 [](const std::optional<TTI::ShuffleKind> &SK) {
19965 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
19967 }) &&
19968 none_of(Mask, [&](int I) { return I >= MSz; }) &&
19970 bool EnoughConstsForShuffle =
19971 IsSingleShuffle &&
19972 (none_of(GatheredScalars,
19973 [](Value *V) {
19974 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19975 }) ||
19976 any_of(GatheredScalars,
19977 [](Value *V) {
19978 return isa<Constant>(V) && !isa<UndefValue>(V);
19979 })) &&
19980 (!IsIdentityShuffle ||
19981 (GatheredScalars.size() == 2 &&
19982 any_of(GatheredScalars,
19983 [](Value *V) { return !isa<UndefValue>(V); })) ||
19984 count_if(GatheredScalars, [](Value *V) {
19985 return isa<Constant>(V) && !isa<PoisonValue>(V);
19986 }) > 1);
19987 // NonConstants array contains just non-constant values, GatheredScalars
19988 // contains only constant to build final vector and then shuffle.
19989 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
19990 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
19991 NonConstants[I] = PoisonValue::get(OrigScalarTy);
19992 else
19993 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
19994 }
19995 // Generate constants for final shuffle and build a mask for them.
19996 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
19997 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
19998 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
19999 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
20000 ShuffleBuilder.add(BV, BVMask);
20001 }
20002 if (all_of(NonConstants, [=](Value *V) {
20003 return isa<PoisonValue>(V) ||
20004 (IsSingleShuffle && ((IsIdentityShuffle &&
20005 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
20006 }))
20007 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20008 SubVectorsMask);
20009 else
20010 Res = ShuffleBuilder.finalize(
20011 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
20012 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
20013 bool IsSplat = isSplat(NonConstants);
20014 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
20015 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
20016 auto CheckIfSplatIsProfitable = [&]() {
20017 // Estimate the cost of splatting + shuffle and compare with
20018 // insert + shuffle.
20019 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
20020 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
20021 if (isa<ExtractElementInst>(V) || isVectorized(V))
20022 return false;
20023 InstructionCost SplatCost = TTI->getVectorInstrCost(
20024 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
20025 PoisonValue::get(VecTy), V);
20026 SmallVector<int> NewMask(Mask.begin(), Mask.end());
20027 for (auto [Idx, I] : enumerate(BVMask))
20028 if (I != PoisonMaskElem)
20029 NewMask[Idx] = Mask.size();
20030 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
20031 NewMask, CostKind);
20032 InstructionCost BVCost = TTI->getVectorInstrCost(
20033 Instruction::InsertElement, VecTy, CostKind,
20034 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
20035 // Shuffle required?
20036 if (count(BVMask, PoisonMaskElem) <
20037 static_cast<int>(BVMask.size() - 1)) {
20038 SmallVector<int> NewMask(Mask.begin(), Mask.end());
20039 for (auto [Idx, I] : enumerate(BVMask))
20040 if (I != PoisonMaskElem)
20041 NewMask[Idx] = I;
20042 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
20043 VecTy, NewMask, CostKind);
20044 }
20045 return SplatCost <= BVCost;
20046 };
20047 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
20048 for (auto [Idx, I] : enumerate(BVMask))
20049 if (I != PoisonMaskElem)
20050 Mask[Idx] = I;
20051 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
20052 } else {
20053 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
20054 SmallVector<Value *> Values(NonConstants.size(),
20055 PoisonValue::get(ScalarTy));
20056 Values[0] = V;
20057 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
20058 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
20059 transform(BVMask, SplatMask.begin(), [](int I) {
20060 return I == PoisonMaskElem ? PoisonMaskElem : 0;
20061 });
20062 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
20063 BV = CreateShuffle(BV, nullptr, SplatMask);
20064 for (auto [Idx, I] : enumerate(BVMask))
20065 if (I != PoisonMaskElem)
20066 Mask[Idx] = BVMask.size() + Idx;
20067 Vec = CreateShuffle(Vec, BV, Mask);
20068 for (auto [Idx, I] : enumerate(Mask))
20069 if (I != PoisonMaskElem)
20070 Mask[Idx] = Idx;
20071 }
20072 });
20073 } else if (!allConstant(GatheredScalars)) {
20074 // Gather unique scalars and all constants.
20075 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
20076 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
20077 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
20078 ShuffleBuilder.add(BV, ReuseMask);
20079 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20080 SubVectorsMask);
20081 } else {
20082 // Gather all constants.
20083 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
20084 for (auto [I, V] : enumerate(GatheredScalars)) {
20085 if (!isa<PoisonValue>(V))
20086 Mask[I] = I;
20087 }
20088 Value *BV = ShuffleBuilder.gather(GatheredScalars);
20089 ShuffleBuilder.add(BV, Mask);
20090 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20091 SubVectorsMask);
20092 }
20093
20094 if (NeedFreeze)
20095 Res = ShuffleBuilder.createFreeze(Res);
20096 return Res;
20097}
20098
20099Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
20100 // Do not do this for split vectorize node, marked to be gathers/buildvectors.
20101 if (E->State != TreeEntry::SplitVectorize ||
20102 !TransformedToGatherNodes.contains(E)) {
20103 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
20104 (void)vectorizeTree(VectorizableTree[EIdx].get());
20105 }
20106 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
20107 Builder, *this);
20108}
20109
20110/// \returns \p I after propagating metadata from \p VL only for instructions in
20111/// \p VL.
20114 for (Value *V : VL)
20115 if (isa<Instruction>(V))
20116 Insts.push_back(V);
20117 return llvm::propagateMetadata(Inst, Insts);
20118}
20119
20121 if (DebugLoc DL = PN.getDebugLoc())
20122 return DL;
20123 return DebugLoc::getUnknown();
20124}
20125
20126Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
20127 IRBuilderBase::InsertPointGuard Guard(Builder);
20128
20129 Value *V = E->Scalars.front();
20130 Type *ScalarTy = V->getType();
20131 if (!isa<CmpInst>(V))
20132 ScalarTy = getValueType(V);
20133 auto It = MinBWs.find(E);
20134 if (It != MinBWs.end()) {
20135 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
20136 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
20137 if (VecTy)
20138 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
20139 }
20140 if (E->VectorizedValue)
20141 return E->VectorizedValue;
20142 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
20143 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
20144 // Set insert point for non-reduction initial nodes.
20145 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
20146 setInsertPointAfterBundle(E);
20147 Value *Vec = createBuildVector(E, ScalarTy);
20148 E->VectorizedValue = Vec;
20149 return Vec;
20150 }
20151 if (E->State == TreeEntry::SplitVectorize) {
20152 assert(E->CombinedEntriesWithIndices.size() == 2 &&
20153 "Expected exactly 2 combined entries.");
20154 setInsertPointAfterBundle(E);
20155 TreeEntry &OpTE1 =
20156 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
20157 assert(OpTE1.isSame(
20158 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
20159 "Expected same first part of scalars.");
20160 Value *Op1 = vectorizeTree(&OpTE1);
20161 TreeEntry &OpTE2 =
20162 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
20163 assert(
20164 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
20165 "Expected same second part of scalars.");
20166 Value *Op2 = vectorizeTree(&OpTE2);
20167 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
20168 bool IsSigned = false;
20169 auto It = MinBWs.find(OpE);
20170 if (It != MinBWs.end())
20171 IsSigned = It->second.second;
20172 else
20173 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
20174 if (isa<PoisonValue>(V))
20175 return false;
20176 return !isKnownNonNegative(R, SimplifyQuery(*DL));
20177 });
20178 return IsSigned;
20179 };
20180 if (cast<VectorType>(Op1->getType())->getElementType() !=
20181 ScalarTy->getScalarType()) {
20182 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20183 Op1 = Builder.CreateIntCast(
20184 Op1,
20186 ScalarTy,
20187 cast<FixedVectorType>(Op1->getType())->getNumElements()),
20188 GetOperandSignedness(&OpTE1));
20189 }
20190 if (cast<VectorType>(Op2->getType())->getElementType() !=
20191 ScalarTy->getScalarType()) {
20192 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20193 Op2 = Builder.CreateIntCast(
20194 Op2,
20196 ScalarTy,
20197 cast<FixedVectorType>(Op2->getType())->getNumElements()),
20198 GetOperandSignedness(&OpTE2));
20199 }
20200 if (E->ReorderIndices.empty()) {
20201 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
20202 std::iota(
20203 Mask.begin(),
20204 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
20205 0);
20206 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
20207 if (ScalarTyNumElements != 1) {
20208 assert(SLPReVec && "Only supported by REVEC.");
20209 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
20210 }
20211 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
20212 Vec = createInsertVector(Builder, Vec, Op2,
20213 E->CombinedEntriesWithIndices.back().second *
20214 ScalarTyNumElements);
20215 E->VectorizedValue = Vec;
20216 return Vec;
20217 }
20218 unsigned CommonVF =
20219 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
20220 if (getNumElements(Op1->getType()) != CommonVF) {
20221 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
20222 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
20223 0);
20224 Op1 = Builder.CreateShuffleVector(Op1, Mask);
20225 }
20226 if (getNumElements(Op2->getType()) != CommonVF) {
20227 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
20228 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
20229 0);
20230 Op2 = Builder.CreateShuffleVector(Op2, Mask);
20231 }
20232 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
20233 E->VectorizedValue = Vec;
20234 return Vec;
20235 }
20236
20237 bool IsReverseOrder =
20238 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
20239 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
20240 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
20241 if (E->getOpcode() == Instruction::Store &&
20242 E->State == TreeEntry::Vectorize) {
20243 ArrayRef<int> Mask =
20244 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
20245 E->ReorderIndices.size());
20246 ShuffleBuilder.add(V, Mask);
20247 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
20248 E->State == TreeEntry::CompressVectorize) {
20249 ShuffleBuilder.addOrdered(V, {});
20250 } else {
20251 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
20252 }
20254 E->CombinedEntriesWithIndices.size());
20255 transform(
20256 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
20257 return std::make_pair(VectorizableTree[P.first].get(), P.second);
20258 });
20259 assert(
20260 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
20261 "Expected either combined subnodes or reordering");
20262 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
20263 };
20264
20265 assert(!E->isGather() && "Unhandled state");
20266 unsigned ShuffleOrOp =
20267 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
20268 if (!E->isAltShuffle()) {
20269 switch (E->CombinedOp) {
20270 case TreeEntry::ReducedBitcast:
20271 case TreeEntry::ReducedBitcastBSwap:
20272 ShuffleOrOp = E->CombinedOp;
20273 break;
20274 default:
20275 break;
20276 }
20277 }
20278 Instruction *VL0 = E->getMainOp();
20279 auto GetOperandSignedness = [&](unsigned Idx) {
20280 const TreeEntry *OpE = getOperandEntry(E, Idx);
20281 bool IsSigned = false;
20282 auto It = MinBWs.find(OpE);
20283 if (It != MinBWs.end())
20284 IsSigned = It->second.second;
20285 else
20286 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
20287 if (isa<PoisonValue>(V))
20288 return false;
20289 return !isKnownNonNegative(R, SimplifyQuery(*DL));
20290 });
20291 return IsSigned;
20292 };
20293 switch (ShuffleOrOp) {
20294 case Instruction::PHI: {
20295 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
20296 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
20297 "PHI reordering is free.");
20298 auto *PH = cast<PHINode>(VL0);
20299 Builder.SetInsertPoint(PH->getParent(),
20300 PH->getParent()->getFirstNonPHIIt());
20301 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
20302 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
20303 Value *V = NewPhi;
20304
20305 // Adjust insertion point once all PHI's have been generated.
20306 Builder.SetInsertPoint(PH->getParent(),
20307 PH->getParent()->getFirstInsertionPt());
20308 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
20309
20310 V = FinalShuffle(V, E);
20311
20312 E->VectorizedValue = V;
20313 // If phi node is fully emitted - exit.
20314 if (NewPhi->getNumIncomingValues() != 0)
20315 return NewPhi;
20316
20317 // PHINodes may have multiple entries from the same block. We want to
20318 // visit every block once.
20319 SmallDenseMap<BasicBlock *, unsigned, 4> VisitedBBs;
20320 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
20321 BasicBlock *IBB = PH->getIncomingBlock(I);
20322
20323 // Stop emission if all incoming values are generated.
20324 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
20325 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
20326 return NewPhi;
20327 }
20328
20329 auto Res = VisitedBBs.try_emplace(IBB, I);
20330 if (!Res.second) {
20331 TreeEntry *OpTE = getOperandEntry(E, I);
20332 if (OpTE->isGather() || DeletedNodes.contains(OpTE) ||
20333 TransformedToGatherNodes.contains(OpTE)) {
20334 Value *VecOp = NewPhi->getIncomingValue(Res.first->getSecond());
20335 NewPhi->addIncoming(VecOp, IBB);
20336 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
20337 OpTE->VectorizedValue = VecOp;
20338 continue;
20339 }
20340 }
20341
20342 Builder.SetInsertPoint(IBB->getTerminator());
20343 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
20344 Value *Vec = vectorizeOperand(E, I);
20345 if (VecTy != Vec->getType()) {
20346 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
20347 MinBWs.contains(getOperandEntry(E, I))) &&
20348 "Expected item in MinBWs.");
20349 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
20350 }
20351 NewPhi->addIncoming(Vec, IBB);
20352 }
20353
20354 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
20355 "Invalid number of incoming values");
20356 assert(E->VectorizedValue && "Expected vectorized value.");
20357 return E->VectorizedValue;
20358 }
20359
20360 case Instruction::ExtractElement: {
20361 Value *V = E->getSingleOperand(0);
20362 setInsertPointAfterBundle(E);
20363 V = FinalShuffle(V, E);
20364 E->VectorizedValue = V;
20365 return V;
20366 }
20367 case Instruction::ExtractValue: {
20368 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
20369 Builder.SetInsertPoint(LI);
20370 Value *Ptr = LI->getPointerOperand();
20371 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
20372 Value *NewV = ::propagateMetadata(V, E->Scalars);
20373 NewV = FinalShuffle(NewV, E);
20374 E->VectorizedValue = NewV;
20375 return NewV;
20376 }
20377 case Instruction::InsertElement: {
20378 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
20379 if (const TreeEntry *OpE = getOperandEntry(E, 1);
20380 OpE && !OpE->isGather() && OpE->hasState() &&
20381 !OpE->hasCopyableElements())
20382 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
20383 else
20384 setInsertPointAfterBundle(E);
20385 Value *V = vectorizeOperand(E, 1);
20386 ArrayRef<Value *> Op = E->getOperand(1);
20387 Type *ScalarTy = Op.front()->getType();
20388 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
20389 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20390 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
20391 assert(Res.first > 0 && "Expected item in MinBWs.");
20392 V = Builder.CreateIntCast(
20393 V,
20395 ScalarTy,
20396 cast<FixedVectorType>(V->getType())->getNumElements()),
20397 Res.second);
20398 }
20399
20400 // Create InsertVector shuffle if necessary
20401 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
20402 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
20403 }));
20404 const unsigned NumElts =
20405 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
20406 const unsigned NumScalars = E->Scalars.size();
20407
20408 unsigned Offset = *getElementIndex(VL0);
20409 assert(Offset < NumElts && "Failed to find vector index offset");
20410
20411 // Create shuffle to resize vector
20412 SmallVector<int> Mask;
20413 if (!E->ReorderIndices.empty()) {
20414 inversePermutation(E->ReorderIndices, Mask);
20415 Mask.append(NumElts - NumScalars, PoisonMaskElem);
20416 } else {
20417 Mask.assign(NumElts, PoisonMaskElem);
20418 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
20419 }
20420 // Create InsertVector shuffle if necessary
20421 bool IsIdentity = true;
20422 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
20423 Mask.swap(PrevMask);
20424 for (unsigned I = 0; I < NumScalars; ++I) {
20425 Value *Scalar = E->Scalars[PrevMask[I]];
20426 unsigned InsertIdx = *getElementIndex(Scalar);
20427 IsIdentity &= InsertIdx - Offset == I;
20428 Mask[InsertIdx - Offset] = I;
20429 }
20430 if (!IsIdentity || NumElts != NumScalars) {
20431 Value *V2 = nullptr;
20432 bool IsVNonPoisonous =
20434 SmallVector<int> InsertMask(Mask);
20435 if (NumElts != NumScalars && Offset == 0) {
20436 // Follow all insert element instructions from the current buildvector
20437 // sequence.
20438 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
20439 do {
20440 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
20441 if (!InsertIdx)
20442 break;
20443 if (InsertMask[*InsertIdx] == PoisonMaskElem)
20444 InsertMask[*InsertIdx] = *InsertIdx;
20445 if (!Ins->hasOneUse())
20446 break;
20449 } while (Ins);
20450 SmallBitVector UseMask =
20451 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20452 SmallBitVector IsFirstPoison =
20453 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
20454 SmallBitVector IsFirstUndef =
20455 isUndefVector(FirstInsert->getOperand(0), UseMask);
20456 if (!IsFirstPoison.all()) {
20457 unsigned Idx = 0;
20458 for (unsigned I = 0; I < NumElts; I++) {
20459 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
20460 IsFirstUndef.test(I)) {
20461 if (IsVNonPoisonous) {
20462 InsertMask[I] = I < NumScalars ? I : 0;
20463 continue;
20464 }
20465 if (!V2)
20466 V2 = UndefValue::get(V->getType());
20467 if (Idx >= NumScalars)
20468 Idx = NumScalars - 1;
20469 InsertMask[I] = NumScalars + Idx;
20470 ++Idx;
20471 } else if (InsertMask[I] != PoisonMaskElem &&
20472 Mask[I] == PoisonMaskElem) {
20473 InsertMask[I] = PoisonMaskElem;
20474 }
20475 }
20476 } else {
20477 InsertMask = Mask;
20478 }
20479 }
20480 if (!V2)
20481 V2 = PoisonValue::get(V->getType());
20482 V = Builder.CreateShuffleVector(V, V2, InsertMask);
20483 if (auto *I = dyn_cast<Instruction>(V)) {
20484 GatherShuffleExtractSeq.insert(I);
20485 CSEBlocks.insert(I->getParent());
20486 }
20487 }
20488
20489 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
20490 for (unsigned I = 0; I < NumElts; I++) {
20491 if (Mask[I] != PoisonMaskElem)
20492 InsertMask[Offset + I] = I;
20493 }
20494 SmallBitVector UseMask =
20495 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20496 SmallBitVector IsFirstUndef =
20497 isUndefVector(FirstInsert->getOperand(0), UseMask);
20498 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
20499 NumElts != NumScalars) {
20500 if (IsFirstUndef.all()) {
20501 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
20502 SmallBitVector IsFirstPoison =
20503 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
20504 if (!IsFirstPoison.all()) {
20505 for (unsigned I = 0; I < NumElts; I++) {
20506 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
20507 InsertMask[I] = I + NumElts;
20508 }
20509 }
20510 V = Builder.CreateShuffleVector(
20511 V,
20512 IsFirstPoison.all() ? PoisonValue::get(V->getType())
20513 : FirstInsert->getOperand(0),
20514 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
20515 if (auto *I = dyn_cast<Instruction>(V)) {
20516 GatherShuffleExtractSeq.insert(I);
20517 CSEBlocks.insert(I->getParent());
20518 }
20519 }
20520 } else {
20521 SmallBitVector IsFirstPoison =
20522 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
20523 for (unsigned I = 0; I < NumElts; I++) {
20524 if (InsertMask[I] == PoisonMaskElem)
20525 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
20526 else
20527 InsertMask[I] += NumElts;
20528 }
20529 V = Builder.CreateShuffleVector(
20530 FirstInsert->getOperand(0), V, InsertMask,
20531 cast<Instruction>(E->Scalars.back())->getName());
20532 if (auto *I = dyn_cast<Instruction>(V)) {
20533 GatherShuffleExtractSeq.insert(I);
20534 CSEBlocks.insert(I->getParent());
20535 }
20536 }
20537 }
20538
20539 ++NumVectorInstructions;
20540 E->VectorizedValue = V;
20541 return V;
20542 }
20543 case Instruction::ZExt:
20544 case Instruction::SExt:
20545 case Instruction::FPToUI:
20546 case Instruction::FPToSI:
20547 case Instruction::FPExt:
20548 case Instruction::PtrToInt:
20549 case Instruction::IntToPtr:
20550 case Instruction::SIToFP:
20551 case Instruction::UIToFP:
20552 case Instruction::Trunc:
20553 case Instruction::FPTrunc:
20554 case Instruction::BitCast: {
20555 setInsertPointAfterBundle(E);
20556
20557 Value *InVec = vectorizeOperand(E, 0);
20558
20559 auto *CI = cast<CastInst>(VL0);
20560 Instruction::CastOps VecOpcode = CI->getOpcode();
20561 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
20562 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
20563 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
20564 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
20565 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
20566 // Check if the values are candidates to demote.
20567 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
20568 if (SrcIt != MinBWs.end())
20569 SrcBWSz = SrcIt->second.first;
20570 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
20571 if (BWSz == SrcBWSz) {
20572 VecOpcode = Instruction::BitCast;
20573 } else if (BWSz < SrcBWSz) {
20574 VecOpcode = Instruction::Trunc;
20575 } else if (It != MinBWs.end()) {
20576 assert(BWSz > SrcBWSz && "Invalid cast!");
20577 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
20578 } else if (SrcIt != MinBWs.end()) {
20579 assert(BWSz > SrcBWSz && "Invalid cast!");
20580 VecOpcode =
20581 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
20582 }
20583 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
20584 !SrcIt->second.second) {
20585 VecOpcode = Instruction::UIToFP;
20586 } else if (VecOpcode == Instruction::BitCast && SrcIt != MinBWs.end() &&
20587 ScalarTy->isFPOrFPVectorTy()) {
20588 Type *OrigSrcScalarTy = CI->getSrcTy();
20589 auto *OrigSrcVectorTy =
20590 getWidenedType(OrigSrcScalarTy, E->Scalars.size());
20591 InVec =
20592 Builder.CreateIntCast(InVec, OrigSrcVectorTy, SrcIt->second.second);
20593 }
20594 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
20595 ? InVec
20596 : Builder.CreateCast(VecOpcode, InVec, VecTy);
20597 V = FinalShuffle(V, E);
20598
20599 E->VectorizedValue = V;
20600 ++NumVectorInstructions;
20601 return V;
20602 }
20603 case Instruction::FCmp:
20604 case Instruction::ICmp: {
20605 setInsertPointAfterBundle(E);
20606
20607 Value *L = vectorizeOperand(E, 0);
20608 Value *R = vectorizeOperand(E, 1);
20609 if (L->getType() != R->getType()) {
20610 assert((getOperandEntry(E, 0)->isGather() ||
20611 getOperandEntry(E, 1)->isGather() ||
20612 MinBWs.contains(getOperandEntry(E, 0)) ||
20613 MinBWs.contains(getOperandEntry(E, 1))) &&
20614 "Expected item in MinBWs.");
20615 if (cast<VectorType>(L->getType())
20616 ->getElementType()
20617 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
20618 ->getElementType()
20619 ->getIntegerBitWidth()) {
20620 Type *CastTy = R->getType();
20621 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
20622 } else {
20623 Type *CastTy = L->getType();
20624 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
20625 }
20626 }
20627
20628 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
20629 Value *V = Builder.CreateCmp(P0, L, R);
20630 propagateIRFlags(V, E->Scalars, VL0);
20631 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
20632 ICmp->setSameSign(/*B=*/false);
20633 // Do not cast for cmps.
20634 VecTy = cast<FixedVectorType>(V->getType());
20635 V = FinalShuffle(V, E);
20636
20637 E->VectorizedValue = V;
20638 ++NumVectorInstructions;
20639 return V;
20640 }
20641 case Instruction::Select: {
20642 setInsertPointAfterBundle(E);
20643
20644 Value *Cond = vectorizeOperand(E, 0);
20645 Value *True = vectorizeOperand(E, 1);
20646 Value *False = vectorizeOperand(E, 2);
20647 if (True->getType() != VecTy || False->getType() != VecTy) {
20648 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
20649 getOperandEntry(E, 2)->isGather() ||
20650 MinBWs.contains(getOperandEntry(E, 1)) ||
20651 MinBWs.contains(getOperandEntry(E, 2))) &&
20652 "Expected item in MinBWs.");
20653 if (True->getType() != VecTy)
20654 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
20655 if (False->getType() != VecTy)
20656 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
20657 }
20658
20659 unsigned CondNumElements = getNumElements(Cond->getType());
20660 unsigned TrueNumElements = getNumElements(True->getType());
20661 assert(TrueNumElements >= CondNumElements &&
20662 TrueNumElements % CondNumElements == 0 &&
20663 "Cannot vectorize Instruction::Select");
20664 assert(TrueNumElements == getNumElements(False->getType()) &&
20665 "Cannot vectorize Instruction::Select");
20666 if (CondNumElements != TrueNumElements) {
20667 // When the return type is i1 but the source is fixed vector type, we
20668 // need to duplicate the condition value.
20669 Cond = Builder.CreateShuffleVector(
20670 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
20671 CondNumElements));
20672 }
20673 assert(getNumElements(Cond->getType()) == TrueNumElements &&
20674 "Cannot vectorize Instruction::Select");
20675 Value *V =
20676 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
20677 V = FinalShuffle(V, E);
20678
20679 E->VectorizedValue = V;
20680 ++NumVectorInstructions;
20681 return V;
20682 }
20683 case Instruction::FNeg: {
20684 setInsertPointAfterBundle(E);
20685
20686 Value *Op = vectorizeOperand(E, 0);
20687
20688 Value *V = Builder.CreateUnOp(
20689 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
20690 propagateIRFlags(V, E->Scalars, VL0);
20691 if (auto *I = dyn_cast<Instruction>(V))
20692 V = ::propagateMetadata(I, E->Scalars);
20693
20694 V = FinalShuffle(V, E);
20695
20696 E->VectorizedValue = V;
20697 ++NumVectorInstructions;
20698
20699 return V;
20700 }
20701 case Instruction::Freeze: {
20702 setInsertPointAfterBundle(E);
20703
20704 Value *Op = vectorizeOperand(E, 0);
20705
20706 if (Op->getType() != VecTy) {
20707 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
20708 MinBWs.contains(getOperandEntry(E, 0))) &&
20709 "Expected item in MinBWs.");
20710 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
20711 }
20712 Value *V = Builder.CreateFreeze(Op);
20713 V = FinalShuffle(V, E);
20714
20715 E->VectorizedValue = V;
20716 ++NumVectorInstructions;
20717
20718 return V;
20719 }
20720 case Instruction::Add:
20721 case Instruction::FAdd:
20722 case Instruction::Sub:
20723 case Instruction::FSub:
20724 case Instruction::Mul:
20725 case Instruction::FMul:
20726 case Instruction::UDiv:
20727 case Instruction::SDiv:
20728 case Instruction::FDiv:
20729 case Instruction::URem:
20730 case Instruction::SRem:
20731 case Instruction::FRem:
20732 case Instruction::Shl:
20733 case Instruction::LShr:
20734 case Instruction::AShr:
20735 case Instruction::And:
20736 case Instruction::Or:
20737 case Instruction::Xor: {
20738 setInsertPointAfterBundle(E);
20739
20740 Value *LHS = vectorizeOperand(E, 0);
20741 Value *RHS = vectorizeOperand(E, 1);
20742 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
20743 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
20744 ArrayRef<Value *> Ops = E->getOperand(I);
20745 if (all_of(Ops, [&](Value *Op) {
20746 auto *CI = dyn_cast<ConstantInt>(Op);
20747 return CI && CI->getValue().countr_one() >= It->second.first;
20748 })) {
20749 V = FinalShuffle(I == 0 ? RHS : LHS, E);
20750 E->VectorizedValue = V;
20751 ++NumVectorInstructions;
20752 return V;
20753 }
20754 }
20755 }
20756 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
20757 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
20758 getOperandEntry(E, 1)->isGather() ||
20759 MinBWs.contains(getOperandEntry(E, 0)) ||
20760 MinBWs.contains(getOperandEntry(E, 1))) &&
20761 "Expected item in MinBWs.");
20762 if (LHS->getType() != VecTy)
20763 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
20764 if (RHS->getType() != VecTy)
20765 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
20766 }
20767
20768 Value *V = Builder.CreateBinOp(
20769 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
20770 RHS);
20771 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
20772 if (auto *I = dyn_cast<Instruction>(V)) {
20773 V = ::propagateMetadata(I, E->Scalars);
20774 // Drop nuw flags for abs(sub(commutative), true).
20775 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
20776 any_of(E->Scalars, [E](Value *V) {
20777 return isa<PoisonValue>(V) ||
20778 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
20779 isCommutative(cast<Instruction>(V));
20780 }))
20781 I->setHasNoUnsignedWrap(/*b=*/false);
20782 }
20783
20784 V = FinalShuffle(V, E);
20785
20786 E->VectorizedValue = V;
20787 ++NumVectorInstructions;
20788
20789 return V;
20790 }
20791 case Instruction::Load: {
20792 // Loads are inserted at the head of the tree because we don't want to
20793 // sink them all the way down past store instructions.
20794 setInsertPointAfterBundle(E);
20795
20796 LoadInst *LI = cast<LoadInst>(VL0);
20797 Instruction *NewLI;
20798 FixedVectorType *StridedLoadTy = nullptr;
20799 Value *PO = LI->getPointerOperand();
20800 if (E->State == TreeEntry::Vectorize) {
20801 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
20802 } else if (E->State == TreeEntry::CompressVectorize) {
20803 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
20804 CompressEntryToData.at(E);
20805 Align CommonAlignment = LI->getAlign();
20806 if (IsMasked) {
20807 unsigned VF = getNumElements(LoadVecTy);
20808 SmallVector<Constant *> MaskValues(
20809 VF / getNumElements(LI->getType()),
20810 ConstantInt::getFalse(VecTy->getContext()));
20811 for (int I : CompressMask)
20812 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
20813 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
20814 assert(SLPReVec && "Only supported by REVEC.");
20815 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
20816 }
20817 Constant *MaskValue = ConstantVector::get(MaskValues);
20818 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
20819 MaskValue);
20820 } else {
20821 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
20822 }
20823 NewLI = ::propagateMetadata(NewLI, E->Scalars);
20824 // TODO: include this cost into CommonCost.
20825 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
20826 assert(SLPReVec && "FixedVectorType is not expected.");
20827 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
20828 CompressMask);
20829 }
20830 NewLI =
20831 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
20832 } else if (E->State == TreeEntry::StridedVectorize) {
20833 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
20834 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
20835 PO = IsReverseOrder ? PtrN : Ptr0;
20836 Type *StrideTy = DL->getIndexType(PO->getType());
20837 Value *StrideVal;
20838 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
20839 StridedLoadTy = SPtrInfo.Ty;
20840 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
20841 unsigned StridedLoadEC =
20842 StridedLoadTy->getElementCount().getKnownMinValue();
20843
20844 Value *Stride = SPtrInfo.StrideVal;
20845 if (!Stride) {
20846 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
20847 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
20848 SCEVExpander Expander(*SE, "strided-load-vec");
20849 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
20850 &*Builder.GetInsertPoint());
20851 }
20852 Value *NewStride =
20853 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
20854 StrideVal = Builder.CreateMul(
20855 NewStride, ConstantInt::getSigned(
20856 StrideTy, (IsReverseOrder ? -1 : 1) *
20857 static_cast<int>(
20858 DL->getTypeAllocSize(ScalarTy))));
20859 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
20860 auto *Inst = Builder.CreateIntrinsic(
20861 Intrinsic::experimental_vp_strided_load,
20862 {StridedLoadTy, PO->getType(), StrideTy},
20863 {PO, StrideVal,
20864 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
20865 Builder.getInt32(StridedLoadEC)});
20866 Inst->addParamAttr(
20867 /*ArgNo=*/0,
20868 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20869 NewLI = Inst;
20870 } else {
20871 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
20872 Value *VecPtr = vectorizeOperand(E, 0);
20873 if (isa<FixedVectorType>(ScalarTy)) {
20874 assert(SLPReVec && "FixedVectorType is not expected.");
20875 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
20876 // to expand VecPtr if ScalarTy is a vector type.
20877 unsigned ScalarTyNumElements =
20878 cast<FixedVectorType>(ScalarTy)->getNumElements();
20879 unsigned VecTyNumElements =
20880 cast<FixedVectorType>(VecTy)->getNumElements();
20881 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
20882 "Cannot expand getelementptr.");
20883 unsigned VF = VecTyNumElements / ScalarTyNumElements;
20884 SmallVector<Constant *> Indices(VecTyNumElements);
20885 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
20886 return Builder.getInt64(I % ScalarTyNumElements);
20887 });
20888 VecPtr = Builder.CreateGEP(
20889 VecTy->getElementType(),
20890 Builder.CreateShuffleVector(
20891 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
20892 ConstantVector::get(Indices));
20893 }
20894 // Use the minimum alignment of the gathered loads.
20895 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
20896 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
20897 }
20898 Value *V = E->State == TreeEntry::CompressVectorize
20899 ? NewLI
20900 : ::propagateMetadata(NewLI, E->Scalars);
20901
20902 if (StridedLoadTy != VecTy)
20903 V = Builder.CreateBitOrPointerCast(V, VecTy);
20904 V = FinalShuffle(V, E);
20905 E->VectorizedValue = V;
20906 ++NumVectorInstructions;
20907 return V;
20908 }
20909 case Instruction::Store: {
20910 auto *SI = cast<StoreInst>(VL0);
20911
20912 setInsertPointAfterBundle(E);
20913
20914 Value *VecValue = vectorizeOperand(E, 0);
20915 if (VecValue->getType() != VecTy)
20916 VecValue =
20917 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
20918 VecValue = FinalShuffle(VecValue, E);
20919
20920 Value *Ptr = SI->getPointerOperand();
20921 Instruction *ST;
20922 if (E->State == TreeEntry::Vectorize) {
20923 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
20924 } else {
20925 assert(E->State == TreeEntry::StridedVectorize &&
20926 "Expected either strided or consecutive stores.");
20927 if (!E->ReorderIndices.empty()) {
20928 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
20929 Ptr = SI->getPointerOperand();
20930 }
20931 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
20932 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
20933 auto *Inst = Builder.CreateIntrinsic(
20934 Intrinsic::experimental_vp_strided_store,
20935 {VecTy, Ptr->getType(), StrideTy},
20936 {VecValue, Ptr,
20938 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
20939 Builder.getAllOnesMask(VecTy->getElementCount()),
20940 Builder.getInt32(E->Scalars.size())});
20941 Inst->addParamAttr(
20942 /*ArgNo=*/1,
20943 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20944 ST = Inst;
20945 }
20946
20947 Value *V = ::propagateMetadata(ST, E->Scalars);
20948
20949 E->VectorizedValue = V;
20950 ++NumVectorInstructions;
20951 return V;
20952 }
20953 case Instruction::GetElementPtr: {
20954 auto *GEP0 = cast<GetElementPtrInst>(VL0);
20955 setInsertPointAfterBundle(E);
20956
20957 Value *Op0 = vectorizeOperand(E, 0);
20958
20959 SmallVector<Value *> OpVecs;
20960 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
20961 Value *OpVec = vectorizeOperand(E, J);
20962 OpVecs.push_back(OpVec);
20963 }
20964
20965 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20966 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
20968 for (Value *V : E->Scalars) {
20970 GEPs.push_back(V);
20971 }
20972 V = ::propagateMetadata(I, GEPs);
20973 }
20974
20975 V = FinalShuffle(V, E);
20976
20977 E->VectorizedValue = V;
20978 ++NumVectorInstructions;
20979
20980 return V;
20981 }
20982 case Instruction::Call: {
20983 CallInst *CI = cast<CallInst>(VL0);
20984 setInsertPointAfterBundle(E);
20985
20987
20989 CI, ID, VecTy->getNumElements(),
20990 It != MinBWs.end() ? It->second.first : 0, TTI);
20991 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
20992 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
20993 VecCallCosts.first <= VecCallCosts.second;
20994
20995 Value *ScalarArg = nullptr;
20996 SmallVector<Value *> OpVecs;
20997 SmallVector<Type *, 2> TysForDecl;
20998 // Add return type if intrinsic is overloaded on it.
20999 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
21000 TysForDecl.push_back(VecTy);
21001 auto *CEI = cast<CallInst>(VL0);
21002 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
21003 // Some intrinsics have scalar arguments. This argument should not be
21004 // vectorized.
21005 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
21006 ScalarArg = CEI->getArgOperand(I);
21007 // if decided to reduce bitwidth of abs intrinsic, it second argument
21008 // must be set false (do not return poison, if value issigned min).
21009 if (ID == Intrinsic::abs && It != MinBWs.end() &&
21010 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
21011 ScalarArg = Builder.getFalse();
21012 OpVecs.push_back(ScalarArg);
21014 TysForDecl.push_back(ScalarArg->getType());
21015 continue;
21016 }
21017
21018 Value *OpVec = vectorizeOperand(E, I);
21019 ScalarArg = CEI->getArgOperand(I);
21020 if (cast<VectorType>(OpVec->getType())->getElementType() !=
21021 ScalarArg->getType()->getScalarType() &&
21022 It == MinBWs.end()) {
21023 auto *CastTy =
21024 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
21025 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
21026 } else if (It != MinBWs.end()) {
21027 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
21028 }
21029 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
21030 OpVecs.push_back(OpVec);
21031 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
21032 TysForDecl.push_back(OpVec->getType());
21033 }
21034
21035 Function *CF;
21036 if (!UseIntrinsic) {
21037 VFShape Shape =
21039 ElementCount::getFixed(VecTy->getNumElements()),
21040 false /*HasGlobalPred*/);
21041 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
21042 } else {
21043 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
21044 }
21045
21047 CI->getOperandBundlesAsDefs(OpBundles);
21048 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
21049
21050 propagateIRFlags(V, E->Scalars, VL0);
21051 V = FinalShuffle(V, E);
21052
21053 E->VectorizedValue = V;
21054 ++NumVectorInstructions;
21055 return V;
21056 }
21057 case Instruction::ShuffleVector: {
21058 Value *V;
21059 if (SLPReVec && !E->isAltShuffle()) {
21060 setInsertPointAfterBundle(E);
21061 Value *Src = vectorizeOperand(E, 0);
21062 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
21063 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
21064 SmallVector<int> NewMask(ThisMask.size());
21065 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
21066 return SVSrc->getShuffleMask()[Mask];
21067 });
21068 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
21069 SVSrc->getOperand(1), NewMask);
21070 } else {
21071 V = Builder.CreateShuffleVector(Src, ThisMask);
21072 }
21073 propagateIRFlags(V, E->Scalars, VL0);
21074 if (auto *I = dyn_cast<Instruction>(V))
21075 V = ::propagateMetadata(I, E->Scalars);
21076 V = FinalShuffle(V, E);
21077 } else {
21078 assert(E->isAltShuffle() &&
21079 ((Instruction::isBinaryOp(E->getOpcode()) &&
21080 Instruction::isBinaryOp(E->getAltOpcode())) ||
21081 (Instruction::isCast(E->getOpcode()) &&
21082 Instruction::isCast(E->getAltOpcode())) ||
21083 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
21084 "Invalid Shuffle Vector Operand");
21085
21086 Value *LHS = nullptr, *RHS = nullptr;
21087 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
21088 setInsertPointAfterBundle(E);
21089 LHS = vectorizeOperand(E, 0);
21090 RHS = vectorizeOperand(E, 1);
21091 } else {
21092 setInsertPointAfterBundle(E);
21093 LHS = vectorizeOperand(E, 0);
21094 }
21095 if (LHS && RHS &&
21096 ((Instruction::isBinaryOp(E->getOpcode()) &&
21097 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
21098 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
21099 assert((It != MinBWs.end() ||
21100 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
21101 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
21102 MinBWs.contains(getOperandEntry(E, 0)) ||
21103 MinBWs.contains(getOperandEntry(E, 1))) &&
21104 "Expected item in MinBWs.");
21105 Type *CastTy = VecTy;
21106 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
21108 ->getElementType()
21109 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
21110 ->getElementType()
21111 ->getIntegerBitWidth())
21112 CastTy = RHS->getType();
21113 else
21114 CastTy = LHS->getType();
21115 }
21116 if (LHS->getType() != CastTy)
21117 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
21118 if (RHS->getType() != CastTy)
21119 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
21120 }
21121
21122 Value *V0, *V1;
21123 if (Instruction::isBinaryOp(E->getOpcode())) {
21124 V0 = Builder.CreateBinOp(
21125 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
21126 V1 = Builder.CreateBinOp(
21127 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
21128 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
21129 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
21130 auto *AltCI = cast<CmpInst>(E->getAltOp());
21131 CmpInst::Predicate AltPred = AltCI->getPredicate();
21132 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
21133 } else {
21134 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
21135 unsigned SrcBWSz = DL->getTypeSizeInBits(
21136 cast<VectorType>(LHS->getType())->getElementType());
21137 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
21138 if (BWSz <= SrcBWSz) {
21139 if (BWSz < SrcBWSz)
21140 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
21141 assert(LHS->getType() == VecTy &&
21142 "Expected same type as operand.");
21143 if (auto *I = dyn_cast<Instruction>(LHS))
21144 LHS = ::propagateMetadata(I, E->Scalars);
21145 LHS = FinalShuffle(LHS, E);
21146 E->VectorizedValue = LHS;
21147 ++NumVectorInstructions;
21148 return LHS;
21149 }
21150 }
21151 V0 = Builder.CreateCast(
21152 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
21153 V1 = Builder.CreateCast(
21154 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
21155 }
21156 // Add V0 and V1 to later analysis to try to find and remove matching
21157 // instruction, if any.
21158 for (Value *V : {V0, V1}) {
21159 if (auto *I = dyn_cast<Instruction>(V)) {
21160 GatherShuffleExtractSeq.insert(I);
21161 CSEBlocks.insert(I->getParent());
21162 }
21163 }
21164
21165 // Create shuffle to take alternate operations from the vector.
21166 // Also, gather up main and alt scalar ops to propagate IR flags to
21167 // each vector operation.
21168 ValueList OpScalars, AltScalars;
21169 SmallVector<int> Mask;
21170 E->buildAltOpShuffleMask(
21171 [E, this](Instruction *I) {
21172 assert(E->getMatchingMainOpOrAltOp(I) &&
21173 "Unexpected main/alternate opcode");
21174 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
21175 *TLI);
21176 },
21177 Mask, &OpScalars, &AltScalars);
21178
21179 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
21180 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
21181 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
21182 // Drop nuw flags for abs(sub(commutative), true).
21183 if (auto *I = dyn_cast<Instruction>(Vec);
21184 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
21185 any_of(E->Scalars, [E](Value *V) {
21186 if (isa<PoisonValue>(V))
21187 return false;
21188 if (E->hasCopyableElements() && E->isCopyableElement(V))
21189 return false;
21190 auto *IV = cast<Instruction>(V);
21191 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
21192 }))
21193 I->setHasNoUnsignedWrap(/*b=*/false);
21194 };
21195 DropNuwFlag(V0, E->getOpcode());
21196 DropNuwFlag(V1, E->getAltOpcode());
21197
21198 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
21199 assert(SLPReVec && "FixedVectorType is not expected.");
21200 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
21201 }
21202 V = Builder.CreateShuffleVector(V0, V1, Mask);
21203 if (auto *I = dyn_cast<Instruction>(V)) {
21204 V = ::propagateMetadata(I, E->Scalars);
21205 GatherShuffleExtractSeq.insert(I);
21206 CSEBlocks.insert(I->getParent());
21207 }
21208 }
21209
21210 E->VectorizedValue = V;
21211 ++NumVectorInstructions;
21212
21213 return V;
21214 }
21215 case TreeEntry::ReducedBitcast:
21216 case TreeEntry::ReducedBitcastBSwap: {
21217 assert(UserIgnoreList && "Expected reduction operations only.");
21218 setInsertPointAfterBundle(E);
21219 TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
21220 ZExt->VectorizedValue = PoisonValue::get(getWidenedType(
21221 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
21222 TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
21223 Const->VectorizedValue = PoisonValue::get(getWidenedType(
21224 Const->Scalars.front()->getType(), Const->getVectorFactor()));
21225 Value *Op = vectorizeOperand(ZExt, 0);
21226 // Set the scalar type properly to avoid casting to the extending type.
21227 ScalarTy = cast<CastInst>(ZExt->getMainOp())->getSrcTy();
21228 Op = FinalShuffle(Op, E);
21229 auto *V = Builder.CreateBitCast(
21231 Op->getContext(),
21232 DL->getTypeSizeInBits(ZExt->getMainOp()->getType())));
21233 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap)
21234 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
21235 E->VectorizedValue = V;
21236 ++NumVectorInstructions;
21237 return V;
21238 }
21239 default:
21240 llvm_unreachable("unknown inst");
21241 }
21242 return nullptr;
21243}
21244
21246 ExtraValueToDebugLocsMap ExternallyUsedValues;
21247 return vectorizeTree(ExternallyUsedValues);
21248}
21249
21251 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
21252 Instruction *ReductionRoot,
21253 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
21254 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
21255 // need to rebuild it.
21256 EntryToLastInstruction.clear();
21257 // All blocks must be scheduled before any instructions are inserted.
21258 for (auto &BSIter : BlocksSchedules)
21259 scheduleBlock(*this, BSIter.second.get());
21260 // Cache last instructions for the nodes to avoid side effects, which may
21261 // appear during vectorization, like extra uses, etc.
21262 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21263 if (TE->isGather() || DeletedNodes.contains(TE.get()) ||
21264 (TE->State == TreeEntry::CombinedVectorize &&
21265 (TE->CombinedOp == TreeEntry::ReducedBitcast ||
21266 TE->CombinedOp == TreeEntry::ReducedBitcastBSwap)))
21267 continue;
21268 (void)getLastInstructionInBundle(TE.get());
21269 }
21270
21271 if (ReductionRoot)
21272 Builder.SetInsertPoint(ReductionRoot->getParent(),
21273 ReductionRoot->getIterator());
21274 else
21275 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21276
21277 // Vectorize gather operands of the nodes with the external uses only.
21279 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21280 if (DeletedNodes.contains(TE.get()))
21281 continue;
21282 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
21283 TE->UserTreeIndex.UserTE->hasState() &&
21284 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
21285 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
21286 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
21287 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
21288 all_of(TE->UserTreeIndex.UserTE->Scalars,
21289 [](Value *V) { return isUsedOutsideBlock(V); })) {
21290 Instruction &LastInst =
21291 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
21292 GatherEntries.emplace_back(TE.get(), &LastInst);
21293 }
21294 }
21295 for (auto &Entry : GatherEntries) {
21296 IRBuilderBase::InsertPointGuard Guard(Builder);
21297 Builder.SetInsertPoint(Entry.second);
21298 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
21299 (void)vectorizeTree(Entry.first);
21300 }
21301 // Emit gathered loads first to emit better code for the users of those
21302 // gathered loads.
21303 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21304 if (DeletedNodes.contains(TE.get()))
21305 continue;
21306 if (GatheredLoadsEntriesFirst.has_value() &&
21307 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
21308 (!TE->isGather() || TE->UserTreeIndex)) {
21309 assert((TE->UserTreeIndex ||
21310 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
21311 "Expected gathered load node.");
21312 (void)vectorizeTree(TE.get());
21313 }
21314 }
21315 (void)vectorizeTree(VectorizableTree[0].get());
21316 // Run through the list of postponed gathers and emit them, replacing the temp
21317 // emitted allocas with actual vector instructions.
21318 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
21320 for (const TreeEntry *E : PostponedNodes) {
21321 auto *TE = const_cast<TreeEntry *>(E);
21322 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
21323 TE->VectorizedValue = nullptr;
21324 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
21325 // If user is a PHI node, its vector code have to be inserted right before
21326 // block terminator. Since the node was delayed, there were some unresolved
21327 // dependencies at the moment when stab instruction was emitted. In a case
21328 // when any of these dependencies turn out an operand of another PHI, coming
21329 // from this same block, position of a stab instruction will become invalid.
21330 // The is because source vector that supposed to feed this gather node was
21331 // inserted at the end of the block [after stab instruction]. So we need
21332 // to adjust insertion point again to the end of block.
21333 if (isa<PHINode>(UserI) ||
21334 (TE->UserTreeIndex.UserTE->hasState() &&
21335 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
21336 // Insert before all users.
21337 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
21338 for (User *U : PrevVec->users()) {
21339 if (U == UserI)
21340 continue;
21341 auto *UI = dyn_cast<Instruction>(U);
21342 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
21343 continue;
21344 if (UI->comesBefore(InsertPt))
21345 InsertPt = UI;
21346 }
21347 Builder.SetInsertPoint(InsertPt);
21348 } else {
21349 Builder.SetInsertPoint(PrevVec);
21350 }
21351 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
21352 Value *Vec = vectorizeTree(TE);
21353 if (auto *VecI = dyn_cast<Instruction>(Vec);
21354 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
21355 Builder.GetInsertPoint()->comesBefore(VecI))
21356 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
21357 Builder.GetInsertPoint());
21358 if (Vec->getType() != PrevVec->getType()) {
21359 assert(Vec->getType()->isIntOrIntVectorTy() &&
21360 PrevVec->getType()->isIntOrIntVectorTy() &&
21361 "Expected integer vector types only.");
21362 std::optional<bool> IsSigned;
21363 for (Value *V : TE->Scalars) {
21364 if (isVectorized(V)) {
21365 for (const TreeEntry *MNTE : getTreeEntries(V)) {
21366 auto It = MinBWs.find(MNTE);
21367 if (It != MinBWs.end()) {
21368 IsSigned = IsSigned.value_or(false) || It->second.second;
21369 if (*IsSigned)
21370 break;
21371 }
21372 }
21373 if (IsSigned.value_or(false))
21374 break;
21375 // Scan through gather nodes.
21376 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
21377 auto It = MinBWs.find(BVE);
21378 if (It != MinBWs.end()) {
21379 IsSigned = IsSigned.value_or(false) || It->second.second;
21380 if (*IsSigned)
21381 break;
21382 }
21383 }
21384 if (IsSigned.value_or(false))
21385 break;
21386 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
21387 IsSigned =
21388 IsSigned.value_or(false) ||
21389 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
21390 continue;
21391 }
21392 if (IsSigned.value_or(false))
21393 break;
21394 }
21395 }
21396 if (IsSigned.value_or(false)) {
21397 // Final attempt - check user node.
21398 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
21399 if (It != MinBWs.end())
21400 IsSigned = It->second.second;
21401 }
21402 assert(IsSigned &&
21403 "Expected user node or perfect diamond match in MinBWs.");
21404 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
21405 }
21406 PrevVec->replaceAllUsesWith(Vec);
21407 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
21408 // Replace the stub vector node, if it was used before for one of the
21409 // buildvector nodes already.
21410 auto It = PostponedValues.find(PrevVec);
21411 if (It != PostponedValues.end()) {
21412 for (TreeEntry *VTE : It->getSecond())
21413 VTE->VectorizedValue = Vec;
21414 }
21415 eraseInstruction(PrevVec);
21416 }
21417
21418 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
21419 << " values .\n");
21420
21422 // Maps vector instruction to original insertelement instruction
21423 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
21424 // Maps extract Scalar to the corresponding extractelement instruction in the
21425 // basic block. Only one extractelement per block should be emitted.
21427 ScalarToEEs;
21428 SmallDenseSet<Value *, 4> UsedInserts;
21430 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
21432 // Extract all of the elements with the external uses.
21433 for (const auto &ExternalUse : ExternalUses) {
21434 Value *Scalar = ExternalUse.Scalar;
21435 llvm::User *User = ExternalUse.User;
21436
21437 // Skip users that we already RAUW. This happens when one instruction
21438 // has multiple uses of the same value.
21439 if (User && !is_contained(Scalar->users(), User))
21440 continue;
21441 const TreeEntry *E = &ExternalUse.E;
21442 assert(E && "Invalid scalar");
21443 assert(!E->isGather() && "Extracting from a gather list");
21444 // Non-instruction pointers are not deleted, just skip them.
21445 if (E->getOpcode() == Instruction::GetElementPtr &&
21446 !isa<GetElementPtrInst>(Scalar))
21447 continue;
21448
21449 Value *Vec = E->VectorizedValue;
21450 assert(Vec && "Can't find vectorizable value");
21451
21452 Value *Lane = Builder.getInt32(ExternalUse.Lane);
21453 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
21454 if (Scalar->getType() != Vec->getType()) {
21455 Value *Ex = nullptr;
21456 Value *ExV = nullptr;
21457 auto *Inst = dyn_cast<Instruction>(Scalar);
21458 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
21459 auto It = ScalarToEEs.find(Scalar);
21460 if (It != ScalarToEEs.end()) {
21461 // No need to emit many extracts, just move the only one in the
21462 // current block.
21463 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
21464 : Builder.GetInsertBlock());
21465 if (EEIt != It->second.end()) {
21466 Value *PrevV = EEIt->second.first;
21467 if (auto *I = dyn_cast<Instruction>(PrevV);
21468 I && !ReplaceInst &&
21469 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
21470 Builder.GetInsertPoint()->comesBefore(I)) {
21471 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
21472 Builder.GetInsertPoint());
21473 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
21474 CI->moveAfter(I);
21475 }
21476 Ex = PrevV;
21477 ExV = EEIt->second.second ? EEIt->second.second : Ex;
21478 }
21479 }
21480 if (!Ex) {
21481 // "Reuse" the existing extract to improve final codegen.
21482 if (ReplaceInst) {
21483 // Leave the instruction as is, if it cheaper extracts and all
21484 // operands are scalar.
21485 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
21486 IgnoredExtracts.insert(EE);
21487 Ex = EE;
21488 } else {
21489 auto *CloneInst = Inst->clone();
21490 CloneInst->insertBefore(Inst->getIterator());
21491 if (Inst->hasName())
21492 CloneInst->takeName(Inst);
21493 Ex = CloneInst;
21494 }
21495 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
21496 ES && isa<Instruction>(Vec)) {
21497 Value *V = ES->getVectorOperand();
21498 auto *IVec = cast<Instruction>(Vec);
21499 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
21500 V = ETEs.front()->VectorizedValue;
21501 if (auto *IV = dyn_cast<Instruction>(V);
21502 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
21503 IV->comesBefore(IVec))
21504 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
21505 else
21506 Ex = Builder.CreateExtractElement(Vec, Lane);
21507 } else if (auto *VecTy =
21508 dyn_cast<FixedVectorType>(Scalar->getType())) {
21509 assert(SLPReVec && "FixedVectorType is not expected.");
21510 unsigned VecTyNumElements = VecTy->getNumElements();
21511 // When REVEC is enabled, we need to extract a vector.
21512 // Note: The element size of Scalar may be different from the
21513 // element size of Vec.
21514 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
21515 ExternalUse.Lane * VecTyNumElements);
21516 } else {
21517 Ex = Builder.CreateExtractElement(Vec, Lane);
21518 }
21519 // If necessary, sign-extend or zero-extend ScalarRoot
21520 // to the larger type.
21521 ExV = Ex;
21522 if (Scalar->getType() != Ex->getType())
21523 ExV = Builder.CreateIntCast(
21524 Ex, Scalar->getType(),
21525 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
21526 auto *I = dyn_cast<Instruction>(Ex);
21527 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
21528 : &F->getEntryBlock(),
21529 std::make_pair(Ex, ExV));
21530 }
21531 // The then branch of the previous if may produce constants, since 0
21532 // operand might be a constant.
21533 if (auto *ExI = dyn_cast<Instruction>(Ex);
21534 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
21535 GatherShuffleExtractSeq.insert(ExI);
21536 CSEBlocks.insert(ExI->getParent());
21537 }
21538 return ExV;
21539 }
21540 assert(isa<FixedVectorType>(Scalar->getType()) &&
21541 isa<InsertElementInst>(Scalar) &&
21542 "In-tree scalar of vector type is not insertelement?");
21543 auto *IE = cast<InsertElementInst>(Scalar);
21544 VectorToInsertElement.try_emplace(Vec, IE);
21545 return Vec;
21546 };
21547 // If User == nullptr, the Scalar remains as scalar in vectorized
21548 // instructions or is used as extra arg. Generate ExtractElement instruction
21549 // and update the record for this scalar in ExternallyUsedValues.
21550 if (!User) {
21551 if (!ScalarsWithNullptrUser.insert(Scalar).second)
21552 continue;
21553 assert(
21554 (ExternallyUsedValues.count(Scalar) ||
21555 ExternalUsesWithNonUsers.count(Scalar) ||
21556 ExternalUsesAsOriginalScalar.contains(Scalar) ||
21557 any_of(
21558 Scalar->users(),
21559 [&, TTI = TTI](llvm::User *U) {
21560 if (ExternalUsesAsOriginalScalar.contains(U))
21561 return true;
21562 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
21563 return !UseEntries.empty() &&
21564 (E->State == TreeEntry::Vectorize ||
21565 E->State == TreeEntry::StridedVectorize ||
21566 E->State == TreeEntry::CompressVectorize) &&
21567 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
21568 return (UseEntry->State == TreeEntry::Vectorize ||
21569 UseEntry->State ==
21570 TreeEntry::StridedVectorize ||
21571 UseEntry->State ==
21572 TreeEntry::CompressVectorize) &&
21573 doesInTreeUserNeedToExtract(
21574 Scalar, getRootEntryInstruction(*UseEntry),
21575 TLI, TTI);
21576 });
21577 })) &&
21578 "Scalar with nullptr User must be registered in "
21579 "ExternallyUsedValues map or remain as scalar in vectorized "
21580 "instructions");
21581 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
21582 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
21583 if (PHI->getParent()->isLandingPad())
21584 Builder.SetInsertPoint(
21585 PHI->getParent(),
21586 std::next(
21587 PHI->getParent()->getLandingPadInst()->getIterator()));
21588 else
21589 Builder.SetInsertPoint(PHI->getParent(),
21590 PHI->getParent()->getFirstNonPHIIt());
21591 } else {
21592 Builder.SetInsertPoint(VecI->getParent(),
21593 std::next(VecI->getIterator()));
21594 }
21595 } else {
21596 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21597 }
21598 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21599 // Required to update internally referenced instructions.
21600 if (Scalar != NewInst) {
21601 assert((!isa<ExtractElementInst>(Scalar) ||
21602 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
21603 "Extractelements should not be replaced.");
21604 Scalar->replaceAllUsesWith(NewInst);
21605 }
21606 continue;
21607 }
21608
21609 if (auto *VU = dyn_cast<InsertElementInst>(User);
21610 VU && VU->getOperand(1) == Scalar) {
21611 // Skip if the scalar is another vector op or Vec is not an instruction.
21612 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
21613 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
21614 if (!UsedInserts.insert(VU).second)
21615 continue;
21616 // Need to use original vector, if the root is truncated.
21617 auto BWIt = MinBWs.find(E);
21618 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
21619 auto *ScalarTy = FTy->getElementType();
21620 auto Key = std::make_pair(Vec, ScalarTy);
21621 auto VecIt = VectorCasts.find(Key);
21622 if (VecIt == VectorCasts.end()) {
21623 IRBuilderBase::InsertPointGuard Guard(Builder);
21624 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
21625 if (IVec->getParent()->isLandingPad())
21626 Builder.SetInsertPoint(IVec->getParent(),
21627 std::next(IVec->getParent()
21628 ->getLandingPadInst()
21629 ->getIterator()));
21630 else
21631 Builder.SetInsertPoint(
21632 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
21633 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
21634 Builder.SetInsertPoint(IVec->getNextNode());
21635 }
21636 Vec = Builder.CreateIntCast(
21637 Vec,
21639 ScalarTy,
21640 cast<FixedVectorType>(Vec->getType())->getNumElements()),
21641 BWIt->second.second);
21642 VectorCasts.try_emplace(Key, Vec);
21643 } else {
21644 Vec = VecIt->second;
21645 }
21646 }
21647
21648 std::optional<unsigned> InsertIdx = getElementIndex(VU);
21649 if (InsertIdx) {
21650 auto *It = find_if(
21651 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
21652 // Checks if 2 insertelements are from the same buildvector.
21653 InsertElementInst *VecInsert = Data.InsertElements.front();
21655 VU, VecInsert,
21656 [](InsertElementInst *II) { return II->getOperand(0); });
21657 });
21658 unsigned Idx = *InsertIdx;
21659 if (It == ShuffledInserts.end()) {
21660 (void)ShuffledInserts.emplace_back();
21661 It = std::next(ShuffledInserts.begin(),
21662 ShuffledInserts.size() - 1);
21663 }
21664 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
21665 if (Mask.empty())
21666 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
21667 Mask[Idx] = ExternalUse.Lane;
21668 It->InsertElements.push_back(cast<InsertElementInst>(User));
21669 continue;
21670 }
21671 }
21672 }
21673 }
21674
21675 // Generate extracts for out-of-tree users.
21676 // Find the insertion point for the extractelement lane.
21677 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
21678 if (PHINode *PH = dyn_cast<PHINode>(User)) {
21679 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
21680 if (PH->getIncomingValue(I) == Scalar) {
21681 Instruction *IncomingTerminator =
21682 PH->getIncomingBlock(I)->getTerminator();
21683 if (isa<CatchSwitchInst>(IncomingTerminator)) {
21684 Builder.SetInsertPoint(VecI->getParent(),
21685 std::next(VecI->getIterator()));
21686 } else {
21687 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
21688 }
21689 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21690 PH->setOperand(I, NewInst);
21691 }
21692 }
21693 } else {
21694 Builder.SetInsertPoint(cast<Instruction>(User));
21695 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21696 User->replaceUsesOfWith(Scalar, NewInst);
21697 }
21698 } else {
21699 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21700 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21701 User->replaceUsesOfWith(Scalar, NewInst);
21702 }
21703
21704 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
21705 }
21706
21707 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
21708 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
21709 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
21710 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
21711 for (int I = 0, E = Mask.size(); I < E; ++I) {
21712 if (Mask[I] < VF)
21713 CombinedMask1[I] = Mask[I];
21714 else
21715 CombinedMask2[I] = Mask[I] - VF;
21716 }
21717 ShuffleInstructionBuilder ShuffleBuilder(
21718 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
21719 ShuffleBuilder.add(V1, CombinedMask1);
21720 if (V2)
21721 ShuffleBuilder.add(V2, CombinedMask2);
21722 return ShuffleBuilder.finalize({}, {}, {});
21723 };
21724
21725 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
21726 bool ForSingleMask) {
21727 unsigned VF = Mask.size();
21728 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
21729 if (VF != VecVF) {
21730 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
21731 Vec = CreateShuffle(Vec, nullptr, Mask);
21732 return std::make_pair(Vec, true);
21733 }
21734 if (!ForSingleMask) {
21735 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
21736 for (unsigned I = 0; I < VF; ++I) {
21737 if (Mask[I] != PoisonMaskElem)
21738 ResizeMask[Mask[I]] = Mask[I];
21739 }
21740 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
21741 }
21742 }
21743
21744 return std::make_pair(Vec, false);
21745 };
21746 // Perform shuffling of the vectorize tree entries for better handling of
21747 // external extracts.
21748 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
21749 // Find the first and the last instruction in the list of insertelements.
21750 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
21751 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
21752 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
21753 Builder.SetInsertPoint(LastInsert);
21754 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
21756 MutableArrayRef(Vector.data(), Vector.size()),
21757 FirstInsert->getOperand(0),
21758 [](Value *Vec) {
21759 return cast<VectorType>(Vec->getType())
21760 ->getElementCount()
21761 .getKnownMinValue();
21762 },
21763 ResizeToVF,
21764 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
21765 ArrayRef<Value *> Vals) {
21766 assert((Vals.size() == 1 || Vals.size() == 2) &&
21767 "Expected exactly 1 or 2 input values.");
21768 if (Vals.size() == 1) {
21769 // Do not create shuffle if the mask is a simple identity
21770 // non-resizing mask.
21771 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
21772 ->getNumElements() ||
21773 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
21774 return CreateShuffle(Vals.front(), nullptr, Mask);
21775 return Vals.front();
21776 }
21777 return CreateShuffle(Vals.front() ? Vals.front()
21778 : FirstInsert->getOperand(0),
21779 Vals.back(), Mask);
21780 });
21781 auto It = ShuffledInserts[I].InsertElements.rbegin();
21782 // Rebuild buildvector chain.
21783 InsertElementInst *II = nullptr;
21784 if (It != ShuffledInserts[I].InsertElements.rend())
21785 II = *It;
21787 while (It != ShuffledInserts[I].InsertElements.rend()) {
21788 assert(II && "Must be an insertelement instruction.");
21789 if (*It == II)
21790 ++It;
21791 else
21792 Inserts.push_back(cast<Instruction>(II));
21793 II = dyn_cast<InsertElementInst>(II->getOperand(0));
21794 }
21795 for (Instruction *II : reverse(Inserts)) {
21796 II->replaceUsesOfWith(II->getOperand(0), NewInst);
21797 if (auto *NewI = dyn_cast<Instruction>(NewInst))
21798 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
21799 II->moveAfter(NewI);
21800 NewInst = II;
21801 }
21802 LastInsert->replaceAllUsesWith(NewInst);
21803 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
21804 IE->replaceUsesOfWith(IE->getOperand(0),
21805 PoisonValue::get(IE->getOperand(0)->getType()));
21806 IE->replaceUsesOfWith(IE->getOperand(1),
21807 PoisonValue::get(IE->getOperand(1)->getType()));
21808 eraseInstruction(IE);
21809 }
21810 CSEBlocks.insert(LastInsert->getParent());
21811 }
21812
21813 SmallVector<Instruction *> RemovedInsts;
21814 // For each vectorized value:
21815 for (auto &TEPtr : VectorizableTree) {
21816 TreeEntry *Entry = TEPtr.get();
21817
21818 // No need to handle users of gathered values.
21819 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
21820 DeletedNodes.contains(Entry) ||
21821 TransformedToGatherNodes.contains(Entry))
21822 continue;
21823
21824 if (Entry->CombinedOp == TreeEntry::ReducedBitcast ||
21825 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap) {
21826 // Skip constant node
21827 if (!Entry->hasState()) {
21828 assert(allConstant(Entry->Scalars) && "Expected constants only.");
21829 continue;
21830 }
21831 for (Value *Scalar : Entry->Scalars) {
21832 auto *I = dyn_cast<Instruction>(Scalar);
21833
21834 if (!I || Entry->isCopyableElement(I))
21835 continue;
21836 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *I << ".\n");
21837 RemovedInsts.push_back(I);
21838 }
21839 continue;
21840 }
21841
21842 assert(Entry->VectorizedValue && "Can't find vectorizable value");
21843
21844 // For each lane:
21845 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
21846 Value *Scalar = Entry->Scalars[Lane];
21847
21848 if (Entry->getOpcode() == Instruction::GetElementPtr &&
21849 !isa<GetElementPtrInst>(Scalar))
21850 continue;
21851 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
21852 EE && IgnoredExtracts.contains(EE))
21853 continue;
21854 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
21855 continue;
21856#ifndef NDEBUG
21857 Type *Ty = Scalar->getType();
21858 if (!Ty->isVoidTy()) {
21859 for (User *U : Scalar->users()) {
21860 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
21861
21862 // It is legal to delete users in the ignorelist.
21863 assert((isVectorized(U) ||
21864 (UserIgnoreList && UserIgnoreList->contains(U)) ||
21867 "Deleting out-of-tree value");
21868 }
21869 }
21870#endif
21871 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
21872 auto *I = cast<Instruction>(Scalar);
21873 RemovedInsts.push_back(I);
21874 }
21875 }
21876
21877 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
21878 // new vector instruction.
21879 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
21880 V->mergeDIAssignID(RemovedInsts);
21881
21882 // Clear up reduction references, if any.
21883 if (UserIgnoreList) {
21884 for (Instruction *I : RemovedInsts) {
21885 const TreeEntry *IE = getTreeEntries(I).front();
21886 if (ArrayRef<TreeEntry *> SplitEntries = getSplitTreeEntries(I);
21887 !SplitEntries.empty() && SplitEntries.front()->Idx < IE->Idx)
21888 IE = SplitEntries.front();
21889 if (IE->Idx != 0 &&
21890 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
21891 (ValueToGatherNodes.lookup(I).contains(
21892 VectorizableTree.front().get()) ||
21893 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
21894 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
21895 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
21896 IE->UserTreeIndex &&
21897 is_contained(VectorizableTree.front()->Scalars, I)) &&
21898 !(GatheredLoadsEntriesFirst.has_value() &&
21899 IE->Idx >= *GatheredLoadsEntriesFirst &&
21900 VectorizableTree.front()->isGather() &&
21901 is_contained(VectorizableTree.front()->Scalars, I)) &&
21902 !(!VectorizableTree.front()->isGather() &&
21903 VectorizableTree.front()->isCopyableElement(I)))
21904 continue;
21905 SmallVector<SelectInst *> LogicalOpSelects;
21906 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
21907 // Do not replace condition of the logical op in form select <cond>.
21908 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
21909 (match(U.getUser(), m_LogicalAnd()) ||
21910 match(U.getUser(), m_LogicalOr())) &&
21911 U.getOperandNo() == 0;
21912 if (IsPoisoningLogicalOp) {
21913 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
21914 return false;
21915 }
21916 return UserIgnoreList->contains(U.getUser());
21917 });
21918 // Replace conditions of the poisoning logical ops with the non-poison
21919 // constant value.
21920 for (SelectInst *SI : LogicalOpSelects)
21921 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
21922 }
21923 }
21924 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
21925 // cache correctness.
21926 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
21927 // - instructions are not deleted until later.
21928 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
21929
21930 Builder.ClearInsertionPoint();
21931 InstrElementSize.clear();
21932
21933 const TreeEntry &RootTE = *VectorizableTree.front();
21934 Value *Vec = RootTE.VectorizedValue;
21935 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
21936 It != MinBWs.end() &&
21937 ReductionBitWidth != It->second.first) {
21938 IRBuilder<>::InsertPointGuard Guard(Builder);
21939 Builder.SetInsertPoint(ReductionRoot->getParent(),
21940 ReductionRoot->getIterator());
21941 Vec = Builder.CreateIntCast(
21942 Vec,
21943 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
21944 cast<VectorType>(Vec->getType())->getElementCount()),
21945 It->second.second);
21946 }
21947 return Vec;
21948}
21949
21951 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
21952 << " gather sequences instructions.\n");
21953 // LICM InsertElementInst sequences.
21954 for (Instruction *I : GatherShuffleExtractSeq) {
21955 if (isDeleted(I))
21956 continue;
21957
21958 // Check if this block is inside a loop.
21959 Loop *L = LI->getLoopFor(I->getParent());
21960 if (!L)
21961 continue;
21962
21963 // Check if it has a preheader.
21964 BasicBlock *PreHeader = L->getLoopPreheader();
21965 if (!PreHeader)
21966 continue;
21967
21968 // If the vector or the element that we insert into it are
21969 // instructions that are defined in this basic block then we can't
21970 // hoist this instruction.
21971 if (any_of(I->operands(), [L](Value *V) {
21972 auto *OpI = dyn_cast<Instruction>(V);
21973 return OpI && L->contains(OpI);
21974 }))
21975 continue;
21976
21977 // We can hoist this instruction. Move it to the pre-header.
21978 I->moveBefore(PreHeader->getTerminator()->getIterator());
21979 CSEBlocks.insert(PreHeader);
21980 }
21981
21982 // Make a list of all reachable blocks in our CSE queue.
21984 CSEWorkList.reserve(CSEBlocks.size());
21985 for (BasicBlock *BB : CSEBlocks)
21986 if (DomTreeNode *N = DT->getNode(BB)) {
21987 assert(DT->isReachableFromEntry(N));
21988 CSEWorkList.push_back(N);
21989 }
21990
21991 // Sort blocks by domination. This ensures we visit a block after all blocks
21992 // dominating it are visited.
21993 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
21994 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
21995 "Different nodes should have different DFS numbers");
21996 return A->getDFSNumIn() < B->getDFSNumIn();
21997 });
21998
21999 // Less defined shuffles can be replaced by the more defined copies.
22000 // Between two shuffles one is less defined if it has the same vector operands
22001 // and its mask indeces are the same as in the first one or undefs. E.g.
22002 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
22003 // poison, <0, 0, 0, 0>.
22004 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
22005 Instruction *I2,
22006 SmallVectorImpl<int> &NewMask) {
22007 if (I1->getType() != I2->getType())
22008 return false;
22009 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
22010 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
22011 if (!SI1 || !SI2)
22012 return I1->isIdenticalTo(I2);
22013 if (SI1->isIdenticalTo(SI2))
22014 return true;
22015 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
22016 if (SI1->getOperand(I) != SI2->getOperand(I))
22017 return false;
22018 // Check if the second instruction is more defined than the first one.
22019 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
22020 ArrayRef<int> SM1 = SI1->getShuffleMask();
22021 // Count trailing undefs in the mask to check the final number of used
22022 // registers.
22023 unsigned LastUndefsCnt = 0;
22024 for (int I = 0, E = NewMask.size(); I < E; ++I) {
22025 if (SM1[I] == PoisonMaskElem)
22026 ++LastUndefsCnt;
22027 else
22028 LastUndefsCnt = 0;
22029 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
22030 NewMask[I] != SM1[I])
22031 return false;
22032 if (NewMask[I] == PoisonMaskElem)
22033 NewMask[I] = SM1[I];
22034 }
22035 // Check if the last undefs actually change the final number of used vector
22036 // registers.
22037 return SM1.size() - LastUndefsCnt > 1 &&
22038 ::getNumberOfParts(*TTI, SI1->getType()) ==
22040 *TTI, getWidenedType(SI1->getType()->getElementType(),
22041 SM1.size() - LastUndefsCnt));
22042 };
22043 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
22044 // instructions. TODO: We can further optimize this scan if we split the
22045 // instructions into different buckets based on the insert lane.
22047 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
22048 assert(*I &&
22049 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
22050 "Worklist not sorted properly!");
22051 BasicBlock *BB = (*I)->getBlock();
22052 // For all instructions in blocks containing gather sequences:
22053 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
22054 if (isDeleted(&In))
22055 continue;
22057 !GatherShuffleExtractSeq.contains(&In))
22058 continue;
22059
22060 // Check if we can replace this instruction with any of the
22061 // visited instructions.
22062 bool Replaced = false;
22063 for (Instruction *&V : Visited) {
22064 SmallVector<int> NewMask;
22065 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
22066 DT->dominates(V->getParent(), In.getParent())) {
22067 In.replaceAllUsesWith(V);
22068 eraseInstruction(&In);
22069 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
22070 if (!NewMask.empty())
22071 SI->setShuffleMask(NewMask);
22072 Replaced = true;
22073 break;
22074 }
22076 GatherShuffleExtractSeq.contains(V) &&
22077 IsIdenticalOrLessDefined(V, &In, NewMask) &&
22078 DT->dominates(In.getParent(), V->getParent())) {
22079 In.moveAfter(V);
22080 V->replaceAllUsesWith(&In);
22082 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
22083 if (!NewMask.empty())
22084 SI->setShuffleMask(NewMask);
22085 V = &In;
22086 Replaced = true;
22087 break;
22088 }
22089 }
22090 if (!Replaced) {
22091 assert(!is_contained(Visited, &In));
22092 Visited.push_back(&In);
22093 }
22094 }
22095 }
22096 CSEBlocks.clear();
22097 GatherShuffleExtractSeq.clear();
22098}
22099
22100BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
22101 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
22102 auto &BundlePtr =
22103 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
22104 for (Value *V : VL) {
22105 if (S.isNonSchedulable(V))
22106 continue;
22107 auto *I = cast<Instruction>(V);
22108 if (S.isCopyableElement(V)) {
22109 // Add a copyable element model.
22110 ScheduleCopyableData &SD =
22111 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
22112 // Group the instructions to a bundle.
22113 BundlePtr->add(&SD);
22114 continue;
22115 }
22116 ScheduleData *BundleMember = getScheduleData(V);
22117 assert(BundleMember && "no ScheduleData for bundle member "
22118 "(maybe not in same basic block)");
22119 // Group the instructions to a bundle.
22120 BundlePtr->add(BundleMember);
22121 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
22122 BundlePtr.get());
22123 }
22124 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
22125 return *BundlePtr;
22126}
22127
22128// Groups the instructions to a bundle (which is then a single scheduling entity)
22129// and schedules instructions until the bundle gets ready.
22130std::optional<BoUpSLP::ScheduleBundle *>
22131BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
22132 const InstructionsState &S,
22133 const EdgeInfo &EI) {
22134 // No need to schedule PHIs, insertelement, extractelement and extractvalue
22135 // instructions.
22136 if (isa<PHINode>(S.getMainOp()) ||
22137 isVectorLikeInstWithConstOps(S.getMainOp()))
22138 return nullptr;
22139 // If the parent node is non-schedulable and the current node is copyable, and
22140 // any of parent instructions are used outside several basic blocks or in
22141 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
22142 // analysis, leading to a crash.
22143 // Non-scheduled nodes may not have related ScheduleData model, which may lead
22144 // to a skipped dep analysis.
22145 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
22146 EI.UserTE->doesNotNeedToSchedule() &&
22147 EI.UserTE->getOpcode() != Instruction::PHI &&
22148 any_of(EI.UserTE->Scalars, [](Value *V) {
22149 auto *I = dyn_cast<Instruction>(V);
22150 if (!I || I->hasOneUser())
22151 return false;
22152 for (User *U : I->users()) {
22153 auto *UI = cast<Instruction>(U);
22154 if (isa<BinaryOperator>(UI))
22155 return true;
22156 }
22157 return false;
22158 }))
22159 return std::nullopt;
22160 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
22161 EI.UserTE->hasCopyableElements() &&
22162 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
22163 all_of(VL, [&](Value *V) {
22164 if (S.isCopyableElement(V))
22165 return true;
22166 return isUsedOutsideBlock(V);
22167 }))
22168 return std::nullopt;
22169 // If any instruction is used outside block only and its operand is placed
22170 // immediately before it, do not schedule, it may cause wrong def-use chain.
22171 if (S.areInstructionsWithCopyableElements() && any_of(VL, [&](Value *V) {
22172 if (isa<PoisonValue>(V) || S.isCopyableElement(V))
22173 return false;
22174 if (isUsedOutsideBlock(V)) {
22175 for (Value *Op : cast<Instruction>(V)->operands()) {
22176 auto *I = dyn_cast<Instruction>(Op);
22177 if (!I)
22178 continue;
22179 return SLP->isVectorized(I) && I->getNextNode() == V;
22180 }
22181 }
22182 return false;
22183 }))
22184 return std::nullopt;
22185 if (S.areInstructionsWithCopyableElements() && EI) {
22186 bool IsNonSchedulableWithParentPhiNode =
22187 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
22188 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
22189 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
22190 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
22191 if (IsNonSchedulableWithParentPhiNode) {
22192 SmallSet<std::pair<Value *, Value *>, 4> Values;
22193 for (const auto [Idx, V] :
22194 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
22195 Value *Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
22196 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
22197 auto *I = dyn_cast<Instruction>(Op);
22198 if (!I || !isCommutative(I))
22199 continue;
22200 if (!Values.insert(std::make_pair(V, Op)).second)
22201 return std::nullopt;
22202 }
22203 } else {
22204 // If any of the parent requires scheduling - exit, complex dep between
22205 // schedulable/non-schedulable parents.
22206 if (any_of(EI.UserTE->Scalars, [&](Value *V) {
22207 if (EI.UserTE->hasCopyableElements() &&
22208 EI.UserTE->isCopyableElement(V))
22209 return false;
22210 ArrayRef<TreeEntry *> Entries = SLP->getTreeEntries(V);
22211 return any_of(Entries, [](const TreeEntry *TE) {
22212 return TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
22213 TE->UserTreeIndex.UserTE->hasState() &&
22214 TE->UserTreeIndex.UserTE->State !=
22215 TreeEntry::SplitVectorize &&
22216 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
22217 });
22218 }))
22219 return std::nullopt;
22220 }
22221 }
22222 bool HasCopyables = S.areInstructionsWithCopyableElements();
22223 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
22224 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
22225 // If all operands were replaced by copyables, the operands of this node
22226 // might be not, so need to recalculate dependencies for schedule data,
22227 // replaced by copyable schedule data.
22228 SmallVector<ScheduleData *> ControlDependentMembers;
22229 for (Value *V : VL) {
22230 auto *I = dyn_cast<Instruction>(V);
22231 if (!I || (HasCopyables && S.isCopyableElement(V)))
22232 continue;
22233 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
22234 for (const Use &U : I->operands()) {
22235 unsigned &NumOps =
22236 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
22237 .first->getSecond();
22238 ++NumOps;
22239 if (auto *Op = dyn_cast<Instruction>(U.get());
22240 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
22241 if (ScheduleData *OpSD = getScheduleData(Op);
22242 OpSD && OpSD->hasValidDependencies())
22243 // TODO: investigate how to improve it instead of early exiting.
22244 return std::nullopt;
22245 }
22246 }
22247 }
22248 return nullptr;
22249 }
22250
22251 // Initialize the instruction bundle.
22252 Instruction *OldScheduleEnd = ScheduleEnd;
22253 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
22254
22255 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
22256 // Clear deps or recalculate the region, if the memory instruction is a
22257 // copyable. It may have memory deps, which must be recalculated.
22258 SmallVector<ScheduleData *> ControlDependentMembers;
22259 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
22260 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
22261 for (ScheduleEntity *SE : Bundle.getBundle()) {
22262 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
22263 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
22264 BundleMember && BundleMember->hasValidDependencies()) {
22265 BundleMember->clearDirectDependencies();
22266 if (RegionHasStackSave ||
22268 BundleMember->getInst()))
22269 ControlDependentMembers.push_back(BundleMember);
22270 }
22271 continue;
22272 }
22273 auto *SD = cast<ScheduleData>(SE);
22274 if (SD->hasValidDependencies() &&
22275 (!S.areInstructionsWithCopyableElements() ||
22276 !S.isCopyableElement(SD->getInst())) &&
22277 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
22278 EI.UserTE->hasState() &&
22279 (!EI.UserTE->hasCopyableElements() ||
22280 !EI.UserTE->isCopyableElement(SD->getInst())))
22281 SD->clearDirectDependencies();
22282 for (const Use &U : SD->getInst()->operands()) {
22283 unsigned &NumOps =
22284 UserOpToNumOps
22285 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
22286 .first->getSecond();
22287 ++NumOps;
22288 if (auto *Op = dyn_cast<Instruction>(U.get());
22289 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
22290 *SLP, NumOps)) {
22291 if (ScheduleData *OpSD = getScheduleData(Op);
22292 OpSD && OpSD->hasValidDependencies()) {
22293 OpSD->clearDirectDependencies();
22294 if (RegionHasStackSave ||
22296 ControlDependentMembers.push_back(OpSD);
22297 }
22298 }
22299 }
22300 }
22301 };
22302 // The scheduling region got new instructions at the lower end (or it is a
22303 // new region for the first bundle). This makes it necessary to
22304 // recalculate all dependencies.
22305 // It is seldom that this needs to be done a second time after adding the
22306 // initial bundle to the region.
22307 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
22308 for_each(ScheduleDataMap, [&](auto &P) {
22309 if (BB != P.first->getParent())
22310 return;
22311 ScheduleData *SD = P.second;
22312 if (isInSchedulingRegion(*SD))
22313 SD->clearDependencies();
22314 });
22315 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
22316 for_each(P.second, [&](ScheduleCopyableData *SD) {
22317 if (isInSchedulingRegion(*SD))
22318 SD->clearDependencies();
22319 });
22320 });
22321 ReSchedule = true;
22322 }
22323 // Check if the bundle data has deps for copyable elements already. In
22324 // this case need to reset deps and recalculate it.
22325 if (Bundle && !Bundle.getBundle().empty()) {
22326 if (S.areInstructionsWithCopyableElements() ||
22327 !ScheduleCopyableDataMap.empty())
22328 CheckIfNeedToClearDeps(Bundle);
22329 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
22330 << BB->getName() << "\n");
22331 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
22332 ControlDependentMembers);
22333 } else if (!ControlDependentMembers.empty()) {
22334 ScheduleBundle Invalid = ScheduleBundle::invalid();
22335 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
22336 ControlDependentMembers);
22337 }
22338
22339 if (ReSchedule) {
22340 resetSchedule();
22341 initialFillReadyList(ReadyInsts);
22342 }
22343
22344 // Now try to schedule the new bundle or (if no bundle) just calculate
22345 // dependencies. As soon as the bundle is "ready" it means that there are no
22346 // cyclic dependencies and we can schedule it. Note that's important that we
22347 // don't "schedule" the bundle yet.
22348 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
22349 !ReadyInsts.empty()) {
22350 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
22351 assert(Picked->isReady() && "must be ready to schedule");
22352 schedule(*SLP, S, EI, Picked, ReadyInsts);
22353 if (Picked == &Bundle)
22354 break;
22355 }
22356 };
22357
22358 // Make sure that the scheduling region contains all
22359 // instructions of the bundle.
22360 for (Value *V : VL) {
22361 if (S.isNonSchedulable(V))
22362 continue;
22363 if (!extendSchedulingRegion(V, S)) {
22364 // If the scheduling region got new instructions at the lower end (or it
22365 // is a new region for the first bundle). This makes it necessary to
22366 // recalculate all dependencies.
22367 // Otherwise the compiler may crash trying to incorrectly calculate
22368 // dependencies and emit instruction in the wrong order at the actual
22369 // scheduling.
22370 ScheduleBundle Invalid = ScheduleBundle::invalid();
22371 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
22372 return std::nullopt;
22373 }
22374 }
22375
22376 bool ReSchedule = false;
22377 for (Value *V : VL) {
22378 if (S.isNonSchedulable(V))
22379 continue;
22381 getScheduleCopyableData(cast<Instruction>(V));
22382 if (!CopyableData.empty()) {
22383 for (ScheduleCopyableData *SD : CopyableData)
22384 ReadyInsts.remove(SD);
22385 }
22386 ScheduleData *BundleMember = getScheduleData(V);
22387 assert((BundleMember || S.isCopyableElement(V)) &&
22388 "no ScheduleData for bundle member (maybe not in same basic block)");
22389 if (!BundleMember)
22390 continue;
22391
22392 // Make sure we don't leave the pieces of the bundle in the ready list when
22393 // whole bundle might not be ready.
22394 ReadyInsts.remove(BundleMember);
22395 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
22396 !Bundles.empty()) {
22397 for (ScheduleBundle *B : Bundles)
22398 ReadyInsts.remove(B);
22399 }
22400
22401 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
22402 continue;
22403 // A bundle member was scheduled as single instruction before and now
22404 // needs to be scheduled as part of the bundle. We just get rid of the
22405 // existing schedule.
22406 // A bundle member has deps calculated before it was copyable element - need
22407 // to reschedule.
22408 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
22409 << " was already scheduled\n");
22410 ReSchedule = true;
22411 }
22412
22413 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
22414 TryScheduleBundleImpl(ReSchedule, Bundle);
22415 if (!Bundle.isReady()) {
22416 for (ScheduleEntity *BD : Bundle.getBundle()) {
22417 // Copyable data scheduling is just removed.
22419 continue;
22420 if (BD->isReady()) {
22421 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
22422 if (Bundles.empty()) {
22423 ReadyInsts.insert(BD);
22424 continue;
22425 }
22426 for (ScheduleBundle *B : Bundles)
22427 if (B->isReady())
22428 ReadyInsts.insert(B);
22429 }
22430 }
22431 ScheduledBundlesList.pop_back();
22432 SmallVector<ScheduleData *> ControlDependentMembers;
22433 for (Value *V : VL) {
22434 if (S.isNonSchedulable(V))
22435 continue;
22436 auto *I = cast<Instruction>(V);
22437 if (S.isCopyableElement(I)) {
22438 // Remove the copyable data from the scheduling region and restore
22439 // previous mappings.
22440 auto KV = std::make_pair(EI, I);
22441 assert(ScheduleCopyableDataMap.contains(KV) &&
22442 "no ScheduleCopyableData for copyable element");
22443 ScheduleCopyableData *SD =
22444 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
22445 ScheduleCopyableDataMapByUsers[I].remove(SD);
22446 if (EI.UserTE) {
22447 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
22448 const auto *It = find(Op, I);
22449 assert(It != Op.end() && "Lane not set");
22450 SmallPtrSet<Instruction *, 4> Visited;
22451 do {
22452 int Lane = std::distance(Op.begin(), It);
22453 assert(Lane >= 0 && "Lane not set");
22454 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
22455 !EI.UserTE->ReorderIndices.empty())
22456 Lane = EI.UserTE->ReorderIndices[Lane];
22457 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
22458 "Couldn't find extract lane");
22459 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
22460 if (!Visited.insert(In).second) {
22461 It = find(make_range(std::next(It), Op.end()), I);
22462 break;
22463 }
22464 ScheduleCopyableDataMapByInstUser
22465 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
22466 .pop_back();
22467 It = find(make_range(std::next(It), Op.end()), I);
22468 } while (It != Op.end());
22469 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
22470 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
22471 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
22472 }
22473 if (ScheduleCopyableDataMapByUsers[I].empty())
22474 ScheduleCopyableDataMapByUsers.erase(I);
22475 ScheduleCopyableDataMap.erase(KV);
22476 // Need to recalculate dependencies for the actual schedule data.
22477 if (ScheduleData *OpSD = getScheduleData(I);
22478 OpSD && OpSD->hasValidDependencies()) {
22479 OpSD->clearDirectDependencies();
22480 if (RegionHasStackSave ||
22482 ControlDependentMembers.push_back(OpSD);
22483 }
22484 continue;
22485 }
22486 ScheduledBundles.find(I)->getSecond().pop_back();
22487 }
22488 if (!ControlDependentMembers.empty()) {
22489 ScheduleBundle Invalid = ScheduleBundle::invalid();
22490 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
22491 ControlDependentMembers);
22492 }
22493 return std::nullopt;
22494 }
22495 return &Bundle;
22496}
22497
22498BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
22499 // Allocate a new ScheduleData for the instruction.
22500 if (ChunkPos >= ChunkSize) {
22501 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
22502 ChunkPos = 0;
22503 }
22504 return &(ScheduleDataChunks.back()[ChunkPos++]);
22505}
22506
22507bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
22508 Value *V, const InstructionsState &S) {
22510 assert(I && "bundle member must be an instruction");
22511 if (getScheduleData(I))
22512 return true;
22513 if (!ScheduleStart) {
22514 // It's the first instruction in the new region.
22515 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
22516 ScheduleStart = I;
22517 ScheduleEnd = I->getNextNode();
22518 assert(ScheduleEnd && "tried to vectorize a terminator?");
22519 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
22520 return true;
22521 }
22522 // Search up and down at the same time, because we don't know if the new
22523 // instruction is above or below the existing scheduling region.
22524 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
22525 // against the budget. Otherwise debug info could affect codegen.
22527 ++ScheduleStart->getIterator().getReverse();
22528 BasicBlock::reverse_iterator UpperEnd = BB->rend();
22529 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
22530 BasicBlock::iterator LowerEnd = BB->end();
22531 auto IsAssumeLikeIntr = [](const Instruction &I) {
22532 if (auto *II = dyn_cast<IntrinsicInst>(&I))
22533 return II->isAssumeLikeIntrinsic();
22534 return false;
22535 };
22536 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22537 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22538 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
22539 &*DownIter != I) {
22540 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
22541 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
22542 return false;
22543 }
22544
22545 ++UpIter;
22546 ++DownIter;
22547
22548 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22549 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22550 }
22551 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
22552 assert(I->getParent() == ScheduleStart->getParent() &&
22553 "Instruction is in wrong basic block.");
22554 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
22555 ScheduleStart = I;
22556 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
22557 << "\n");
22558 return true;
22559 }
22560 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
22561 "Expected to reach top of the basic block or instruction down the "
22562 "lower end.");
22563 assert(I->getParent() == ScheduleEnd->getParent() &&
22564 "Instruction is in wrong basic block.");
22565 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
22566 nullptr);
22567 ScheduleEnd = I->getNextNode();
22568 assert(ScheduleEnd && "tried to vectorize a terminator?");
22569 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
22570 return true;
22571}
22572
22573void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
22574 Instruction *ToI,
22575 ScheduleData *PrevLoadStore,
22576 ScheduleData *NextLoadStore) {
22577 ScheduleData *CurrentLoadStore = PrevLoadStore;
22578 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
22579 // No need to allocate data for non-schedulable instructions.
22580 if (isa<PHINode>(I))
22581 continue;
22582 ScheduleData *SD = ScheduleDataMap.lookup(I);
22583 if (!SD) {
22584 SD = allocateScheduleDataChunks();
22585 ScheduleDataMap[I] = SD;
22586 }
22587 assert(!isInSchedulingRegion(*SD) &&
22588 "new ScheduleData already in scheduling region");
22589 SD->init(SchedulingRegionID, I);
22590
22591 auto CanIgnoreLoad = [](const Instruction *I) {
22592 const auto *LI = dyn_cast<LoadInst>(I);
22593 // If there is a simple load marked as invariant, we can ignore it.
22594 // But, in the (unlikely) case of non-simple invariant load,
22595 // we should not ignore it.
22596 return LI && LI->isSimple() &&
22597 LI->getMetadata(LLVMContext::MD_invariant_load);
22598 };
22599
22600 if (I->mayReadOrWriteMemory() &&
22601 // Simple InvariantLoad does not depend on other memory accesses.
22602 !CanIgnoreLoad(I) &&
22603 (!isa<IntrinsicInst>(I) ||
22604 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
22606 Intrinsic::pseudoprobe))) {
22607 // Update the linked list of memory accessing instructions.
22608 if (CurrentLoadStore) {
22609 CurrentLoadStore->setNextLoadStore(SD);
22610 } else {
22611 FirstLoadStoreInRegion = SD;
22612 }
22613 CurrentLoadStore = SD;
22614 }
22615
22618 RegionHasStackSave = true;
22619 }
22620 if (NextLoadStore) {
22621 if (CurrentLoadStore)
22622 CurrentLoadStore->setNextLoadStore(NextLoadStore);
22623 } else {
22624 LastLoadStoreInRegion = CurrentLoadStore;
22625 }
22626}
22627
22628void BoUpSLP::BlockScheduling::calculateDependencies(
22629 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
22630 ArrayRef<ScheduleData *> ControlDeps) {
22631 SmallVector<ScheduleEntity *> WorkList;
22632 auto ProcessNode = [&](ScheduleEntity *SE) {
22633 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
22634 if (CD->hasValidDependencies())
22635 return;
22636 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
22637 CD->initDependencies();
22638 CD->resetUnscheduledDeps();
22639 const EdgeInfo &EI = CD->getEdgeInfo();
22640 if (EI.UserTE) {
22641 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
22642 const auto *It = find(Op, CD->getInst());
22643 assert(It != Op.end() && "Lane not set");
22644 SmallPtrSet<Instruction *, 4> Visited;
22645 do {
22646 int Lane = std::distance(Op.begin(), It);
22647 assert(Lane >= 0 && "Lane not set");
22648 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
22649 !EI.UserTE->ReorderIndices.empty())
22650 Lane = EI.UserTE->ReorderIndices[Lane];
22651 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
22652 "Couldn't find extract lane");
22653 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
22654 if (EI.UserTE->isCopyableElement(In)) {
22655 // We may have not have related copyable scheduling data, if the
22656 // instruction is non-schedulable.
22657 if (ScheduleCopyableData *UseSD =
22658 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
22659 CD->incDependencies();
22660 if (!UseSD->isScheduled())
22661 CD->incrementUnscheduledDeps(1);
22662 if (!UseSD->hasValidDependencies() ||
22663 (InsertInReadyList && UseSD->isReady()))
22664 WorkList.push_back(UseSD);
22665 }
22666 } else if (Visited.insert(In).second) {
22667 if (ScheduleData *UseSD = getScheduleData(In)) {
22668 CD->incDependencies();
22669 if (!UseSD->isScheduled())
22670 CD->incrementUnscheduledDeps(1);
22671 if (!UseSD->hasValidDependencies() ||
22672 (InsertInReadyList && UseSD->isReady()))
22673 WorkList.push_back(UseSD);
22674 }
22675 }
22676 It = find(make_range(std::next(It), Op.end()), CD->getInst());
22677 } while (It != Op.end());
22678 if (CD->isReady() && CD->getDependencies() == 0 &&
22679 (EI.UserTE->hasState() &&
22680 (EI.UserTE->getMainOp()->getParent() !=
22681 CD->getInst()->getParent() ||
22682 (isa<PHINode>(EI.UserTE->getMainOp()) &&
22683 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
22684 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
22685 auto *IU = dyn_cast<Instruction>(U);
22686 if (!IU)
22687 return true;
22688 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
22689 })))))) {
22690 // If no uses in the block - mark as having pseudo-use, which cannot
22691 // be scheduled.
22692 // Prevents incorrect def-use tracking between external user and
22693 // actual instruction.
22694 CD->incDependencies();
22695 CD->incrementUnscheduledDeps(1);
22696 }
22697 }
22698 return;
22699 }
22700 auto *BundleMember = cast<ScheduleData>(SE);
22701 if (BundleMember->hasValidDependencies())
22702 return;
22703 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
22704 BundleMember->initDependencies();
22705 BundleMember->resetUnscheduledDeps();
22706 // Handle def-use chain dependencies.
22707 SmallDenseMap<Value *, unsigned> UserToNumOps;
22708 for (User *U : BundleMember->getInst()->users()) {
22709 if (isa<PHINode>(U))
22710 continue;
22711 if (ScheduleData *UseSD = getScheduleData(U)) {
22712 // The operand is a copyable element - skip.
22713 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
22714 ++NumOps;
22715 if (areAllOperandsReplacedByCopyableData(
22716 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
22717 continue;
22718 BundleMember->incDependencies();
22719 if (!UseSD->isScheduled())
22720 BundleMember->incrementUnscheduledDeps(1);
22721 if (!UseSD->hasValidDependencies() ||
22722 (InsertInReadyList && UseSD->isReady()))
22723 WorkList.push_back(UseSD);
22724 }
22725 }
22726 for (ScheduleCopyableData *UseSD :
22727 getScheduleCopyableDataUsers(BundleMember->getInst())) {
22728 BundleMember->incDependencies();
22729 if (!UseSD->isScheduled())
22730 BundleMember->incrementUnscheduledDeps(1);
22731 if (!UseSD->hasValidDependencies() ||
22732 (InsertInReadyList && UseSD->isReady()))
22733 WorkList.push_back(UseSD);
22734 }
22735
22736 SmallPtrSet<const Instruction *, 4> Visited;
22737 auto MakeControlDependent = [&](Instruction *I) {
22738 // Do not mark control dependent twice.
22739 if (!Visited.insert(I).second)
22740 return;
22741 auto *DepDest = getScheduleData(I);
22742 assert(DepDest && "must be in schedule window");
22743 DepDest->addControlDependency(BundleMember);
22744 BundleMember->incDependencies();
22745 if (!DepDest->isScheduled())
22746 BundleMember->incrementUnscheduledDeps(1);
22747 if (!DepDest->hasValidDependencies() ||
22748 (InsertInReadyList && DepDest->isReady()))
22749 WorkList.push_back(DepDest);
22750 };
22751
22752 // Any instruction which isn't safe to speculate at the beginning of the
22753 // block is control depend on any early exit or non-willreturn call
22754 // which proceeds it.
22755 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
22756 for (Instruction *I = BundleMember->getInst()->getNextNode();
22757 I != ScheduleEnd; I = I->getNextNode()) {
22758 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
22759 continue;
22760
22761 // Add the dependency
22762 MakeControlDependent(I);
22763
22765 // Everything past here must be control dependent on I.
22766 break;
22767 }
22768 }
22769
22770 if (RegionHasStackSave) {
22771 // If we have an inalloc alloca instruction, it needs to be scheduled
22772 // after any preceeding stacksave. We also need to prevent any alloca
22773 // from reordering above a preceeding stackrestore.
22774 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
22775 match(BundleMember->getInst(),
22777 for (Instruction *I = BundleMember->getInst()->getNextNode();
22778 I != ScheduleEnd; I = I->getNextNode()) {
22781 // Any allocas past here must be control dependent on I, and I
22782 // must be memory dependend on BundleMember->Inst.
22783 break;
22784
22785 if (!isa<AllocaInst>(I))
22786 continue;
22787
22788 // Add the dependency
22789 MakeControlDependent(I);
22790 }
22791 }
22792
22793 // In addition to the cases handle just above, we need to prevent
22794 // allocas and loads/stores from moving below a stacksave or a
22795 // stackrestore. Avoiding moving allocas below stackrestore is currently
22796 // thought to be conservatism. Moving loads/stores below a stackrestore
22797 // can lead to incorrect code.
22798 if (isa<AllocaInst>(BundleMember->getInst()) ||
22799 BundleMember->getInst()->mayReadOrWriteMemory()) {
22800 for (Instruction *I = BundleMember->getInst()->getNextNode();
22801 I != ScheduleEnd; I = I->getNextNode()) {
22804 continue;
22805
22806 // Add the dependency
22807 MakeControlDependent(I);
22808 break;
22809 }
22810 }
22811 }
22812
22813 // Handle the memory dependencies (if any).
22814 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
22815 if (!NextLoadStore)
22816 return;
22817 Instruction *SrcInst = BundleMember->getInst();
22818 assert(SrcInst->mayReadOrWriteMemory() &&
22819 "NextLoadStore list for non memory effecting bundle?");
22820 MemoryLocation SrcLoc = getLocation(SrcInst);
22821 bool SrcMayWrite = SrcInst->mayWriteToMemory();
22822 unsigned NumAliased = 0;
22823 unsigned DistToSrc = 1;
22824 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
22825
22826 for (ScheduleData *DepDest = NextLoadStore; DepDest;
22827 DepDest = DepDest->getNextLoadStore()) {
22828 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
22829
22830 // We have two limits to reduce the complexity:
22831 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
22832 // SLP->isAliased (which is the expensive part in this loop).
22833 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
22834 // the whole loop (even if the loop is fast, it's quadratic).
22835 // It's important for the loop break condition (see below) to
22836 // check this limit even between two read-only instructions.
22837 if (DistToSrc >= MaxMemDepDistance ||
22838 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
22839 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
22840 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
22841
22842 // We increment the counter only if the locations are aliased
22843 // (instead of counting all alias checks). This gives a better
22844 // balance between reduced runtime and accurate dependencies.
22845 NumAliased++;
22846
22847 DepDest->addMemoryDependency(BundleMember);
22848 BundleMember->incDependencies();
22849 if (!DepDest->isScheduled())
22850 BundleMember->incrementUnscheduledDeps(1);
22851 if (!DepDest->hasValidDependencies() ||
22852 (InsertInReadyList && DepDest->isReady()))
22853 WorkList.push_back(DepDest);
22854 }
22855
22856 // Example, explaining the loop break condition: Let's assume our
22857 // starting instruction is i0 and MaxMemDepDistance = 3.
22858 //
22859 // +--------v--v--v
22860 // i0,i1,i2,i3,i4,i5,i6,i7,i8
22861 // +--------^--^--^
22862 //
22863 // MaxMemDepDistance let us stop alias-checking at i3 and we add
22864 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
22865 // Previously we already added dependencies from i3 to i6,i7,i8
22866 // (because of MaxMemDepDistance). As we added a dependency from
22867 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
22868 // and we can abort this loop at i6.
22869 if (DistToSrc >= 2 * MaxMemDepDistance)
22870 break;
22871 DistToSrc++;
22872 }
22873 };
22874
22875 assert((Bundle || !ControlDeps.empty()) &&
22876 "expected at least one instruction to schedule");
22877 if (Bundle)
22878 WorkList.push_back(Bundle.getBundle().front());
22879 WorkList.append(ControlDeps.begin(), ControlDeps.end());
22880 SmallPtrSet<ScheduleBundle *, 16> Visited;
22881 while (!WorkList.empty()) {
22882 ScheduleEntity *SD = WorkList.pop_back_val();
22883 SmallVector<ScheduleBundle *, 1> CopyableBundle;
22885 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
22886 CopyableBundle.push_back(&CD->getBundle());
22887 Bundles = CopyableBundle;
22888 } else {
22889 Bundles = getScheduleBundles(SD->getInst());
22890 }
22891 if (Bundles.empty()) {
22892 if (!SD->hasValidDependencies())
22893 ProcessNode(SD);
22894 if (InsertInReadyList && SD->isReady()) {
22895 ReadyInsts.insert(SD);
22896 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
22897 }
22898 continue;
22899 }
22900 for (ScheduleBundle *Bundle : Bundles) {
22901 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
22902 continue;
22903 assert(isInSchedulingRegion(*Bundle) &&
22904 "ScheduleData not in scheduling region");
22905 for_each(Bundle->getBundle(), ProcessNode);
22906 }
22907 if (InsertInReadyList && SD->isReady()) {
22908 for (ScheduleBundle *Bundle : Bundles) {
22909 assert(isInSchedulingRegion(*Bundle) &&
22910 "ScheduleData not in scheduling region");
22911 if (!Bundle->isReady())
22912 continue;
22913 ReadyInsts.insert(Bundle);
22914 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
22915 << "\n");
22916 }
22917 }
22918 }
22919}
22920
22921void BoUpSLP::BlockScheduling::resetSchedule() {
22922 assert(ScheduleStart &&
22923 "tried to reset schedule on block which has not been scheduled");
22924 for_each(ScheduleDataMap, [&](auto &P) {
22925 if (BB != P.first->getParent())
22926 return;
22927 ScheduleData *SD = P.second;
22928 if (isInSchedulingRegion(*SD)) {
22929 SD->setScheduled(/*Scheduled=*/false);
22930 SD->resetUnscheduledDeps();
22931 }
22932 });
22933 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
22934 for_each(P.second, [&](ScheduleCopyableData *SD) {
22935 if (isInSchedulingRegion(*SD)) {
22936 SD->setScheduled(/*Scheduled=*/false);
22937 SD->resetUnscheduledDeps();
22938 }
22939 });
22940 });
22941 for_each(ScheduledBundles, [&](auto &P) {
22942 for_each(P.second, [&](ScheduleBundle *Bundle) {
22943 if (isInSchedulingRegion(*Bundle))
22944 Bundle->setScheduled(/*Scheduled=*/false);
22945 });
22946 });
22947 // Reset schedule data for copyable elements.
22948 for (auto &P : ScheduleCopyableDataMap) {
22949 if (isInSchedulingRegion(*P.second)) {
22950 P.second->setScheduled(/*Scheduled=*/false);
22951 P.second->resetUnscheduledDeps();
22952 }
22953 }
22954 ReadyInsts.clear();
22955}
22956
22957void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
22958 if (!BS->ScheduleStart)
22959 return;
22960
22961 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
22962
22963 // A key point - if we got here, pre-scheduling was able to find a valid
22964 // scheduling of the sub-graph of the scheduling window which consists
22965 // of all vector bundles and their transitive users. As such, we do not
22966 // need to reschedule anything *outside of* that subgraph.
22967
22968 BS->resetSchedule();
22969
22970 // For the real scheduling we use a more sophisticated ready-list: it is
22971 // sorted by the original instruction location. This lets the final schedule
22972 // be as close as possible to the original instruction order.
22973 // WARNING: If changing this order causes a correctness issue, that means
22974 // there is some missing dependence edge in the schedule data graph.
22975 struct ScheduleDataCompare {
22976 bool operator()(const ScheduleEntity *SD1,
22977 const ScheduleEntity *SD2) const {
22978 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
22979 }
22980 };
22981 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
22982
22983 // Ensure that all dependency data is updated (for nodes in the sub-graph)
22984 // and fill the ready-list with initial instructions.
22985 int Idx = 0;
22986 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22987 I = I->getNextNode()) {
22988 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
22989 if (!Bundles.empty()) {
22990 for (ScheduleBundle *Bundle : Bundles) {
22991 Bundle->setSchedulingPriority(Idx++);
22992 if (!Bundle->hasValidDependencies())
22993 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
22994 }
22995 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
22996 for (ScheduleCopyableData *SD : reverse(SDs)) {
22997 ScheduleBundle &Bundle = SD->getBundle();
22998 Bundle.setSchedulingPriority(Idx++);
22999 if (!Bundle.hasValidDependencies())
23000 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
23001 }
23002 continue;
23003 }
23005 BS->getScheduleCopyableDataUsers(I);
23006 if (ScheduleData *SD = BS->getScheduleData(I)) {
23007 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
23008 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
23009 SDTEs.front()->doesNotNeedToSchedule() ||
23011 "scheduler and vectorizer bundle mismatch");
23012 SD->setSchedulingPriority(Idx++);
23013 if (!SD->hasValidDependencies() &&
23014 (!CopyableData.empty() ||
23015 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
23016 assert(TE->isGather() && "expected gather node");
23017 return TE->hasState() && TE->hasCopyableElements() &&
23018 TE->isCopyableElement(I);
23019 }))) {
23020 // Need to calculate deps for these nodes to correctly handle copyable
23021 // dependencies, even if they were cancelled.
23022 // If copyables bundle was cancelled, the deps are cleared and need to
23023 // recalculate them.
23024 ScheduleBundle Bundle;
23025 Bundle.add(SD);
23026 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
23027 }
23028 }
23029 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
23030 ScheduleBundle &Bundle = SD->getBundle();
23031 Bundle.setSchedulingPriority(Idx++);
23032 if (!Bundle.hasValidDependencies())
23033 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
23034 }
23035 }
23036 BS->initialFillReadyList(ReadyInsts);
23037
23038 Instruction *LastScheduledInst = BS->ScheduleEnd;
23039
23040 // Do the "real" scheduling.
23041 SmallPtrSet<Instruction *, 16> Scheduled;
23042 while (!ReadyInsts.empty()) {
23043 auto *Picked = *ReadyInsts.begin();
23044 ReadyInsts.erase(ReadyInsts.begin());
23045
23046 // Move the scheduled instruction(s) to their dedicated places, if not
23047 // there yet.
23048 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
23049 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
23050 Instruction *PickedInst = BundleMember->getInst();
23051 // If copyable must be schedule as part of something else, skip it.
23052 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
23053 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
23054 (!IsCopyable && !Scheduled.insert(PickedInst).second))
23055 continue;
23056 if (PickedInst->getNextNode() != LastScheduledInst)
23057 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
23058 LastScheduledInst = PickedInst;
23059 }
23060 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
23061 LastScheduledInst);
23062 } else {
23063 auto *SD = cast<ScheduleData>(Picked);
23064 Instruction *PickedInst = SD->getInst();
23065 if (PickedInst->getNextNode() != LastScheduledInst)
23066 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
23067 LastScheduledInst = PickedInst;
23068 }
23069 auto Invalid = InstructionsState::invalid();
23070 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
23071 }
23072
23073 // Check that we didn't break any of our invariants.
23074#ifdef EXPENSIVE_CHECKS
23075 BS->verify();
23076#endif
23077
23078#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
23079 // Check that all schedulable entities got scheduled
23080 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
23081 I = I->getNextNode()) {
23082 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
23083 assert(all_of(Bundles,
23084 [](const ScheduleBundle *Bundle) {
23085 return Bundle->isScheduled();
23086 }) &&
23087 "must be scheduled at this point");
23088 }
23089#endif
23090
23091 // Avoid duplicate scheduling of the block.
23092 BS->ScheduleStart = nullptr;
23093}
23094
23096 // If V is a store, just return the width of the stored value (or value
23097 // truncated just before storing) without traversing the expression tree.
23098 // This is the common case.
23099 if (auto *Store = dyn_cast<StoreInst>(V))
23100 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
23101
23102 if (auto *IEI = dyn_cast<InsertElementInst>(V))
23103 return getVectorElementSize(IEI->getOperand(1));
23104
23105 auto E = InstrElementSize.find(V);
23106 if (E != InstrElementSize.end())
23107 return E->second;
23108
23109 // If V is not a store, we can traverse the expression tree to find loads
23110 // that feed it. The type of the loaded value may indicate a more suitable
23111 // width than V's type. We want to base the vector element size on the width
23112 // of memory operations where possible.
23115 if (auto *I = dyn_cast<Instruction>(V)) {
23116 Worklist.emplace_back(I, I->getParent(), 0);
23117 Visited.insert(I);
23118 }
23119
23120 // Traverse the expression tree in bottom-up order looking for loads. If we
23121 // encounter an instruction we don't yet handle, we give up.
23122 auto Width = 0u;
23123 Value *FirstNonBool = nullptr;
23124 while (!Worklist.empty()) {
23125 auto [I, Parent, Level] = Worklist.pop_back_val();
23126
23127 // We should only be looking at scalar instructions here. If the current
23128 // instruction has a vector type, skip.
23129 auto *Ty = I->getType();
23130 if (isa<VectorType>(Ty))
23131 continue;
23132 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
23133 FirstNonBool = I;
23134 if (Level > RecursionMaxDepth)
23135 continue;
23136
23137 // If the current instruction is a load, update MaxWidth to reflect the
23138 // width of the loaded value.
23140 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
23141
23142 // Otherwise, we need to visit the operands of the instruction. We only
23143 // handle the interesting cases from buildTree here. If an operand is an
23144 // instruction we haven't yet visited and from the same basic block as the
23145 // user or the use is a PHI node, we add it to the worklist.
23148 for (Use &U : I->operands()) {
23149 if (auto *J = dyn_cast<Instruction>(U.get()))
23150 if (Visited.insert(J).second &&
23151 (isa<PHINode>(I) || J->getParent() == Parent)) {
23152 Worklist.emplace_back(J, J->getParent(), Level + 1);
23153 continue;
23154 }
23155 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
23156 FirstNonBool = U.get();
23157 }
23158 } else {
23159 break;
23160 }
23161 }
23162
23163 // If we didn't encounter a memory access in the expression tree, or if we
23164 // gave up for some reason, just return the width of V. Otherwise, return the
23165 // maximum width we found.
23166 if (!Width) {
23167 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
23168 V = FirstNonBool;
23169 Width = DL->getTypeSizeInBits(V->getType());
23170 }
23171
23172 for (Instruction *I : Visited)
23173 InstrElementSize[I] = Width;
23174
23175 return Width;
23176}
23177
23178bool BoUpSLP::collectValuesToDemote(
23179 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
23181 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
23182 bool &IsProfitableToDemote, bool IsTruncRoot) const {
23183 // We can always demote constants.
23184 if (all_of(E.Scalars, IsaPred<Constant>))
23185 return true;
23186
23187 unsigned OrigBitWidth =
23188 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
23189 if (OrigBitWidth == BitWidth) {
23190 MaxDepthLevel = 1;
23191 return true;
23192 }
23193
23194 // Check if the node was analyzed already and must keep its original bitwidth.
23195 if (NodesToKeepBWs.contains(E.Idx))
23196 return false;
23197
23198 // If the value is not a vectorized instruction in the expression and not used
23199 // by the insertelement instruction and not used in multiple vector nodes, it
23200 // cannot be demoted.
23201 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
23202 if (isa<PoisonValue>(R))
23203 return false;
23204 return !isKnownNonNegative(R, SimplifyQuery(*DL));
23205 });
23206 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
23207 if (isa<PoisonValue>(V))
23208 return true;
23209 if (getTreeEntries(V).size() > 1)
23210 return false;
23211 // For lat shuffle of sext/zext with many uses need to check the extra bit
23212 // for unsigned values, otherwise may have incorrect casting for reused
23213 // scalars.
23214 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
23215 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
23216 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23217 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
23218 return true;
23219 }
23220 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
23221 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
23222 if (IsSignedNode)
23223 ++BitWidth1;
23224 if (auto *I = dyn_cast<Instruction>(V)) {
23225 APInt Mask = DB->getDemandedBits(I);
23226 unsigned BitWidth2 =
23227 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
23228 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
23229 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
23230 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
23231 break;
23232 BitWidth2 *= 2;
23233 }
23234 BitWidth1 = std::min(BitWidth1, BitWidth2);
23235 }
23236 BitWidth = std::max(BitWidth, BitWidth1);
23237 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
23238 };
23239 auto FinalAnalysis = [&, TTI = TTI]() {
23240 if (!IsProfitableToDemote)
23241 return false;
23242 bool Res = all_of(
23243 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
23244 // Demote gathers.
23245 if (Res && E.isGather()) {
23246 if (E.hasState()) {
23247 if (const TreeEntry *SameTE =
23248 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
23249 SameTE)
23250 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
23251 ToDemote, Visited, NodesToKeepBWs,
23252 MaxDepthLevel, IsProfitableToDemote,
23253 IsTruncRoot)) {
23254 ToDemote.push_back(E.Idx);
23255 return true;
23256 }
23257 }
23258 // Check possible extractelement instructions bases and final vector
23259 // length.
23260 SmallPtrSet<Value *, 4> UniqueBases;
23261 for (Value *V : E.Scalars) {
23262 auto *EE = dyn_cast<ExtractElementInst>(V);
23263 if (!EE)
23264 continue;
23265 UniqueBases.insert(EE->getVectorOperand());
23266 }
23267 const unsigned VF = E.Scalars.size();
23268 Type *OrigScalarTy = E.Scalars.front()->getType();
23269 if (UniqueBases.size() <= 2 ||
23270 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
23272 *TTI,
23274 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
23275 VF))) {
23276 ToDemote.push_back(E.Idx);
23277 return true;
23278 }
23279 }
23280 return Res;
23281 };
23282 if (E.isGather() || !Visited.insert(&E).second ||
23283 any_of(E.Scalars, [&](Value *V) {
23284 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
23285 return isa<InsertElementInst>(U) && !isVectorized(U);
23286 });
23287 }))
23288 return FinalAnalysis();
23289
23290 if (any_of(E.Scalars, [&](Value *V) {
23291 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
23292 return isVectorized(U) ||
23293 (E.Idx == 0 && UserIgnoreList &&
23294 UserIgnoreList->contains(U)) ||
23295 (!isa<CmpInst>(U) && U->getType()->isSized() &&
23296 !U->getType()->isScalableTy() &&
23297 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
23298 }) && !IsPotentiallyTruncated(V, BitWidth);
23299 }))
23300 return false;
23301
23302 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
23303 bool &NeedToExit) {
23304 NeedToExit = false;
23305 unsigned InitLevel = MaxDepthLevel;
23306 for (const TreeEntry *Op : Operands) {
23307 unsigned Level = InitLevel;
23308 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
23309 ToDemote, Visited, NodesToKeepBWs, Level,
23310 IsProfitableToDemote, IsTruncRoot)) {
23311 if (!IsProfitableToDemote)
23312 return false;
23313 NeedToExit = true;
23314 if (!FinalAnalysis())
23315 return false;
23316 continue;
23317 }
23318 MaxDepthLevel = std::max(MaxDepthLevel, Level);
23319 }
23320 return true;
23321 };
23322 auto AttemptCheckBitwidth =
23323 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
23324 // Try all bitwidth < OrigBitWidth.
23325 NeedToExit = false;
23326 unsigned BestFailBitwidth = 0;
23327 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
23328 if (Checker(BitWidth, OrigBitWidth))
23329 return true;
23330 if (BestFailBitwidth == 0 && FinalAnalysis())
23331 BestFailBitwidth = BitWidth;
23332 }
23333 if (BitWidth >= OrigBitWidth) {
23334 if (BestFailBitwidth == 0) {
23335 BitWidth = OrigBitWidth;
23336 return false;
23337 }
23338 MaxDepthLevel = 1;
23339 BitWidth = BestFailBitwidth;
23340 NeedToExit = true;
23341 return true;
23342 }
23343 return false;
23344 };
23345 auto TryProcessInstruction =
23346 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
23347 function_ref<bool(unsigned, unsigned)> Checker = {}) {
23348 if (Operands.empty()) {
23349 if (!IsTruncRoot)
23350 MaxDepthLevel = 1;
23351 for (Value *V : E.Scalars)
23352 (void)IsPotentiallyTruncated(V, BitWidth);
23353 } else {
23354 // Several vectorized uses? Check if we can truncate it, otherwise -
23355 // exit.
23356 if (any_of(E.Scalars, [&](Value *V) {
23357 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
23358 }))
23359 return false;
23360 bool NeedToExit = false;
23361 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
23362 return false;
23363 if (NeedToExit)
23364 return true;
23365 if (!ProcessOperands(Operands, NeedToExit))
23366 return false;
23367 if (NeedToExit)
23368 return true;
23369 }
23370
23371 ++MaxDepthLevel;
23372 // Record the entry that we can demote.
23373 ToDemote.push_back(E.Idx);
23374 return IsProfitableToDemote;
23375 };
23376
23377 if (E.State == TreeEntry::SplitVectorize)
23378 return TryProcessInstruction(
23379 BitWidth,
23380 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
23381 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
23382
23383 if (E.isAltShuffle()) {
23384 // Combining these opcodes may lead to incorrect analysis, skip for now.
23385 auto IsDangerousOpcode = [](unsigned Opcode) {
23386 switch (Opcode) {
23387 case Instruction::Shl:
23388 case Instruction::AShr:
23389 case Instruction::LShr:
23390 case Instruction::UDiv:
23391 case Instruction::SDiv:
23392 case Instruction::URem:
23393 case Instruction::SRem:
23394 return true;
23395 default:
23396 break;
23397 }
23398 return false;
23399 };
23400 if (IsDangerousOpcode(E.getAltOpcode()))
23401 return FinalAnalysis();
23402 }
23403
23404 switch (E.getOpcode()) {
23405
23406 // We can always demote truncations and extensions. Since truncations can
23407 // seed additional demotion, we save the truncated value.
23408 case Instruction::Trunc:
23409 if (IsProfitableToDemoteRoot)
23410 IsProfitableToDemote = true;
23411 return TryProcessInstruction(BitWidth);
23412 case Instruction::ZExt:
23413 case Instruction::SExt:
23414 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
23415 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
23416 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
23417 return false;
23418 IsProfitableToDemote = true;
23419 return TryProcessInstruction(BitWidth);
23420
23421 // We can demote certain binary operations if we can demote both of their
23422 // operands.
23423 case Instruction::Add:
23424 case Instruction::Sub:
23425 case Instruction::Mul:
23426 case Instruction::And:
23427 case Instruction::Or:
23428 case Instruction::Xor: {
23429 return TryProcessInstruction(
23430 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
23431 }
23432 case Instruction::Freeze:
23433 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
23434 case Instruction::Shl: {
23435 // If we are truncating the result of this SHL, and if it's a shift of an
23436 // inrange amount, we can always perform a SHL in a smaller type.
23437 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
23438 return all_of(E.Scalars, [&](Value *V) {
23439 if (isa<PoisonValue>(V))
23440 return true;
23441 if (E.isCopyableElement(V))
23442 return true;
23443 auto *I = cast<Instruction>(V);
23444 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23445 return AmtKnownBits.getMaxValue().ult(BitWidth);
23446 });
23447 };
23448 return TryProcessInstruction(
23449 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
23450 }
23451 case Instruction::LShr: {
23452 // If this is a truncate of a logical shr, we can truncate it to a smaller
23453 // lshr iff we know that the bits we would otherwise be shifting in are
23454 // already zeros.
23455 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23456 return all_of(E.Scalars, [&](Value *V) {
23457 if (isa<PoisonValue>(V))
23458 return true;
23459 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23460 if (E.isCopyableElement(V))
23461 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
23462 auto *I = cast<Instruction>(V);
23463 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23464 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23465 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
23466 SimplifyQuery(*DL));
23467 });
23468 };
23469 return TryProcessInstruction(
23470 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
23471 LShrChecker);
23472 }
23473 case Instruction::AShr: {
23474 // If this is a truncate of an arithmetic shr, we can truncate it to a
23475 // smaller ashr iff we know that all the bits from the sign bit of the
23476 // original type and the sign bit of the truncate type are similar.
23477 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23478 return all_of(E.Scalars, [&](Value *V) {
23479 if (isa<PoisonValue>(V))
23480 return true;
23481 auto *I = cast<Instruction>(V);
23482 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23483 unsigned ShiftedBits = OrigBitWidth - BitWidth;
23484 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23485 ShiftedBits <
23486 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23487 });
23488 };
23489 return TryProcessInstruction(
23490 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
23491 AShrChecker);
23492 }
23493 case Instruction::UDiv:
23494 case Instruction::URem: {
23495 // UDiv and URem can be truncated if all the truncated bits are zero.
23496 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23497 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23498 return all_of(E.Scalars, [&](Value *V) {
23499 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23500 if (E.hasCopyableElements() && E.isCopyableElement(V))
23501 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
23502 auto *I = cast<Instruction>(V);
23503 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
23504 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23505 });
23506 };
23507 return TryProcessInstruction(
23508 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
23509 }
23510
23511 // We can demote selects if we can demote their true and false values.
23512 case Instruction::Select: {
23513 return TryProcessInstruction(
23514 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
23515 }
23516
23517 // We can demote phis if we can demote all their incoming operands.
23518 case Instruction::PHI: {
23519 const unsigned NumOps = E.getNumOperands();
23521 transform(seq<unsigned>(0, NumOps), Ops.begin(),
23522 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
23523
23524 return TryProcessInstruction(BitWidth, Ops);
23525 }
23526
23527 case Instruction::Call: {
23528 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
23529 if (!IC)
23530 break;
23532 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
23533 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
23534 break;
23535 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
23536 function_ref<bool(unsigned, unsigned)> CallChecker;
23537 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23538 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23539 return all_of(E.Scalars, [&](Value *V) {
23540 auto *I = cast<Instruction>(V);
23541 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
23542 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23543 return MaskedValueIsZero(I->getOperand(0), Mask,
23544 SimplifyQuery(*DL)) &&
23545 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23546 }
23547 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
23548 "Expected min/max intrinsics only.");
23549 unsigned SignBits = OrigBitWidth - BitWidth;
23550 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
23551 unsigned Op0SignBits =
23552 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23553 unsigned Op1SignBits =
23554 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
23555 return SignBits <= Op0SignBits &&
23556 ((SignBits != Op0SignBits &&
23557 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
23558 MaskedValueIsZero(I->getOperand(0), Mask,
23559 SimplifyQuery(*DL))) &&
23560 SignBits <= Op1SignBits &&
23561 ((SignBits != Op1SignBits &&
23562 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
23563 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
23564 });
23565 };
23566 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23567 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23568 return all_of(E.Scalars, [&](Value *V) {
23569 auto *I = cast<Instruction>(V);
23570 unsigned SignBits = OrigBitWidth - BitWidth;
23571 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
23572 unsigned Op0SignBits =
23573 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23574 return SignBits <= Op0SignBits &&
23575 ((SignBits != Op0SignBits &&
23576 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
23577 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
23578 });
23579 };
23580 if (ID != Intrinsic::abs) {
23581 Operands.push_back(getOperandEntry(&E, 1));
23582 CallChecker = CompChecker;
23583 } else {
23584 CallChecker = AbsChecker;
23585 }
23586 InstructionCost BestCost =
23587 std::numeric_limits<InstructionCost::CostType>::max();
23588 unsigned BestBitWidth = BitWidth;
23589 unsigned VF = E.Scalars.size();
23590 // Choose the best bitwidth based on cost estimations.
23591 auto Checker = [&](unsigned BitWidth, unsigned) {
23592 unsigned MinBW = PowerOf2Ceil(BitWidth);
23593 SmallVector<Type *> ArgTys =
23594 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
23595 auto VecCallCosts = getVectorCallCosts(
23596 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
23597 TTI, TLI, ArgTys);
23598 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
23599 if (Cost < BestCost) {
23600 BestCost = Cost;
23601 BestBitWidth = BitWidth;
23602 }
23603 return false;
23604 };
23605 [[maybe_unused]] bool NeedToExit;
23606 (void)AttemptCheckBitwidth(Checker, NeedToExit);
23607 BitWidth = BestBitWidth;
23608 return TryProcessInstruction(BitWidth, Operands, CallChecker);
23609 }
23610
23611 // Otherwise, conservatively give up.
23612 default:
23613 break;
23614 }
23615 MaxDepthLevel = 1;
23616 return FinalAnalysis();
23617}
23618
23619static RecurKind getRdxKind(Value *V);
23620
23622 // We only attempt to truncate integer expressions.
23623 bool IsStoreOrInsertElt =
23624 VectorizableTree.front()->hasState() &&
23625 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
23626 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
23627 if ((IsStoreOrInsertElt || UserIgnoreList) &&
23628 ExtraBitWidthNodes.size() <= 1 &&
23629 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
23630 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
23631 return;
23632
23633 unsigned NodeIdx = 0;
23634 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
23635 NodeIdx = 1;
23636
23637 // Ensure the roots of the vectorizable tree don't form a cycle.
23638 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
23639 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
23640 "Unexpected tree is graph.");
23641
23642 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
23643 // resize to the final type.
23644 bool IsTruncRoot = false;
23645 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
23646 SmallVector<unsigned> RootDemotes;
23647 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
23648 if (NodeIdx != 0 &&
23649 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23650 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23651 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
23652 IsTruncRoot = true;
23653 RootDemotes.push_back(NodeIdx);
23654 IsProfitableToDemoteRoot = true;
23655 ++NodeIdx;
23656 }
23657
23658 // Analyzed the reduction already and not profitable - exit.
23659 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
23660 return;
23661
23662 SmallVector<unsigned> ToDemote;
23663 auto ComputeMaxBitWidth =
23664 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
23665 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
23666 ToDemote.clear();
23667 // Check if the root is trunc and the next node is gather/buildvector, then
23668 // keep trunc in scalars, which is free in most cases.
23669 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
23670 !NodesToKeepBWs.contains(E.Idx) &&
23671 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
23672 all_of(E.Scalars, [&](Value *V) {
23673 return V->hasOneUse() || isa<Constant>(V) ||
23674 (!V->hasNUsesOrMore(UsesLimit) &&
23675 none_of(V->users(), [&](User *U) {
23676 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
23677 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23678 if (TEs.empty() || is_contained(TEs, UserTE))
23679 return false;
23680 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23681 SelectInst>(U) ||
23682 isa<SIToFPInst, UIToFPInst>(U) ||
23683 (UserTE->hasState() &&
23684 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23685 SelectInst>(UserTE->getMainOp()) ||
23686 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
23687 return true;
23688 unsigned UserTESz = DL->getTypeSizeInBits(
23689 UserTE->Scalars.front()->getType());
23690 if (all_of(TEs, [&](const TreeEntry *TE) {
23691 auto It = MinBWs.find(TE);
23692 return It != MinBWs.end() &&
23693 It->second.first > UserTESz;
23694 }))
23695 return true;
23696 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
23697 }));
23698 })) {
23699 ToDemote.push_back(E.Idx);
23700 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23701 auto It = MinBWs.find(UserTE);
23702 if (It != MinBWs.end())
23703 return It->second.first;
23704 unsigned MaxBitWidth =
23705 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
23706 MaxBitWidth = bit_ceil(MaxBitWidth);
23707 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23708 MaxBitWidth = 8;
23709 return MaxBitWidth;
23710 }
23711
23712 if (!E.hasState())
23713 return 0u;
23714
23715 unsigned VF = E.getVectorFactor();
23716 Type *ScalarTy = E.Scalars.front()->getType();
23717 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
23718 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
23719 if (!TreeRootIT)
23720 return 0u;
23721
23722 if (any_of(E.Scalars,
23723 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
23724 return 0u;
23725
23726 unsigned NumParts = ::getNumberOfParts(
23727 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
23728
23729 // The maximum bit width required to represent all the values that can be
23730 // demoted without loss of precision. It would be safe to truncate the roots
23731 // of the expression to this width.
23732 unsigned MaxBitWidth = 1u;
23733
23734 // True if the roots can be zero-extended back to their original type,
23735 // rather than sign-extended. We know that if the leading bits are not
23736 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
23737 // True.
23738 // Determine if the sign bit of all the roots is known to be zero. If not,
23739 // IsKnownPositive is set to False.
23740 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
23741 if (isa<PoisonValue>(R))
23742 return true;
23743 KnownBits Known = computeKnownBits(R, *DL);
23744 return Known.isNonNegative();
23745 });
23746
23747 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
23748 E.UserTreeIndex.UserTE->hasState() &&
23749 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
23750 MaxBitWidth =
23751 std::min(DL->getTypeSizeInBits(
23752 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
23753 DL->getTypeSizeInBits(ScalarTy));
23754
23755 // We first check if all the bits of the roots are demanded. If they're not,
23756 // we can truncate the roots to this narrower type.
23757 for (Value *Root : E.Scalars) {
23758 if (isa<PoisonValue>(Root))
23759 continue;
23760 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
23761 TypeSize NumTypeBits =
23762 DL->getTypeSizeInBits(Root->getType()->getScalarType());
23763 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23764 // If we can't prove that the sign bit is zero, we must add one to the
23765 // maximum bit width to account for the unknown sign bit. This preserves
23766 // the existing sign bit so we can safely sign-extend the root back to the
23767 // original type. Otherwise, if we know the sign bit is zero, we will
23768 // zero-extend the root instead.
23769 //
23770 // FIXME: This is somewhat suboptimal, as there will be cases where adding
23771 // one to the maximum bit width will yield a larger-than-necessary
23772 // type. In general, we need to add an extra bit only if we can't
23773 // prove that the upper bit of the original type is equal to the
23774 // upper bit of the proposed smaller type. If these two bits are
23775 // the same (either zero or one) we know that sign-extending from
23776 // the smaller type will result in the same value. Here, since we
23777 // can't yet prove this, we are just making the proposed smaller
23778 // type larger to ensure correctness.
23779 if (!IsKnownPositive)
23780 ++BitWidth1;
23781
23782 auto *I = dyn_cast<Instruction>(Root);
23783 if (!I) {
23784 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
23785 continue;
23786 }
23787 APInt Mask = DB->getDemandedBits(I);
23788 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23789 MaxBitWidth =
23790 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
23791 }
23792
23793 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23794 MaxBitWidth = 8;
23795
23796 // If the original type is large, but reduced type does not improve the reg
23797 // use - ignore it.
23798 if (NumParts > 1 &&
23799 NumParts ==
23801 *TTI, getWidenedType(IntegerType::get(F->getContext(),
23802 bit_ceil(MaxBitWidth)),
23803 VF)))
23804 return 0u;
23805
23806 unsigned Opcode = E.getOpcode();
23807 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
23808 Opcode == Instruction::SExt ||
23809 Opcode == Instruction::ZExt || NumParts > 1;
23810 // Conservatively determine if we can actually truncate the roots of the
23811 // expression. Collect the values that can be demoted in ToDemote and
23812 // additional roots that require investigating in Roots.
23814 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
23815 bool NeedToDemote = IsProfitableToDemote;
23816
23817 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
23818 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
23819 NeedToDemote, IsTruncRoot) ||
23820 (MaxDepthLevel <= Limit &&
23821 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
23822 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
23823 DL->getTypeSizeInBits(TreeRootIT) /
23824 DL->getTypeSizeInBits(
23825 E.getMainOp()->getOperand(0)->getType()) >
23826 2)))))
23827 return 0u;
23828 // Round MaxBitWidth up to the next power-of-two.
23829 MaxBitWidth = bit_ceil(MaxBitWidth);
23830
23831 return MaxBitWidth;
23832 };
23833
23834 // If we can truncate the root, we must collect additional values that might
23835 // be demoted as a result. That is, those seeded by truncations we will
23836 // modify.
23837 // Add reduction ops sizes, if any.
23838 if (UserIgnoreList &&
23839 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
23840 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
23841 // x i1> to in)).
23842 if (all_of(*UserIgnoreList,
23843 [](Value *V) {
23844 return isa<PoisonValue>(V) ||
23845 cast<Instruction>(V)->getOpcode() == Instruction::Add;
23846 }) &&
23847 VectorizableTree.front()->State == TreeEntry::Vectorize &&
23848 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
23849 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
23850 Builder.getInt1Ty()) {
23851 ReductionBitWidth = 1;
23852 } else {
23853 for (Value *V : *UserIgnoreList) {
23854 if (isa<PoisonValue>(V))
23855 continue;
23856 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
23857 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
23858 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23859 if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
23860 ++BitWidth1;
23861 unsigned BitWidth2 = BitWidth1;
23863 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
23864 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23865 }
23866 ReductionBitWidth =
23867 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
23868 }
23869 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
23870 ReductionBitWidth = 8;
23871
23872 ReductionBitWidth = bit_ceil(ReductionBitWidth);
23873 }
23874 }
23875 bool IsTopRoot = NodeIdx == 0;
23876 while (NodeIdx < VectorizableTree.size() &&
23877 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23878 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23879 RootDemotes.push_back(NodeIdx);
23880 ++NodeIdx;
23881 IsTruncRoot = true;
23882 }
23883 bool IsSignedCmp = false;
23884 if (UserIgnoreList &&
23885 all_of(*UserIgnoreList,
23887 m_SMax(m_Value(), m_Value())))))
23888 IsSignedCmp = true;
23889 while (NodeIdx < VectorizableTree.size()) {
23890 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
23891 unsigned Limit = 2;
23892 if (IsTopRoot &&
23893 ReductionBitWidth ==
23894 DL->getTypeSizeInBits(
23895 VectorizableTree.front()->Scalars.front()->getType()))
23896 Limit = 3;
23897 unsigned MaxBitWidth = ComputeMaxBitWidth(
23898 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
23899 IsTruncRoot, IsSignedCmp);
23900 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
23901 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
23902 ReductionBitWidth = bit_ceil(MaxBitWidth);
23903 else if (MaxBitWidth == 0)
23904 ReductionBitWidth = 0;
23905 }
23906
23907 for (unsigned Idx : RootDemotes) {
23908 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
23909 uint32_t OrigBitWidth =
23910 DL->getTypeSizeInBits(V->getType()->getScalarType());
23911 if (OrigBitWidth > MaxBitWidth) {
23912 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
23913 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
23914 }
23915 return false;
23916 }))
23917 ToDemote.push_back(Idx);
23918 }
23919 RootDemotes.clear();
23920 IsTopRoot = false;
23921 IsProfitableToDemoteRoot = true;
23922
23923 if (ExtraBitWidthNodes.empty()) {
23924 NodeIdx = VectorizableTree.size();
23925 } else {
23926 unsigned NewIdx = 0;
23927 do {
23928 NewIdx = *ExtraBitWidthNodes.begin();
23929 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
23930 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
23931 NodeIdx = NewIdx;
23932 IsTruncRoot =
23933 NodeIdx < VectorizableTree.size() &&
23934 VectorizableTree[NodeIdx]->UserTreeIndex &&
23935 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
23936 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23937 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23938 Instruction::Trunc &&
23939 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
23940 IsSignedCmp =
23941 NodeIdx < VectorizableTree.size() &&
23942 VectorizableTree[NodeIdx]->UserTreeIndex &&
23943 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23944 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23945 Instruction::ICmp &&
23946 any_of(
23947 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
23948 [&](Value *V) {
23949 auto *IC = dyn_cast<ICmpInst>(V);
23950 return IC && (IC->isSigned() ||
23951 !isKnownNonNegative(IC->getOperand(0),
23952 SimplifyQuery(*DL)) ||
23953 !isKnownNonNegative(IC->getOperand(1),
23954 SimplifyQuery(*DL)));
23955 });
23956 }
23957
23958 // If the maximum bit width we compute is less than the width of the roots'
23959 // type, we can proceed with the narrowing. Otherwise, do nothing.
23960 if (MaxBitWidth == 0 ||
23961 MaxBitWidth >=
23962 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
23963 ->getBitWidth()) {
23964 if (UserIgnoreList)
23965 AnalyzedMinBWVals.insert_range(TreeRoot);
23966 NodesToKeepBWs.insert_range(ToDemote);
23967 continue;
23968 }
23969
23970 // Finally, map the values we can demote to the maximum bit with we
23971 // computed.
23972 for (unsigned Idx : ToDemote) {
23973 TreeEntry *TE = VectorizableTree[Idx].get();
23974 if (MinBWs.contains(TE))
23975 continue;
23976 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
23977 if (isa<PoisonValue>(R))
23978 return false;
23979 return !isKnownNonNegative(R, SimplifyQuery(*DL));
23980 });
23981 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
23982 }
23983 }
23984}
23985
23987 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
23988 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
23990 auto *AA = &AM.getResult<AAManager>(F);
23991 auto *LI = &AM.getResult<LoopAnalysis>(F);
23992 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
23993 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
23994 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
23996
23997 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
23998 if (!Changed)
23999 return PreservedAnalyses::all();
24000
24003 return PA;
24004}
24005
24007 TargetTransformInfo *TTI_,
24008 TargetLibraryInfo *TLI_, AAResults *AA_,
24009 LoopInfo *LI_, DominatorTree *DT_,
24010 AssumptionCache *AC_, DemandedBits *DB_,
24013 return false;
24014 SE = SE_;
24015 TTI = TTI_;
24016 TLI = TLI_;
24017 AA = AA_;
24018 LI = LI_;
24019 DT = DT_;
24020 AC = AC_;
24021 DB = DB_;
24022 DL = &F.getDataLayout();
24023
24024 Stores.clear();
24025 GEPs.clear();
24026 bool Changed = false;
24027
24028 // If the target claims to have no vector registers don't attempt
24029 // vectorization.
24030 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
24031 LLVM_DEBUG(
24032 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
24033 return false;
24034 }
24035
24036 // Don't vectorize when the attribute NoImplicitFloat is used.
24037 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
24038 return false;
24039
24040 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
24041
24042 // Use the bottom up slp vectorizer to construct chains that start with
24043 // store instructions.
24044 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
24045
24046 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
24047 // delete instructions.
24048
24049 // Update DFS numbers now so that we can use them for ordering.
24050 DT->updateDFSNumbers();
24051
24052 // Scan the blocks in the function in post order.
24053 for (auto *BB : post_order(&F.getEntryBlock())) {
24055 continue;
24056
24057 // Start new block - clear the list of reduction roots.
24058 R.clearReductionData();
24059 collectSeedInstructions(BB);
24060
24061 // Vectorize trees that end at stores.
24062 if (!Stores.empty()) {
24063 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
24064 << " underlying objects.\n");
24065 Changed |= vectorizeStoreChains(R);
24066 }
24067
24068 // Vectorize trees that end at reductions.
24069 Changed |= vectorizeChainsInBlock(BB, R);
24070
24071 // Vectorize the index computations of getelementptr instructions. This
24072 // is primarily intended to catch gather-like idioms ending at
24073 // non-consecutive loads.
24074 if (!GEPs.empty()) {
24075 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
24076 << " underlying objects.\n");
24077 Changed |= vectorizeGEPIndices(BB, R);
24078 }
24079 }
24080
24081 if (Changed) {
24082 R.optimizeGatherSequence();
24083 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
24084 }
24085 return Changed;
24086}
24087
24088std::optional<bool>
24089SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
24090 unsigned Idx, unsigned MinVF,
24091 unsigned &Size) {
24092 Size = 0;
24093 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
24094 << "\n");
24095 const unsigned Sz = R.getVectorElementSize(Chain[0]);
24096 unsigned VF = Chain.size();
24097
24098 if (!has_single_bit(Sz) ||
24100 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
24101 VF) ||
24102 VF < 2 || VF < MinVF) {
24103 // Check if vectorizing with a non-power-of-2 VF should be considered. At
24104 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
24105 // all vector lanes are used.
24106 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
24107 return false;
24108 }
24109
24110 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
24111 << "\n");
24112
24113 SetVector<Value *> ValOps;
24114 for (Value *V : Chain)
24115 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
24116 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
24117 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
24118 InstructionsState S = Analysis.buildInstructionsState(
24119 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
24120 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
24121 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
24122 bool IsAllowedSize =
24123 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
24124 ValOps.size()) ||
24125 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
24126 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
24127 (!S.getMainOp()->isSafeToRemove() ||
24128 any_of(ValOps.getArrayRef(),
24129 [&](Value *V) {
24130 return !isa<ExtractElementInst>(V) &&
24131 (V->getNumUses() > Chain.size() ||
24132 any_of(V->users(), [&](User *U) {
24133 return !Stores.contains(U);
24134 }));
24135 }))) ||
24136 (ValOps.size() > Chain.size() / 2 && !S)) {
24137 Size = (!IsAllowedSize && S) ? 1 : 2;
24138 return false;
24139 }
24140 }
24141 if (R.isLoadCombineCandidate(Chain))
24142 return true;
24143 R.buildTree(Chain);
24144 // Check if tree tiny and store itself or its value is not vectorized.
24145 if (R.isTreeTinyAndNotFullyVectorizable()) {
24146 if (R.isGathered(Chain.front()) ||
24147 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
24148 return std::nullopt;
24149 Size = R.getCanonicalGraphSize();
24150 return false;
24151 }
24152 if (R.isProfitableToReorder()) {
24153 R.reorderTopToBottom();
24154 R.reorderBottomToTop();
24155 }
24156 R.transformNodes();
24157 R.computeMinimumValueSizes();
24158
24159 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
24160 R.buildExternalUses();
24161
24162 Size = R.getCanonicalGraphSize();
24163 if (S && S.getOpcode() == Instruction::Load)
24164 Size = 2; // cut off masked gather small trees
24165 InstructionCost Cost = R.getTreeCost(TreeCost);
24166
24167 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
24168 if (Cost < -SLPCostThreshold) {
24169 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
24170
24171 using namespace ore;
24172
24173 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
24174 cast<StoreInst>(Chain[0]))
24175 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
24176 << " and with tree size "
24177 << NV("TreeSize", R.getTreeSize()));
24178
24179 R.vectorizeTree();
24180 return true;
24181 }
24182
24183 return false;
24184}
24185
24186/// Checks if the quadratic mean deviation is less than 90% of the mean size.
24187static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes) {
24188 unsigned Num = 0;
24189 uint64_t Sum = std::accumulate(
24190 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
24191 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
24192 unsigned Size = Val.first;
24193 if (Size == 1)
24194 return V;
24195 ++Num;
24196 return V + Size;
24197 });
24198 if (Num == 0)
24199 return true;
24200 uint64_t Mean = Sum / Num;
24201 if (Mean == 0)
24202 return true;
24203 uint64_t Dev = std::accumulate(
24204 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
24205 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
24206 unsigned P = Val.first;
24207 if (P == 1)
24208 return V;
24209 return V + (P - Mean) * (P - Mean);
24210 }) /
24211 Num;
24212 return Dev * 96 / (Mean * Mean) == 0;
24213}
24214
24215namespace {
24216
24217/// A group of stores that we'll try to bundle together using vector ops.
24218/// They are ordered using the signed distance of their address operand to the
24219/// address of this group's BaseInstr.
24220class RelatedStoreInsts {
24221public:
24222 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
24223 : AllStores(AllStores) {
24224 reset(BaseInstrIdx);
24225 }
24226
24227 void reset(unsigned NewBaseInstr) {
24228 assert(NewBaseInstr < AllStores.size() &&
24229 "Instruction index out of bounds");
24230 BaseInstrIdx = NewBaseInstr;
24231 Instrs.clear();
24232 insertOrLookup(NewBaseInstr, 0);
24233 }
24234
24235 /// Tries to insert \p InstrIdx as the store with a pointer distance of
24236 /// \p PtrDist.
24237 /// Does nothing if there is already a store with that \p PtrDist.
24238 /// \returns The previously associated Instruction index, or std::nullopt
24239 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
24240 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
24241 return Inserted ? std::nullopt : std::make_optional(It->second);
24242 }
24243
24244 using DistToInstMap = std::map<int64_t, unsigned>;
24245 const DistToInstMap &getStores() const { return Instrs; }
24246
24247 /// If \p SI is related to this group of stores, return the distance of its
24248 /// pointer operand to the one the group's BaseInstr.
24249 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
24250 ScalarEvolution &SE) const {
24251 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
24252 return getPointersDiff(
24253 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
24254 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
24255 /*StrictCheck=*/true);
24256 }
24257
24258 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
24259 /// Stores whose index is less than \p MinSafeIdx will be dropped.
24260 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
24261 int64_t DistFromCurBase) {
24262 DistToInstMap PrevSet = std::move(Instrs);
24263 reset(NewBaseInstIdx);
24264
24265 // Re-insert stores that come after MinSafeIdx to try and vectorize them
24266 // again. Their distance will be "rebased" to use NewBaseInstIdx as
24267 // reference.
24268 for (auto [Dist, InstIdx] : PrevSet) {
24269 if (InstIdx >= MinSafeIdx)
24270 insertOrLookup(InstIdx, Dist - DistFromCurBase);
24271 }
24272 }
24273
24274 /// Remove all stores that have been vectorized from this group.
24275 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
24276 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
24277 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
24278 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
24279 });
24280
24281 // Get a forward iterator pointing after the last vectorized store and erase
24282 // all stores before it so we don't try to vectorize them again.
24283 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
24284 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
24285 }
24286
24287private:
24288 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
24289 unsigned BaseInstrIdx;
24290
24291 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
24292 DistToInstMap Instrs;
24293
24294 /// Reference to all the stores in the BB being analyzed.
24295 ArrayRef<StoreInst *> AllStores;
24296};
24297
24298} // end anonymous namespace
24299
24300bool SLPVectorizerPass::vectorizeStores(
24301 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
24302 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
24303 &Visited) {
24304 // We may run into multiple chains that merge into a single chain. We mark the
24305 // stores that we vectorized so that we don't visit the same store twice.
24306 BoUpSLP::ValueSet VectorizedStores;
24307 bool Changed = false;
24308
24309 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
24310 int64_t PrevDist = -1;
24311 BoUpSLP::ValueList Operands;
24312 // Collect the chain into a list.
24313 for (auto [Idx, Data] : enumerate(StoreSeq)) {
24314 auto &[Dist, InstIdx] = Data;
24315 if (Operands.empty() || Dist - PrevDist == 1) {
24316 Operands.push_back(Stores[InstIdx]);
24317 PrevDist = Dist;
24318 if (Idx != StoreSeq.size() - 1)
24319 continue;
24320 }
24321 llvm::scope_exit E([&, &Dist = Dist, &InstIdx = InstIdx]() {
24322 Operands.clear();
24323 Operands.push_back(Stores[InstIdx]);
24324 PrevDist = Dist;
24325 });
24326
24327 if (Operands.size() <= 1 ||
24328 !Visited
24329 .insert({Operands.front(),
24330 cast<StoreInst>(Operands.front())->getValueOperand(),
24331 Operands.back(),
24332 cast<StoreInst>(Operands.back())->getValueOperand(),
24333 Operands.size()})
24334 .second)
24335 continue;
24336
24337 unsigned MaxVecRegSize = R.getMaxVecRegSize();
24338 unsigned EltSize = R.getVectorElementSize(Operands[0]);
24339 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
24340
24341 unsigned MaxVF =
24342 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
24343 auto *Store = cast<StoreInst>(Operands[0]);
24344 Type *StoreTy = Store->getValueOperand()->getType();
24345 Type *ValueTy = StoreTy;
24346 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
24347 ValueTy = Trunc->getSrcTy();
24348 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
24349 // getStoreMinimumVF only support scalar type as arguments. As a result,
24350 // we need to use the element type of StoreTy and ValueTy to retrieve the
24351 // VF and then transform it back.
24352 // Remember: VF is defined as the number we want to vectorize, not the
24353 // number of elements in the final vector.
24354 Type *StoreScalarTy = StoreTy->getScalarType();
24355 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
24356 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
24357 ValueTy->getScalarType()));
24358 MinVF /= getNumElements(StoreTy);
24359 MinVF = std::max<unsigned>(2, MinVF);
24360
24361 if (MaxVF < MinVF) {
24362 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
24363 << ") < "
24364 << "MinVF (" << MinVF << ")\n");
24365 continue;
24366 }
24367
24368 unsigned NonPowerOf2VF = 0;
24370 // First try vectorizing with a non-power-of-2 VF. At the moment, only
24371 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
24372 // lanes are used.
24373 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
24374 if (has_single_bit(CandVF + 1)) {
24375 NonPowerOf2VF = CandVF;
24376 assert(NonPowerOf2VF != MaxVF &&
24377 "Non-power-of-2 VF should not be equal to MaxVF");
24378 }
24379 }
24380
24381 // MaxRegVF represents the number of instructions (scalar, or vector in
24382 // case of revec) that can be vectorized to naturally fit in a vector
24383 // register.
24384 unsigned MaxRegVF = MaxVF;
24385
24386 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
24387 if (MaxVF < MinVF) {
24388 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
24389 << ") < "
24390 << "MinVF (" << MinVF << ")\n");
24391 continue;
24392 }
24393
24394 SmallVector<unsigned> CandidateVFs;
24395 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
24396 VF = divideCeil(VF, 2))
24397 CandidateVFs.push_back(VF);
24398
24399 unsigned End = Operands.size();
24400 unsigned Repeat = 0;
24401 constexpr unsigned MaxAttempts = 4;
24402 // first: the best TreeSize from all prior loops over CandidateVFs, gets
24403 // updated after looping through CandidateVFs
24404 // second: the best TreeSize from all prior loops including the current
24405 // one
24407 Operands.size(), {1, 1});
24408 // The `slice` and `drop_front` interfaces are convenient
24409 const auto RangeSizes = MutableArrayRef(RangeSizesStorage);
24410 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
24411 auto IsNotVectorized = [](const std::pair<unsigned, unsigned> &P) {
24412 return P.first > 0;
24413 };
24414 auto IsVectorized = [](const std::pair<unsigned, unsigned> &P) {
24415 return P.first == 0;
24416 };
24417 auto VFIsProfitable = [](unsigned Size,
24418 const std::pair<unsigned, unsigned> &P) {
24419 return Size >= P.first;
24420 };
24421 auto FirstSizeSame = [](unsigned Size,
24422 const std::pair<unsigned, unsigned> &P) {
24423 return Size == P.first;
24424 };
24425 while (true) {
24426 ++Repeat;
24427 bool RepeatChanged = false;
24428 bool AnyProfitableGraph = false;
24429 for (unsigned VF : CandidateVFs) {
24430 AnyProfitableGraph = false;
24431 unsigned FirstUnvecStore = std::distance(
24432 RangeSizes.begin(), find_if(RangeSizes, IsNotVectorized));
24433
24434 // Form slices of size VF starting from FirstUnvecStore and try to
24435 // vectorize them.
24436 while (FirstUnvecStore < End) {
24437 unsigned FirstVecStore = std::distance(
24438 RangeSizes.begin(),
24439 find_if(RangeSizes.drop_front(FirstUnvecStore), IsVectorized));
24440 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
24441 for (unsigned SliceStartIdx = FirstUnvecStore;
24442 SliceStartIdx + VF <= MaxSliceEnd;) {
24443 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF))) {
24444 ++SliceStartIdx;
24445 continue;
24446 }
24447 ArrayRef<Value *> Slice =
24448 ArrayRef(Operands).slice(SliceStartIdx, VF);
24449 assert(all_of(Slice,
24450 [&](Value *V) {
24451 return cast<StoreInst>(V)
24452 ->getValueOperand()
24453 ->getType() ==
24454 cast<StoreInst>(Slice.front())
24455 ->getValueOperand()
24456 ->getType();
24457 }) &&
24458 "Expected all operands of same type.");
24459 if (!NonSchedulable.empty()) {
24460 auto [NonSchedSizeMax, NonSchedSizeMin] =
24461 NonSchedulable.lookup(Slice.front());
24462 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
24463 // VF is too ambitious. Try to vectorize another slice before
24464 // trying a smaller VF.
24465 SliceStartIdx += NonSchedSizeMax;
24466 continue;
24467 }
24468 }
24469 unsigned TreeSize;
24470 std::optional<bool> Res =
24471 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
24472 if (!Res) {
24473 // Update the range of non schedulable VFs for slices starting
24474 // at SliceStartIdx.
24475 NonSchedulable
24476 .try_emplace(Slice.front(), std::make_pair(VF, VF))
24477 .first->getSecond()
24478 .second = VF;
24479 } else if (*Res) {
24480 // Mark the vectorized stores so that we don't vectorize them
24481 // again.
24482 VectorizedStores.insert_range(Slice);
24483 // Mark the vectorized stores so that we don't vectorize them
24484 // again.
24485 AnyProfitableGraph = RepeatChanged = Changed = true;
24486 // If we vectorized initial block, no need to try to vectorize
24487 // it again.
24488 for (std::pair<unsigned, unsigned> &P :
24489 RangeSizes.slice(SliceStartIdx, VF))
24490 P.first = P.second = 0;
24491 if (SliceStartIdx < FirstUnvecStore + MinVF) {
24492 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
24493 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
24494 P.first = P.second = 0;
24495 FirstUnvecStore = SliceStartIdx + VF;
24496 }
24497 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
24498 for (std::pair<unsigned, unsigned> &P :
24499 RangeSizes.slice(SliceStartIdx + VF,
24500 MaxSliceEnd - (SliceStartIdx + VF)))
24501 P.first = P.second = 0;
24502 if (MaxSliceEnd == End)
24503 End = SliceStartIdx;
24504 MaxSliceEnd = SliceStartIdx;
24505 }
24506 SliceStartIdx += VF;
24507 continue;
24508 }
24509 if (VF > 2 && Res &&
24510 !all_of(RangeSizes.slice(SliceStartIdx, VF),
24511 std::bind(VFIsProfitable, TreeSize, _1))) {
24512 SliceStartIdx += VF;
24513 continue;
24514 }
24515 // Check for the very big VFs that we're not rebuilding same
24516 // trees, just with larger number of elements.
24517 if (VF > MaxRegVF && TreeSize > 1 &&
24518 all_of(RangeSizes.slice(SliceStartIdx, VF),
24519 std::bind(FirstSizeSame, TreeSize, _1))) {
24520 SliceStartIdx += VF;
24521 while (SliceStartIdx != MaxSliceEnd &&
24522 RangeSizes[SliceStartIdx].first == TreeSize)
24523 ++SliceStartIdx;
24524 continue;
24525 }
24526 if (TreeSize > 1)
24527 for (std::pair<unsigned, unsigned> &P :
24528 RangeSizes.slice(SliceStartIdx, VF))
24529 P.second = std::max(P.second, TreeSize);
24530 ++SliceStartIdx;
24531 AnyProfitableGraph = true;
24532 }
24533 if (FirstUnvecStore >= End)
24534 break;
24535 if (MaxSliceEnd - FirstUnvecStore < VF &&
24536 MaxSliceEnd - FirstUnvecStore >= MinVF)
24537 AnyProfitableGraph = true;
24538 FirstUnvecStore = std::distance(
24539 RangeSizes.begin(),
24540 find_if(RangeSizes.drop_front(MaxSliceEnd), IsNotVectorized));
24541 }
24542 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
24543 break;
24544 // For the MaxRegVF case, save RangeSizes to limit compile time
24545 if (VF == MaxRegVF)
24546 for (std::pair<unsigned, unsigned> &P : RangeSizes)
24547 if (P.first != 0)
24548 P.first = std::max(P.second, P.first);
24549 }
24550 // All values vectorized - exit.
24551 if (all_of(RangeSizes, IsVectorized))
24552 break;
24553 // Check if tried all attempts or no need for the last attempts at all.
24554 if (Repeat >= MaxAttempts ||
24555 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
24556 break;
24557 constexpr unsigned StoresLimit = 64;
24558 const unsigned MaxTotalNum = std::min<unsigned>(
24559 Operands.size(),
24560 static_cast<unsigned>(
24561 End -
24562 std::distance(RangeSizes.begin(),
24563 find_if(RangeSizes, IsNotVectorized)) +
24564 1));
24565 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
24566 unsigned Limit =
24567 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
24568 CandidateVFs.clear();
24569 if (bit_floor(Limit) == VF)
24570 CandidateVFs.push_back(Limit);
24571 if (VF > MaxTotalNum || VF >= StoresLimit)
24572 break;
24573 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
24574 if (P.first != 0)
24575 P.first = std::max(P.second, P.first);
24576 }
24577 // Last attempt to vectorize max number of elements, if all previous
24578 // attempts were unsuccessful because of the cost issues.
24579 CandidateVFs.push_back(VF);
24580 }
24581 }
24582 };
24583
24584 /// Groups of stores to vectorize
24585 SmallVector<RelatedStoreInsts> SortedStores;
24586
24587 // Inserts the specified store SI with the given index Idx to the set of the
24588 // stores. If the store with the same distance is found already - stop
24589 // insertion, try to vectorize already found stores. If some stores from this
24590 // sequence were not vectorized - try to vectorize them with the new store
24591 // later. But this logic is applied only to the stores, that come before the
24592 // previous store with the same distance.
24593 // Example:
24594 // 1. store x, %p
24595 // 2. store y, %p+1
24596 // 3. store z, %p+2
24597 // 4. store a, %p
24598 // 5. store b, %p+3
24599 // - Scan this from the last to first store. The very first bunch of stores is
24600 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
24601 // vector).
24602 // - The next store in the list - #1 - has the same distance from store #5 as
24603 // the store #4.
24604 // - Try to vectorize sequence of stores 4,2,3,5.
24605 // - If all these stores are vectorized - just drop them.
24606 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
24607 // - Start new stores sequence.
24608 // The new bunch of stores is {1, {1, 0}}.
24609 // - Add the stores from previous sequence, that were not vectorized.
24610 // Here we consider the stores in the reversed order, rather they are used in
24611 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
24612 // Store #3 can be added -> comes after store #4 with the same distance as
24613 // store #1.
24614 // Store #5 cannot be added - comes before store #4.
24615 // This logic allows to improve the compile time, we assume that the stores
24616 // after previous store with the same distance most likely have memory
24617 // dependencies and no need to waste compile time to try to vectorize them.
24618 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
24619 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
24620 std::optional<int64_t> PtrDist;
24621 auto *RelatedStores = find_if(
24622 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
24623 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
24624 return PtrDist.has_value();
24625 });
24626
24627 // We did not find a comparable store, start a new group.
24628 if (RelatedStores == SortedStores.end()) {
24629 SortedStores.emplace_back(Idx, Stores);
24630 return;
24631 }
24632
24633 // If there is already a store in the group with the same PtrDiff, try to
24634 // vectorize the existing instructions before adding the current store.
24635 // Otherwise, insert this store and keep collecting.
24636 if (std::optional<unsigned> PrevInst =
24637 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
24638 TryToVectorize(RelatedStores->getStores());
24639 RelatedStores->clearVectorizedStores(VectorizedStores);
24640 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
24641 /*NewBaseInstIdx=*/Idx,
24642 /*DistFromCurBase=*/*PtrDist);
24643 }
24644 };
24645 Type *PrevValTy = nullptr;
24646 for (auto [I, SI] : enumerate(Stores)) {
24647 if (R.isDeleted(SI))
24648 continue;
24649 if (!PrevValTy)
24650 PrevValTy = SI->getValueOperand()->getType();
24651 // Check that we do not try to vectorize stores of different types.
24652 if (PrevValTy != SI->getValueOperand()->getType()) {
24653 for (RelatedStoreInsts &StoreSeq : SortedStores)
24654 TryToVectorize(StoreSeq.getStores());
24655 SortedStores.clear();
24656 PrevValTy = SI->getValueOperand()->getType();
24657 }
24658 FillStoresSet(I, SI);
24659 }
24660
24661 // Final vectorization attempt.
24662 for (RelatedStoreInsts &StoreSeq : SortedStores)
24663 TryToVectorize(StoreSeq.getStores());
24664
24665 return Changed;
24666}
24667
24668void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
24669 // Initialize the collections. We will make a single pass over the block.
24670 Stores.clear();
24671 GEPs.clear();
24672
24673 // Visit the store and getelementptr instructions in BB and organize them in
24674 // Stores and GEPs according to the underlying objects of their pointer
24675 // operands.
24676 for (Instruction &I : *BB) {
24677 // Ignore store instructions that are volatile or have a pointer operand
24678 // that doesn't point to a scalar type.
24679 if (auto *SI = dyn_cast<StoreInst>(&I)) {
24680 if (!SI->isSimple())
24681 continue;
24682 if (!isValidElementType(SI->getValueOperand()->getType()))
24683 continue;
24684 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
24685 }
24686
24687 // Ignore getelementptr instructions that have more than one index, a
24688 // constant index, or a pointer operand that doesn't point to a scalar
24689 // type.
24690 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
24691 if (GEP->getNumIndices() != 1)
24692 continue;
24693 Value *Idx = GEP->idx_begin()->get();
24694 if (isa<Constant>(Idx))
24695 continue;
24696 if (!isValidElementType(Idx->getType()))
24697 continue;
24698 if (GEP->getType()->isVectorTy())
24699 continue;
24700 GEPs[GEP->getPointerOperand()].push_back(GEP);
24701 }
24702 }
24703}
24704
24705bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
24706 bool MaxVFOnly) {
24707 if (VL.size() < 2)
24708 return false;
24709
24710 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
24711 << VL.size() << ".\n");
24712
24713 // Check that all of the parts are instructions of the same type,
24714 // we permit an alternate opcode via InstructionsState.
24715 InstructionsState S = getSameOpcode(VL, *TLI);
24716 if (!S)
24717 return false;
24718
24719 Instruction *I0 = S.getMainOp();
24720 // Make sure invalid types (including vector type) are rejected before
24721 // determining vectorization factor for scalar instructions.
24722 for (Value *V : VL) {
24723 Type *Ty = V->getType();
24725 // NOTE: the following will give user internal llvm type name, which may
24726 // not be useful.
24727 R.getORE()->emit([&]() {
24728 std::string TypeStr;
24729 llvm::raw_string_ostream OS(TypeStr);
24730 Ty->print(OS);
24731 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
24732 << "Cannot SLP vectorize list: type "
24733 << TypeStr + " is unsupported by vectorizer";
24734 });
24735 return false;
24736 }
24737 }
24738
24739 Type *ScalarTy = getValueType(VL[0]);
24740 unsigned Sz = R.getVectorElementSize(I0);
24741 unsigned MinVF = R.getMinVF(Sz);
24742 unsigned MaxVF = std::max<unsigned>(
24743 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
24744 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
24745 if (MaxVF < 2) {
24746 R.getORE()->emit([&]() {
24747 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
24748 << "Cannot SLP vectorize list: vectorization factor "
24749 << "less than 2 is not supported";
24750 });
24751 return false;
24752 }
24753
24754 bool Changed = false;
24755 bool CandidateFound = false;
24756 InstructionCost MinCost = SLPCostThreshold.getValue();
24757
24758 unsigned NextInst = 0, MaxInst = VL.size();
24759 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
24760 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
24761 // No actual vectorization should happen, if number of parts is the same as
24762 // provided vectorization factor (i.e. the scalar type is used for vector
24763 // code during codegen).
24764 auto *VecTy = getWidenedType(ScalarTy, VF);
24765 if (TTI->getNumberOfParts(VecTy) == VF)
24766 continue;
24767 for (unsigned I = NextInst; I < MaxInst; ++I) {
24768 unsigned ActualVF = std::min(MaxInst - I, VF);
24769
24770 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
24771 continue;
24772
24773 if (MaxVFOnly && ActualVF < MaxVF)
24774 break;
24775 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
24776 break;
24777
24778 SmallVector<Value *> Ops(ActualVF, nullptr);
24779 unsigned Idx = 0;
24780 for (Value *V : VL.drop_front(I)) {
24781 // Check that a previous iteration of this loop did not delete the
24782 // Value.
24783 if (auto *Inst = dyn_cast<Instruction>(V);
24784 !Inst || !R.isDeleted(Inst)) {
24785 Ops[Idx] = V;
24786 ++Idx;
24787 if (Idx == ActualVF)
24788 break;
24789 }
24790 }
24791 // Not enough vectorizable instructions - exit.
24792 if (Idx != ActualVF)
24793 break;
24794
24795 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
24796 << "\n");
24797
24798 R.buildTree(Ops);
24799 if (R.isTreeTinyAndNotFullyVectorizable())
24800 continue;
24801 if (R.isProfitableToReorder()) {
24802 R.reorderTopToBottom();
24803 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
24804 }
24805 R.transformNodes();
24806 R.computeMinimumValueSizes();
24807 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
24808 R.buildExternalUses();
24809
24810 InstructionCost Cost = R.getTreeCost(TreeCost);
24811 CandidateFound = true;
24812 MinCost = std::min(MinCost, Cost);
24813
24814 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24815 << " for VF=" << ActualVF << "\n");
24816 if (Cost < -SLPCostThreshold) {
24817 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
24818 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
24820 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
24821 << " and with tree size "
24822 << ore::NV("TreeSize", R.getTreeSize()));
24823
24824 R.vectorizeTree();
24825 // Move to the next bundle.
24826 I += VF - 1;
24827 NextInst = I + 1;
24828 Changed = true;
24829 }
24830 }
24831 }
24832
24833 if (!Changed && CandidateFound) {
24834 R.getORE()->emit([&]() {
24835 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
24836 << "List vectorization was possible but not beneficial with cost "
24837 << ore::NV("Cost", MinCost) << " >= "
24838 << ore::NV("Treshold", -SLPCostThreshold);
24839 });
24840 } else if (!Changed) {
24841 R.getORE()->emit([&]() {
24842 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
24843 << "Cannot SLP vectorize list: vectorization was impossible"
24844 << " with available vectorization factors";
24845 });
24846 }
24847 return Changed;
24848}
24849
24850namespace {
24851
24852/// Model horizontal reductions.
24853///
24854/// A horizontal reduction is a tree of reduction instructions that has values
24855/// that can be put into a vector as its leaves. For example:
24856///
24857/// mul mul mul mul
24858/// \ / \ /
24859/// + +
24860/// \ /
24861/// +
24862/// This tree has "mul" as its leaf values and "+" as its reduction
24863/// instructions. A reduction can feed into a store or a binary operation
24864/// feeding a phi.
24865/// ...
24866/// \ /
24867/// +
24868/// |
24869/// phi +=
24870///
24871/// Or:
24872/// ...
24873/// \ /
24874/// +
24875/// |
24876/// *p =
24877///
24878class HorizontalReduction {
24879 using ReductionOpsType = SmallVector<Value *, 16>;
24880 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
24881 ReductionOpsListType ReductionOps;
24882 /// List of possibly reduced values.
24884 /// Maps reduced value to the corresponding reduction operation.
24885 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
24886 WeakTrackingVH ReductionRoot;
24887 /// The type of reduction operation.
24888 RecurKind RdxKind;
24889 /// Checks if the optimization of original scalar identity operations on
24890 /// matched horizontal reductions is enabled and allowed.
24891 bool IsSupportedHorRdxIdentityOp = false;
24892 /// The minimum number of the reduced values.
24893 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
24894 /// Contains vector values for reduction including their scale factor and
24895 /// signedness.
24897
24898 static bool isCmpSelMinMax(Instruction *I) {
24899 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
24901 }
24902
24903 // And/or are potentially poison-safe logical patterns like:
24904 // select x, y, false
24905 // select x, true, y
24906 static bool isBoolLogicOp(Instruction *I) {
24907 return isa<SelectInst>(I) &&
24908 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
24909 }
24910
24911 /// Checks if instruction is associative and can be vectorized.
24912 static bool isVectorizable(RecurKind Kind, Instruction *I,
24913 bool TwoElementReduction = false) {
24914 if (Kind == RecurKind::None)
24915 return false;
24916
24917 // Integer ops that map to select instructions or intrinsics are fine.
24919 isBoolLogicOp(I))
24920 return true;
24921
24922 // No need to check for associativity, if 2 reduced values.
24923 if (TwoElementReduction)
24924 return true;
24925
24926 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
24927 // FP min/max are associative except for NaN and -0.0. We do not
24928 // have to rule out -0.0 here because the intrinsic semantics do not
24929 // specify a fixed result for it.
24930 return I->getFastMathFlags().noNaNs();
24931 }
24932
24933 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
24934 return true;
24935
24936 return I->isAssociative();
24937 }
24938
24939 static Value *getRdxOperand(Instruction *I, unsigned Index) {
24940 // Poison-safe 'or' takes the form: select X, true, Y
24941 // To make that work with the normal operand processing, we skip the
24942 // true value operand.
24943 // TODO: Change the code and data structures to handle this without a hack.
24944 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
24945 return I->getOperand(2);
24946 return I->getOperand(Index);
24947 }
24948
24949 /// Creates reduction operation with the current opcode.
24950 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
24951 Value *RHS, const Twine &Name, bool UseSelect) {
24952 Type *OpTy = LHS->getType();
24953 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
24954 switch (Kind) {
24955 case RecurKind::Or: {
24956 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
24957 return Builder.CreateSelectWithUnknownProfile(
24958 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
24959 RHS, DEBUG_TYPE, Name);
24960 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24961 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24962 Name);
24963 }
24964 case RecurKind::And: {
24965 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
24966 return Builder.CreateSelectWithUnknownProfile(
24967 LHS, RHS,
24968 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
24969 DEBUG_TYPE, Name);
24970 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24971 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24972 Name);
24973 }
24974 case RecurKind::Add:
24975 case RecurKind::Mul:
24976 case RecurKind::Xor:
24977 case RecurKind::FAdd:
24978 case RecurKind::FMul: {
24979 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24980 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24981 Name);
24982 }
24983 case RecurKind::SMax:
24984 case RecurKind::SMin:
24985 case RecurKind::UMax:
24986 case RecurKind::UMin:
24987 if (UseSelect) {
24989 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
24990 return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
24991 Name);
24992 }
24993 [[fallthrough]];
24994 case RecurKind::FMax:
24995 case RecurKind::FMin:
24996 case RecurKind::FMaximum:
24997 case RecurKind::FMinimum:
24998 case RecurKind::FMaximumNum:
24999 case RecurKind::FMinimumNum: {
25001 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
25002 }
25003 default:
25004 llvm_unreachable("Unknown reduction operation.");
25005 }
25006 }
25007
25008 /// Creates reduction operation with the current opcode with the IR flags
25009 /// from \p ReductionOps, dropping nuw/nsw flags.
25010 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
25011 Value *RHS, const Twine &Name,
25012 const ReductionOpsListType &ReductionOps) {
25013 bool UseSelect = ReductionOps.size() == 2 ||
25014 // Logical or/and.
25015 (ReductionOps.size() == 1 &&
25016 any_of(ReductionOps.front(), IsaPred<SelectInst>));
25017 assert((!UseSelect || ReductionOps.size() != 2 ||
25018 isa<SelectInst>(ReductionOps[1][0])) &&
25019 "Expected cmp + select pairs for reduction");
25020 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
25022 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
25023 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
25024 /*IncludeWrapFlags=*/false);
25025 propagateIRFlags(Op, ReductionOps[1], nullptr,
25026 /*IncludeWrapFlags=*/false);
25027 return Op;
25028 }
25029 }
25030 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
25031 return Op;
25032 }
25033
25034public:
25035 static RecurKind getRdxKind(Value *V) {
25036 auto *I = dyn_cast<Instruction>(V);
25037 if (!I)
25038 return RecurKind::None;
25039 if (match(I, m_Add(m_Value(), m_Value())))
25040 return RecurKind::Add;
25041 if (match(I, m_Mul(m_Value(), m_Value())))
25042 return RecurKind::Mul;
25043 if (match(I, m_And(m_Value(), m_Value())) ||
25045 return RecurKind::And;
25046 if (match(I, m_Or(m_Value(), m_Value())) ||
25048 return RecurKind::Or;
25049 if (match(I, m_Xor(m_Value(), m_Value())))
25050 return RecurKind::Xor;
25051 if (match(I, m_FAdd(m_Value(), m_Value())))
25052 return RecurKind::FAdd;
25053 if (match(I, m_FMul(m_Value(), m_Value())))
25054 return RecurKind::FMul;
25055
25057 return RecurKind::FMax;
25059 return RecurKind::FMin;
25060
25061 if (match(I, m_FMaximum(m_Value(), m_Value())))
25062 return RecurKind::FMaximum;
25063 if (match(I, m_FMinimum(m_Value(), m_Value())))
25064 return RecurKind::FMinimum;
25065 // This matches either cmp+select or intrinsics. SLP is expected to handle
25066 // either form.
25067 // TODO: If we are canonicalizing to intrinsics, we can remove several
25068 // special-case paths that deal with selects.
25069 if (match(I, m_SMax(m_Value(), m_Value())))
25070 return RecurKind::SMax;
25071 if (match(I, m_SMin(m_Value(), m_Value())))
25072 return RecurKind::SMin;
25073 if (match(I, m_UMax(m_Value(), m_Value())))
25074 return RecurKind::UMax;
25075 if (match(I, m_UMin(m_Value(), m_Value())))
25076 return RecurKind::UMin;
25077
25078 if (auto *Select = dyn_cast<SelectInst>(I)) {
25079 // Try harder: look for min/max pattern based on instructions producing
25080 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
25081 // During the intermediate stages of SLP, it's very common to have
25082 // pattern like this (since optimizeGatherSequence is run only once
25083 // at the end):
25084 // %1 = extractelement <2 x i32> %a, i32 0
25085 // %2 = extractelement <2 x i32> %a, i32 1
25086 // %cond = icmp sgt i32 %1, %2
25087 // %3 = extractelement <2 x i32> %a, i32 0
25088 // %4 = extractelement <2 x i32> %a, i32 1
25089 // %select = select i1 %cond, i32 %3, i32 %4
25090 CmpPredicate Pred;
25091 Instruction *L1;
25092 Instruction *L2;
25093
25094 Value *LHS = Select->getTrueValue();
25095 Value *RHS = Select->getFalseValue();
25096 Value *Cond = Select->getCondition();
25097
25098 // TODO: Support inverse predicates.
25099 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
25102 return RecurKind::None;
25103 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
25106 return RecurKind::None;
25107 } else {
25109 return RecurKind::None;
25110 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
25113 return RecurKind::None;
25114 }
25115
25116 switch (Pred) {
25117 default:
25118 return RecurKind::None;
25119 case CmpInst::ICMP_SGT:
25120 case CmpInst::ICMP_SGE:
25121 return RecurKind::SMax;
25122 case CmpInst::ICMP_SLT:
25123 case CmpInst::ICMP_SLE:
25124 return RecurKind::SMin;
25125 case CmpInst::ICMP_UGT:
25126 case CmpInst::ICMP_UGE:
25127 return RecurKind::UMax;
25128 case CmpInst::ICMP_ULT:
25129 case CmpInst::ICMP_ULE:
25130 return RecurKind::UMin;
25131 }
25132 }
25133 return RecurKind::None;
25134 }
25135
25136 /// Get the index of the first operand.
25137 static unsigned getFirstOperandIndex(Instruction *I) {
25138 return isCmpSelMinMax(I) ? 1 : 0;
25139 }
25140
25141private:
25142 /// Total number of operands in the reduction operation.
25143 static unsigned getNumberOfOperands(Instruction *I) {
25144 return isCmpSelMinMax(I) ? 3 : 2;
25145 }
25146
25147 /// Checks if the instruction is in basic block \p BB.
25148 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
25149 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
25150 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
25151 auto *Sel = cast<SelectInst>(I);
25152 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
25153 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
25154 }
25155 return I->getParent() == BB;
25156 }
25157
25158 /// Expected number of uses for reduction operations/reduced values.
25159 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
25160 if (IsCmpSelMinMax) {
25161 // SelectInst must be used twice while the condition op must have single
25162 // use only.
25163 if (auto *Sel = dyn_cast<SelectInst>(I))
25164 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
25165 return I->hasNUses(2);
25166 }
25167
25168 // Arithmetic reduction operation must be used once only.
25169 return I->hasOneUse();
25170 }
25171
25172 /// Initializes the list of reduction operations.
25173 void initReductionOps(Instruction *I) {
25174 if (isCmpSelMinMax(I))
25175 ReductionOps.assign(2, ReductionOpsType());
25176 else
25177 ReductionOps.assign(1, ReductionOpsType());
25178 }
25179
25180 /// Add all reduction operations for the reduction instruction \p I.
25181 void addReductionOps(Instruction *I) {
25182 if (isCmpSelMinMax(I)) {
25183 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
25184 ReductionOps[1].emplace_back(I);
25185 } else {
25186 ReductionOps[0].emplace_back(I);
25187 }
25188 }
25189
25190 static bool isGoodForReduction(ArrayRef<Value *> Data) {
25191 int Sz = Data.size();
25192 auto *I = dyn_cast<Instruction>(Data.front());
25193 return Sz > 1 || isConstant(Data.front()) ||
25194 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
25195 }
25196
25197public:
25198 HorizontalReduction() = default;
25200 : ReductionRoot(I), ReductionLimit(2) {
25201 RdxKind = HorizontalReduction::getRdxKind(I);
25202 ReductionOps.emplace_back().push_back(I);
25203 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
25204 for (Value *V : Ops)
25205 ReducedValsToOps[V].push_back(I);
25206 }
25207
25208 bool matchReductionForOperands() const {
25209 // Analyze "regular" integer/FP types for reductions - no target-specific
25210 // types or pointers.
25211 assert(ReductionRoot && "Reduction root is not set!");
25212 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
25213 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
25214 return Ops.size() == 2;
25215 })))
25216 return false;
25217
25218 return true;
25219 }
25220
25221 /// Try to find a reduction tree.
25222 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
25223 ScalarEvolution &SE, const DataLayout &DL,
25224 const TargetLibraryInfo &TLI) {
25225 RdxKind = HorizontalReduction::getRdxKind(Root);
25226 if (!isVectorizable(RdxKind, Root))
25227 return false;
25228
25229 // Analyze "regular" integer/FP types for reductions - no target-specific
25230 // types or pointers.
25231 Type *Ty = Root->getType();
25232 if (!isValidElementType(Ty) || Ty->isPointerTy())
25233 return false;
25234
25235 // Though the ultimate reduction may have multiple uses, its condition must
25236 // have only single use.
25237 if (auto *Sel = dyn_cast<SelectInst>(Root))
25238 if (!Sel->getCondition()->hasOneUse())
25239 return false;
25240
25241 ReductionRoot = Root;
25242
25243 // Iterate through all the operands of the possible reduction tree and
25244 // gather all the reduced values, sorting them by their value id.
25245 BasicBlock *BB = Root->getParent();
25246 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
25248 1, std::make_pair(Root, 0));
25249 // Checks if the operands of the \p TreeN instruction are also reduction
25250 // operations or should be treated as reduced values or an extra argument,
25251 // which is not part of the reduction.
25252 auto CheckOperands = [&](Instruction *TreeN,
25253 SmallVectorImpl<Value *> &PossibleReducedVals,
25254 SmallVectorImpl<Instruction *> &ReductionOps,
25255 unsigned Level) {
25256 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
25257 getNumberOfOperands(TreeN)))) {
25258 Value *EdgeVal = getRdxOperand(TreeN, I);
25259 ReducedValsToOps[EdgeVal].push_back(TreeN);
25260 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
25261 // If the edge is not an instruction, or it is different from the main
25262 // reduction opcode or has too many uses - possible reduced value.
25263 // Also, do not try to reduce const values, if the operation is not
25264 // foldable.
25265 if (!EdgeInst || Level > RecursionMaxDepth ||
25266 getRdxKind(EdgeInst) != RdxKind ||
25267 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
25268 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
25269 !isVectorizable(RdxKind, EdgeInst) ||
25270 (R.isAnalyzedReductionRoot(EdgeInst) &&
25271 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
25272 PossibleReducedVals.push_back(EdgeVal);
25273 continue;
25274 }
25275 ReductionOps.push_back(EdgeInst);
25276 }
25277 };
25278 // Try to regroup reduced values so that it gets more profitable to try to
25279 // reduce them. Values are grouped by their value ids, instructions - by
25280 // instruction op id and/or alternate op id, plus do extra analysis for
25281 // loads (grouping them by the distance between pointers) and cmp
25282 // instructions (grouping them by the predicate).
25283 SmallMapVector<
25284 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
25285 8>
25286 PossibleReducedVals;
25287 initReductionOps(Root);
25288 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
25289 SmallSet<size_t, 2> LoadKeyUsed;
25290
25291 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
25293 Value *Ptr =
25295 if (!LoadKeyUsed.insert(Key).second) {
25296 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
25297 if (LIt != LoadsMap.end()) {
25298 for (LoadInst *RLI : LIt->second) {
25299 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
25300 LI->getType(), LI->getPointerOperand(), DL, SE,
25301 /*StrictCheck=*/true))
25302 return hash_value(RLI->getPointerOperand());
25303 }
25304 for (LoadInst *RLI : LIt->second) {
25306 LI->getPointerOperand(), TLI)) {
25307 hash_code SubKey = hash_value(RLI->getPointerOperand());
25308 return SubKey;
25309 }
25310 }
25311 if (LIt->second.size() > 2) {
25312 hash_code SubKey =
25313 hash_value(LIt->second.back()->getPointerOperand());
25314 return SubKey;
25315 }
25316 }
25317 }
25318 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
25319 .first->second.push_back(LI);
25320 return hash_value(LI->getPointerOperand());
25321 };
25322
25323 while (!Worklist.empty()) {
25324 auto [TreeN, Level] = Worklist.pop_back_val();
25325 SmallVector<Value *> PossibleRedVals;
25326 SmallVector<Instruction *> PossibleReductionOps;
25327 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
25328 addReductionOps(TreeN);
25329 // Add reduction values. The values are sorted for better vectorization
25330 // results.
25331 for (Value *V : PossibleRedVals) {
25332 size_t Key, Idx;
25333 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
25334 /*AllowAlternate=*/false);
25335 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
25336 }
25337 for (Instruction *I : reverse(PossibleReductionOps))
25338 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
25339 }
25340 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
25341 // Sort values by the total number of values kinds to start the reduction
25342 // from the longest possible reduced values sequences.
25343 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
25344 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
25345 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
25346 for (auto &Slice : PossibleRedVals) {
25347 PossibleRedValsVect.emplace_back();
25348 auto RedValsVect = Slice.second.takeVector();
25349 stable_sort(RedValsVect, llvm::less_second());
25350 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
25351 PossibleRedValsVect.back().append(Data.second, Data.first);
25352 }
25353 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
25354 return P1.size() > P2.size();
25355 });
25356 bool First = true;
25357 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
25358 if (First) {
25359 First = false;
25360 ReducedVals.emplace_back();
25361 } else if (!isGoodForReduction(Data)) {
25362 auto *LI = dyn_cast<LoadInst>(Data.front());
25363 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
25364 if (!LI || !LastLI ||
25366 getUnderlyingObject(LastLI->getPointerOperand()))
25367 ReducedVals.emplace_back();
25368 }
25369 ReducedVals.back().append(Data.rbegin(), Data.rend());
25370 }
25371 }
25372 // Sort the reduced values by number of same/alternate opcode and/or pointer
25373 // operand.
25374 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
25375 return P1.size() > P2.size();
25376 });
25377 return true;
25378 }
25379
25380 /// Attempt to vectorize the tree found by matchAssociativeReduction.
25381 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
25382 const TargetLibraryInfo &TLI, AssumptionCache *AC,
25383 DominatorTree &DT) {
25384 constexpr unsigned RegMaxNumber = 4;
25385 constexpr unsigned RedValsMaxNumber = 128;
25386 // If there are a sufficient number of reduction values, reduce
25387 // to a nearby power-of-2. We can safely generate oversized
25388 // vectors and rely on the backend to split them to legal sizes.
25389 if (unsigned NumReducedVals = std::accumulate(
25390 ReducedVals.begin(), ReducedVals.end(), 0,
25391 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
25392 if (!isGoodForReduction(Vals))
25393 return Num;
25394 return Num + Vals.size();
25395 });
25396 NumReducedVals < ReductionLimit &&
25397 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
25398 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
25399 })) {
25400 for (ReductionOpsType &RdxOps : ReductionOps)
25401 for (Value *RdxOp : RdxOps)
25402 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
25403 return nullptr;
25404 }
25405
25406 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
25407 TargetFolder(DL));
25408 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
25409
25410 // Track the reduced values in case if they are replaced by extractelement
25411 // because of the vectorization.
25412 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
25413 ReducedVals.front().size());
25414
25415 // The compare instruction of a min/max is the insertion point for new
25416 // instructions and may be replaced with a new compare instruction.
25417 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
25418 assert(isa<SelectInst>(RdxRootInst) &&
25419 "Expected min/max reduction to have select root instruction");
25420 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
25421 assert(isa<Instruction>(ScalarCond) &&
25422 "Expected min/max reduction to have compare condition");
25423 return cast<Instruction>(ScalarCond);
25424 };
25425
25426 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
25427 return isBoolLogicOp(cast<Instruction>(V));
25428 });
25429 // Return new VectorizedTree, based on previous value.
25430 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
25431 if (VectorizedTree) {
25432 // Update the final value in the reduction.
25434 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
25435 if (AnyBoolLogicOp) {
25436 auto It = ReducedValsToOps.find(VectorizedTree);
25437 auto It1 = ReducedValsToOps.find(Res);
25438 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
25439 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
25440 (It != ReducedValsToOps.end() &&
25441 any_of(It->getSecond(), [&](Instruction *I) {
25442 return isBoolLogicOp(I) &&
25443 getRdxOperand(I, 0) == VectorizedTree;
25444 }))) {
25445 ;
25446 } else if (isGuaranteedNotToBePoison(Res, AC) ||
25447 (It1 != ReducedValsToOps.end() &&
25448 any_of(It1->getSecond(), [&](Instruction *I) {
25449 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
25450 }))) {
25451 std::swap(VectorizedTree, Res);
25452 } else {
25453 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
25454 }
25455 }
25456
25457 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
25458 ReductionOps);
25459 }
25460 // Initialize the final value in the reduction.
25461 return Res;
25462 };
25463 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
25464 ReductionOps.front().size());
25465 for (ReductionOpsType &RdxOps : ReductionOps)
25466 for (Value *RdxOp : RdxOps) {
25467 if (!RdxOp)
25468 continue;
25469 IgnoreList.insert(RdxOp);
25470 }
25471 // Intersect the fast-math-flags from all reduction operations.
25472 FastMathFlags RdxFMF;
25473 RdxFMF.set();
25474 for (Value *U : IgnoreList)
25475 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
25476 RdxFMF &= FPMO->getFastMathFlags();
25477 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
25478
25479 // Need to track reduced vals, they may be changed during vectorization of
25480 // subvectors.
25481 for (ArrayRef<Value *> Candidates : ReducedVals)
25482 for (Value *V : Candidates)
25483 TrackedVals.try_emplace(V, V);
25484
25485 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
25486 Value *V) -> unsigned & {
25487 auto *It = MV.find(V);
25488 assert(It != MV.end() && "Unable to find given key.");
25489 return It->second;
25490 };
25491
25492 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
25493 // List of the values that were reduced in other trees as part of gather
25494 // nodes and thus requiring extract if fully vectorized in other trees.
25495 SmallPtrSet<Value *, 4> RequiredExtract;
25496 WeakTrackingVH VectorizedTree = nullptr;
25497 bool CheckForReusedReductionOps = false;
25498 // Try to vectorize elements based on their type.
25500 SmallVector<SmallVector<Value *>> LocalReducedVals;
25501 // Try merge consecutive reduced values into a single vectorizable group and
25502 // check, if they can be vectorized as copyables.
25503 for (ArrayRef<Value *> RV : ReducedVals) {
25504 // Loads are not very compatible with undefs.
25505 if (isa<UndefValue>(RV.front()) &&
25506 (States.empty() || !States.back() ||
25507 States.back().getOpcode() == Instruction::Load)) {
25508 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
25509 States.push_back(InstructionsState::invalid());
25510 continue;
25511 }
25512 if (!LocalReducedVals.empty() &&
25513 isa<UndefValue>(LocalReducedVals.back().front()) &&
25514 isa<LoadInst>(RV.front())) {
25515 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
25516 States.push_back(getSameOpcode(RV, TLI));
25517 continue;
25518 }
25520 if (!LocalReducedVals.empty())
25521 Ops = LocalReducedVals.back();
25522 Ops.append(RV.begin(), RV.end());
25523 InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
25524 InstructionsState OpS =
25525 Analysis.buildInstructionsState(Ops, V, VectorizeCopyableElements);
25526 if (LocalReducedVals.empty()) {
25527 LocalReducedVals.push_back(Ops);
25528 States.push_back(OpS);
25529 continue;
25530 }
25531 if (OpS) {
25532 LocalReducedVals.back().swap(Ops);
25533 States.back() = OpS;
25534 continue;
25535 }
25536 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
25537 States.push_back(getSameOpcode(RV, TLI));
25538 }
25539 ReducedVals.swap(LocalReducedVals);
25540 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
25541 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
25542 InstructionsState S = States[I];
25543 SmallVector<Value *> Candidates;
25544 Candidates.reserve(2 * OrigReducedVals.size());
25545 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
25546 for (Value *ReducedVal : OrigReducedVals) {
25547 Value *RdxVal = TrackedVals.at(ReducedVal);
25548 // Check if the reduction value was not overriden by the extractelement
25549 // instruction because of the vectorization and exclude it, if it is not
25550 // compatible with other values.
25551 // Also check if the instruction was folded to constant/other value.
25552 auto *Inst = dyn_cast<Instruction>(RdxVal);
25553 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
25554 (!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
25555 !S.isCopyableElement(Inst)))) ||
25556 (S && !Inst && !isa<PoisonValue>(RdxVal) &&
25557 !S.isCopyableElement(RdxVal)))
25558 continue;
25559 Candidates.push_back(RdxVal);
25560 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
25561 }
25562 bool ShuffledExtracts = false;
25563 // Try to handle shuffled extractelements.
25564 if (S && S.getOpcode() == Instruction::ExtractElement &&
25565 !S.isAltShuffle() && I + 1 < E) {
25566 SmallVector<Value *> CommonCandidates(Candidates);
25567 for (Value *RV : ReducedVals[I + 1]) {
25568 Value *RdxVal = TrackedVals.at(RV);
25569 // Check if the reduction value was not overriden by the
25570 // extractelement instruction because of the vectorization and
25571 // exclude it, if it is not compatible with other values.
25572 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
25573 if (!Inst)
25574 continue;
25575 CommonCandidates.push_back(RdxVal);
25576 TrackedToOrig.try_emplace(RdxVal, RV);
25577 }
25578 SmallVector<int> Mask;
25579 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
25580 ++I;
25581 Candidates.swap(CommonCandidates);
25582 ShuffledExtracts = true;
25583 }
25584 }
25585
25586 // Emit code for constant values.
25587 if (Candidates.size() > 1 && allConstant(Candidates)) {
25588 Value *Res = Candidates.front();
25589 Value *OrigV = TrackedToOrig.at(Candidates.front());
25590 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25591 for (Value *VC : ArrayRef(Candidates).drop_front()) {
25592 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
25593 Value *OrigV = TrackedToOrig.at(VC);
25594 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25595 if (auto *ResI = dyn_cast<Instruction>(Res))
25596 V.analyzedReductionRoot(ResI);
25597 }
25598 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
25599 continue;
25600 }
25601
25602 unsigned NumReducedVals = Candidates.size();
25603 if (NumReducedVals < ReductionLimit &&
25604 (NumReducedVals < 2 || !isSplat(Candidates)))
25605 continue;
25606
25607 // Check if we support repeated scalar values processing (optimization of
25608 // original scalar identity operations on matched horizontal reductions).
25609 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
25610 RdxKind != RecurKind::FMul &&
25611 RdxKind != RecurKind::FMulAdd;
25612 // Gather same values.
25613 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
25614 if (IsSupportedHorRdxIdentityOp)
25615 for (Value *V : Candidates) {
25616 Value *OrigV = TrackedToOrig.at(V);
25617 ++SameValuesCounter.try_emplace(OrigV).first->second;
25618 }
25619 // Used to check if the reduced values used same number of times. In this
25620 // case the compiler may produce better code. E.g. if reduced values are
25621 // aabbccdd (8 x values), then the first node of the tree will have a node
25622 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
25623 // Plus, the final reduction will be performed on <8 x aabbccdd>.
25624 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
25625 // x abcd) * 2.
25626 // Currently it only handles add/fadd/xor. and/or/min/max do not require
25627 // this analysis, other operations may require an extra estimation of
25628 // the profitability.
25629 bool SameScaleFactor = false;
25630 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
25631 SameValuesCounter.size() != Candidates.size();
25632 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
25633 if (OptReusedScalars) {
25634 SameScaleFactor =
25635 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
25636 RdxKind == RecurKind::Xor) &&
25637 all_of(drop_begin(SameValuesCounter),
25638 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
25639 return P.second == SameValuesCounter.front().second;
25640 });
25641 Candidates.resize(SameValuesCounter.size());
25642 transform(SameValuesCounter, Candidates.begin(),
25643 [&](const auto &P) { return TrackedVals.at(P.first); });
25644 NumReducedVals = Candidates.size();
25645 // Have a reduction of the same element.
25646 if (NumReducedVals == 1) {
25647 Value *OrigV = TrackedToOrig.at(Candidates.front());
25648 unsigned Cnt = At(SameValuesCounter, OrigV);
25649 Value *RedVal =
25650 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
25651 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25652 VectorizedVals.try_emplace(OrigV, Cnt);
25653 ExternallyUsedValues.insert(OrigV);
25654 continue;
25655 }
25656 }
25657
25658 unsigned MaxVecRegSize = V.getMaxVecRegSize();
25659 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
25660 const unsigned MaxElts = std::clamp<unsigned>(
25661 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
25662 RegMaxNumber * RedValsMaxNumber);
25663
25664 unsigned ReduxWidth = NumReducedVals;
25665 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
25666 unsigned NumParts, NumRegs;
25667 Type *ScalarTy = Candidates.front()->getType();
25668 ReduxWidth =
25669 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
25670 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
25671 NumParts = ::getNumberOfParts(TTI, Tp);
25672 NumRegs =
25674 while (NumParts > NumRegs) {
25675 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
25676 ReduxWidth = bit_floor(ReduxWidth - 1);
25677 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
25678 NumParts = ::getNumberOfParts(TTI, Tp);
25679 NumRegs =
25681 }
25682 if (NumParts > NumRegs / 2)
25683 ReduxWidth = bit_floor(ReduxWidth);
25684 return ReduxWidth;
25685 };
25686 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
25687 ReduxWidth = GetVectorFactor(ReduxWidth);
25688 ReduxWidth = std::min(ReduxWidth, MaxElts);
25689
25690 unsigned Start = 0;
25691 unsigned Pos = Start;
25692 // Restarts vectorization attempt with lower vector factor.
25693 unsigned PrevReduxWidth = ReduxWidth;
25694 bool CheckForReusedReductionOpsLocal = false;
25695 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
25696 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
25697 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
25698 // Check if any of the reduction ops are gathered. If so, worth
25699 // trying again with less number of reduction ops.
25700 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
25701 }
25702 ++Pos;
25703 if (Pos < NumReducedVals - ReduxWidth + 1)
25704 return IsAnyRedOpGathered;
25705 Pos = Start;
25706 --ReduxWidth;
25707 if (ReduxWidth > 1)
25708 ReduxWidth = GetVectorFactor(ReduxWidth);
25709 return IsAnyRedOpGathered;
25710 };
25711 bool AnyVectorized = false;
25712 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
25713 while (Pos < NumReducedVals - ReduxWidth + 1 &&
25714 ReduxWidth >= ReductionLimit) {
25715 // Dependency in tree of the reduction ops - drop this attempt, try
25716 // later.
25717 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
25718 Start == 0) {
25719 CheckForReusedReductionOps = true;
25720 break;
25721 }
25722 PrevReduxWidth = ReduxWidth;
25723 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
25724 // Been analyzed already - skip.
25725 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
25726 (!has_single_bit(ReduxWidth) &&
25727 (IgnoredCandidates.contains(
25728 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
25729 IgnoredCandidates.contains(
25730 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
25731 bit_floor(ReduxWidth))))) ||
25732 V.areAnalyzedReductionVals(VL)) {
25733 (void)AdjustReducedVals(/*IgnoreVL=*/true);
25734 continue;
25735 }
25736 // Early exit if any of the reduction values were deleted during
25737 // previous vectorization attempts.
25738 if (any_of(VL, [&V](Value *RedVal) {
25739 auto *RedValI = dyn_cast<Instruction>(RedVal);
25740 return RedValI && V.isDeleted(RedValI);
25741 }))
25742 break;
25743 V.buildTree(VL, IgnoreList);
25744 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
25745 if (!AdjustReducedVals())
25746 V.analyzedReductionVals(VL);
25747 continue;
25748 }
25749 if (V.isLoadCombineReductionCandidate(RdxKind)) {
25750 if (!AdjustReducedVals())
25751 V.analyzedReductionVals(VL);
25752 continue;
25753 }
25754 V.reorderTopToBottom();
25755 // No need to reorder the root node at all for reassociative reduction.
25756 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
25757 VL.front()->getType()->isIntOrIntVectorTy() ||
25758 ReductionLimit > 2);
25759 // Keep extracted other reduction values, if they are used in the
25760 // vectorization trees.
25761 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
25762 ExternallyUsedValues);
25763 // The reduction root is used as the insertion point for new
25764 // instructions, so set it as externally used to prevent it from being
25765 // deleted.
25766 LocalExternallyUsedValues.insert(ReductionRoot);
25767 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
25768 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
25769 continue;
25770 for (Value *V : ReducedVals[Cnt])
25771 if (isa<Instruction>(V))
25772 LocalExternallyUsedValues.insert(TrackedVals[V]);
25773 }
25774 if (!IsSupportedHorRdxIdentityOp) {
25775 // Number of uses of the candidates in the vector of values.
25776 assert(SameValuesCounter.empty() &&
25777 "Reused values counter map is not empty");
25778 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25779 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25780 continue;
25781 Value *V = Candidates[Cnt];
25782 Value *OrigV = TrackedToOrig.at(V);
25783 ++SameValuesCounter.try_emplace(OrigV).first->second;
25784 }
25785 }
25786 V.transformNodes();
25787 V.computeMinimumValueSizes();
25788 InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VL);
25789
25790 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
25791 // Gather externally used values.
25792 SmallPtrSet<Value *, 4> Visited;
25793 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25794 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25795 continue;
25796 Value *RdxVal = Candidates[Cnt];
25797 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
25798 RdxVal = It->second;
25799 if (!Visited.insert(RdxVal).second)
25800 continue;
25801 // Check if the scalar was vectorized as part of the vectorization
25802 // tree but not the top node.
25803 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
25804 LocalExternallyUsedValues.insert(RdxVal);
25805 continue;
25806 }
25807 Value *OrigV = TrackedToOrig.at(RdxVal);
25808 unsigned NumOps =
25809 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
25810 if (NumOps != ReducedValsToOps.at(OrigV).size())
25811 LocalExternallyUsedValues.insert(RdxVal);
25812 }
25813 // Do not need the list of reused scalars in regular mode anymore.
25814 if (!IsSupportedHorRdxIdentityOp)
25815 SameValuesCounter.clear();
25816 for (Value *RdxVal : VL)
25817 if (RequiredExtract.contains(RdxVal))
25818 LocalExternallyUsedValues.insert(RdxVal);
25819 V.buildExternalUses(LocalExternallyUsedValues);
25820
25821 // Estimate cost.
25822 InstructionCost ReductionCost;
25823 if (V.isReducedBitcastRoot())
25824 ReductionCost = 0;
25825 else
25826 ReductionCost =
25827 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
25828 InstructionCost Cost = V.getTreeCost(TreeCost, VL, ReductionCost);
25829 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
25830 << " for reduction\n");
25831 if (!Cost.isValid())
25832 break;
25833 if (Cost >= -SLPCostThreshold) {
25834 V.getORE()->emit([&]() {
25835 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
25836 ReducedValsToOps.at(VL[0]).front())
25837 << "Vectorizing horizontal reduction is possible "
25838 << "but not beneficial with cost " << ore::NV("Cost", Cost)
25839 << " and threshold "
25840 << ore::NV("Threshold", -SLPCostThreshold);
25841 });
25842 if (!AdjustReducedVals()) {
25843 V.analyzedReductionVals(VL);
25844 unsigned Offset = Pos == Start ? Pos : Pos - 1;
25845 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
25846 // Add subvectors of VL to the list of the analyzed values.
25847 for (unsigned VF = getFloorFullVectorNumberOfElements(
25848 *TTI, VL.front()->getType(), ReduxWidth - 1);
25849 VF >= ReductionLimit;
25851 *TTI, VL.front()->getType(), VF - 1)) {
25852 if (has_single_bit(VF) &&
25853 V.getCanonicalGraphSize() != V.getTreeSize())
25854 continue;
25855 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
25856 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
25857 }
25858 }
25859 }
25860 continue;
25861 }
25862
25863 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
25864 << Cost << ". (HorRdx)\n");
25865 V.getORE()->emit([&]() {
25866 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
25867 ReducedValsToOps.at(VL[0]).front())
25868 << "Vectorized horizontal reduction with cost "
25869 << ore::NV("Cost", Cost) << " and with tree size "
25870 << ore::NV("TreeSize", V.getTreeSize());
25871 });
25872
25873 Builder.setFastMathFlags(RdxFMF);
25874
25875 // Emit a reduction. If the root is a select (min/max idiom), the insert
25876 // point is the compare condition of that select.
25877 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
25878 Instruction *InsertPt = RdxRootInst;
25879 if (IsCmpSelMinMax)
25880 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
25881
25882 // Vectorize a tree.
25883 Value *VectorizedRoot = V.vectorizeTree(
25884 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
25885 // Update TrackedToOrig mapping, since the tracked values might be
25886 // updated.
25887 for (Value *RdxVal : Candidates) {
25888 Value *OrigVal = TrackedToOrig.at(RdxVal);
25889 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
25890 if (TransformedRdxVal != RdxVal)
25891 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
25892 }
25893
25894 Builder.SetInsertPoint(InsertPt);
25895
25896 // To prevent poison from leaking across what used to be sequential,
25897 // safe, scalar boolean logic operations, the reduction operand must be
25898 // frozen.
25899 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
25900 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
25901
25902 // Emit code to correctly handle reused reduced values, if required.
25903 if (OptReusedScalars && !SameScaleFactor) {
25904 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
25905 SameValuesCounter, TrackedToOrig);
25906 }
25907
25908 Type *ScalarTy = VL.front()->getType();
25909 Type *VecTy = VectorizedRoot->getType();
25910 Type *RedScalarTy = VecTy->getScalarType();
25911 VectorValuesAndScales.emplace_back(
25912 VectorizedRoot,
25913 OptReusedScalars && SameScaleFactor
25914 ? SameValuesCounter.front().second
25915 : 1,
25916 RedScalarTy != ScalarTy->getScalarType()
25917 ? V.isSignedMinBitwidthRootNode()
25918 : true);
25919
25920 // Count vectorized reduced values to exclude them from final reduction.
25921 for (Value *RdxVal : VL) {
25922 Value *OrigV = TrackedToOrig.at(RdxVal);
25923 if (IsSupportedHorRdxIdentityOp) {
25924 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
25925 continue;
25926 }
25927 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25928 if (!V.isVectorized(RdxVal))
25929 RequiredExtract.insert(RdxVal);
25930 }
25931 Pos += ReduxWidth;
25932 Start = Pos;
25933 ReduxWidth = NumReducedVals - Pos;
25934 if (ReduxWidth > 1)
25935 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
25936 AnyVectorized = true;
25937 }
25938 if (OptReusedScalars && !AnyVectorized) {
25939 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
25940 Value *RdxVal = TrackedVals.at(P.first);
25941 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
25942 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25943 VectorizedVals.try_emplace(P.first, P.second);
25944 }
25945 continue;
25946 }
25947 }
25948 if (!VectorValuesAndScales.empty())
25949 VectorizedTree = GetNewVectorizedTree(
25950 VectorizedTree, emitReduction(Builder, *TTI, ReductionRoot->getType(),
25951 V.isReducedBitcastRoot()));
25952
25953 if (!VectorizedTree) {
25954 if (!CheckForReusedReductionOps) {
25955 for (ReductionOpsType &RdxOps : ReductionOps)
25956 for (Value *RdxOp : RdxOps)
25957 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
25958 }
25959 return nullptr;
25960 }
25961
25962 // Reorder operands of bool logical op in the natural order to avoid
25963 // possible problem with poison propagation. If not possible to reorder
25964 // (both operands are originally RHS), emit an extra freeze instruction
25965 // for the LHS operand.
25966 // I.e., if we have original code like this:
25967 // RedOp1 = select i1 ?, i1 LHS, i1 false
25968 // RedOp2 = select i1 RHS, i1 ?, i1 false
25969
25970 // Then, we swap LHS/RHS to create a new op that matches the poison
25971 // semantics of the original code.
25972
25973 // If we have original code like this and both values could be poison:
25974 // RedOp1 = select i1 ?, i1 LHS, i1 false
25975 // RedOp2 = select i1 ?, i1 RHS, i1 false
25976
25977 // Then, we must freeze LHS in the new op.
25978 auto FixBoolLogicalOps =
25979 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
25980 Instruction *RedOp2, bool InitStep) {
25981 if (!AnyBoolLogicOp)
25982 return;
25983 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
25984 getRdxOperand(RedOp1, 0) == LHS ||
25986 return;
25987 bool NeedFreeze = LHS != VectorizedTree;
25988 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
25989 getRdxOperand(RedOp2, 0) == RHS ||
25991 // If RedOp2 was used as a second operand - do not swap.
25992 if ((InitStep || RHS != VectorizedTree) &&
25993 getRdxOperand(RedOp2, 0) == RHS &&
25994 ((isBoolLogicOp(RedOp1) &&
25995 getRdxOperand(RedOp1, 1) == RedOp2) ||
25996 any_of(ReductionOps, [&](ArrayRef<Value *> Ops) {
25997 return any_of(Ops, [&](Value *Op) {
25998 auto *OpI = dyn_cast<Instruction>(Op);
25999 return OpI && isBoolLogicOp(OpI) &&
26000 getRdxOperand(OpI, 1) == RedOp2;
26001 });
26002 }))) {
26003 NeedFreeze = false;
26004 } else {
26005 std::swap(LHS, RHS);
26006 return;
26007 }
26008 }
26009 if (NeedFreeze)
26010 LHS = Builder.CreateFreeze(LHS);
26011 };
26012 // Finish the reduction.
26013 // Need to add extra arguments and not vectorized possible reduction values.
26014 // Try to avoid dependencies between the scalar remainders after reductions.
26015 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
26016 bool InitStep) {
26017 unsigned Sz = InstVals.size();
26018 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
26019 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
26020 Instruction *RedOp = InstVals[I + 1].first;
26021 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
26022 Value *RdxVal1 = InstVals[I].second;
26023 Value *StableRdxVal1 = RdxVal1;
26024 auto It1 = TrackedVals.find(RdxVal1);
26025 if (It1 != TrackedVals.end())
26026 StableRdxVal1 = It1->second;
26027 Value *RdxVal2 = InstVals[I + 1].second;
26028 Value *StableRdxVal2 = RdxVal2;
26029 auto It2 = TrackedVals.find(RdxVal2);
26030 if (It2 != TrackedVals.end())
26031 StableRdxVal2 = It2->second;
26032 // To prevent poison from leaking across what used to be sequential,
26033 // safe, scalar boolean logic operations, the reduction operand must be
26034 // frozen.
26035 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
26036 RedOp, InitStep);
26037 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
26038 StableRdxVal2, "op.rdx", ReductionOps);
26039 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
26040 }
26041 if (Sz % 2 == 1)
26042 ExtraReds[Sz / 2] = InstVals.back();
26043 return ExtraReds;
26044 };
26046 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
26047 VectorizedTree);
26048 SmallPtrSet<Value *, 8> Visited;
26049 for (ArrayRef<Value *> Candidates : ReducedVals) {
26050 for (Value *RdxVal : Candidates) {
26051 if (!Visited.insert(RdxVal).second)
26052 continue;
26053 unsigned NumOps = VectorizedVals.lookup(RdxVal);
26054 for (Instruction *RedOp :
26055 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
26056 ExtraReductions.emplace_back(RedOp, RdxVal);
26057 }
26058 }
26059 // Iterate through all not-vectorized reduction values/extra arguments.
26060 bool InitStep = true;
26061 while (ExtraReductions.size() > 1) {
26063 FinalGen(ExtraReductions, InitStep);
26064 ExtraReductions.swap(NewReds);
26065 InitStep = false;
26066 }
26067 VectorizedTree = ExtraReductions.front().second;
26068
26069 ReductionRoot->replaceAllUsesWith(VectorizedTree);
26070
26071 // The original scalar reduction is expected to have no remaining
26072 // uses outside the reduction tree itself. Assert that we got this
26073 // correct, replace internal uses with undef, and mark for eventual
26074 // deletion.
26075#ifndef NDEBUG
26076 SmallPtrSet<Value *, 4> IgnoreSet;
26077 for (ArrayRef<Value *> RdxOps : ReductionOps)
26078 IgnoreSet.insert_range(RdxOps);
26079#endif
26080 for (ArrayRef<Value *> RdxOps : ReductionOps) {
26081 for (Value *Ignore : RdxOps) {
26082 if (!Ignore)
26083 continue;
26084#ifndef NDEBUG
26085 for (auto *U : Ignore->users()) {
26086 assert(IgnoreSet.count(U) &&
26087 "All users must be either in the reduction ops list.");
26088 }
26089#endif
26090 if (!Ignore->use_empty()) {
26091 Value *P = PoisonValue::get(Ignore->getType());
26092 Ignore->replaceAllUsesWith(P);
26093 }
26094 }
26095 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
26096 }
26097 return VectorizedTree;
26098 }
26099
26100private:
26101 /// Creates the reduction from the given \p Vec vector value with the given
26102 /// scale \p Scale and signedness \p IsSigned.
26103 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
26104 Value *Vec, unsigned Scale, bool IsSigned, Type *DestTy,
26105 bool ReducedInTree) {
26106 Value *Rdx;
26107 if (ReducedInTree) {
26108 Rdx = Vec;
26109 } else if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
26110 unsigned DestTyNumElements = getNumElements(VecTy);
26111 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
26112 Rdx = PoisonValue::get(
26113 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
26114 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
26115 // Do reduction for each lane.
26116 // e.g., do reduce add for
26117 // VL[0] = <4 x Ty> <a, b, c, d>
26118 // VL[1] = <4 x Ty> <e, f, g, h>
26119 // Lane[0] = <2 x Ty> <a, e>
26120 // Lane[1] = <2 x Ty> <b, f>
26121 // Lane[2] = <2 x Ty> <c, g>
26122 // Lane[3] = <2 x Ty> <d, h>
26123 // result[0] = reduce add Lane[0]
26124 // result[1] = reduce add Lane[1]
26125 // result[2] = reduce add Lane[2]
26126 // result[3] = reduce add Lane[3]
26127 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
26128 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
26129 Rdx = Builder.CreateInsertElement(
26130 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
26131 }
26132 } else {
26133 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
26134 }
26135 if (Rdx->getType() != DestTy)
26136 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
26137 // Improved analysis for add/fadd/xor reductions with same scale
26138 // factor for all operands of reductions. We can emit scalar ops for
26139 // them instead.
26140 if (Scale > 1)
26141 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
26142 return Rdx;
26143 }
26144
26145 /// Calculate the cost of a reduction.
26146 InstructionCost getReductionCost(TargetTransformInfo *TTI,
26147 ArrayRef<Value *> ReducedVals,
26148 bool IsCmpSelMinMax, FastMathFlags FMF,
26149 const BoUpSLP &R, DominatorTree &DT,
26150 const DataLayout &DL,
26151 const TargetLibraryInfo &TLI) {
26153 Type *ScalarTy = ReducedVals.front()->getType();
26154 unsigned ReduxWidth = ReducedVals.size();
26155 FixedVectorType *VectorTy = R.getReductionType();
26156 InstructionCost VectorCost = 0, ScalarCost;
26157 // If all of the reduced values are constant, the vector cost is 0, since
26158 // the reduction value can be calculated at the compile time.
26159 bool AllConsts = allConstant(ReducedVals);
26160 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
26162 // Scalar cost is repeated for N-1 elements.
26163 int Cnt = ReducedVals.size();
26164 for (Value *RdxVal : ReducedVals) {
26165 if (!isa<Instruction>(RdxVal))
26166 continue;
26167 if (Cnt == 1)
26168 break;
26169 --Cnt;
26170 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
26171 Cost += GenCostFn();
26172 continue;
26173 }
26174 InstructionCost ScalarCost = 0;
26175 for (User *U : RdxVal->users()) {
26176 auto *RdxOp = cast<Instruction>(U);
26177 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
26178 if (RdxKind == RecurKind::FAdd) {
26180 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
26181 if (FMACost.isValid()) {
26182 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
26183 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
26184 // Also, exclude scalar fmul cost.
26185 InstructionCost FMulCost =
26187 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
26188 FMACost -= FMulCost;
26189 }
26190 ScalarCost += FMACost;
26191 continue;
26192 }
26193 }
26194 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
26195 continue;
26196 }
26197 ScalarCost = InstructionCost::getInvalid();
26198 break;
26199 }
26200 if (ScalarCost.isValid())
26201 Cost += ScalarCost;
26202 else
26203 Cost += GenCostFn();
26204 }
26205 return Cost;
26206 };
26207 // Require reduction cost if:
26208 // 1. This type is not a full register type and no other vectors with the
26209 // same type in the storage (first vector with small type).
26210 // 2. The storage does not have any vector with full vector use (first
26211 // vector with full register use).
26212 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
26213 switch (RdxKind) {
26214 case RecurKind::Add:
26215 case RecurKind::Mul:
26216 case RecurKind::Or:
26217 case RecurKind::And:
26218 case RecurKind::Xor:
26219 case RecurKind::FAdd:
26220 case RecurKind::FMul: {
26221 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
26222 if (!AllConsts) {
26223 if (DoesRequireReductionOp) {
26224 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
26225 assert(SLPReVec && "FixedVectorType is not expected.");
26226 unsigned ScalarTyNumElements = VecTy->getNumElements();
26227 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
26228 VectorCost += TTI->getShuffleCost(
26231 ReducedVals.size()),
26232 VectorTy,
26233 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
26234 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
26235 FMF, CostKind);
26236 }
26237 VectorCost += TTI->getScalarizationOverhead(
26238 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
26239 /*Extract*/ false, TTI::TCK_RecipThroughput);
26240 } else {
26241 Type *RedTy = VectorTy->getElementType();
26242 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
26243 std::make_pair(RedTy, true));
26244 if (RType == RedTy) {
26245 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
26246 FMF, CostKind);
26247 } else {
26248 VectorCost = TTI->getExtendedReductionCost(
26249 RdxOpcode, !IsSigned, RedTy,
26250 getWidenedType(RType, ReduxWidth), FMF, CostKind);
26251 }
26252 }
26253 } else {
26254 Type *RedTy = VectorTy->getElementType();
26255 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
26256 std::make_pair(RedTy, true));
26257 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
26258 InstructionCost FMACost = InstructionCost::getInvalid();
26259 if (RdxKind == RecurKind::FAdd) {
26260 // Check if the reduction operands can be converted to FMA.
26262 FastMathFlags FMF;
26263 FMF.set();
26264 for (Value *RdxVal : ReducedVals) {
26265 if (!RdxVal->hasOneUse()) {
26266 Ops.clear();
26267 break;
26268 }
26269 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
26270 FMF &= FPCI->getFastMathFlags();
26271 Ops.push_back(RdxVal->user_back());
26272 }
26273 if (!Ops.empty()) {
26274 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
26275 *TTI, TLI);
26276 if (FMACost.isValid()) {
26277 // Calculate actual FMAD cost.
26278 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
26279 {RVecTy, RVecTy, RVecTy}, FMF);
26280 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
26281
26282 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
26283 // Also, exclude vector fmul cost.
26285 Instruction::FMul, RVecTy, CostKind);
26287 << "Minus vector FMul cost: " << FMulCost << "\n");
26288 FMACost -= FMulCost;
26289 }
26290 }
26291 }
26292 if (FMACost.isValid())
26293 VectorCost += FMACost;
26294 else
26295 VectorCost +=
26296 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
26297 if (RType != RedTy) {
26298 unsigned Opcode = Instruction::Trunc;
26299 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
26300 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
26301 VectorCost += TTI->getCastInstrCost(
26302 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
26303 }
26304 }
26305 }
26306 ScalarCost = EvaluateScalarCost([&]() {
26307 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
26308 });
26309 break;
26310 }
26311 case RecurKind::FMax:
26312 case RecurKind::FMin:
26313 case RecurKind::FMaximum:
26314 case RecurKind::FMinimum:
26315 case RecurKind::SMax:
26316 case RecurKind::SMin:
26317 case RecurKind::UMax:
26318 case RecurKind::UMin: {
26320 if (!AllConsts) {
26321 if (DoesRequireReductionOp) {
26322 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
26323 } else {
26324 // Check if the previous reduction already exists and account it as
26325 // series of operations + single reduction.
26326 Type *RedTy = VectorTy->getElementType();
26327 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
26328 std::make_pair(RedTy, true));
26329 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
26330 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
26331 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
26332 if (RType != RedTy) {
26333 unsigned Opcode = Instruction::Trunc;
26334 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
26335 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
26336 VectorCost += TTI->getCastInstrCost(
26337 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
26338 }
26339 }
26340 }
26341 ScalarCost = EvaluateScalarCost([&]() {
26342 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
26343 return TTI->getIntrinsicInstrCost(ICA, CostKind);
26344 });
26345 break;
26346 }
26347 default:
26348 llvm_unreachable("Expected arithmetic or min/max reduction operation");
26349 }
26350
26351 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
26352 << " for reduction of " << shortBundleName(ReducedVals)
26353 << " (It is a splitting reduction)\n");
26354 return VectorCost - ScalarCost;
26355 }
26356
26357 /// Splits the values, stored in VectorValuesAndScales, into registers/free
26358 /// sub-registers, combines them with the given reduction operation as a
26359 /// vector operation and then performs single (small enough) reduction.
26360 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
26361 Type *DestTy, bool ReducedInTree) {
26362 Value *ReducedSubTree = nullptr;
26363 // Creates reduction and combines with the previous reduction.
26364 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
26365 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy,
26366 ReducedInTree);
26367 if (ReducedSubTree)
26368 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
26369 "op.rdx", ReductionOps);
26370 else
26371 ReducedSubTree = Rdx;
26372 };
26373 if (VectorValuesAndScales.size() == 1) {
26374 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
26375 CreateSingleOp(Vec, Scale, IsSigned);
26376 return ReducedSubTree;
26377 }
26378 // Scales Vec using given Cnt scale factor and then performs vector combine
26379 // with previous value of VecOp.
26380 Value *VecRes = nullptr;
26381 bool VecResSignedness = false;
26382 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
26383 Type *ScalarTy = Vec->getType()->getScalarType();
26384 // Scale Vec using given Cnt scale factor.
26385 if (Cnt > 1) {
26386 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
26387 switch (RdxKind) {
26388 case RecurKind::Add: {
26389 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
26390 unsigned VF = getNumElements(Vec->getType());
26391 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
26392 << ". (HorRdx)\n");
26393 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
26394 for (unsigned I : seq<unsigned>(Cnt))
26395 std::iota(std::next(Mask.begin(), VF * I),
26396 std::next(Mask.begin(), VF * (I + 1)), 0);
26397 ++NumVectorInstructions;
26398 Vec = Builder.CreateShuffleVector(Vec, Mask);
26399 break;
26400 }
26401 // res = mul vv, n
26402 if (ScalarTy != DestTy->getScalarType())
26403 Vec = Builder.CreateIntCast(
26404 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
26405 IsSigned);
26407 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
26408 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
26409 << ". (HorRdx)\n");
26410 ++NumVectorInstructions;
26411 Vec = Builder.CreateMul(Vec, Scale);
26412 break;
26413 }
26414 case RecurKind::Xor: {
26415 // res = n % 2 ? 0 : vv
26417 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
26418 if (Cnt % 2 == 0)
26419 Vec = Constant::getNullValue(Vec->getType());
26420 break;
26421 }
26422 case RecurKind::FAdd: {
26423 // res = fmul v, n
26424 Value *Scale =
26425 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
26426 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
26427 << ". (HorRdx)\n");
26428 ++NumVectorInstructions;
26429 Vec = Builder.CreateFMul(Vec, Scale);
26430 break;
26431 }
26432 case RecurKind::And:
26433 case RecurKind::Or:
26434 case RecurKind::SMax:
26435 case RecurKind::SMin:
26436 case RecurKind::UMax:
26437 case RecurKind::UMin:
26438 case RecurKind::FMax:
26439 case RecurKind::FMin:
26440 case RecurKind::FMaximum:
26441 case RecurKind::FMinimum:
26442 // res = vv
26443 break;
26444 case RecurKind::Sub:
26445 case RecurKind::AddChainWithSubs:
26446 case RecurKind::Mul:
26447 case RecurKind::FMul:
26448 case RecurKind::FMulAdd:
26449 case RecurKind::AnyOf:
26450 case RecurKind::FindFirstIVSMin:
26451 case RecurKind::FindFirstIVUMin:
26452 case RecurKind::FindLastIVSMax:
26453 case RecurKind::FindLastIVUMax:
26454 case RecurKind::FindLast:
26455 case RecurKind::FMaxNum:
26456 case RecurKind::FMinNum:
26457 case RecurKind::FMaximumNum:
26458 case RecurKind::FMinimumNum:
26459 case RecurKind::None:
26460 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
26461 }
26462 }
26463 // Combine Vec with the previous VecOp.
26464 if (!VecRes) {
26465 VecRes = Vec;
26466 VecResSignedness = IsSigned;
26467 } else {
26468 ++NumVectorInstructions;
26469 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
26470 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
26471 // Handle ctpop.
26472 unsigned VecResVF = getNumElements(VecRes->getType());
26473 unsigned VecVF = getNumElements(Vec->getType());
26474 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
26475 std::iota(Mask.begin(), Mask.end(), 0);
26476 // Ensure that VecRes is always larger than Vec
26477 if (VecResVF < VecVF) {
26478 std::swap(VecRes, Vec);
26479 std::swap(VecResVF, VecVF);
26480 }
26481 if (VecResVF != VecVF) {
26482 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
26483 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
26484 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
26485 }
26486 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
26487 return;
26488 }
26489 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
26490 VecRes = Builder.CreateIntCast(
26491 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
26492 VecResSignedness);
26493 if (ScalarTy != DestTy->getScalarType())
26494 Vec = Builder.CreateIntCast(
26495 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
26496 IsSigned);
26497 unsigned VecResVF = getNumElements(VecRes->getType());
26498 unsigned VecVF = getNumElements(Vec->getType());
26499 // Ensure that VecRes is always larger than Vec
26500 if (VecResVF < VecVF) {
26501 std::swap(VecRes, Vec);
26502 std::swap(VecResVF, VecVF);
26503 }
26504 // extract + op + insert
26505 Value *Op = VecRes;
26506 if (VecResVF != VecVF)
26507 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
26508 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
26509 if (VecResVF != VecVF)
26510 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
26511 VecRes = Op;
26512 }
26513 };
26514 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
26515 CreateVecOp(Vec, Scale, IsSigned);
26516 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
26517
26518 return ReducedSubTree;
26519 }
26520
26521 /// Emit a horizontal reduction of the vectorized value.
26522 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
26523 const TargetTransformInfo *TTI, Type *DestTy) {
26524 assert(VectorizedValue && "Need to have a vectorized tree node");
26525 assert(RdxKind != RecurKind::FMulAdd &&
26526 "A call to the llvm.fmuladd intrinsic is not handled yet");
26527
26528 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
26529 if (FTy->getScalarType() == Builder.getInt1Ty() &&
26530 RdxKind == RecurKind::Add &&
26531 DestTy->getScalarType() != FTy->getScalarType()) {
26532 // Convert vector_reduce_add(ZExt(<n x i1>)) to
26533 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
26534 Value *V = Builder.CreateBitCast(
26535 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
26536 ++NumVectorInstructions;
26537 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
26538 }
26539 ++NumVectorInstructions;
26540 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
26541 }
26542
26543 /// Emits optimized code for unique scalar value reused \p Cnt times.
26544 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
26545 unsigned Cnt) {
26546 assert(IsSupportedHorRdxIdentityOp &&
26547 "The optimization of matched scalar identity horizontal reductions "
26548 "must be supported.");
26549 if (Cnt == 1)
26550 return VectorizedValue;
26551 switch (RdxKind) {
26552 case RecurKind::Add: {
26553 // res = mul vv, n
26554 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
26555 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
26556 << VectorizedValue << ". (HorRdx)\n");
26557 return Builder.CreateMul(VectorizedValue, Scale);
26558 }
26559 case RecurKind::Xor: {
26560 // res = n % 2 ? 0 : vv
26561 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
26562 << ". (HorRdx)\n");
26563 if (Cnt % 2 == 0)
26564 return Constant::getNullValue(VectorizedValue->getType());
26565 return VectorizedValue;
26566 }
26567 case RecurKind::FAdd: {
26568 // res = fmul v, n
26569 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
26570 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
26571 << VectorizedValue << ". (HorRdx)\n");
26572 return Builder.CreateFMul(VectorizedValue, Scale);
26573 }
26574 case RecurKind::And:
26575 case RecurKind::Or:
26576 case RecurKind::SMax:
26577 case RecurKind::SMin:
26578 case RecurKind::UMax:
26579 case RecurKind::UMin:
26580 case RecurKind::FMax:
26581 case RecurKind::FMin:
26582 case RecurKind::FMaximum:
26583 case RecurKind::FMinimum:
26584 // res = vv
26585 return VectorizedValue;
26586 case RecurKind::Sub:
26587 case RecurKind::AddChainWithSubs:
26588 case RecurKind::Mul:
26589 case RecurKind::FMul:
26590 case RecurKind::FMulAdd:
26591 case RecurKind::AnyOf:
26592 case RecurKind::FindFirstIVSMin:
26593 case RecurKind::FindFirstIVUMin:
26594 case RecurKind::FindLastIVSMax:
26595 case RecurKind::FindLastIVUMax:
26596 case RecurKind::FindLast:
26597 case RecurKind::FMaxNum:
26598 case RecurKind::FMinNum:
26599 case RecurKind::FMaximumNum:
26600 case RecurKind::FMinimumNum:
26601 case RecurKind::None:
26602 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
26603 }
26604 return nullptr;
26605 }
26606
26607 /// Emits actual operation for the scalar identity values, found during
26608 /// horizontal reduction analysis.
26609 Value *
26610 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
26611 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
26612 const DenseMap<Value *, Value *> &TrackedToOrig) {
26613 assert(IsSupportedHorRdxIdentityOp &&
26614 "The optimization of matched scalar identity horizontal reductions "
26615 "must be supported.");
26616 ArrayRef<Value *> VL = R.getRootNodeScalars();
26617 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
26618 if (VTy->getElementType() != VL.front()->getType()) {
26619 VectorizedValue = Builder.CreateIntCast(
26620 VectorizedValue,
26621 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
26622 R.isSignedMinBitwidthRootNode());
26623 }
26624 switch (RdxKind) {
26625 case RecurKind::Add: {
26626 // root = mul prev_root, <1, 1, n, 1>
26628 for (Value *V : VL) {
26629 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
26630 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
26631 }
26632 auto *Scale = ConstantVector::get(Vals);
26633 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
26634 << VectorizedValue << ". (HorRdx)\n");
26635 return Builder.CreateMul(VectorizedValue, Scale);
26636 }
26637 case RecurKind::And:
26638 case RecurKind::Or:
26639 // No need for multiple or/and(s).
26640 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
26641 << ". (HorRdx)\n");
26642 return VectorizedValue;
26643 case RecurKind::SMax:
26644 case RecurKind::SMin:
26645 case RecurKind::UMax:
26646 case RecurKind::UMin:
26647 case RecurKind::FMax:
26648 case RecurKind::FMin:
26649 case RecurKind::FMaximum:
26650 case RecurKind::FMinimum:
26651 // No need for multiple min/max(s) of the same value.
26652 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
26653 << ". (HorRdx)\n");
26654 return VectorizedValue;
26655 case RecurKind::Xor: {
26656 // Replace values with even number of repeats with 0, since
26657 // x xor x = 0.
26658 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
26659 // 7>, if elements 4th and 6th elements have even number of repeats.
26660 SmallVector<int> Mask(
26661 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
26663 std::iota(Mask.begin(), Mask.end(), 0);
26664 bool NeedShuffle = false;
26665 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
26666 Value *V = VL[I];
26667 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
26668 if (Cnt % 2 == 0) {
26669 Mask[I] = VF;
26670 NeedShuffle = true;
26671 }
26672 }
26673 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
26674 : Mask) dbgs()
26675 << I << " ";
26676 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
26677 if (NeedShuffle)
26678 VectorizedValue = Builder.CreateShuffleVector(
26679 VectorizedValue,
26680 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
26681 return VectorizedValue;
26682 }
26683 case RecurKind::FAdd: {
26684 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
26686 for (Value *V : VL) {
26687 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
26688 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
26689 }
26690 auto *Scale = ConstantVector::get(Vals);
26691 return Builder.CreateFMul(VectorizedValue, Scale);
26692 }
26693 case RecurKind::Sub:
26694 case RecurKind::AddChainWithSubs:
26695 case RecurKind::Mul:
26696 case RecurKind::FMul:
26697 case RecurKind::FMulAdd:
26698 case RecurKind::AnyOf:
26699 case RecurKind::FindFirstIVSMin:
26700 case RecurKind::FindFirstIVUMin:
26701 case RecurKind::FindLastIVSMax:
26702 case RecurKind::FindLastIVUMax:
26703 case RecurKind::FindLast:
26704 case RecurKind::FMaxNum:
26705 case RecurKind::FMinNum:
26706 case RecurKind::FMaximumNum:
26707 case RecurKind::FMinimumNum:
26708 case RecurKind::None:
26709 llvm_unreachable("Unexpected reduction kind for reused scalars.");
26710 }
26711 return nullptr;
26712 }
26713};
26714} // end anonymous namespace
26715
26716/// Gets recurrence kind from the specified value.
26718 return HorizontalReduction::getRdxKind(V);
26719}
26720static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
26721 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
26722 return cast<FixedVectorType>(IE->getType())->getNumElements();
26723
26724 unsigned AggregateSize = 1;
26725 auto *IV = cast<InsertValueInst>(InsertInst);
26726 Type *CurrentType = IV->getType();
26727 do {
26728 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
26729 for (auto *Elt : ST->elements())
26730 if (Elt != ST->getElementType(0)) // check homogeneity
26731 return std::nullopt;
26732 AggregateSize *= ST->getNumElements();
26733 CurrentType = ST->getElementType(0);
26734 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
26735 AggregateSize *= AT->getNumElements();
26736 CurrentType = AT->getElementType();
26737 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
26738 AggregateSize *= VT->getNumElements();
26739 return AggregateSize;
26740 } else if (CurrentType->isSingleValueType()) {
26741 return AggregateSize;
26742 } else {
26743 return std::nullopt;
26744 }
26745 } while (true);
26746}
26747
26748static void findBuildAggregateRec(Instruction *LastInsertInst,
26750 SmallVectorImpl<Value *> &BuildVectorOpds,
26751 SmallVectorImpl<Value *> &InsertElts,
26752 unsigned OperandOffset, const BoUpSLP &R) {
26753 do {
26754 Value *InsertedOperand = LastInsertInst->getOperand(1);
26755 std::optional<unsigned> OperandIndex =
26756 getElementIndex(LastInsertInst, OperandOffset);
26757 if (!OperandIndex || R.isDeleted(LastInsertInst))
26758 return;
26759 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
26761 BuildVectorOpds, InsertElts, *OperandIndex, R);
26762
26763 } else {
26764 BuildVectorOpds[*OperandIndex] = InsertedOperand;
26765 InsertElts[*OperandIndex] = LastInsertInst;
26766 }
26767 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
26768 } while (LastInsertInst != nullptr &&
26770 LastInsertInst->hasOneUse());
26771}
26772
26773/// Recognize construction of vectors like
26774/// %ra = insertelement <4 x float> poison, float %s0, i32 0
26775/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
26776/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
26777/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
26778/// starting from the last insertelement or insertvalue instruction.
26779///
26780/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
26781/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
26782/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
26783///
26784/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
26785///
26786/// \return true if it matches.
26787static bool findBuildAggregate(Instruction *LastInsertInst,
26789 SmallVectorImpl<Value *> &BuildVectorOpds,
26790 SmallVectorImpl<Value *> &InsertElts,
26791 const BoUpSLP &R) {
26792
26793 assert((isa<InsertElementInst>(LastInsertInst) ||
26794 isa<InsertValueInst>(LastInsertInst)) &&
26795 "Expected insertelement or insertvalue instruction!");
26796
26797 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
26798 "Expected empty result vectors!");
26799
26800 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
26801 if (!AggregateSize)
26802 return false;
26803 BuildVectorOpds.resize(*AggregateSize);
26804 InsertElts.resize(*AggregateSize);
26805
26806 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
26807 llvm::erase(BuildVectorOpds, nullptr);
26808 llvm::erase(InsertElts, nullptr);
26809 if (BuildVectorOpds.size() >= 2)
26810 return true;
26811
26812 return false;
26813}
26814
26815/// Try and get a reduction instruction from a phi node.
26816///
26817/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
26818/// if they come from either \p ParentBB or a containing loop latch.
26819///
26820/// \returns A candidate reduction value if possible, or \code nullptr \endcode
26821/// if not possible.
26823 BasicBlock *ParentBB, LoopInfo *LI) {
26824 // There are situations where the reduction value is not dominated by the
26825 // reduction phi. Vectorizing such cases has been reported to cause
26826 // miscompiles. See PR25787.
26827 auto DominatedReduxValue = [&](Value *R) {
26828 return isa<Instruction>(R) &&
26829 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
26830 };
26831
26832 Instruction *Rdx = nullptr;
26833
26834 // Return the incoming value if it comes from the same BB as the phi node.
26835 if (P->getIncomingBlock(0) == ParentBB) {
26836 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
26837 } else if (P->getIncomingBlock(1) == ParentBB) {
26838 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
26839 }
26840
26841 if (Rdx && DominatedReduxValue(Rdx))
26842 return Rdx;
26843
26844 // Otherwise, check whether we have a loop latch to look at.
26845 Loop *BBL = LI->getLoopFor(ParentBB);
26846 if (!BBL)
26847 return nullptr;
26848 BasicBlock *BBLatch = BBL->getLoopLatch();
26849 if (!BBLatch)
26850 return nullptr;
26851
26852 // There is a loop latch, return the incoming value if it comes from
26853 // that. This reduction pattern occasionally turns up.
26854 if (P->getIncomingBlock(0) == BBLatch) {
26855 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
26856 } else if (P->getIncomingBlock(1) == BBLatch) {
26857 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
26858 }
26859
26860 if (Rdx && DominatedReduxValue(Rdx))
26861 return Rdx;
26862
26863 return nullptr;
26864}
26865
26866static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
26867 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
26868 return true;
26869 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
26870 return true;
26871 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
26872 return true;
26873 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
26874 return true;
26875 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
26876 return true;
26878 return true;
26880 return true;
26882 return true;
26884 return true;
26885 return false;
26886}
26887
26888/// We could have an initial reduction that is not an add.
26889/// r *= v1 + v2 + v3 + v4
26890/// In such a case start looking for a tree rooted in the first '+'.
26891/// \Returns the new root if found, which may be nullptr if not an instruction.
26893 Instruction *Root) {
26894 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
26895 isa<IntrinsicInst>(Root)) &&
26896 "Expected binop, select, or intrinsic for reduction matching");
26897 Value *LHS =
26898 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
26899 Value *RHS =
26900 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
26901 if (LHS == Phi)
26902 return dyn_cast<Instruction>(RHS);
26903 if (RHS == Phi)
26904 return dyn_cast<Instruction>(LHS);
26905 return nullptr;
26906}
26907
26908/// \p Returns the first operand of \p I that does not match \p Phi. If
26909/// operand is not an instruction it returns nullptr.
26911 Value *Op0 = nullptr;
26912 Value *Op1 = nullptr;
26913 if (!matchRdxBop(I, Op0, Op1))
26914 return nullptr;
26915 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
26916}
26917
26918/// \Returns true if \p I is a candidate instruction for reduction vectorization.
26920 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
26921 Value *B0 = nullptr, *B1 = nullptr;
26922 bool IsBinop = matchRdxBop(I, B0, B1);
26923 return IsBinop || IsSelect;
26924}
26925
26926bool SLPVectorizerPass::vectorizeHorReduction(
26927 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
26928 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
26929 if (!ShouldVectorizeHor)
26930 return false;
26931 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
26932
26933 if (Root->getParent() != BB || isa<PHINode>(Root))
26934 return false;
26935
26936 // If we can find a secondary reduction root, use that instead.
26937 auto SelectRoot = [&]() {
26938 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
26939 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
26940 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
26941 return NewRoot;
26942 return Root;
26943 };
26944
26945 // Start analysis starting from Root instruction. If horizontal reduction is
26946 // found, try to vectorize it. If it is not a horizontal reduction or
26947 // vectorization is not possible or not effective, and currently analyzed
26948 // instruction is a binary operation, try to vectorize the operands, using
26949 // pre-order DFS traversal order. If the operands were not vectorized, repeat
26950 // the same procedure considering each operand as a possible root of the
26951 // horizontal reduction.
26952 // Interrupt the process if the Root instruction itself was vectorized or all
26953 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
26954 // If a horizintal reduction was not matched or vectorized we collect
26955 // instructions for possible later attempts for vectorization.
26956 std::queue<std::pair<Instruction *, unsigned>> Stack;
26957 Stack.emplace(SelectRoot(), 0);
26958 SmallPtrSet<Value *, 8> VisitedInstrs;
26959 bool Res = false;
26960 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
26961 if (R.isAnalyzedReductionRoot(Inst))
26962 return nullptr;
26963 if (!isReductionCandidate(Inst))
26964 return nullptr;
26965 HorizontalReduction HorRdx;
26966 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
26967 return nullptr;
26968 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
26969 };
26970 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
26971 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
26972 FutureSeed = getNonPhiOperand(Root, P);
26973 if (!FutureSeed)
26974 return false;
26975 }
26976 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
26977 // analysis is done separately.
26979 PostponedInsts.push_back(FutureSeed);
26980 return true;
26981 };
26982
26983 while (!Stack.empty()) {
26984 Instruction *Inst;
26985 unsigned Level;
26986 std::tie(Inst, Level) = Stack.front();
26987 Stack.pop();
26988 // Do not try to analyze instruction that has already been vectorized.
26989 // This may happen when we vectorize instruction operands on a previous
26990 // iteration while stack was populated before that happened.
26991 if (R.isDeleted(Inst))
26992 continue;
26993 if (Value *VectorizedV = TryToReduce(Inst)) {
26994 Res = true;
26995 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
26996 // Try to find another reduction.
26997 Stack.emplace(I, Level);
26998 continue;
26999 }
27000 if (R.isDeleted(Inst))
27001 continue;
27002 } else {
27003 // We could not vectorize `Inst` so try to use it as a future seed.
27004 if (!TryAppendToPostponedInsts(Inst)) {
27005 assert(Stack.empty() && "Expected empty stack");
27006 break;
27007 }
27008 }
27009
27010 // Try to vectorize operands.
27011 // Continue analysis for the instruction from the same basic block only to
27012 // save compile time.
27013 if (++Level < RecursionMaxDepth)
27014 for (auto *Op : Inst->operand_values())
27015 if (VisitedInstrs.insert(Op).second)
27016 if (auto *I = dyn_cast<Instruction>(Op))
27017 // Do not try to vectorize CmpInst operands, this is done
27018 // separately.
27020 !R.isDeleted(I) && I->getParent() == BB)
27021 Stack.emplace(I, Level);
27022 }
27023 return Res;
27024}
27025
27026bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
27027 if (!I)
27028 return false;
27029
27030 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
27031 return false;
27032 // Skip potential FMA candidates.
27033 if ((I->getOpcode() == Instruction::FAdd ||
27034 I->getOpcode() == Instruction::FSub) &&
27035 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
27036 .isValid())
27037 return false;
27038
27039 Value *P = I->getParent();
27040
27041 // Vectorize in current basic block only.
27042 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
27043 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
27044 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
27045 R.isDeleted(Op0) || R.isDeleted(Op1))
27046 return false;
27047
27048 // First collect all possible candidates
27050 Candidates.emplace_back(Op0, Op1);
27051
27052 auto *A = dyn_cast<BinaryOperator>(Op0);
27053 auto *B = dyn_cast<BinaryOperator>(Op1);
27054 // Try to skip B.
27055 if (A && B && B->hasOneUse()) {
27056 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
27057 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
27058 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
27059 Candidates.emplace_back(A, B0);
27060 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
27061 Candidates.emplace_back(A, B1);
27062 }
27063 // Try to skip A.
27064 if (B && A && A->hasOneUse()) {
27065 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
27066 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
27067 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
27068 Candidates.emplace_back(A0, B);
27069 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
27070 Candidates.emplace_back(A1, B);
27071 }
27072
27073 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
27075 if (!isReductionCandidate(Inst))
27076 return false;
27077 Type *Ty = Inst->getType();
27078 if (!isValidElementType(Ty) || Ty->isPointerTy())
27079 return false;
27080 HorizontalReduction HorRdx(Inst, Ops);
27081 if (!HorRdx.matchReductionForOperands())
27082 return false;
27083 // Check the cost of operations.
27084 VectorType *VecTy = getWidenedType(Ty, Ops.size());
27086 InstructionCost ScalarCost =
27087 TTI.getScalarizationOverhead(
27088 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
27089 /*Extract=*/true, CostKind) +
27090 TTI.getInstructionCost(Inst, CostKind);
27091 InstructionCost RedCost;
27092 switch (::getRdxKind(Inst)) {
27093 case RecurKind::Add:
27094 case RecurKind::Mul:
27095 case RecurKind::Or:
27096 case RecurKind::And:
27097 case RecurKind::Xor:
27098 case RecurKind::FAdd:
27099 case RecurKind::FMul: {
27100 FastMathFlags FMF;
27101 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
27102 FMF = FPCI->getFastMathFlags();
27103 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
27104 CostKind);
27105 break;
27106 }
27107 default:
27108 return false;
27109 }
27110 if (RedCost >= ScalarCost)
27111 return false;
27112
27113 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
27114 };
27115 if (Candidates.size() == 1)
27116 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
27117
27118 // We have multiple options. Try to pick the single best.
27119 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
27120 if (!BestCandidate)
27121 return false;
27122 return (*BestCandidate == 0 &&
27123 TryToReduce(I, {Candidates[*BestCandidate].first,
27124 Candidates[*BestCandidate].second})) ||
27125 tryToVectorizeList({Candidates[*BestCandidate].first,
27126 Candidates[*BestCandidate].second},
27127 R);
27128}
27129
27130bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
27131 BasicBlock *BB, BoUpSLP &R) {
27132 SmallVector<WeakTrackingVH> PostponedInsts;
27133 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
27134 Res |= tryToVectorize(PostponedInsts, R);
27135 return Res;
27136}
27137
27138bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
27139 BoUpSLP &R) {
27140 bool Res = false;
27141 for (Value *V : Insts)
27142 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
27143 Res |= tryToVectorize(Inst, R);
27144 return Res;
27145}
27146
27147bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
27148 BasicBlock *BB, BoUpSLP &R,
27149 bool MaxVFOnly) {
27150 if (!R.canMapToVector(IVI->getType()))
27151 return false;
27152
27153 SmallVector<Value *, 16> BuildVectorOpds;
27154 SmallVector<Value *, 16> BuildVectorInsts;
27155 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
27156 return false;
27157
27158 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
27159 R.getORE()->emit([&]() {
27160 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
27161 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
27162 "trying reduction first.";
27163 });
27164 return false;
27165 }
27166 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
27167 // Aggregate value is unlikely to be processed in vector register.
27168 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
27169}
27170
27171bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
27172 BasicBlock *BB, BoUpSLP &R,
27173 bool MaxVFOnly) {
27174 SmallVector<Value *, 16> BuildVectorInsts;
27175 SmallVector<Value *, 16> BuildVectorOpds;
27176 SmallVector<int> Mask;
27177 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
27179 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
27180 return false;
27181
27182 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
27183 R.getORE()->emit([&]() {
27184 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
27185 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
27186 "trying reduction first.";
27187 });
27188 return false;
27189 }
27190 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
27191 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
27192}
27193
27194template <typename T>
27196 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
27197 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
27198 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
27199 bool MaxVFOnly, BoUpSLP &R) {
27200 bool Changed = false;
27201 // Sort by type, parent, operands.
27202 stable_sort(Incoming, Comparator);
27203
27204 // Try to vectorize elements base on their type.
27205 SmallVector<T *> Candidates;
27207 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
27208 VL.clear()) {
27209 // Look for the next elements with the same type, parent and operand
27210 // kinds.
27211 auto *I = dyn_cast<Instruction>(*IncIt);
27212 if (!I || R.isDeleted(I)) {
27213 ++IncIt;
27214 continue;
27215 }
27216 auto *SameTypeIt = IncIt;
27217 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
27218 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
27219 AreCompatible(VL, *SameTypeIt))) {
27220 auto *I = dyn_cast<Instruction>(*SameTypeIt);
27221 ++SameTypeIt;
27222 if (I && !R.isDeleted(I))
27223 VL.push_back(cast<T>(I));
27224 }
27225
27226 // Try to vectorize them.
27227 unsigned NumElts = VL.size();
27228 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
27229 << NumElts << ")\n");
27230 // The vectorization is a 3-state attempt:
27231 // 1. Try to vectorize instructions with the same/alternate opcodes with the
27232 // size of maximal register at first.
27233 // 2. Try to vectorize remaining instructions with the same type, if
27234 // possible. This may result in the better vectorization results rather than
27235 // if we try just to vectorize instructions with the same/alternate opcodes.
27236 // 3. Final attempt to try to vectorize all instructions with the
27237 // same/alternate ops only, this may result in some extra final
27238 // vectorization.
27239 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
27240 // Success start over because instructions might have been changed.
27241 Changed = true;
27242 VL.swap(Candidates);
27243 Candidates.clear();
27244 for (T *V : VL) {
27245 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
27246 Candidates.push_back(V);
27247 }
27248 } else {
27249 /// \Returns the minimum number of elements that we will attempt to
27250 /// vectorize.
27251 auto GetMinNumElements = [&R](Value *V) {
27252 unsigned EltSize = R.getVectorElementSize(V);
27253 return std::max(2U, R.getMaxVecRegSize() / EltSize);
27254 };
27255 if (NumElts < GetMinNumElements(*IncIt) &&
27256 (Candidates.empty() ||
27257 Candidates.front()->getType() == (*IncIt)->getType())) {
27258 for (T *V : VL) {
27259 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
27260 Candidates.push_back(V);
27261 }
27262 }
27263 }
27264 // Final attempt to vectorize instructions with the same types.
27265 if (Candidates.size() > 1 &&
27266 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
27267 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
27268 // Success start over because instructions might have been changed.
27269 Changed = true;
27270 } else if (MaxVFOnly) {
27271 // Try to vectorize using small vectors.
27273 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
27274 VL.clear()) {
27275 auto *I = dyn_cast<Instruction>(*It);
27276 if (!I || R.isDeleted(I)) {
27277 ++It;
27278 continue;
27279 }
27280 auto *SameTypeIt = It;
27281 while (SameTypeIt != End &&
27282 (!isa<Instruction>(*SameTypeIt) ||
27283 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
27284 AreCompatible(*SameTypeIt, *It))) {
27285 auto *I = dyn_cast<Instruction>(*SameTypeIt);
27286 ++SameTypeIt;
27287 if (I && !R.isDeleted(I))
27288 VL.push_back(cast<T>(I));
27289 }
27290 unsigned NumElts = VL.size();
27291 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
27292 /*MaxVFOnly=*/false))
27293 Changed = true;
27294 It = SameTypeIt;
27295 }
27296 }
27297 Candidates.clear();
27298 }
27299
27300 // Start over at the next instruction of a different type (or the end).
27301 IncIt = SameTypeIt;
27302 }
27303 return Changed;
27304}
27305
27306/// Compare two cmp instructions. If IsCompatibility is true, function returns
27307/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
27308/// operands. If IsCompatibility is false, function implements strict weak
27309/// ordering relation between two cmp instructions, returning true if the first
27310/// instruction is "less" than the second, i.e. its predicate is less than the
27311/// predicate of the second or the operands IDs are less than the operands IDs
27312/// of the second cmp instruction.
27313template <bool IsCompatibility>
27314static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
27315 const DominatorTree &DT) {
27316 assert(isValidElementType(V->getType()) &&
27317 isValidElementType(V2->getType()) &&
27318 "Expected valid element types only.");
27319 if (V == V2)
27320 return IsCompatibility;
27321 auto *CI1 = cast<CmpInst>(V);
27322 auto *CI2 = cast<CmpInst>(V2);
27323 if (CI1->getOperand(0)->getType()->getTypeID() <
27324 CI2->getOperand(0)->getType()->getTypeID())
27325 return !IsCompatibility;
27326 if (CI1->getOperand(0)->getType()->getTypeID() >
27327 CI2->getOperand(0)->getType()->getTypeID())
27328 return false;
27329 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
27331 return !IsCompatibility;
27332 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
27334 return false;
27335 CmpInst::Predicate Pred1 = CI1->getPredicate();
27336 CmpInst::Predicate Pred2 = CI2->getPredicate();
27339 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
27340 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
27341 if (BasePred1 < BasePred2)
27342 return !IsCompatibility;
27343 if (BasePred1 > BasePred2)
27344 return false;
27345 // Compare operands.
27346 bool CI1Preds = Pred1 == BasePred1;
27347 bool CI2Preds = Pred2 == BasePred1;
27348 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
27349 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
27350 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
27351 if (Op1 == Op2)
27352 continue;
27353 if (Op1->getValueID() < Op2->getValueID())
27354 return !IsCompatibility;
27355 if (Op1->getValueID() > Op2->getValueID())
27356 return false;
27357 if (auto *I1 = dyn_cast<Instruction>(Op1))
27358 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
27359 if (IsCompatibility) {
27360 if (I1->getParent() != I2->getParent())
27361 return false;
27362 } else {
27363 // Try to compare nodes with same parent.
27364 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
27365 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
27366 if (!NodeI1)
27367 return NodeI2 != nullptr;
27368 if (!NodeI2)
27369 return false;
27370 assert((NodeI1 == NodeI2) ==
27371 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27372 "Different nodes should have different DFS numbers");
27373 if (NodeI1 != NodeI2)
27374 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27375 }
27376 InstructionsState S = getSameOpcode({I1, I2}, TLI);
27377 if (S && (IsCompatibility || !S.isAltShuffle()))
27378 continue;
27379 if (IsCompatibility)
27380 return false;
27381 if (I1->getOpcode() != I2->getOpcode())
27382 return I1->getOpcode() < I2->getOpcode();
27383 }
27384 }
27385 return IsCompatibility;
27386}
27387
27388template <typename ItT>
27389bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
27390 BasicBlock *BB, BoUpSLP &R) {
27391 bool Changed = false;
27392 // Try to find reductions first.
27393 for (CmpInst *I : CmpInsts) {
27394 if (R.isDeleted(I))
27395 continue;
27396 for (Value *Op : I->operands())
27397 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
27398 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
27399 if (R.isDeleted(I))
27400 break;
27401 }
27402 }
27403 // Try to vectorize operands as vector bundles.
27404 for (CmpInst *I : CmpInsts) {
27405 if (R.isDeleted(I))
27406 continue;
27407 Changed |= tryToVectorize(I, R);
27408 }
27409 // Try to vectorize list of compares.
27410 // Sort by type, compare predicate, etc.
27411 auto CompareSorter = [&](Value *V, Value *V2) {
27412 if (V == V2)
27413 return false;
27414 return compareCmp<false>(V, V2, *TLI, *DT);
27415 };
27416
27417 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
27418 if (VL.empty() || VL.back() == V1)
27419 return true;
27420 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
27421 };
27422
27424 for (Instruction *V : CmpInsts)
27425 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
27426 Vals.push_back(V);
27427 if (Vals.size() <= 1)
27428 return Changed;
27430 Vals, CompareSorter, AreCompatibleCompares,
27431 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
27432 // Exclude possible reductions from other blocks.
27433 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
27434 return any_of(V->users(), [V](User *U) {
27435 auto *Select = dyn_cast<SelectInst>(U);
27436 return Select &&
27437 Select->getParent() != cast<Instruction>(V)->getParent();
27438 });
27439 });
27440 if (ArePossiblyReducedInOtherBlock)
27441 return false;
27442 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27443 },
27444 /*MaxVFOnly=*/true, R);
27445 return Changed;
27446}
27447
27448bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
27449 BasicBlock *BB, BoUpSLP &R) {
27451 "This function only accepts Insert instructions");
27452 bool OpsChanged = false;
27453 SmallVector<WeakTrackingVH> PostponedInsts;
27454 for (auto *I : reverse(Instructions)) {
27455 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
27456 if (R.isDeleted(I) || isa<CmpInst>(I))
27457 continue;
27458 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
27459 OpsChanged |=
27460 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
27461 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
27462 OpsChanged |=
27463 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
27464 }
27465 // pass2 - try to vectorize reductions only
27466 if (R.isDeleted(I))
27467 continue;
27468 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
27469 if (R.isDeleted(I) || isa<CmpInst>(I))
27470 continue;
27471 // pass3 - try to match and vectorize a buildvector sequence.
27472 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
27473 OpsChanged |=
27474 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
27475 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
27476 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
27477 /*MaxVFOnly=*/false);
27478 }
27479 }
27480 // Now try to vectorize postponed instructions.
27481 OpsChanged |= tryToVectorize(PostponedInsts, R);
27482
27483 Instructions.clear();
27484 return OpsChanged;
27485}
27486
27487bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
27488 bool Changed = false;
27489 SmallVector<Value *, 4> Incoming;
27490 SmallPtrSet<Value *, 16> VisitedInstrs;
27491 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
27492 // node. Allows better to identify the chains that can be vectorized in the
27493 // better way.
27494 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
27495 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
27497 isValidElementType(V2->getType()) &&
27498 "Expected vectorizable types only.");
27499 if (V1 == V2)
27500 return false;
27501 // It is fine to compare type IDs here, since we expect only vectorizable
27502 // types, like ints, floats and pointers, we don't care about other type.
27503 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
27504 return true;
27505 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
27506 return false;
27507 if (V1->getType()->getScalarSizeInBits() <
27508 V2->getType()->getScalarSizeInBits())
27509 return true;
27510 if (V1->getType()->getScalarSizeInBits() >
27511 V2->getType()->getScalarSizeInBits())
27512 return false;
27513 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
27514 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
27515 if (Opcodes1.size() < Opcodes2.size())
27516 return true;
27517 if (Opcodes1.size() > Opcodes2.size())
27518 return false;
27519 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
27520 {
27521 // Instructions come first.
27522 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
27523 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
27524 if (I1 && I2) {
27525 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
27526 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
27527 if (!NodeI1)
27528 return NodeI2 != nullptr;
27529 if (!NodeI2)
27530 return false;
27531 assert((NodeI1 == NodeI2) ==
27532 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27533 "Different nodes should have different DFS numbers");
27534 if (NodeI1 != NodeI2)
27535 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27536 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
27537 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
27538 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
27539 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
27540 if (!E1 || !E2)
27541 continue;
27542
27543 // Sort on ExtractElementInsts primarily by vector operands. Prefer
27544 // program order of the vector operands.
27545 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
27546 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
27547 if (V1 != V2) {
27548 if (V1 && !V2)
27549 return true;
27550 if (!V1 && V2)
27551 return false;
27553 DT->getNode(V1->getParent());
27555 DT->getNode(V2->getParent());
27556 if (!NodeI1)
27557 return NodeI2 != nullptr;
27558 if (!NodeI2)
27559 return false;
27560 assert((NodeI1 == NodeI2) ==
27561 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27562 "Different nodes should have different DFS numbers");
27563 if (NodeI1 != NodeI2)
27564 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27565 return V1->comesBefore(V2);
27566 }
27567 // If we have the same vector operand, try to sort by constant
27568 // index.
27569 std::optional<unsigned> Id1 = getExtractIndex(E1);
27570 std::optional<unsigned> Id2 = getExtractIndex(E2);
27571 // Bring constants to the top
27572 if (Id1 && !Id2)
27573 return true;
27574 if (!Id1 && Id2)
27575 return false;
27576 // First elements come first.
27577 if (Id1 && Id2)
27578 return *Id1 < *Id2;
27579
27580 continue;
27581 }
27582 if (I1->getOpcode() == I2->getOpcode())
27583 continue;
27584 return I1->getOpcode() < I2->getOpcode();
27585 }
27586 if (I1)
27587 return true;
27588 if (I2)
27589 return false;
27590 }
27591 {
27592 // Non-undef constants come next.
27593 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
27594 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
27595 if (C1 && C2)
27596 continue;
27597 if (C1)
27598 return true;
27599 if (C2)
27600 return false;
27601 }
27602 bool U1 = isa<UndefValue>(Opcodes1[I]);
27603 bool U2 = isa<UndefValue>(Opcodes2[I]);
27604 {
27605 // Non-constant non-instructions come next.
27606 if (!U1 && !U2) {
27607 auto ValID1 = Opcodes1[I]->getValueID();
27608 auto ValID2 = Opcodes2[I]->getValueID();
27609 if (ValID1 == ValID2)
27610 continue;
27611 if (ValID1 < ValID2)
27612 return true;
27613 if (ValID1 > ValID2)
27614 return false;
27615 }
27616 if (!U1)
27617 return true;
27618 if (!U2)
27619 return false;
27620 }
27621 // Undefs come last.
27622 assert(U1 && U2 && "The only thing left should be undef & undef.");
27623 }
27624 return false;
27625 };
27626 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
27627 Value *V1) {
27628 if (VL.empty() || V1 == VL.back())
27629 return true;
27630 Value *V2 = VL.back();
27631 if (V1->getType() != V2->getType())
27632 return false;
27633 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
27634 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
27635 if (Opcodes1.size() != Opcodes2.size())
27636 return false;
27637 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
27638 // Undefs are compatible with any other value.
27639 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
27640 continue;
27641 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
27642 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
27643 if (R.isDeleted(I1) || R.isDeleted(I2))
27644 return false;
27645 if (I1->getParent() != I2->getParent())
27646 return false;
27647 if (getSameOpcode({I1, I2}, *TLI))
27648 continue;
27649 return false;
27650 }
27651 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
27652 continue;
27653 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
27654 return false;
27655 }
27656 return true;
27657 };
27658
27659 bool HaveVectorizedPhiNodes = false;
27660 do {
27661 // Collect the incoming values from the PHIs.
27662 Incoming.clear();
27663 for (Instruction &I : *BB) {
27664 auto *P = dyn_cast<PHINode>(&I);
27665 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
27666 break;
27667
27668 // No need to analyze deleted, vectorized and non-vectorizable
27669 // instructions.
27670 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
27671 isValidElementType(P->getType()))
27672 Incoming.push_back(P);
27673 }
27674
27675 if (Incoming.size() <= 1)
27676 break;
27677
27678 // Find the corresponding non-phi nodes for better matching when trying to
27679 // build the tree.
27680 for (Value *V : Incoming) {
27681 SmallVectorImpl<Value *> &Opcodes =
27682 PHIToOpcodes.try_emplace(V).first->getSecond();
27683 if (!Opcodes.empty())
27684 continue;
27685 SmallVector<Value *, 4> Nodes(1, V);
27686 SmallPtrSet<Value *, 4> Visited;
27687 while (!Nodes.empty()) {
27688 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
27689 if (!Visited.insert(PHI).second)
27690 continue;
27691 for (Value *V : PHI->incoming_values()) {
27692 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
27693 Nodes.push_back(PHI1);
27694 continue;
27695 }
27696 Opcodes.emplace_back(V);
27697 }
27698 }
27699 }
27700
27701 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
27702 Incoming, PHICompare, AreCompatiblePHIs,
27703 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
27704 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27705 },
27706 /*MaxVFOnly=*/true, R);
27707 Changed |= HaveVectorizedPhiNodes;
27708 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
27709 auto *PHI = dyn_cast<PHINode>(P.first);
27710 return !PHI || R.isDeleted(PHI);
27711 }))
27712 PHIToOpcodes.clear();
27713 VisitedInstrs.insert_range(Incoming);
27714 } while (HaveVectorizedPhiNodes);
27715
27716 VisitedInstrs.clear();
27717
27718 InstSetVector PostProcessInserts;
27719 SmallSetVector<CmpInst *, 8> PostProcessCmps;
27720 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
27721 // also vectorizes `PostProcessCmps`.
27722 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
27723 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
27724 if (VectorizeCmps) {
27725 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
27726 PostProcessCmps.clear();
27727 }
27728 PostProcessInserts.clear();
27729 return Changed;
27730 };
27731 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
27732 auto IsInPostProcessInstrs = [&](Instruction *I) {
27733 if (auto *Cmp = dyn_cast<CmpInst>(I))
27734 return PostProcessCmps.contains(Cmp);
27736 PostProcessInserts.contains(I);
27737 };
27738 // Returns true if `I` is an instruction without users, like terminator, or
27739 // function call with ignored return value, store. Ignore unused instructions
27740 // (basing on instruction type, except for CallInst and InvokeInst).
27741 auto HasNoUsers = [](Instruction *I) {
27742 return I->use_empty() &&
27743 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
27744 };
27745 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
27746 // Skip instructions with scalable type. The num of elements is unknown at
27747 // compile-time for scalable type.
27748 if (isa<ScalableVectorType>(It->getType()))
27749 continue;
27750
27751 // Skip instructions marked for the deletion.
27752 if (R.isDeleted(&*It))
27753 continue;
27754 // We may go through BB multiple times so skip the one we have checked.
27755 if (!VisitedInstrs.insert(&*It).second) {
27756 if (HasNoUsers(&*It) &&
27757 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
27758 // We would like to start over since some instructions are deleted
27759 // and the iterator may become invalid value.
27760 Changed = true;
27761 It = BB->begin();
27762 E = BB->end();
27763 }
27764 continue;
27765 }
27766
27767 // Try to vectorize reductions that use PHINodes.
27768 if (PHINode *P = dyn_cast<PHINode>(It)) {
27769 // Check that the PHI is a reduction PHI.
27770 if (P->getNumIncomingValues() == 2) {
27771 // Try to match and vectorize a horizontal reduction.
27772 Instruction *Root = getReductionInstr(DT, P, BB, LI);
27773 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
27774 Changed = true;
27775 It = BB->begin();
27776 E = BB->end();
27777 continue;
27778 }
27779 }
27780 // Try to vectorize the incoming values of the PHI, to catch reductions
27781 // that feed into PHIs.
27782 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
27783 // Skip if the incoming block is the current BB for now. Also, bypass
27784 // unreachable IR for efficiency and to avoid crashing.
27785 // TODO: Collect the skipped incoming values and try to vectorize them
27786 // after processing BB.
27787 if (BB == P->getIncomingBlock(I) ||
27788 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
27789 continue;
27790
27791 // Postponed instructions should not be vectorized here, delay their
27792 // vectorization.
27793 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
27794 PI && !IsInPostProcessInstrs(PI)) {
27795 bool Res =
27796 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
27797 Changed |= Res;
27798 if (Res && R.isDeleted(P)) {
27799 It = BB->begin();
27800 E = BB->end();
27801 break;
27802 }
27803 }
27804 }
27805 continue;
27806 }
27807
27808 if (HasNoUsers(&*It)) {
27809 bool OpsChanged = false;
27810 auto *SI = dyn_cast<StoreInst>(It);
27811 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
27812 if (SI) {
27813 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
27814 // Try to vectorize chain in store, if this is the only store to the
27815 // address in the block.
27816 // TODO: This is just a temporarily solution to save compile time. Need
27817 // to investigate if we can safely turn on slp-vectorize-hor-store
27818 // instead to allow lookup for reduction chains in all non-vectorized
27819 // stores (need to check side effects and compile time).
27820 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
27821 SI->getValueOperand()->hasOneUse();
27822 }
27823 if (TryToVectorizeRoot) {
27824 for (auto *V : It->operand_values()) {
27825 // Postponed instructions should not be vectorized here, delay their
27826 // vectorization.
27827 if (auto *VI = dyn_cast<Instruction>(V);
27828 VI && !IsInPostProcessInstrs(VI))
27829 // Try to match and vectorize a horizontal reduction.
27830 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
27831 }
27832 }
27833 // Start vectorization of post-process list of instructions from the
27834 // top-tree instructions to try to vectorize as many instructions as
27835 // possible.
27836 OpsChanged |=
27837 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
27838 if (OpsChanged) {
27839 // We would like to start over since some instructions are deleted
27840 // and the iterator may become invalid value.
27841 Changed = true;
27842 It = BB->begin();
27843 E = BB->end();
27844 continue;
27845 }
27846 }
27847
27849 PostProcessInserts.insert(&*It);
27850 else if (isa<CmpInst>(It))
27851 PostProcessCmps.insert(cast<CmpInst>(&*It));
27852 }
27853
27854 return Changed;
27855}
27856
27857bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
27858 auto Changed = false;
27859 for (auto &Entry : GEPs) {
27860 // If the getelementptr list has fewer than two elements, there's nothing
27861 // to do.
27862 if (Entry.second.size() < 2)
27863 continue;
27864
27865 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
27866 << Entry.second.size() << ".\n");
27867
27868 // Process the GEP list in chunks suitable for the target's supported
27869 // vector size. If a vector register can't hold 1 element, we are done. We
27870 // are trying to vectorize the index computations, so the maximum number of
27871 // elements is based on the size of the index expression, rather than the
27872 // size of the GEP itself (the target's pointer size).
27873 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
27874 return !R.isDeleted(GEP);
27875 });
27876 if (It == Entry.second.end())
27877 continue;
27878 unsigned MaxVecRegSize = R.getMaxVecRegSize();
27879 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
27880 if (MaxVecRegSize < EltSize)
27881 continue;
27882
27883 unsigned MaxElts = MaxVecRegSize / EltSize;
27884 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
27885 auto Len = std::min<unsigned>(BE - BI, MaxElts);
27886 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
27887
27888 // Initialize a set a candidate getelementptrs. Note that we use a
27889 // SetVector here to preserve program order. If the index computations
27890 // are vectorizable and begin with loads, we want to minimize the chance
27891 // of having to reorder them later.
27892 SetVector<Value *> Candidates(llvm::from_range, GEPList);
27893
27894 // Some of the candidates may have already been vectorized after we
27895 // initially collected them or their index is optimized to constant value.
27896 // If so, they are marked as deleted, so remove them from the set of
27897 // candidates.
27898 Candidates.remove_if([&R](Value *I) {
27899 return R.isDeleted(cast<Instruction>(I)) ||
27900 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
27901 });
27902
27903 // Remove from the set of candidates all pairs of getelementptrs with
27904 // constant differences. Such getelementptrs are likely not good
27905 // candidates for vectorization in a bottom-up phase since one can be
27906 // computed from the other. We also ensure all candidate getelementptr
27907 // indices are unique.
27908 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
27909 auto *GEPI = GEPList[I];
27910 if (!Candidates.count(GEPI))
27911 continue;
27912 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
27913 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
27914 auto *GEPJ = GEPList[J];
27915 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
27916 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
27917 Candidates.remove(GEPI);
27918 Candidates.remove(GEPJ);
27919 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
27920 Candidates.remove(GEPJ);
27921 }
27922 }
27923 }
27924
27925 // We break out of the above computation as soon as we know there are
27926 // fewer than two candidates remaining.
27927 if (Candidates.size() < 2)
27928 continue;
27929
27930 // Add the single, non-constant index of each candidate to the bundle. We
27931 // ensured the indices met these constraints when we originally collected
27932 // the getelementptrs.
27933 SmallVector<Value *, 16> Bundle(Candidates.size());
27934 auto BundleIndex = 0u;
27935 for (auto *V : Candidates) {
27936 auto *GEP = cast<GetElementPtrInst>(V);
27937 auto *GEPIdx = GEP->idx_begin()->get();
27938 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
27939 Bundle[BundleIndex++] = GEPIdx;
27940 }
27941
27942 // Try and vectorize the indices. We are currently only interested in
27943 // gather-like cases of the form:
27944 //
27945 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
27946 //
27947 // where the loads of "a", the loads of "b", and the subtractions can be
27948 // performed in parallel. It's likely that detecting this pattern in a
27949 // bottom-up phase will be simpler and less costly than building a
27950 // full-blown top-down phase beginning at the consecutive loads.
27951 Changed |= tryToVectorizeList(Bundle, R);
27952 }
27953 }
27954 return Changed;
27955}
27956
27957bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
27958 bool Changed = false;
27959 // Sort by type, base pointers and values operand. Value operands must be
27960 // compatible (have the same opcode, same parent), otherwise it is
27961 // definitely not profitable to try to vectorize them.
27962 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
27963 if (V->getValueOperand()->getType()->getTypeID() <
27964 V2->getValueOperand()->getType()->getTypeID())
27965 return true;
27966 if (V->getValueOperand()->getType()->getTypeID() >
27967 V2->getValueOperand()->getType()->getTypeID())
27968 return false;
27969 if (V->getPointerOperandType()->getTypeID() <
27970 V2->getPointerOperandType()->getTypeID())
27971 return true;
27972 if (V->getPointerOperandType()->getTypeID() >
27973 V2->getPointerOperandType()->getTypeID())
27974 return false;
27975 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
27976 V2->getValueOperand()->getType()->getScalarSizeInBits())
27977 return true;
27978 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
27979 V2->getValueOperand()->getType()->getScalarSizeInBits())
27980 return false;
27981 // UndefValues are compatible with all other values.
27982 auto *I1 = dyn_cast<Instruction>(V->getValueOperand());
27983 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
27984 if (I1 && I2) {
27985 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
27986 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
27987 assert(NodeI1 && "Should only process reachable instructions");
27988 assert(NodeI2 && "Should only process reachable instructions");
27989 assert((NodeI1 == NodeI2) ==
27990 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27991 "Different nodes should have different DFS numbers");
27992 if (NodeI1 != NodeI2)
27993 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27994 return I1->getOpcode() < I2->getOpcode();
27995 }
27996 if (I1 && !I2)
27997 return true;
27998 if (!I1 && I2)
27999 return false;
28000 return V->getValueOperand()->getValueID() <
28001 V2->getValueOperand()->getValueID();
28002 };
28003
28004 bool SameParent = true;
28005 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
28006 if (VL.empty()) {
28007 SameParent = true;
28008 return true;
28009 }
28010 StoreInst *V2 = VL.back();
28011 if (V1 == V2)
28012 return true;
28013 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
28014 return false;
28015 if (V1->getPointerOperandType() != V2->getPointerOperandType())
28016 return false;
28017 // Undefs are compatible with any other value.
28018 if (isa<UndefValue>(V1->getValueOperand()) ||
28020 return true;
28021 if (isa<Constant>(V1->getValueOperand()) &&
28023 return true;
28024 // Check if the operands of the stores can be vectorized. They can be
28025 // vectorized, if they have compatible operands or have operands, which can
28026 // be vectorized as copyables.
28027 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
28028 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
28029 if (I1 || I2) {
28030 // Accept only tail-following non-compatible values for now.
28031 // TODO: investigate if it is possible to vectorize incompatible values,
28032 // if the copyables are first in the list.
28033 if (I1 && !I2)
28034 return false;
28035 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
28036 SmallVector<Value *> NewVL(VL.size() + 1);
28037 for (auto [SI, V] : zip(VL, NewVL))
28038 V = SI->getValueOperand();
28039 NewVL.back() = V1->getValueOperand();
28040 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
28041 InstructionsState S = Analysis.buildInstructionsState(
28042 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
28043 /*SkipSameCodeCheck=*/!SameParent);
28044 if (S)
28045 return true;
28046 if (!SameParent)
28047 return false;
28048 }
28049 return V1->getValueOperand()->getValueID() ==
28050 V2->getValueOperand()->getValueID();
28051 };
28052
28053 // Attempt to sort and vectorize each of the store-groups.
28054 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
28055 for (auto &Pair : Stores) {
28056 if (Pair.second.size() < 2)
28057 continue;
28058
28059 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
28060 << Pair.second.size() << ".\n");
28061
28062 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
28063 continue;
28064
28065 // Reverse stores to do bottom-to-top analysis. This is important if the
28066 // values are stores to the same addresses several times, in this case need
28067 // to follow the stores order (reversed to meet the memory dependecies).
28068 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
28069 Pair.second.rend());
28071 ReversedStores, StoreSorter, AreCompatibleStores,
28072 [&](ArrayRef<StoreInst *> Candidates, bool) {
28073 return vectorizeStores(Candidates, R, Attempted);
28074 },
28075 /*MaxVFOnly=*/false, R);
28076 }
28077 return Changed;
28078}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
static Type * getIndexType(Value *In)
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op, bool IsCopyable=false)
Checks if the operand is commutative.
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, SmallVectorImpl< int64_t > &Coeffs)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1421
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1345
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1677
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1118
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1411
void negate()
Negate this APInt in place.
Definition APInt.h:1483
unsigned logBase2() const
Definition APInt.h:1776
void setAllBits()
Set every bit to 1.
Definition APInt.h:1334
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1382
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:287
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:178
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:219
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:195
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:157
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:470
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:488
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:491
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:718
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:664
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
static bool shouldExecute(CounterInfo &Counter)
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getUnknown()
Definition DebugLoc.h:161
An analysis that produces DemandedBits for a function.
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:863
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2554
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:546
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:574
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2620
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2176
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2576
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1711
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2249
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2411
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1654
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1440
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:154
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Information for memory intrinsic cost model.
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
T & front() const
front - Get the first element.
Definition ArrayRef.h:349
iterator end() const
Definition ArrayRef.h:343
iterator begin() const
Definition ArrayRef.h:342
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:91
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:132
void insert_range(Range &&R)
Definition SetVector.h:176
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:94
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:228
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ None
The insert/extract is not used with a load/store.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing an instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:180
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:259
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
iterator_range< value_op_iterator > operand_values()
Definition User.h:291
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
bool hasUseList() const
Check if this Value has a use-list.
Definition Value.h:344
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:346
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
InstructionCost getTreeCost(InstructionCost TreeCost, ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isReducedBitcastRoot() const
Returns the opcode of the root node, or 0, if the root node is gather.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
InstructionCost calculateTreeCostAndTrimNonProfitable(ArrayRef< Value * > VectorizedVals={})
Calculates the cost of the subtrees, trims non-profitable ones and returns final cost.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< CodeNode * > Code
Definition RDFGraph.h:388
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2170
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2106
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1757
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1731
scope_exit(Callable) -> scope_exit< Callable >
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2303
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
iterator_range< po_iterator< T > > post_order(const T &G)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:2029
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2190
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2016
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1775
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:361
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1397
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1968
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Add
Sum of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1883
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1407
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1437
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1446
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)