LLVM 22.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <memory>
99#include <optional>
100#include <set>
101#include <string>
102#include <tuple>
103#include <utility>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107using namespace slpvectorizer;
108using namespace std::placeholders;
109
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
112
113STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
114
115DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
116 "Controls which SLP graphs should be vectorized.");
117
118static cl::opt<bool>
119 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
120 cl::desc("Run the SLP vectorization passes"));
121
122static cl::opt<bool>
123 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
124 cl::desc("Enable vectorization for wider vector utilization"));
125
126static cl::opt<int>
128 cl::desc("Only vectorize if you gain more than this "
129 "number "));
130
132 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
133 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
135
136static cl::opt<bool>
137ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
138 cl::desc("Attempt to vectorize horizontal reductions"));
139
141 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
142 cl::desc(
143 "Attempt to vectorize horizontal reductions feeding into a store"));
144
146 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
147 cl::desc("Improve the code quality by splitting alternate instructions"));
148
149static cl::opt<int>
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
155 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
156
157/// Limits the size of scheduling regions in a block.
158/// It avoid long compile times for _very_ large blocks where vector
159/// instructions are spread over a wide range.
160/// This limit is way higher than needed by real-world functions.
161static cl::opt<int>
162ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
163 cl::desc("Limit the size of the SLP scheduling region per block"));
164
166 "slp-min-reg-size", cl::init(128), cl::Hidden,
167 cl::desc("Attempt to vectorize for this register size in bits"));
168
170 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
171 cl::desc("Limit the recursion depth when building a vectorizable tree"));
172
174 "slp-min-tree-size", cl::init(3), cl::Hidden,
175 cl::desc("Only vectorize small trees if they are fully vectorizable"));
176
177// The maximum depth that the look-ahead score heuristic will explore.
178// The higher this value, the higher the compilation time overhead.
180 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
181 cl::desc("The maximum look-ahead depth for operand reordering scores"));
182
183// The maximum depth that the look-ahead score heuristic will explore
184// when it probing among candidates for vectorization tree roots.
185// The higher this value, the higher the compilation time overhead but unlike
186// similar limit for operands ordering this is less frequently used, hence
187// impact of higher value is less noticeable.
189 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
190 cl::desc("The maximum look-ahead depth for searching best rooting option"));
191
193 "slp-min-strided-loads", cl::init(2), cl::Hidden,
194 cl::desc("The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
196
198 "slp-max-stride", cl::init(8), cl::Hidden,
199 cl::desc("The maximum stride, considered to be profitable."));
200
201static cl::opt<bool>
202 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
203 cl::desc("Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
205
206static cl::opt<bool>
207 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
208 cl::desc("Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
210
211static cl::opt<bool>
212 ViewSLPTree("view-slp-tree", cl::Hidden,
213 cl::desc("Display the SLP trees with Graphviz"));
214
216 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
217 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
218
219/// Enables vectorization of copyable elements.
221 "slp-copyable-elements", cl::init(true), cl::Hidden,
222 cl::desc("Try to replace values with the idempotent instructions for "
223 "better vectorization."));
224
225// Limit the number of alias checks. The limit is chosen so that
226// it has no negative effect on the llvm benchmarks.
227static const unsigned AliasedCheckLimit = 10;
228
229// Limit of the number of uses for potentially transformed instructions/values,
230// used in checks to avoid compile-time explode.
231static constexpr int UsesLimit = 64;
232
233// Another limit for the alias checks: The maximum distance between load/store
234// instructions where alias checks are done.
235// This limit is useful for very large basic blocks.
236static const unsigned MaxMemDepDistance = 160;
237
238/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
239/// regions to be handled.
240static const int MinScheduleRegionSize = 16;
241
242/// Maximum allowed number of operands in the PHI nodes.
243static const unsigned MaxPHINumOperands = 128;
244
245/// Predicate for the element types that the SLP vectorizer supports.
246///
247/// The most important thing to filter here are types which are invalid in LLVM
248/// vectors. We also filter target specific types which have absolutely no
249/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
250/// avoids spending time checking the cost model and realizing that they will
251/// be inevitably scalarized.
252static bool isValidElementType(Type *Ty) {
253 // TODO: Support ScalableVectorType.
255 Ty = Ty->getScalarType();
256 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
257 !Ty->isPPC_FP128Ty();
258}
259
260/// Returns the type of the given value/instruction \p V. If it is store,
261/// returns the type of its value operand, for Cmp - the types of the compare
262/// operands and for insertelement - the type os the inserted operand.
263/// Otherwise, just the type of the value is returned.
265 if (auto *SI = dyn_cast<StoreInst>(V))
266 return SI->getValueOperand()->getType();
267 if (auto *CI = dyn_cast<CmpInst>(V))
268 return CI->getOperand(0)->getType();
269 if (auto *IE = dyn_cast<InsertElementInst>(V))
270 return IE->getOperand(1)->getType();
271 return V->getType();
272}
273
274/// \returns the number of elements for Ty.
275static unsigned getNumElements(Type *Ty) {
277 "ScalableVectorType is not supported.");
278 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
279 return VecTy->getNumElements();
280 return 1;
281}
282
283/// \returns the vector type of ScalarTy based on vectorization factor.
284static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
285 return FixedVectorType::get(ScalarTy->getScalarType(),
286 VF * getNumElements(ScalarTy));
287}
288
289/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
290/// which forms type, which splits by \p TTI into whole vector types during
291/// legalization.
293 Type *Ty, unsigned Sz) {
294 if (!isValidElementType(Ty))
295 return bit_ceil(Sz);
296 // Find the number of elements, which forms full vectors.
297 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
298 if (NumParts == 0 || NumParts >= Sz)
299 return bit_ceil(Sz);
300 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
301}
302
303/// Returns the number of elements of the given type \p Ty, not greater than \p
304/// Sz, which forms type, which splits by \p TTI into whole vector types during
305/// legalization.
306static unsigned
308 unsigned Sz) {
309 if (!isValidElementType(Ty))
310 return bit_floor(Sz);
311 // Find the number of elements, which forms full vectors.
312 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
313 if (NumParts == 0 || NumParts >= Sz)
314 return bit_floor(Sz);
315 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
316 if (RegVF > Sz)
317 return bit_floor(Sz);
318 return (Sz / RegVF) * RegVF;
319}
320
321static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
322 SmallVectorImpl<int> &Mask) {
323 // The ShuffleBuilder implementation use shufflevector to splat an "element".
324 // But the element have different meaning for SLP (scalar) and REVEC
325 // (vector). We need to expand Mask into masks which shufflevector can use
326 // directly.
327 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
328 for (unsigned I : seq<unsigned>(Mask.size()))
329 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
330 I * VecTyNumElements, VecTyNumElements)))
331 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
332 : Mask[I] * VecTyNumElements + J;
333 Mask.swap(NewMask);
334}
335
336/// \returns the number of groups of shufflevector
337/// A group has the following features
338/// 1. All of value in a group are shufflevector.
339/// 2. The mask of all shufflevector is isExtractSubvectorMask.
340/// 3. The mask of all shufflevector uses all of the elements of the source.
341/// e.g., it is 1 group (%0)
342/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
343/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
344/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
345/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
346/// it is 2 groups (%3 and %4)
347/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
348/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
349/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
350/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
351/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
352/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
353/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
354/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
355/// it is 0 group
356/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
357/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
358/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
359/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
361 if (VL.empty())
362 return 0;
364 return 0;
365 auto *SV = cast<ShuffleVectorInst>(VL.front());
366 unsigned SVNumElements =
367 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
370 return 0;
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
373 return 0;
374 unsigned NumGroup = 0;
375 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
376 auto *SV = cast<ShuffleVectorInst>(VL[I]);
377 Value *Src = SV->getOperand(0);
378 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
379 SmallBitVector ExpectedIndex(GroupSize);
380 if (!all_of(Group, [&](Value *V) {
381 auto *SV = cast<ShuffleVectorInst>(V);
382 // From the same source.
383 if (SV->getOperand(0) != Src)
384 return false;
385 int Index;
386 if (!SV->isExtractSubvectorMask(Index))
387 return false;
388 ExpectedIndex.set(Index / ShuffleMaskSize);
389 return true;
390 }))
391 return 0;
392 if (!ExpectedIndex.all())
393 return 0;
394 ++NumGroup;
395 }
396 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
397 return NumGroup;
398}
399
400/// \returns a shufflevector mask which is used to vectorize shufflevectors
401/// e.g.,
402/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
403/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
404/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
405/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
406/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
407/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
409/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
410/// the result is
411/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
413 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
414 auto *SV = cast<ShuffleVectorInst>(VL.front());
415 unsigned SVNumElements =
416 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
417 SmallVector<int> Mask;
418 unsigned AccumulateLength = 0;
419 for (Value *V : VL) {
420 auto *SV = cast<ShuffleVectorInst>(V);
421 for (int M : SV->getShuffleMask())
422 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
425 }
426 return Mask;
427}
428
429/// \returns True if the value is a constant (but not globals/constant
430/// expressions).
431static bool isConstant(Value *V) {
433}
434
435/// Checks if \p V is one of vector-like instructions, i.e. undef,
436/// insertelement/extractelement with constant indices for fixed vector type or
437/// extractvalue instruction.
441 return false;
442 auto *I = dyn_cast<Instruction>(V);
443 if (!I || isa<ExtractValueInst>(I))
444 return true;
445 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
446 return false;
448 return isConstant(I->getOperand(1));
449 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
450 return isConstant(I->getOperand(2));
451}
452
453/// Returns power-of-2 number of elements in a single register (part), given the
454/// total number of elements \p Size and number of registers (parts) \p
455/// NumParts.
456static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
457 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
458}
459
460/// Returns correct remaining number of elements, considering total amount \p
461/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
462/// and current register (part) \p Part.
463static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
464 unsigned Part) {
465 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
466}
467
468#if !defined(NDEBUG)
469/// Print a short descriptor of the instruction bundle suitable for debug output.
470static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
471 std::string Result;
472 raw_string_ostream OS(Result);
473 if (Idx >= 0)
474 OS << "Idx: " << Idx << ", ";
475 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
476 return Result;
477}
478#endif
479
480/// \returns true if all of the instructions in \p VL are in the same block or
481/// false otherwise.
483 auto *It = find_if(VL, IsaPred<Instruction>);
484 if (It == VL.end())
485 return false;
488 return true;
489
490 BasicBlock *BB = I0->getParent();
491 for (Value *V : iterator_range(It, VL.end())) {
492 if (isa<PoisonValue>(V))
493 continue;
494 auto *II = dyn_cast<Instruction>(V);
495 if (!II)
496 return false;
497
498 if (BB != II->getParent())
499 return false;
500 }
501 return true;
502}
503
504/// \returns True if all of the values in \p VL are constants (but not
505/// globals/constant expressions).
507 // Constant expressions and globals can't be vectorized like normal integer/FP
508 // constants.
509 return all_of(VL, isConstant);
510}
511
512/// \returns True if all of the values in \p VL are identical or some of them
513/// are UndefValue.
514static bool isSplat(ArrayRef<Value *> VL) {
515 Value *FirstNonUndef = nullptr;
516 for (Value *V : VL) {
517 if (isa<UndefValue>(V))
518 continue;
519 if (!FirstNonUndef) {
520 FirstNonUndef = V;
521 continue;
522 }
523 if (V != FirstNonUndef)
524 return false;
525 }
526 return FirstNonUndef != nullptr;
527}
528
529/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
530/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
531/// patterns that make it effectively commutative (like equality comparisons
532/// with zero).
533/// In most cases, users should not call this function directly (since \p I and
534/// \p InstWithUses are the same). However, when analyzing interchangeable
535/// instructions, we need to use the converted opcode along with the original
536/// uses.
537/// \param I The instruction to check for commutativity
538/// \param ValWithUses The value whose uses are analyzed for special
539/// patterns
540static bool isCommutative(Instruction *I, Value *ValWithUses) {
541 if (auto *Cmp = dyn_cast<CmpInst>(I))
542 return Cmp->isCommutative();
543 if (auto *BO = dyn_cast<BinaryOperator>(I))
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
546 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
547 all_of(
548 ValWithUses->uses(),
549 [](const Use &U) {
550 // Commutative, if icmp eq/ne sub, 0
551 CmpPredicate Pred;
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
555 return true;
556 // Commutative, if abs(sub nsw, true) or abs(sub, false).
557 ConstantInt *Flag;
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
562 Flag->isOne());
563 })) ||
564 (BO->getOpcode() == Instruction::FSub &&
565 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
566 all_of(ValWithUses->uses(), [](const Use &U) {
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
569 }));
570 return I->isCommutative();
571}
572
573/// This is a helper function to check whether \p I is commutative.
574/// This is a convenience wrapper that calls the two-parameter version of
575/// isCommutative with the same instruction for both parameters. This is
576/// the common case where the instruction being checked for commutativity
577/// is the same as the instruction whose uses are analyzed for special
578/// patterns (see the two-parameter version above for details).
579/// \param I The instruction to check for commutativity
580/// \returns true if the instruction is commutative, false otherwise
581static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
582
583/// \returns number of operands of \p I, considering commutativity. Returns 2
584/// for commutative instrinsics.
585/// \param I The instruction to check for commutativity
588 // IntrinsicInst::isCommutative returns true if swapping the first "two"
589 // arguments to the intrinsic produces the same result.
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
592 }
593 return I->getNumOperands();
594}
595
596template <typename T>
597static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
598 unsigned Offset) {
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
601 "unsupported T");
602 int Index = Offset;
603 if (const auto *IE = dyn_cast<T>(Inst)) {
604 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
605 if (!VT)
606 return std::nullopt;
607 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
608 if (!CI)
609 return std::nullopt;
610 if (CI->getValue().uge(VT->getNumElements()))
611 return std::nullopt;
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
614 return Index;
615 }
616 return std::nullopt;
617}
618
619/// \returns inserting or extracting index of InsertElement, ExtractElement or
620/// InsertValue instruction, using Offset as base offset for index.
621/// \returns std::nullopt if the index is not an immediate.
622static std::optional<unsigned> getElementIndex(const Value *Inst,
623 unsigned Offset = 0) {
624 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
625 return Index;
627 return Index;
628
629 int Index = Offset;
630
631 const auto *IV = dyn_cast<InsertValueInst>(Inst);
632 if (!IV)
633 return std::nullopt;
634
635 Type *CurrentType = IV->getType();
636 for (unsigned I : IV->indices()) {
637 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(I);
640 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
643 } else {
644 return std::nullopt;
645 }
646 Index += I;
647 }
648 return Index;
649}
650
651/// \returns true if all of the values in \p VL use the same opcode.
652/// For comparison instructions, also checks if predicates match.
653/// PoisonValues are considered matching.
654/// Interchangeable instructions are not considered.
656 auto *It = find_if(VL, IsaPred<Instruction>);
657 if (It == VL.end())
658 return true;
659 Instruction *MainOp = cast<Instruction>(*It);
660 unsigned Opcode = MainOp->getOpcode();
661 bool IsCmpOp = isa<CmpInst>(MainOp);
662 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
664 return std::all_of(It, VL.end(), [&](Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
670 });
671}
672
673namespace {
674/// Specifies the way the mask should be analyzed for undefs/poisonous elements
675/// in the shuffle mask.
676enum class UseMask {
677 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
678 ///< check for the mask elements for the first argument (mask
679 ///< indices are in range [0:VF)).
680 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
681 ///< for the mask elements for the second argument (mask indices
682 ///< are in range [VF:2*VF))
683 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
684 ///< future shuffle elements and mark them as ones as being used
685 ///< in future. Non-undef elements are considered as unused since
686 ///< they're already marked as used in the mask.
687};
688} // namespace
689
690/// Prepares a use bitset for the given mask either for the first argument or
691/// for the second.
693 UseMask MaskArg) {
694 SmallBitVector UseMask(VF, true);
695 for (auto [Idx, Value] : enumerate(Mask)) {
696 if (Value == PoisonMaskElem) {
697 if (MaskArg == UseMask::UndefsAsMask)
698 UseMask.reset(Idx);
699 continue;
700 }
701 if (MaskArg == UseMask::FirstArg && Value < VF)
702 UseMask.reset(Value);
703 else if (MaskArg == UseMask::SecondArg && Value >= VF)
704 UseMask.reset(Value - VF);
705 }
706 return UseMask;
707}
708
709/// Checks if the given value is actually an undefined constant vector.
710/// Also, if the \p UseMask is not empty, tries to check if the non-masked
711/// elements actually mask the insertelement buildvector, if any.
712template <bool IsPoisonOnly = false>
714 const SmallBitVector &UseMask = {}) {
715 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
717 if (isa<T>(V))
718 return Res;
719 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
720 if (!VecTy)
721 return Res.reset();
722 auto *C = dyn_cast<Constant>(V);
723 if (!C) {
724 if (!UseMask.empty()) {
725 const Value *Base = V;
726 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
727 Base = II->getOperand(0);
728 if (isa<T>(II->getOperand(1)))
729 continue;
730 std::optional<unsigned> Idx = getElementIndex(II);
731 if (!Idx) {
732 Res.reset();
733 return Res;
734 }
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
736 Res.reset(*Idx);
737 }
738 // TODO: Add analysis for shuffles here too.
739 if (V == Base) {
740 Res.reset();
741 } else {
742 SmallBitVector SubMask(UseMask.size(), false);
743 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
744 }
745 } else {
746 Res.reset();
747 }
748 return Res;
749 }
750 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
751 if (Constant *Elem = C->getAggregateElement(I))
752 if (!isa<T>(Elem) &&
753 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
754 Res.reset(I);
755 }
756 return Res;
757}
758
759/// Checks if the vector of instructions can be represented as a shuffle, like:
760/// %x0 = extractelement <4 x i8> %x, i32 0
761/// %x3 = extractelement <4 x i8> %x, i32 3
762/// %y1 = extractelement <4 x i8> %y, i32 1
763/// %y2 = extractelement <4 x i8> %y, i32 2
764/// %x0x0 = mul i8 %x0, %x0
765/// %x3x3 = mul i8 %x3, %x3
766/// %y1y1 = mul i8 %y1, %y1
767/// %y2y2 = mul i8 %y2, %y2
768/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
769/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
770/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
771/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
772/// ret <4 x i8> %ins4
773/// can be transformed into:
774/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
775/// i32 6>
776/// %2 = mul <4 x i8> %1, %1
777/// ret <4 x i8> %2
778/// Mask will return the Shuffle Mask equivalent to the extracted elements.
779/// TODO: Can we split off and reuse the shuffle mask detection from
780/// ShuffleVectorInst/getShuffleCost?
781static std::optional<TargetTransformInfo::ShuffleKind>
783 AssumptionCache *AC) {
784 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
785 if (It == VL.end())
786 return std::nullopt;
787 unsigned Size =
788 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
790 if (!EI)
791 return S;
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
793 if (!VTy)
794 return S;
795 return std::max(S, VTy->getNumElements());
796 });
797
798 Value *Vec1 = nullptr;
799 Value *Vec2 = nullptr;
800 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
801 auto *EE = dyn_cast<ExtractElementInst>(V);
802 if (!EE)
803 return false;
804 Value *Vec = EE->getVectorOperand();
805 if (isa<UndefValue>(Vec))
806 return false;
807 return isGuaranteedNotToBePoison(Vec, AC);
808 });
809 enum ShuffleMode { Unknown, Select, Permute };
810 ShuffleMode CommonShuffleMode = Unknown;
811 Mask.assign(VL.size(), PoisonMaskElem);
812 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
813 // Undef can be represented as an undef element in a vector.
814 if (isa<UndefValue>(VL[I]))
815 continue;
816 auto *EI = cast<ExtractElementInst>(VL[I]);
817 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
818 return std::nullopt;
819 auto *Vec = EI->getVectorOperand();
820 // We can extractelement from undef or poison vector.
822 continue;
823 // All vector operands must have the same number of vector elements.
824 if (isa<UndefValue>(Vec)) {
825 Mask[I] = I;
826 } else {
827 if (isa<UndefValue>(EI->getIndexOperand()))
828 continue;
829 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
830 if (!Idx)
831 return std::nullopt;
832 // Undefined behavior if Idx is negative or >= Size.
833 if (Idx->getValue().uge(Size))
834 continue;
835 unsigned IntIdx = Idx->getValue().getZExtValue();
836 Mask[I] = IntIdx;
837 }
838 if (isUndefVector(Vec).all() && HasNonUndefVec)
839 continue;
840 // For correct shuffling we have to have at most 2 different vector operands
841 // in all extractelement instructions.
842 if (!Vec1 || Vec1 == Vec) {
843 Vec1 = Vec;
844 } else if (!Vec2 || Vec2 == Vec) {
845 Vec2 = Vec;
846 Mask[I] += Size;
847 } else {
848 return std::nullopt;
849 }
850 if (CommonShuffleMode == Permute)
851 continue;
852 // If the extract index is not the same as the operation number, it is a
853 // permutation.
854 if (Mask[I] % Size != I) {
855 CommonShuffleMode = Permute;
856 continue;
857 }
858 CommonShuffleMode = Select;
859 }
860 // If we're not crossing lanes in different vectors, consider it as blending.
861 if (CommonShuffleMode == Select && Vec2)
863 // If Vec2 was never used, we have a permutation of a single vector, otherwise
864 // we have permutation of 2 vectors.
867}
868
869/// \returns True if Extract{Value,Element} instruction extracts element Idx.
870static std::optional<unsigned> getExtractIndex(const Instruction *E) {
871 unsigned Opcode = E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
876 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
877 if (!CI)
878 return std::nullopt;
879 return CI->getZExtValue();
880 }
881 auto *EI = cast<ExtractValueInst>(E);
882 if (EI->getNumIndices() != 1)
883 return std::nullopt;
884 return *EI->idx_begin();
885}
886
887namespace llvm {
888/// Checks if the provided value does not require scheduling. It does not
889/// require scheduling if this is not an instruction or it is an instruction
890/// that does not read/write memory and all operands are either not instructions
891/// or phi nodes or instructions from different blocks.
892static bool areAllOperandsNonInsts(Value *V);
893/// Checks if the provided value does not require scheduling. It does not
894/// require scheduling if this is not an instruction or it is an instruction
895/// that does not read/write memory and all users are phi nodes or instructions
896/// from the different blocks.
897static bool isUsedOutsideBlock(Value *V);
898/// Checks if the specified value does not require scheduling. It does not
899/// require scheduling if all operands and all users do not need to be scheduled
900/// in the current basic block.
901static bool doesNotNeedToBeScheduled(Value *V);
902} // namespace llvm
903
904namespace {
905/// \returns true if \p Opcode is allowed as part of the main/alternate
906/// instruction for SLP vectorization.
907///
908/// Example of unsupported opcode is SDIV that can potentially cause UB if the
909/// "shuffled out" lane would result in division by zero.
910bool isValidForAlternation(unsigned Opcode) {
911 return !Instruction::isIntDivRem(Opcode);
912}
913
914/// Helper class that determines VL can use the same opcode.
915/// Alternate instruction is supported. In addition, it supports interchangeable
916/// instruction. An interchangeable instruction is an instruction that can be
917/// converted to another instruction with same semantics. For example, x << 1 is
918/// equal to x * 2. x * 1 is equal to x | 0.
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
921 /// Sort SupportedOp because it is used by binary_search.
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
925 enum : MaskType {
926 ShlBIT = 0b1,
927 AShrBIT = 0b10,
928 MulBIT = 0b100,
929 AddBIT = 0b1000,
930 SubBIT = 0b10000,
931 AndBIT = 0b100000,
932 OrBIT = 0b1000000,
933 XorBIT = 0b10000000,
934 MainOpBIT = 0b100000000,
936 };
937 /// Return a non-nullptr if either operand of I is a ConstantInt.
938 /// The second return value represents the operand position. We check the
939 /// right-hand side first (1). If the right hand side is not a ConstantInt and
940 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
941 /// side (0).
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(const Instruction *I) {
944 unsigned Opcode = I->getOpcode();
945 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
946 (void)SupportedOp;
947 auto *BinOp = cast<BinaryOperator>(I);
948 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
949 return {CI, 1};
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
952 return {nullptr, 0};
953 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
954 return {CI, 0};
955 return {nullptr, 0};
956 }
957 struct InterchangeableInfo {
958 const Instruction *I = nullptr;
959 /// The bit it sets represents whether MainOp can be converted to.
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
962 /// We cannot create an interchangeable instruction that does not exist in
963 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
964 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
965 /// 1]. SeenBefore is used to know what operations have been seen before.
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(const Instruction *I) : I(I) {}
968 /// Return false allows BinOpSameOpcodeHelper to find an alternate
969 /// instruction. Directly setting the mask will destroy the mask state,
970 /// preventing us from determining which instruction it should convert to.
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
975 return true;
976 }
977 return false;
978 }
979 bool equal(unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
981 }
982 unsigned getOpcode() const {
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1002 llvm_unreachable("Cannot find interchangeable instruction.");
1003 }
1004
1005 /// Return true if the instruction can be converted to \p Opcode.
1006 bool hasCandidateOpcode(unsigned Opcode) const {
1007 MaskType Candidate = Mask & SeenBefore;
1008 switch (Opcode) {
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1035 return false;
1036 default:
1037 break;
1038 }
1039 llvm_unreachable("Cannot find interchangeable instruction.");
1040 }
1041
1042 SmallVector<Value *> getOperand(const Instruction *To) const {
1043 unsigned ToOpcode = To->getOpcode();
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1046 return SmallVector<Value *>(I->operands());
1047 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1051 APInt ToCIValue;
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1055 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1056 FromCIValue.getZExtValue());
1057 } else {
1058 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1060 ? APInt::getAllOnes(FromCIValueBitWidth)
1061 : APInt::getZero(FromCIValueBitWidth);
1062 }
1063 break;
1064 case Instruction::Mul:
1065 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1068 } else {
1069 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1071 ? APInt::getAllOnes(FromCIValueBitWidth)
1072 : APInt::getZero(FromCIValueBitWidth);
1073 }
1074 break;
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.isZero()) {
1078 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1079 } else {
1080 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1083 ToCIValue.negate();
1084 }
1085 break;
1086 case Instruction::And:
1087 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1088 ToCIValue = ToOpcode == Instruction::Mul
1089 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1090 : APInt::getZero(FromCIValueBitWidth);
1091 break;
1092 default:
1093 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1094 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1095 break;
1096 }
1097 Value *LHS = I->getOperand(1 - Pos);
1098 Constant *RHS =
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1100 // constant + x cannot be -constant - x
1101 // instead, it should be x - -constant
1102 if (Pos == 1 ||
1103 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1104 FromOpcode == Instruction::Xor) &&
1105 ToOpcode == Instruction::Sub))
1106 return SmallVector<Value *>({LHS, RHS});
1107 return SmallVector<Value *>({RHS, LHS});
1108 }
1109 };
1110 InterchangeableInfo MainOp;
1111 InterchangeableInfo AltOp;
1112 bool isValidForAlternation(const Instruction *I) const {
1113 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1114 ::isValidForAlternation(I->getOpcode());
1115 }
1116 bool initializeAltOp(const Instruction *I) {
1117 if (AltOp.I)
1118 return true;
1119 if (!isValidForAlternation(I))
1120 return false;
1121 AltOp.I = I;
1122 return true;
1123 }
1124
1125public:
1126 BinOpSameOpcodeHelper(const Instruction *MainOp,
1127 const Instruction *AltOp = nullptr)
1128 : MainOp(MainOp), AltOp(AltOp) {
1129 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1130 }
1131 bool add(const Instruction *I) {
1133 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1134 unsigned Opcode = I->getOpcode();
1135 MaskType OpcodeInMaskForm;
1136 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1137 switch (Opcode) {
1138 case Instruction::Shl:
1139 OpcodeInMaskForm = ShlBIT;
1140 break;
1141 case Instruction::AShr:
1142 OpcodeInMaskForm = AShrBIT;
1143 break;
1144 case Instruction::Mul:
1145 OpcodeInMaskForm = MulBIT;
1146 break;
1147 case Instruction::Add:
1148 OpcodeInMaskForm = AddBIT;
1149 break;
1150 case Instruction::Sub:
1151 OpcodeInMaskForm = SubBIT;
1152 break;
1153 case Instruction::And:
1154 OpcodeInMaskForm = AndBIT;
1155 break;
1156 case Instruction::Or:
1157 OpcodeInMaskForm = OrBIT;
1158 break;
1159 case Instruction::Xor:
1160 OpcodeInMaskForm = XorBIT;
1161 break;
1162 default:
1163 return MainOp.equal(Opcode) ||
1164 (initializeAltOp(I) && AltOp.equal(Opcode));
1165 }
1166 MaskType InterchangeableMask = OpcodeInMaskForm;
1167 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1168 if (CI) {
1169 constexpr MaskType CanBeAll =
1170 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1171 const APInt &CIValue = CI->getValue();
1172 switch (Opcode) {
1173 case Instruction::Shl:
1174 if (CIValue.ult(CIValue.getBitWidth()))
1175 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1176 break;
1177 case Instruction::Mul:
1178 if (CIValue.isOne()) {
1179 InterchangeableMask = CanBeAll;
1180 break;
1181 }
1182 if (CIValue.isPowerOf2())
1183 InterchangeableMask = MulBIT | ShlBIT;
1184 break;
1185 case Instruction::Add:
1186 case Instruction::Sub:
1187 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1188 break;
1189 case Instruction::And:
1190 if (CIValue.isAllOnes())
1191 InterchangeableMask = CanBeAll;
1192 break;
1193 case Instruction::Xor:
1194 if (CIValue.isZero())
1195 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1196 break;
1197 default:
1198 if (CIValue.isZero())
1199 InterchangeableMask = CanBeAll;
1200 break;
1201 }
1202 }
1203 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1204 (initializeAltOp(I) &&
1205 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1206 }
1207 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1208 /// Checks if the list of potential opcodes includes \p Opcode.
1209 bool hasCandidateOpcode(unsigned Opcode) const {
1210 return MainOp.hasCandidateOpcode(Opcode);
1211 }
1212 bool hasAltOp() const { return AltOp.I; }
1213 unsigned getAltOpcode() const {
1214 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1215 }
1216 SmallVector<Value *> getOperand(const Instruction *I) const {
1217 return MainOp.getOperand(I);
1218 }
1219};
1220
1221/// Main data required for vectorization of instructions.
1222class InstructionsState {
1223 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1224 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1225 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1226 /// isAltShuffle).
1227 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1228 /// from getMainAltOpsNoStateVL.
1229 /// For those InstructionsState that use alternate instructions, the resulting
1230 /// vectorized output ultimately comes from a shufflevector. For example,
1231 /// given a vector list (VL):
1232 /// VL[0] = add i32 a, e
1233 /// VL[1] = sub i32 b, f
1234 /// VL[2] = add i32 c, g
1235 /// VL[3] = sub i32 d, h
1236 /// The vectorized result would be:
1237 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1238 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1239 /// result = shufflevector <4 x i32> intermediated_0,
1240 /// <4 x i32> intermediated_1,
1241 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1242 /// Since shufflevector is used in the final result, when calculating the cost
1243 /// (getEntryCost), we must account for the usage of shufflevector in
1244 /// GetVectorCost.
1245 Instruction *MainOp = nullptr;
1246 Instruction *AltOp = nullptr;
1247 /// Wether the instruction state represents copyable instructions.
1248 bool HasCopyables = false;
1249
1250public:
1251 Instruction *getMainOp() const {
1252 assert(valid() && "InstructionsState is invalid.");
1253 return MainOp;
1254 }
1255
1256 Instruction *getAltOp() const {
1257 assert(valid() && "InstructionsState is invalid.");
1258 return AltOp;
1259 }
1260
1261 /// The main/alternate opcodes for the list of instructions.
1262 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1263
1264 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1265
1266 /// Some of the instructions in the list have alternate opcodes.
1267 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1268
1269 /// Checks if the instruction matches either the main or alternate opcode.
1270 /// \returns
1271 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1272 /// to it
1273 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1274 /// it
1275 /// - nullptr if \param I cannot be matched or converted to either opcode
1276 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1277 assert(MainOp && "MainOp cannot be nullptr.");
1278 if (I->getOpcode() == MainOp->getOpcode())
1279 return MainOp;
1280 // Prefer AltOp instead of interchangeable instruction of MainOp.
1281 assert(AltOp && "AltOp cannot be nullptr.");
1282 if (I->getOpcode() == AltOp->getOpcode())
1283 return AltOp;
1284 if (!I->isBinaryOp())
1285 return nullptr;
1286 BinOpSameOpcodeHelper Converter(MainOp);
1287 if (!Converter.add(I) || !Converter.add(MainOp))
1288 return nullptr;
1289 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1290 BinOpSameOpcodeHelper AltConverter(AltOp);
1291 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1292 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1293 return AltOp;
1294 }
1295 if (Converter.hasAltOp() && !isAltShuffle())
1296 return nullptr;
1297 return Converter.hasAltOp() ? AltOp : MainOp;
1298 }
1299
1300 /// Checks if main/alt instructions are shift operations.
1301 bool isShiftOp() const {
1302 return getMainOp()->isShift() && getAltOp()->isShift();
1303 }
1304
1305 /// Checks if main/alt instructions are bitwise logic operations.
1306 bool isBitwiseLogicOp() const {
1307 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1308 }
1309
1310 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1311 bool isMulDivLikeOp() const {
1312 constexpr std::array<unsigned, 8> MulDiv = {
1313 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1314 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1315 Instruction::URem, Instruction::FRem};
1316 return is_contained(MulDiv, getOpcode()) &&
1317 is_contained(MulDiv, getAltOpcode());
1318 }
1319
1320 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1321 bool isAddSubLikeOp() const {
1322 constexpr std::array<unsigned, 4> AddSub = {
1323 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1324 Instruction::FSub};
1325 return is_contained(AddSub, getOpcode()) &&
1326 is_contained(AddSub, getAltOpcode());
1327 }
1328
1329 /// Checks if main/alt instructions are cmp operations.
1330 bool isCmpOp() const {
1331 return (getOpcode() == Instruction::ICmp ||
1332 getOpcode() == Instruction::FCmp) &&
1333 getAltOpcode() == getOpcode();
1334 }
1335
1336 /// Checks if the current state is valid, i.e. has non-null MainOp
1337 bool valid() const { return MainOp && AltOp; }
1338
1339 explicit operator bool() const { return valid(); }
1340
1341 InstructionsState() = delete;
1342 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1343 bool HasCopyables = false)
1344 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1345 static InstructionsState invalid() { return {nullptr, nullptr}; }
1346
1347 /// Checks if the value is a copyable element.
1348 bool isCopyableElement(Value *V) const {
1349 assert(valid() && "InstructionsState is invalid.");
1350 if (!HasCopyables)
1351 return false;
1352 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1353 return false;
1354 auto *I = dyn_cast<Instruction>(V);
1355 if (!I)
1356 return !isa<PoisonValue>(V);
1357 if (I->getParent() != MainOp->getParent() &&
1360 return true;
1361 if (I->getOpcode() == MainOp->getOpcode())
1362 return false;
1363 if (!I->isBinaryOp())
1364 return true;
1365 BinOpSameOpcodeHelper Converter(MainOp);
1366 return !Converter.add(I) || !Converter.add(MainOp) ||
1367 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1368 }
1369
1370 /// Checks if the value is non-schedulable.
1371 bool isNonSchedulable(Value *V) const {
1372 assert(valid() && "InstructionsState is invalid.");
1373 auto *I = dyn_cast<Instruction>(V);
1374 if (!HasCopyables)
1375 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1377 // MainOp for copyables always schedulable to correctly identify
1378 // non-schedulable copyables.
1379 if (getMainOp() == V)
1380 return false;
1381 if (isCopyableElement(V)) {
1382 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1383 auto *I = dyn_cast<Instruction>(V);
1384 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1386 // If the copyable instructions comes after MainOp
1387 // (non-schedulable, but used in the block) - cannot vectorize
1388 // it, will possibly generate use before def.
1389 !MainOp->comesBefore(I));
1390 };
1391
1392 return IsNonSchedulableCopyableElement(V);
1393 }
1394 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1396 }
1397
1398 /// Checks if the state represents copyable instructions.
1399 bool areInstructionsWithCopyableElements() const {
1400 assert(valid() && "InstructionsState is invalid.");
1401 return HasCopyables;
1402 }
1403};
1404
1405std::pair<Instruction *, SmallVector<Value *>>
1406convertTo(Instruction *I, const InstructionsState &S) {
1407 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1408 assert(SelectedOp && "Cannot convert the instruction.");
1409 if (I->isBinaryOp()) {
1410 BinOpSameOpcodeHelper Converter(I);
1411 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1412 }
1413 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1414}
1415
1416} // end anonymous namespace
1417
1418static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1419 const TargetLibraryInfo &TLI);
1420
1421/// Find an instruction with a specific opcode in VL.
1422/// \param VL Array of values to search through. Must contain only Instructions
1423/// and PoisonValues.
1424/// \param Opcode The instruction opcode to search for
1425/// \returns
1426/// - The first instruction found with matching opcode
1427/// - nullptr if no matching instruction is found
1429 unsigned Opcode) {
1430 for (Value *V : VL) {
1431 if (isa<PoisonValue>(V))
1432 continue;
1433 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1434 auto *Inst = cast<Instruction>(V);
1435 if (Inst->getOpcode() == Opcode)
1436 return Inst;
1437 }
1438 return nullptr;
1439}
1440
1441/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1442/// compatible instructions or constants, or just some other regular values.
1443static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1444 Value *Op1, const TargetLibraryInfo &TLI) {
1445 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1446 (isConstant(BaseOp1) && isConstant(Op1)) ||
1447 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1448 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1449 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1450 getSameOpcode({BaseOp0, Op0}, TLI) ||
1451 getSameOpcode({BaseOp1, Op1}, TLI);
1452}
1453
1454/// \returns true if a compare instruction \p CI has similar "look" and
1455/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1456/// swapped, false otherwise.
1457static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1458 const TargetLibraryInfo &TLI) {
1459 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1460 "Assessing comparisons of different types?");
1461 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1462 CmpInst::Predicate Pred = CI->getPredicate();
1464
1465 Value *BaseOp0 = BaseCI->getOperand(0);
1466 Value *BaseOp1 = BaseCI->getOperand(1);
1467 Value *Op0 = CI->getOperand(0);
1468 Value *Op1 = CI->getOperand(1);
1469
1470 return (BasePred == Pred &&
1471 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1472 (BasePred == SwappedPred &&
1473 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1474}
1475
1476/// \returns analysis of the Instructions in \p VL described in
1477/// InstructionsState, the Opcode that we suppose the whole list
1478/// could be vectorized even if its structure is diverse.
1479static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1480 const TargetLibraryInfo &TLI) {
1481 // Make sure these are all Instructions.
1483 return InstructionsState::invalid();
1484
1485 auto *It = find_if(VL, IsaPred<Instruction>);
1486 if (It == VL.end())
1487 return InstructionsState::invalid();
1488
1489 Instruction *MainOp = cast<Instruction>(*It);
1490 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1491 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1492 (VL.size() == 2 && InstCnt < 2))
1493 return InstructionsState::invalid();
1494
1495 bool IsCastOp = isa<CastInst>(MainOp);
1496 bool IsBinOp = isa<BinaryOperator>(MainOp);
1497 bool IsCmpOp = isa<CmpInst>(MainOp);
1498 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1500 Instruction *AltOp = MainOp;
1501 unsigned Opcode = MainOp->getOpcode();
1502 unsigned AltOpcode = Opcode;
1503
1504 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1505 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1506 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1507 UniquePreds.insert(BasePred);
1508 UniqueNonSwappedPreds.insert(BasePred);
1509 for (Value *V : VL) {
1510 auto *I = dyn_cast<CmpInst>(V);
1511 if (!I)
1512 return false;
1513 CmpInst::Predicate CurrentPred = I->getPredicate();
1514 CmpInst::Predicate SwappedCurrentPred =
1515 CmpInst::getSwappedPredicate(CurrentPred);
1516 UniqueNonSwappedPreds.insert(CurrentPred);
1517 if (!UniquePreds.contains(CurrentPred) &&
1518 !UniquePreds.contains(SwappedCurrentPred))
1519 UniquePreds.insert(CurrentPred);
1520 }
1521 // Total number of predicates > 2, but if consider swapped predicates
1522 // compatible only 2, consider swappable predicates as compatible opcodes,
1523 // not alternate.
1524 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1525 }();
1526 // Check for one alternate opcode from another BinaryOperator.
1527 // TODO - generalize to support all operators (types, calls etc.).
1528 Intrinsic::ID BaseID = 0;
1529 SmallVector<VFInfo> BaseMappings;
1530 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1531 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1532 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1533 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1534 return InstructionsState::invalid();
1535 }
1536 bool AnyPoison = InstCnt != VL.size();
1537 // Check MainOp too to be sure that it matches the requirements for the
1538 // instructions.
1539 for (Value *V : iterator_range(It, VL.end())) {
1540 auto *I = dyn_cast<Instruction>(V);
1541 if (!I)
1542 continue;
1543
1544 // Cannot combine poison and divisions.
1545 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1546 // intrinsics/functions only.
1547 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1548 return InstructionsState::invalid();
1549 unsigned InstOpcode = I->getOpcode();
1550 if (IsBinOp && isa<BinaryOperator>(I)) {
1551 if (BinOpHelper.add(I))
1552 continue;
1553 } else if (IsCastOp && isa<CastInst>(I)) {
1554 Value *Op0 = MainOp->getOperand(0);
1555 Type *Ty0 = Op0->getType();
1556 Value *Op1 = I->getOperand(0);
1557 Type *Ty1 = Op1->getType();
1558 if (Ty0 == Ty1) {
1559 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1560 continue;
1561 if (Opcode == AltOpcode) {
1562 assert(isValidForAlternation(Opcode) &&
1563 isValidForAlternation(InstOpcode) &&
1564 "Cast isn't safe for alternation, logic needs to be updated!");
1565 AltOpcode = InstOpcode;
1566 AltOp = I;
1567 continue;
1568 }
1569 }
1570 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1571 auto *BaseInst = cast<CmpInst>(MainOp);
1572 Type *Ty0 = BaseInst->getOperand(0)->getType();
1573 Type *Ty1 = Inst->getOperand(0)->getType();
1574 if (Ty0 == Ty1) {
1575 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1576 assert(InstOpcode == AltOpcode &&
1577 "Alternate instructions are only supported by BinaryOperator "
1578 "and CastInst.");
1579 // Check for compatible operands. If the corresponding operands are not
1580 // compatible - need to perform alternate vectorization.
1581 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1582 CmpInst::Predicate SwappedCurrentPred =
1583 CmpInst::getSwappedPredicate(CurrentPred);
1584
1585 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1586 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1587 continue;
1588
1589 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1590 continue;
1591 auto *AltInst = cast<CmpInst>(AltOp);
1592 if (MainOp != AltOp) {
1593 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1594 continue;
1595 } else if (BasePred != CurrentPred) {
1596 assert(
1597 isValidForAlternation(InstOpcode) &&
1598 "CmpInst isn't safe for alternation, logic needs to be updated!");
1599 AltOp = I;
1600 continue;
1601 }
1602 CmpInst::Predicate AltPred = AltInst->getPredicate();
1603 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1604 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1605 continue;
1606 }
1607 } else if (InstOpcode == Opcode) {
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator and "
1610 "CastInst.");
1611 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1612 if (Gep->getNumOperands() != 2 ||
1613 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1614 return InstructionsState::invalid();
1615 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1617 return InstructionsState::invalid();
1618 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1619 auto *BaseLI = cast<LoadInst>(MainOp);
1620 if (!LI->isSimple() || !BaseLI->isSimple())
1621 return InstructionsState::invalid();
1622 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1623 auto *CallBase = cast<CallInst>(MainOp);
1624 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1625 return InstructionsState::invalid();
1626 if (Call->hasOperandBundles() &&
1628 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1629 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1630 CallBase->op_begin() +
1632 return InstructionsState::invalid();
1634 if (ID != BaseID)
1635 return InstructionsState::invalid();
1636 if (!ID) {
1637 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1638 if (Mappings.size() != BaseMappings.size() ||
1639 Mappings.front().ISA != BaseMappings.front().ISA ||
1640 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1641 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1642 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1643 Mappings.front().Shape.Parameters !=
1644 BaseMappings.front().Shape.Parameters)
1645 return InstructionsState::invalid();
1646 }
1647 }
1648 continue;
1649 }
1650 return InstructionsState::invalid();
1651 }
1652
1653 if (IsBinOp) {
1654 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1655 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1656 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1657 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1658 }
1659 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1660 "Incorrect implementation of allSameOpcode.");
1661 InstructionsState S(MainOp, AltOp);
1662 assert(all_of(VL,
1663 [&](Value *V) {
1664 return isa<PoisonValue>(V) ||
1665 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1666 }) &&
1667 "Invalid InstructionsState.");
1668 return S;
1669}
1670
1671/// \returns true if all of the values in \p VL have the same type or false
1672/// otherwise.
1674 Type *Ty = VL.consume_front()->getType();
1675 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1676}
1677
1678/// \returns True if in-tree use also needs extract. This refers to
1679/// possible scalar operand in vectorized instruction.
1680static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1681 TargetLibraryInfo *TLI,
1682 const TargetTransformInfo *TTI) {
1683 if (!UserInst)
1684 return false;
1685 unsigned Opcode = UserInst->getOpcode();
1686 switch (Opcode) {
1687 case Instruction::Load: {
1688 LoadInst *LI = cast<LoadInst>(UserInst);
1689 return (LI->getPointerOperand() == Scalar);
1690 }
1691 case Instruction::Store: {
1692 StoreInst *SI = cast<StoreInst>(UserInst);
1693 return (SI->getPointerOperand() == Scalar);
1694 }
1695 case Instruction::Call: {
1696 CallInst *CI = cast<CallInst>(UserInst);
1698 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1699 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1700 Arg.value().get() == Scalar;
1701 });
1702 }
1703 default:
1704 return false;
1705 }
1706}
1707
1708/// \returns the AA location that is being access by the instruction.
1711 return MemoryLocation::get(SI);
1712 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1713 return MemoryLocation::get(LI);
1714 return MemoryLocation();
1715}
1716
1717/// \returns True if the instruction is not a volatile or atomic load/store.
1718static bool isSimple(Instruction *I) {
1719 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1720 return LI->isSimple();
1722 return SI->isSimple();
1724 return !MI->isVolatile();
1725 return true;
1726}
1727
1728/// Shuffles \p Mask in accordance with the given \p SubMask.
1729/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1730/// one but two input vectors.
1731static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1732 bool ExtendingManyInputs = false) {
1733 if (SubMask.empty())
1734 return;
1735 assert(
1736 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1737 // Check if input scalars were extended to match the size of other node.
1738 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1739 "SubMask with many inputs support must be larger than the mask.");
1740 if (Mask.empty()) {
1741 Mask.append(SubMask.begin(), SubMask.end());
1742 return;
1743 }
1744 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1745 int TermValue = std::min(Mask.size(), SubMask.size());
1746 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1747 if (SubMask[I] == PoisonMaskElem ||
1748 (!ExtendingManyInputs &&
1749 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1750 continue;
1751 NewMask[I] = Mask[SubMask[I]];
1752 }
1753 Mask.swap(NewMask);
1754}
1755
1756/// Order may have elements assigned special value (size) which is out of
1757/// bounds. Such indices only appear on places which correspond to undef values
1758/// (see canReuseExtract for details) and used in order to avoid undef values
1759/// have effect on operands ordering.
1760/// The first loop below simply finds all unused indices and then the next loop
1761/// nest assigns these indices for undef values positions.
1762/// As an example below Order has two undef positions and they have assigned
1763/// values 3 and 7 respectively:
1764/// before: 6 9 5 4 9 2 1 0
1765/// after: 6 3 5 4 7 2 1 0
1767 const size_t Sz = Order.size();
1768 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1769 SmallBitVector MaskedIndices(Sz);
1770 for (unsigned I = 0; I < Sz; ++I) {
1771 if (Order[I] < Sz)
1772 UnusedIndices.reset(Order[I]);
1773 else
1774 MaskedIndices.set(I);
1775 }
1776 if (MaskedIndices.none())
1777 return;
1778 assert(UnusedIndices.count() == MaskedIndices.count() &&
1779 "Non-synced masked/available indices.");
1780 int Idx = UnusedIndices.find_first();
1781 int MIdx = MaskedIndices.find_first();
1782 while (MIdx >= 0) {
1783 assert(Idx >= 0 && "Indices must be synced.");
1784 Order[MIdx] = Idx;
1785 Idx = UnusedIndices.find_next(Idx);
1786 MIdx = MaskedIndices.find_next(MIdx);
1787 }
1788}
1789
1790/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1791/// Opcode1.
1793 unsigned Opcode0, unsigned Opcode1) {
1794 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1795 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1796 for (unsigned Lane : seq<unsigned>(VL.size())) {
1797 if (isa<PoisonValue>(VL[Lane]))
1798 continue;
1799 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1800 OpcodeMask.set(Lane * ScalarTyNumElements,
1801 Lane * ScalarTyNumElements + ScalarTyNumElements);
1802 }
1803 return OpcodeMask;
1804}
1805
1806/// Replicates the given \p Val \p VF times.
1808 unsigned VF) {
1809 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1810 "Expected scalar constants.");
1811 SmallVector<Constant *> NewVal(Val.size() * VF);
1812 for (auto [I, V] : enumerate(Val))
1813 std::fill_n(NewVal.begin() + I * VF, VF, V);
1814 return NewVal;
1815}
1816
1817namespace llvm {
1818
1820 SmallVectorImpl<int> &Mask) {
1821 Mask.clear();
1822 const unsigned E = Indices.size();
1823 Mask.resize(E, PoisonMaskElem);
1824 for (unsigned I = 0; I < E; ++I)
1825 Mask[Indices[I]] = I;
1826}
1827
1828/// Reorders the list of scalars in accordance with the given \p Mask.
1830 ArrayRef<int> Mask) {
1831 assert(!Mask.empty() && "Expected non-empty mask.");
1832 SmallVector<Value *> Prev(Scalars.size(),
1833 PoisonValue::get(Scalars.front()->getType()));
1834 Prev.swap(Scalars);
1835 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1836 if (Mask[I] != PoisonMaskElem)
1837 Scalars[Mask[I]] = Prev[I];
1838}
1839
1840/// Checks if the provided value does not require scheduling. It does not
1841/// require scheduling if this is not an instruction or it is an instruction
1842/// that does not read/write memory and all operands are either not instructions
1843/// or phi nodes or instructions from different blocks.
1845 auto *I = dyn_cast<Instruction>(V);
1846 if (!I)
1847 return true;
1848 return !mayHaveNonDefUseDependency(*I) &&
1849 all_of(I->operands(), [I](Value *V) {
1850 auto *IO = dyn_cast<Instruction>(V);
1851 if (!IO)
1852 return true;
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1854 });
1855}
1856
1857/// Checks if the provided value does not require scheduling. It does not
1858/// require scheduling if this is not an instruction or it is an instruction
1859/// that does not read/write memory and all users are phi nodes or instructions
1860/// from the different blocks.
1861static bool isUsedOutsideBlock(Value *V) {
1862 auto *I = dyn_cast<Instruction>(V);
1863 if (!I)
1864 return true;
1865 // Limits the number of uses to save compile time.
1866 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1867 all_of(I->users(), [I](User *U) {
1868 auto *IU = dyn_cast<Instruction>(U);
1869 if (!IU)
1870 return true;
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1872 });
1873}
1874
1875/// Checks if the specified value does not require scheduling. It does not
1876/// require scheduling if all operands and all users do not need to be scheduled
1877/// in the current basic block.
1880}
1881
1882/// Checks if the specified array of instructions does not require scheduling.
1883/// It is so if all either instructions have operands that do not require
1884/// scheduling or their users do not require scheduling since they are phis or
1885/// in other basic blocks.
1887 return !VL.empty() &&
1889}
1890
1891/// Returns true if widened type of \p Ty elements with size \p Sz represents
1892/// full vector type, i.e. adding extra element results in extra parts upon type
1893/// legalization.
1895 unsigned Sz) {
1896 if (Sz <= 1)
1897 return false;
1899 return false;
1900 if (has_single_bit(Sz))
1901 return true;
1902 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1903 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1904 Sz % NumParts == 0;
1905}
1906
1907/// Returns number of parts, the type \p VecTy will be split at the codegen
1908/// phase. If the type is going to be scalarized or does not uses whole
1909/// registers, returns 1.
1910static unsigned
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1915 return 1;
1916 unsigned Sz = getNumElements(VecTy);
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1918 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1919 return 1;
1920 return NumParts;
1921}
1922
1923namespace slpvectorizer {
1924
1925/// Bottom Up SLP Vectorizer.
1926class BoUpSLP {
1927 class TreeEntry;
1928 class ScheduleEntity;
1929 class ScheduleData;
1930 class ScheduleCopyableData;
1931 class ScheduleBundle;
1934
1935 /// If we decide to generate strided load / store, this struct contains all
1936 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1937 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1938 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1939 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1940 /// size of element of FixedVectorType.
1941 struct StridedPtrInfo {
1942 Value *StrideVal = nullptr;
1943 const SCEV *StrideSCEV = nullptr;
1944 FixedVectorType *Ty = nullptr;
1945 };
1946 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1947
1948public:
1949 /// Tracks the state we can represent the loads in the given sequence.
1957
1964
1966 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1968 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1969 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1970 AC(AC), DB(DB), DL(DL), ORE(ORE),
1971 Builder(Se->getContext(), TargetFolder(*DL)) {
1972 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1973 // Use the vector register size specified by the target unless overridden
1974 // by a command-line option.
1975 // TODO: It would be better to limit the vectorization factor based on
1976 // data type rather than just register size. For example, x86 AVX has
1977 // 256-bit registers, but it does not support integer operations
1978 // at that width (that requires AVX2).
1979 if (MaxVectorRegSizeOption.getNumOccurrences())
1980 MaxVecRegSize = MaxVectorRegSizeOption;
1981 else
1982 MaxVecRegSize =
1983 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1984 .getFixedValue();
1985
1986 if (MinVectorRegSizeOption.getNumOccurrences())
1987 MinVecRegSize = MinVectorRegSizeOption;
1988 else
1989 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1990 }
1991
1992 /// Vectorize the tree that starts with the elements in \p VL.
1993 /// Returns the vectorized root.
1995
1996 /// Vectorize the tree but with the list of externally used values \p
1997 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1998 /// generated extractvalue instructions.
2000 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2001 Instruction *ReductionRoot = nullptr,
2002 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2003
2004 /// \returns the cost incurred by unwanted spills and fills, caused by
2005 /// holding live values over call sites.
2007
2008 /// \returns the vectorization cost of the subtree that starts at \p VL.
2009 /// A negative number means that this is profitable.
2010 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
2011 InstructionCost ReductionCost = TTI::TCC_Free);
2012
2013 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2014 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2015 void buildTree(ArrayRef<Value *> Roots,
2016 const SmallDenseSet<Value *> &UserIgnoreLst);
2017
2018 /// Construct a vectorizable tree that starts at \p Roots.
2019 void buildTree(ArrayRef<Value *> Roots);
2020
2021 /// Return the scalars of the root node.
2023 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2024 return VectorizableTree.front()->Scalars;
2025 }
2026
2027 /// Returns the type/is-signed info for the root node in the graph without
2028 /// casting.
2029 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2030 const TreeEntry &Root = *VectorizableTree.front();
2031 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2032 !Root.Scalars.front()->getType()->isIntegerTy())
2033 return std::nullopt;
2034 auto It = MinBWs.find(&Root);
2035 if (It != MinBWs.end())
2036 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2037 It->second.first),
2038 It->second.second);
2039 if (Root.getOpcode() == Instruction::ZExt ||
2040 Root.getOpcode() == Instruction::SExt)
2041 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2042 Root.getOpcode() == Instruction::SExt);
2043 return std::nullopt;
2044 }
2045
2046 /// Checks if the root graph node can be emitted with narrower bitwidth at
2047 /// codegen and returns it signedness, if so.
2049 return MinBWs.at(VectorizableTree.front().get()).second;
2050 }
2051
2052 /// Returns reduction type after minbitdth analysis.
2054 if (ReductionBitWidth == 0 ||
2055 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2056 ReductionBitWidth >=
2057 DL->getTypeSizeInBits(
2058 VectorizableTree.front()->Scalars.front()->getType()))
2059 return getWidenedType(
2060 VectorizableTree.front()->Scalars.front()->getType(),
2061 VectorizableTree.front()->getVectorFactor());
2062 return getWidenedType(
2064 VectorizableTree.front()->Scalars.front()->getContext(),
2065 ReductionBitWidth),
2066 VectorizableTree.front()->getVectorFactor());
2067 }
2068
2069 /// Builds external uses of the vectorized scalars, i.e. the list of
2070 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2071 /// ExternallyUsedValues contains additional list of external uses to handle
2072 /// vectorization of reductions.
2073 void
2074 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2075
2076 /// Transforms graph nodes to target specific representations, if profitable.
2077 void transformNodes();
2078
2079 /// Clear the internal data structures that are created by 'buildTree'.
2080 void deleteTree() {
2081 VectorizableTree.clear();
2082 ScalarToTreeEntries.clear();
2083 OperandsToTreeEntry.clear();
2084 ScalarsInSplitNodes.clear();
2085 MustGather.clear();
2086 NonScheduledFirst.clear();
2087 EntryToLastInstruction.clear();
2088 LoadEntriesToVectorize.clear();
2089 IsGraphTransformMode = false;
2090 GatheredLoadsEntriesFirst.reset();
2091 CompressEntryToData.clear();
2092 ExternalUses.clear();
2093 ExternalUsesAsOriginalScalar.clear();
2094 ExternalUsesWithNonUsers.clear();
2095 for (auto &Iter : BlocksSchedules) {
2096 BlockScheduling *BS = Iter.second.get();
2097 BS->clear();
2098 }
2099 MinBWs.clear();
2100 ReductionBitWidth = 0;
2101 BaseGraphSize = 1;
2102 CastMaxMinBWSizes.reset();
2103 ExtraBitWidthNodes.clear();
2104 InstrElementSize.clear();
2105 UserIgnoreList = nullptr;
2106 PostponedGathers.clear();
2107 ValueToGatherNodes.clear();
2108 TreeEntryToStridedPtrInfoMap.clear();
2109 }
2110
2111 unsigned getTreeSize() const { return VectorizableTree.size(); }
2112
2113 /// Returns the base graph size, before any transformations.
2114 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2115
2116 /// Perform LICM and CSE on the newly generated gather sequences.
2118
2119 /// Does this non-empty order represent an identity order? Identity
2120 /// should be represented as an empty order, so this is used to
2121 /// decide if we can canonicalize a computed order. Undef elements
2122 /// (represented as size) are ignored.
2124 assert(!Order.empty() && "expected non-empty order");
2125 const unsigned Sz = Order.size();
2126 return all_of(enumerate(Order), [&](const auto &P) {
2127 return P.value() == P.index() || P.value() == Sz;
2128 });
2129 }
2130
2131 /// Checks if the specified gather tree entry \p TE can be represented as a
2132 /// shuffled vector entry + (possibly) permutation with other gathers. It
2133 /// implements the checks only for possibly ordered scalars (Loads,
2134 /// ExtractElement, ExtractValue), which can be part of the graph.
2135 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2136 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2137 /// node might be ignored.
2138 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2139 bool TopToBottom,
2140 bool IgnoreReorder);
2141
2142 /// Sort loads into increasing pointers offsets to allow greater clustering.
2143 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2144
2145 /// Gets reordering data for the given tree entry. If the entry is vectorized
2146 /// - just return ReorderIndices, otherwise check if the scalars can be
2147 /// reordered and return the most optimal order.
2148 /// \return std::nullopt if ordering is not important, empty order, if
2149 /// identity order is important, or the actual order.
2150 /// \param TopToBottom If true, include the order of vectorized stores and
2151 /// insertelement nodes, otherwise skip them.
2152 /// \param IgnoreReorder true, if the root node order can be ignored.
2153 std::optional<OrdersType>
2154 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2155
2156 /// Checks if it is profitable to reorder the current tree.
2157 /// If the tree does not contain many profitable reordable nodes, better to
2158 /// skip it to save compile time.
2159 bool isProfitableToReorder() const;
2160
2161 /// Reorders the current graph to the most profitable order starting from the
2162 /// root node to the leaf nodes. The best order is chosen only from the nodes
2163 /// of the same size (vectorization factor). Smaller nodes are considered
2164 /// parts of subgraph with smaller VF and they are reordered independently. We
2165 /// can make it because we still need to extend smaller nodes to the wider VF
2166 /// and we can merge reordering shuffles with the widening shuffles.
2167 void reorderTopToBottom();
2168
2169 /// Reorders the current graph to the most profitable order starting from
2170 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2171 /// number of reshuffles if the leaf nodes use the same order. In this case we
2172 /// can merge the orders and just shuffle user node instead of shuffling its
2173 /// operands. Plus, even the leaf nodes have different orders, it allows to
2174 /// sink reordering in the graph closer to the root node and merge it later
2175 /// during analysis.
2176 void reorderBottomToTop(bool IgnoreReorder = false);
2177
2178 /// \return The vector element size in bits to use when vectorizing the
2179 /// expression tree ending at \p V. If V is a store, the size is the width of
2180 /// the stored value. Otherwise, the size is the width of the largest loaded
2181 /// value reaching V. This method is used by the vectorizer to calculate
2182 /// vectorization factors.
2183 unsigned getVectorElementSize(Value *V);
2184
2185 /// Compute the minimum type sizes required to represent the entries in a
2186 /// vectorizable tree.
2188
2189 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2190 unsigned getMaxVecRegSize() const {
2191 return MaxVecRegSize;
2192 }
2193
2194 // \returns minimum vector register size as set by cl::opt.
2195 unsigned getMinVecRegSize() const {
2196 return MinVecRegSize;
2197 }
2198
2199 unsigned getMinVF(unsigned Sz) const {
2200 return std::max(2U, getMinVecRegSize() / Sz);
2201 }
2202
2203 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2204 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2205 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2206 return MaxVF ? MaxVF : UINT_MAX;
2207 }
2208
2209 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2210 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2211 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2212 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2213 ///
2214 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2215 unsigned canMapToVector(Type *T) const;
2216
2217 /// \returns True if the VectorizableTree is both tiny and not fully
2218 /// vectorizable. We do not vectorize such trees.
2219 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2220
2221 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2222 /// It may happen, if all gather nodes are loads and they cannot be
2223 /// "clusterized". In this case even subgraphs cannot be vectorized more
2224 /// effectively than the base graph.
2225 bool isTreeNotExtendable() const;
2226
2227 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2228 /// can be load combined in the backend. Load combining may not be allowed in
2229 /// the IR optimizer, so we do not want to alter the pattern. For example,
2230 /// partially transforming a scalar bswap() pattern into vector code is
2231 /// effectively impossible for the backend to undo.
2232 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2233 /// may not be necessary.
2234 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2235
2236 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2237 /// can be load combined in the backend. Load combining may not be allowed in
2238 /// the IR optimizer, so we do not want to alter the pattern. For example,
2239 /// partially transforming a scalar bswap() pattern into vector code is
2240 /// effectively impossible for the backend to undo.
2241 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2242 /// may not be necessary.
2243 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2244 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2245 Align Alignment, const int64_t Diff,
2246 const size_t Sz) const;
2247
2248 /// Return true if an array of scalar loads can be replaced with a strided
2249 /// load (with constant stride).
2250 ///
2251 /// TODO:
2252 /// It is possible that the load gets "widened". Suppose that originally each
2253 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2254 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2255 /// ...
2256 /// %b + 0 * %s + (w - 1)
2257 ///
2258 /// %b + 1 * %s + 0
2259 /// %b + 1 * %s + 1
2260 /// %b + 1 * %s + 2
2261 /// ...
2262 /// %b + 1 * %s + (w - 1)
2263 /// ...
2264 ///
2265 /// %b + (n - 1) * %s + 0
2266 /// %b + (n - 1) * %s + 1
2267 /// %b + (n - 1) * %s + 2
2268 /// ...
2269 /// %b + (n - 1) * %s + (w - 1)
2270 ///
2271 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2272 ///
2273 /// \param PointerOps list of pointer arguments of loads.
2274 /// \param ElemTy original scalar type of loads.
2275 /// \param Alignment alignment of the first load.
2276 /// \param SortedIndices is the order of PointerOps as returned by
2277 /// `sortPtrAccesses`
2278 /// \param Diff Pointer difference between the lowest and the highes pointer
2279 /// in `PointerOps` as returned by `getPointersDiff`.
2280 /// \param Ptr0 first pointer in `PointersOps`.
2281 /// \param PtrN last pointer in `PointersOps`.
2282 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2283 /// of `SPtrInfo` necessary to generate the strided load later.
2285 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2286 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2287 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2288
2289 /// Return true if an array of scalar loads can be replaced with a strided
2290 /// load (with run-time stride).
2291 /// \param PointerOps list of pointer arguments of loads.
2292 /// \param ScalarTy type of loads.
2293 /// \param CommonAlignment common alignement of loads as computed by
2294 /// `computeCommonAlignment<LoadInst>`.
2295 /// \param SortedIndicies is a list of indicies computed by this function such
2296 /// that the sequence `PointerOps[SortedIndices[0]],
2297 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2298 /// ordered by the coefficient of the stride. For example, if PointerOps is
2299 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2300 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2301 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2302 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2303 /// of `SPtrInfo` necessary to generate the strided load later.
2304 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2305 Align CommonAlignment,
2306 SmallVectorImpl<unsigned> &SortedIndices,
2307 StridedPtrInfo &SPtrInfo) const;
2308
2309 /// Checks if the given array of loads can be represented as a vectorized,
2310 /// scatter or just simple gather.
2311 /// \param VL list of loads.
2312 /// \param VL0 main load value.
2313 /// \param Order returned order of load instructions.
2314 /// \param PointerOps returned list of pointer operands.
2315 /// \param BestVF return best vector factor, if recursive check found better
2316 /// vectorization sequences rather than masked gather.
2317 /// \param TryRecursiveCheck used to check if long masked gather can be
2318 /// represented as a serie of loads/insert subvector, if profitable.
2321 SmallVectorImpl<Value *> &PointerOps,
2322 StridedPtrInfo &SPtrInfo,
2323 unsigned *BestVF = nullptr,
2324 bool TryRecursiveCheck = true) const;
2325
2326 /// Registers non-vectorizable sequence of loads
2327 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2328 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2329 }
2330
2331 /// Checks if the given loads sequence is known as not vectorizable
2332 template <typename T>
2334 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2335 }
2336
2338
2339 /// This structure holds any data we need about the edges being traversed
2340 /// during buildTreeRec(). We keep track of:
2341 /// (i) the user TreeEntry index, and
2342 /// (ii) the index of the edge.
2343 struct EdgeInfo {
2344 EdgeInfo() = default;
2345 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2347 /// The user TreeEntry.
2348 TreeEntry *UserTE = nullptr;
2349 /// The operand index of the use.
2350 unsigned EdgeIdx = UINT_MAX;
2351#ifndef NDEBUG
2353 const BoUpSLP::EdgeInfo &EI) {
2354 EI.dump(OS);
2355 return OS;
2356 }
2357 /// Debug print.
2358 void dump(raw_ostream &OS) const {
2359 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2360 << " EdgeIdx:" << EdgeIdx << "}";
2361 }
2362 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2363#endif
2364 bool operator == (const EdgeInfo &Other) const {
2365 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2366 }
2367
2368 operator bool() const { return UserTE != nullptr; }
2369 };
2370 friend struct DenseMapInfo<EdgeInfo>;
2371
2372 /// A helper class used for scoring candidates for two consecutive lanes.
2374 const TargetLibraryInfo &TLI;
2375 const DataLayout &DL;
2376 ScalarEvolution &SE;
2377 const BoUpSLP &R;
2378 int NumLanes; // Total number of lanes (aka vectorization factor).
2379 int MaxLevel; // The maximum recursion depth for accumulating score.
2380
2381 public:
2383 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2384 int MaxLevel)
2385 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2386 MaxLevel(MaxLevel) {}
2387
2388 // The hard-coded scores listed here are not very important, though it shall
2389 // be higher for better matches to improve the resulting cost. When
2390 // computing the scores of matching one sub-tree with another, we are
2391 // basically counting the number of values that are matching. So even if all
2392 // scores are set to 1, we would still get a decent matching result.
2393 // However, sometimes we have to break ties. For example we may have to
2394 // choose between matching loads vs matching opcodes. This is what these
2395 // scores are helping us with: they provide the order of preference. Also,
2396 // this is important if the scalar is externally used or used in another
2397 // tree entry node in the different lane.
2398
2399 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2400 static const int ScoreConsecutiveLoads = 4;
2401 /// The same load multiple times. This should have a better score than
2402 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2403 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2404 /// a vector load and 1.0 for a broadcast.
2405 static const int ScoreSplatLoads = 3;
2406 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2407 static const int ScoreReversedLoads = 3;
2408 /// A load candidate for masked gather.
2409 static const int ScoreMaskedGatherCandidate = 1;
2410 /// ExtractElementInst from same vector and consecutive indexes.
2411 static const int ScoreConsecutiveExtracts = 4;
2412 /// ExtractElementInst from same vector and reversed indices.
2413 static const int ScoreReversedExtracts = 3;
2414 /// Constants.
2415 static const int ScoreConstants = 2;
2416 /// Instructions with the same opcode.
2417 static const int ScoreSameOpcode = 2;
2418 /// Instructions with alt opcodes (e.g, add + sub).
2419 static const int ScoreAltOpcodes = 1;
2420 /// Identical instructions (a.k.a. splat or broadcast).
2421 static const int ScoreSplat = 1;
2422 /// Matching with an undef is preferable to failing.
2423 static const int ScoreUndef = 1;
2424 /// Score for failing to find a decent match.
2425 static const int ScoreFail = 0;
2426 /// Score if all users are vectorized.
2427 static const int ScoreAllUserVectorized = 1;
2428
2429 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2430 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2431 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2432 /// MainAltOps.
2434 ArrayRef<Value *> MainAltOps) const {
2435 if (!isValidElementType(V1->getType()) ||
2438
2439 if (V1 == V2) {
2440 if (isa<LoadInst>(V1)) {
2441 // Retruns true if the users of V1 and V2 won't need to be extracted.
2442 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2443 // Bail out if we have too many uses to save compilation time.
2444 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2445 return false;
2446
2447 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2448 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2449 return U == U1 || U == U2 || R.isVectorized(U);
2450 });
2451 };
2452 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2453 };
2454 // A broadcast of a load can be cheaper on some targets.
2455 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2456 ElementCount::getFixed(NumLanes)) &&
2457 ((int)V1->getNumUses() == NumLanes ||
2458 AllUsersAreInternal(V1, V2)))
2460 }
2462 }
2463
2464 auto CheckSameEntryOrFail = [&]() {
2465 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2467 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2468 !TEs2.empty() &&
2469 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2471 }
2473 };
2474
2475 auto *LI1 = dyn_cast<LoadInst>(V1);
2476 auto *LI2 = dyn_cast<LoadInst>(V2);
2477 if (LI1 && LI2) {
2478 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2479 !LI2->isSimple())
2480 return CheckSameEntryOrFail();
2481
2482 std::optional<int64_t> Dist = getPointersDiff(
2483 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2484 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2485 if (!Dist || *Dist == 0) {
2486 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2487 getUnderlyingObject(LI2->getPointerOperand()) &&
2488 R.TTI->isLegalMaskedGather(
2489 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2491 return CheckSameEntryOrFail();
2492 }
2493 // The distance is too large - still may be profitable to use masked
2494 // loads/gathers.
2495 if (std::abs(*Dist) > NumLanes / 2)
2497 // This still will detect consecutive loads, but we might have "holes"
2498 // in some cases. It is ok for non-power-2 vectorization and may produce
2499 // better results. It should not affect current vectorization.
2502 }
2503
2504 auto *C1 = dyn_cast<Constant>(V1);
2505 auto *C2 = dyn_cast<Constant>(V2);
2506 if (C1 && C2)
2508
2509 // Consider constants and buildvector compatible.
2510 if ((C1 && isa<InsertElementInst>(V2)) ||
2511 (C2 && isa<InsertElementInst>(V1)))
2513
2514 // Extracts from consecutive indexes of the same vector better score as
2515 // the extracts could be optimized away.
2516 Value *EV1;
2517 ConstantInt *Ex1Idx;
2518 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2519 // Undefs are always profitable for extractelements.
2520 // Compiler can easily combine poison and extractelement <non-poison> or
2521 // undef and extractelement <poison>. But combining undef +
2522 // extractelement <non-poison-but-may-produce-poison> requires some
2523 // extra operations.
2524 if (isa<UndefValue>(V2))
2525 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2528 Value *EV2 = nullptr;
2529 ConstantInt *Ex2Idx = nullptr;
2530 if (match(V2,
2532 m_Undef())))) {
2533 // Undefs are always profitable for extractelements.
2534 if (!Ex2Idx)
2536 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2538 if (EV2 == EV1) {
2539 int Idx1 = Ex1Idx->getZExtValue();
2540 int Idx2 = Ex2Idx->getZExtValue();
2541 int Dist = Idx2 - Idx1;
2542 // The distance is too large - still may be profitable to use
2543 // shuffles.
2544 if (std::abs(Dist) == 0)
2546 if (std::abs(Dist) > NumLanes / 2)
2550 }
2552 }
2553 return CheckSameEntryOrFail();
2554 }
2555
2556 auto *I1 = dyn_cast<Instruction>(V1);
2557 auto *I2 = dyn_cast<Instruction>(V2);
2558 if (I1 && I2) {
2559 if (I1->getParent() != I2->getParent())
2560 return CheckSameEntryOrFail();
2561 SmallVector<Value *, 4> Ops(MainAltOps);
2562 Ops.push_back(I1);
2563 Ops.push_back(I2);
2564 InstructionsState S = getSameOpcode(Ops, TLI);
2565 // Note: Only consider instructions with <= 2 operands to avoid
2566 // complexity explosion.
2567 if (S &&
2568 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2569 !S.isAltShuffle()) &&
2570 all_of(Ops, [&S](Value *V) {
2571 return isa<PoisonValue>(V) ||
2572 cast<Instruction>(V)->getNumOperands() ==
2573 S.getMainOp()->getNumOperands();
2574 }))
2575 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2577 }
2578
2579 if (I1 && isa<PoisonValue>(V2))
2581
2582 if (isa<UndefValue>(V2))
2584
2585 return CheckSameEntryOrFail();
2586 }
2587
2588 /// Go through the operands of \p LHS and \p RHS recursively until
2589 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2590 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2591 /// of \p U1 and \p U2), except at the beginning of the recursion where
2592 /// these are set to nullptr.
2593 ///
2594 /// For example:
2595 /// \verbatim
2596 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2597 /// \ / \ / \ / \ /
2598 /// + + + +
2599 /// G1 G2 G3 G4
2600 /// \endverbatim
2601 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2602 /// each level recursively, accumulating the score. It starts from matching
2603 /// the additions at level 0, then moves on to the loads (level 1). The
2604 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2605 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2606 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2607 /// Please note that the order of the operands does not matter, as we
2608 /// evaluate the score of all profitable combinations of operands. In
2609 /// other words the score of G1 and G4 is the same as G1 and G2. This
2610 /// heuristic is based on ideas described in:
2611 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2612 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2613 /// Luís F. W. Góes
2615 Instruction *U2, int CurrLevel,
2616 ArrayRef<Value *> MainAltOps) const {
2617
2618 // Get the shallow score of V1 and V2.
2619 int ShallowScoreAtThisLevel =
2620 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2621
2622 // If reached MaxLevel,
2623 // or if V1 and V2 are not instructions,
2624 // or if they are SPLAT,
2625 // or if they are not consecutive,
2626 // or if profitable to vectorize loads or extractelements, early return
2627 // the current cost.
2628 auto *I1 = dyn_cast<Instruction>(LHS);
2629 auto *I2 = dyn_cast<Instruction>(RHS);
2630 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2631 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2632 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2633 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2635 ShallowScoreAtThisLevel))
2636 return ShallowScoreAtThisLevel;
2637 assert(I1 && I2 && "Should have early exited.");
2638
2639 // Contains the I2 operand indexes that got matched with I1 operands.
2640 SmallSet<unsigned, 4> Op2Used;
2641
2642 // Recursion towards the operands of I1 and I2. We are trying all possible
2643 // operand pairs, and keeping track of the best score.
2644 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2645 OpIdx1 != NumOperands1; ++OpIdx1) {
2646 // Try to pair op1I with the best operand of I2.
2647 int MaxTmpScore = 0;
2648 unsigned MaxOpIdx2 = 0;
2649 bool FoundBest = false;
2650 // If I2 is commutative try all combinations.
2651 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2652 unsigned ToIdx = isCommutative(I2)
2653 ? I2->getNumOperands()
2654 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2655 assert(FromIdx <= ToIdx && "Bad index");
2656 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2657 // Skip operands already paired with OpIdx1.
2658 if (Op2Used.count(OpIdx2))
2659 continue;
2660 // Recursively calculate the cost at each level
2661 int TmpScore =
2662 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2663 I1, I2, CurrLevel + 1, {});
2664 // Look for the best score.
2665 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2666 TmpScore > MaxTmpScore) {
2667 MaxTmpScore = TmpScore;
2668 MaxOpIdx2 = OpIdx2;
2669 FoundBest = true;
2670 }
2671 }
2672 if (FoundBest) {
2673 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2674 Op2Used.insert(MaxOpIdx2);
2675 ShallowScoreAtThisLevel += MaxTmpScore;
2676 }
2677 }
2678 return ShallowScoreAtThisLevel;
2679 }
2680 };
2681 /// A helper data structure to hold the operands of a vector of instructions.
2682 /// This supports a fixed vector length for all operand vectors.
2684 /// For each operand we need (i) the value, and (ii) the opcode that it
2685 /// would be attached to if the expression was in a left-linearized form.
2686 /// This is required to avoid illegal operand reordering.
2687 /// For example:
2688 /// \verbatim
2689 /// 0 Op1
2690 /// |/
2691 /// Op1 Op2 Linearized + Op2
2692 /// \ / ----------> |/
2693 /// - -
2694 ///
2695 /// Op1 - Op2 (0 + Op1) - Op2
2696 /// \endverbatim
2697 ///
2698 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2699 ///
2700 /// Another way to think of this is to track all the operations across the
2701 /// path from the operand all the way to the root of the tree and to
2702 /// calculate the operation that corresponds to this path. For example, the
2703 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2704 /// corresponding operation is a '-' (which matches the one in the
2705 /// linearized tree, as shown above).
2706 ///
2707 /// For lack of a better term, we refer to this operation as Accumulated
2708 /// Path Operation (APO).
2709 struct OperandData {
2710 OperandData() = default;
2711 OperandData(Value *V, bool APO, bool IsUsed)
2712 : V(V), APO(APO), IsUsed(IsUsed) {}
2713 /// The operand value.
2714 Value *V = nullptr;
2715 /// TreeEntries only allow a single opcode, or an alternate sequence of
2716 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2717 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2718 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2719 /// (e.g., Add/Mul)
2720 bool APO = false;
2721 /// Helper data for the reordering function.
2722 bool IsUsed = false;
2723 };
2724
2725 /// During operand reordering, we are trying to select the operand at lane
2726 /// that matches best with the operand at the neighboring lane. Our
2727 /// selection is based on the type of value we are looking for. For example,
2728 /// if the neighboring lane has a load, we need to look for a load that is
2729 /// accessing a consecutive address. These strategies are summarized in the
2730 /// 'ReorderingMode' enumerator.
2731 enum class ReorderingMode {
2732 Load, ///< Matching loads to consecutive memory addresses
2733 Opcode, ///< Matching instructions based on opcode (same or alternate)
2734 Constant, ///< Matching constants
2735 Splat, ///< Matching the same instruction multiple times (broadcast)
2736 Failed, ///< We failed to create a vectorizable group
2737 };
2738
2739 using OperandDataVec = SmallVector<OperandData, 2>;
2740
2741 /// A vector of operand vectors.
2743 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2744 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2745 unsigned ArgSize = 0;
2746
2747 const TargetLibraryInfo &TLI;
2748 const DataLayout &DL;
2749 ScalarEvolution &SE;
2750 const BoUpSLP &R;
2751 const Loop *L = nullptr;
2752
2753 /// \returns the operand data at \p OpIdx and \p Lane.
2754 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2755 return OpsVec[OpIdx][Lane];
2756 }
2757
2758 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2759 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2760 return OpsVec[OpIdx][Lane];
2761 }
2762
2763 /// Clears the used flag for all entries.
2764 void clearUsed() {
2765 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2766 OpIdx != NumOperands; ++OpIdx)
2767 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2768 ++Lane)
2769 OpsVec[OpIdx][Lane].IsUsed = false;
2770 }
2771
2772 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2773 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2774 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2775 }
2776
2777 /// \param Lane lane of the operands under analysis.
2778 /// \param OpIdx operand index in \p Lane lane we're looking the best
2779 /// candidate for.
2780 /// \param Idx operand index of the current candidate value.
2781 /// \returns The additional score due to possible broadcasting of the
2782 /// elements in the lane. It is more profitable to have power-of-2 unique
2783 /// elements in the lane, it will be vectorized with higher probability
2784 /// after removing duplicates. Currently the SLP vectorizer supports only
2785 /// vectorization of the power-of-2 number of unique scalars.
2786 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2787 const SmallBitVector &UsedLanes) const {
2788 Value *IdxLaneV = getData(Idx, Lane).V;
2789 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2790 isa<ExtractElementInst>(IdxLaneV))
2791 return 0;
2793 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2794 if (Ln == Lane)
2795 continue;
2796 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2797 if (!isa<Instruction>(OpIdxLnV))
2798 return 0;
2799 Uniques.try_emplace(OpIdxLnV, Ln);
2800 }
2801 unsigned UniquesCount = Uniques.size();
2802 auto IdxIt = Uniques.find(IdxLaneV);
2803 unsigned UniquesCntWithIdxLaneV =
2804 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2805 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2806 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2807 unsigned UniquesCntWithOpIdxLaneV =
2808 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2809 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2810 return 0;
2811 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2812 UniquesCntWithOpIdxLaneV,
2813 UniquesCntWithOpIdxLaneV -
2814 bit_floor(UniquesCntWithOpIdxLaneV)) -
2815 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2816 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2817 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2818 }
2819
2820 /// \param Lane lane of the operands under analysis.
2821 /// \param OpIdx operand index in \p Lane lane we're looking the best
2822 /// candidate for.
2823 /// \param Idx operand index of the current candidate value.
2824 /// \returns The additional score for the scalar which users are all
2825 /// vectorized.
2826 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2827 Value *IdxLaneV = getData(Idx, Lane).V;
2828 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2829 // Do not care about number of uses for vector-like instructions
2830 // (extractelement/extractvalue with constant indices), they are extracts
2831 // themselves and already externally used. Vectorization of such
2832 // instructions does not add extra extractelement instruction, just may
2833 // remove it.
2834 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2835 isVectorLikeInstWithConstOps(OpIdxLaneV))
2837 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2838 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2839 return 0;
2840 return R.areAllUsersVectorized(IdxLaneI)
2842 : 0;
2843 }
2844
2845 /// Score scaling factor for fully compatible instructions but with
2846 /// different number of external uses. Allows better selection of the
2847 /// instructions with less external uses.
2848 static const int ScoreScaleFactor = 10;
2849
2850 /// \Returns the look-ahead score, which tells us how much the sub-trees
2851 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2852 /// score. This helps break ties in an informed way when we cannot decide on
2853 /// the order of the operands by just considering the immediate
2854 /// predecessors.
2855 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2856 int Lane, unsigned OpIdx, unsigned Idx,
2857 bool &IsUsed, const SmallBitVector &UsedLanes) {
2858 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2860 // Keep track of the instruction stack as we recurse into the operands
2861 // during the look-ahead score exploration.
2862 int Score =
2863 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2864 /*CurrLevel=*/1, MainAltOps);
2865 if (Score) {
2866 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2867 if (Score <= -SplatScore) {
2868 // Failed score.
2869 Score = 0;
2870 } else {
2871 Score += SplatScore;
2872 // Scale score to see the difference between different operands
2873 // and similar operands but all vectorized/not all vectorized
2874 // uses. It does not affect actual selection of the best
2875 // compatible operand in general, just allows to select the
2876 // operand with all vectorized uses.
2877 Score *= ScoreScaleFactor;
2878 Score += getExternalUseScore(Lane, OpIdx, Idx);
2879 IsUsed = true;
2880 }
2881 }
2882 return Score;
2883 }
2884
2885 /// Best defined scores per lanes between the passes. Used to choose the
2886 /// best operand (with the highest score) between the passes.
2887 /// The key - {Operand Index, Lane}.
2888 /// The value - the best score between the passes for the lane and the
2889 /// operand.
2891 BestScoresPerLanes;
2892
2893 // Search all operands in Ops[*][Lane] for the one that matches best
2894 // Ops[OpIdx][LastLane] and return its opreand index.
2895 // If no good match can be found, return std::nullopt.
2896 std::optional<unsigned>
2897 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2898 ArrayRef<ReorderingMode> ReorderingModes,
2899 ArrayRef<Value *> MainAltOps,
2900 const SmallBitVector &UsedLanes) {
2901 unsigned NumOperands = getNumOperands();
2902
2903 // The operand of the previous lane at OpIdx.
2904 Value *OpLastLane = getData(OpIdx, LastLane).V;
2905
2906 // Our strategy mode for OpIdx.
2907 ReorderingMode RMode = ReorderingModes[OpIdx];
2908 if (RMode == ReorderingMode::Failed)
2909 return std::nullopt;
2910
2911 // The linearized opcode of the operand at OpIdx, Lane.
2912 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2913
2914 // The best operand index and its score.
2915 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2916 // are using the score to differentiate between the two.
2917 struct BestOpData {
2918 std::optional<unsigned> Idx;
2919 unsigned Score = 0;
2920 } BestOp;
2921 BestOp.Score =
2922 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2923 .first->second;
2924
2925 // Track if the operand must be marked as used. If the operand is set to
2926 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2927 // want to reestimate the operands again on the following iterations).
2928 bool IsUsed = RMode == ReorderingMode::Splat ||
2929 RMode == ReorderingMode::Constant ||
2930 RMode == ReorderingMode::Load;
2931 // Iterate through all unused operands and look for the best.
2932 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2933 // Get the operand at Idx and Lane.
2934 OperandData &OpData = getData(Idx, Lane);
2935 Value *Op = OpData.V;
2936 bool OpAPO = OpData.APO;
2937
2938 // Skip already selected operands.
2939 if (OpData.IsUsed)
2940 continue;
2941
2942 // Skip if we are trying to move the operand to a position with a
2943 // different opcode in the linearized tree form. This would break the
2944 // semantics.
2945 if (OpAPO != OpIdxAPO)
2946 continue;
2947
2948 // Look for an operand that matches the current mode.
2949 switch (RMode) {
2950 case ReorderingMode::Load:
2951 case ReorderingMode::Opcode: {
2952 bool LeftToRight = Lane > LastLane;
2953 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2954 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2955 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2956 OpIdx, Idx, IsUsed, UsedLanes);
2957 if (Score > static_cast<int>(BestOp.Score) ||
2958 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2959 Idx == OpIdx)) {
2960 BestOp.Idx = Idx;
2961 BestOp.Score = Score;
2962 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2963 }
2964 break;
2965 }
2966 case ReorderingMode::Constant:
2967 if (isa<Constant>(Op) ||
2968 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2969 BestOp.Idx = Idx;
2970 if (isa<Constant>(Op)) {
2972 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2974 }
2976 IsUsed = false;
2977 }
2978 break;
2979 case ReorderingMode::Splat:
2980 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2981 IsUsed = Op == OpLastLane;
2982 if (Op == OpLastLane) {
2983 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2984 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2986 }
2987 BestOp.Idx = Idx;
2988 }
2989 break;
2990 case ReorderingMode::Failed:
2991 llvm_unreachable("Not expected Failed reordering mode.");
2992 }
2993 }
2994
2995 if (BestOp.Idx) {
2996 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2997 return BestOp.Idx;
2998 }
2999 // If we could not find a good match return std::nullopt.
3000 return std::nullopt;
3001 }
3002
3003 /// Helper for reorderOperandVecs.
3004 /// \returns the lane that we should start reordering from. This is the one
3005 /// which has the least number of operands that can freely move about or
3006 /// less profitable because it already has the most optimal set of operands.
3007 unsigned getBestLaneToStartReordering() const {
3008 unsigned Min = UINT_MAX;
3009 unsigned SameOpNumber = 0;
3010 // std::pair<unsigned, unsigned> is used to implement a simple voting
3011 // algorithm and choose the lane with the least number of operands that
3012 // can freely move about or less profitable because it already has the
3013 // most optimal set of operands. The first unsigned is a counter for
3014 // voting, the second unsigned is the counter of lanes with instructions
3015 // with same/alternate opcodes and same parent basic block.
3017 // Try to be closer to the original results, if we have multiple lanes
3018 // with same cost. If 2 lanes have the same cost, use the one with the
3019 // highest index.
3020 for (int I = getNumLanes(); I > 0; --I) {
3021 unsigned Lane = I - 1;
3022 OperandsOrderData NumFreeOpsHash =
3023 getMaxNumOperandsThatCanBeReordered(Lane);
3024 // Compare the number of operands that can move and choose the one with
3025 // the least number.
3026 if (NumFreeOpsHash.NumOfAPOs < Min) {
3027 Min = NumFreeOpsHash.NumOfAPOs;
3028 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3029 HashMap.clear();
3030 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3031 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3032 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3033 // Select the most optimal lane in terms of number of operands that
3034 // should be moved around.
3035 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3036 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3037 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3038 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3039 auto [It, Inserted] =
3040 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3041 if (!Inserted)
3042 ++It->second.first;
3043 }
3044 }
3045 // Select the lane with the minimum counter.
3046 unsigned BestLane = 0;
3047 unsigned CntMin = UINT_MAX;
3048 for (const auto &Data : reverse(HashMap)) {
3049 if (Data.second.first < CntMin) {
3050 CntMin = Data.second.first;
3051 BestLane = Data.second.second;
3052 }
3053 }
3054 return BestLane;
3055 }
3056
3057 /// Data structure that helps to reorder operands.
3058 struct OperandsOrderData {
3059 /// The best number of operands with the same APOs, which can be
3060 /// reordered.
3061 unsigned NumOfAPOs = UINT_MAX;
3062 /// Number of operands with the same/alternate instruction opcode and
3063 /// parent.
3064 unsigned NumOpsWithSameOpcodeParent = 0;
3065 /// Hash for the actual operands ordering.
3066 /// Used to count operands, actually their position id and opcode
3067 /// value. It is used in the voting mechanism to find the lane with the
3068 /// least number of operands that can freely move about or less profitable
3069 /// because it already has the most optimal set of operands. Can be
3070 /// replaced with SmallVector<unsigned> instead but hash code is faster
3071 /// and requires less memory.
3072 unsigned Hash = 0;
3073 };
3074 /// \returns the maximum number of operands that are allowed to be reordered
3075 /// for \p Lane and the number of compatible instructions(with the same
3076 /// parent/opcode). This is used as a heuristic for selecting the first lane
3077 /// to start operand reordering.
3078 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3079 unsigned CntTrue = 0;
3080 unsigned NumOperands = getNumOperands();
3081 // Operands with the same APO can be reordered. We therefore need to count
3082 // how many of them we have for each APO, like this: Cnt[APO] = x.
3083 // Since we only have two APOs, namely true and false, we can avoid using
3084 // a map. Instead we can simply count the number of operands that
3085 // correspond to one of them (in this case the 'true' APO), and calculate
3086 // the other by subtracting it from the total number of operands.
3087 // Operands with the same instruction opcode and parent are more
3088 // profitable since we don't need to move them in many cases, with a high
3089 // probability such lane already can be vectorized effectively.
3090 bool AllUndefs = true;
3091 unsigned NumOpsWithSameOpcodeParent = 0;
3092 Instruction *OpcodeI = nullptr;
3093 BasicBlock *Parent = nullptr;
3094 unsigned Hash = 0;
3095 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3096 const OperandData &OpData = getData(OpIdx, Lane);
3097 if (OpData.APO)
3098 ++CntTrue;
3099 // Use Boyer-Moore majority voting for finding the majority opcode and
3100 // the number of times it occurs.
3101 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3102 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3103 I->getParent() != Parent) {
3104 if (NumOpsWithSameOpcodeParent == 0) {
3105 NumOpsWithSameOpcodeParent = 1;
3106 OpcodeI = I;
3107 Parent = I->getParent();
3108 } else {
3109 --NumOpsWithSameOpcodeParent;
3110 }
3111 } else {
3112 ++NumOpsWithSameOpcodeParent;
3113 }
3114 }
3115 Hash = hash_combine(
3116 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3117 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3118 }
3119 if (AllUndefs)
3120 return {};
3121 OperandsOrderData Data;
3122 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3123 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3124 Data.Hash = Hash;
3125 return Data;
3126 }
3127
3128 /// Go through the instructions in VL and append their operands.
3129 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3130 const InstructionsState &S) {
3131 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3132 assert((empty() || all_of(Operands,
3133 [this](const ValueList &VL) {
3134 return VL.size() == getNumLanes();
3135 })) &&
3136 "Expected same number of lanes");
3137 assert(S.valid() && "InstructionsState is invalid.");
3138 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3139 // arguments to the intrinsic produces the same result.
3140 Instruction *MainOp = S.getMainOp();
3141 unsigned NumOperands = MainOp->getNumOperands();
3143 OpsVec.resize(ArgSize);
3144 unsigned NumLanes = VL.size();
3145 for (OperandDataVec &Ops : OpsVec)
3146 Ops.resize(NumLanes);
3147 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3148 // Our tree has just 3 nodes: the root and two operands.
3149 // It is therefore trivial to get the APO. We only need to check the
3150 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3151 // operand. The LHS operand of both add and sub is never attached to an
3152 // inversese operation in the linearized form, therefore its APO is
3153 // false. The RHS is true only if V is an inverse operation.
3154
3155 // Since operand reordering is performed on groups of commutative
3156 // operations or alternating sequences (e.g., +, -), we can safely tell
3157 // the inverse operations by checking commutativity.
3158 auto *I = dyn_cast<Instruction>(VL[Lane]);
3159 if (!I && isa<PoisonValue>(VL[Lane])) {
3160 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3161 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3162 continue;
3163 }
3164 bool IsInverseOperation = false;
3165 if (S.isCopyableElement(VL[Lane])) {
3166 // The value is a copyable element.
3167 IsInverseOperation = !isCommutative(MainOp, VL[Lane]);
3168 } else {
3169 assert(I && "Expected instruction");
3170 auto [SelectedOp, Ops] = convertTo(I, S);
3171 // We cannot check commutativity by the converted instruction
3172 // (SelectedOp) because isCommutative also examines def-use
3173 // relationships.
3174 IsInverseOperation = !isCommutative(SelectedOp, I);
3175 }
3176 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3177 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3178 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3179 }
3180 }
3181 }
3182
3183 /// \returns the number of operands.
3184 unsigned getNumOperands() const { return ArgSize; }
3185
3186 /// \returns the number of lanes.
3187 unsigned getNumLanes() const { return OpsVec[0].size(); }
3188
3189 /// \returns the operand value at \p OpIdx and \p Lane.
3190 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3191 return getData(OpIdx, Lane).V;
3192 }
3193
3194 /// \returns true if the data structure is empty.
3195 bool empty() const { return OpsVec.empty(); }
3196
3197 /// Clears the data.
3198 void clear() { OpsVec.clear(); }
3199
3200 /// \Returns true if there are enough operands identical to \p Op to fill
3201 /// the whole vector (it is mixed with constants or loop invariant values).
3202 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3203 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3204 assert(Op == getValue(OpIdx, Lane) &&
3205 "Op is expected to be getValue(OpIdx, Lane).");
3206 // Small number of loads - try load matching.
3207 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3208 return false;
3209 bool OpAPO = getData(OpIdx, Lane).APO;
3210 bool IsInvariant = L && L->isLoopInvariant(Op);
3211 unsigned Cnt = 0;
3212 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3213 if (Ln == Lane)
3214 continue;
3215 // This is set to true if we found a candidate for broadcast at Lane.
3216 bool FoundCandidate = false;
3217 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3218 OperandData &Data = getData(OpI, Ln);
3219 if (Data.APO != OpAPO || Data.IsUsed)
3220 continue;
3221 Value *OpILane = getValue(OpI, Lane);
3222 bool IsConstantOp = isa<Constant>(OpILane);
3223 // Consider the broadcast candidate if:
3224 // 1. Same value is found in one of the operands.
3225 if (Data.V == Op ||
3226 // 2. The operand in the given lane is not constant but there is a
3227 // constant operand in another lane (which can be moved to the
3228 // given lane). In this case we can represent it as a simple
3229 // permutation of constant and broadcast.
3230 (!IsConstantOp &&
3231 ((Lns > 2 && isa<Constant>(Data.V)) ||
3232 // 2.1. If we have only 2 lanes, need to check that value in the
3233 // next lane does not build same opcode sequence.
3234 (Lns == 2 &&
3235 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3236 isa<Constant>(Data.V)))) ||
3237 // 3. The operand in the current lane is loop invariant (can be
3238 // hoisted out) and another operand is also a loop invariant
3239 // (though not a constant). In this case the whole vector can be
3240 // hoisted out.
3241 // FIXME: need to teach the cost model about this case for better
3242 // estimation.
3243 (IsInvariant && !isa<Constant>(Data.V) &&
3244 !getSameOpcode({Op, Data.V}, TLI) &&
3245 L->isLoopInvariant(Data.V))) {
3246 FoundCandidate = true;
3247 Data.IsUsed = Data.V == Op;
3248 if (Data.V == Op)
3249 ++Cnt;
3250 break;
3251 }
3252 }
3253 if (!FoundCandidate)
3254 return false;
3255 }
3256 return getNumLanes() == 2 || Cnt > 1;
3257 }
3258
3259 /// Checks if there is at least single compatible operand in lanes other
3260 /// than \p Lane, compatible with the operand \p Op.
3261 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3262 assert(Op == getValue(OpIdx, Lane) &&
3263 "Op is expected to be getValue(OpIdx, Lane).");
3264 bool OpAPO = getData(OpIdx, Lane).APO;
3265 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3266 if (Ln == Lane)
3267 continue;
3268 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3269 const OperandData &Data = getData(OpI, Ln);
3270 if (Data.APO != OpAPO || Data.IsUsed)
3271 return true;
3272 Value *OpILn = getValue(OpI, Ln);
3273 return (L && L->isLoopInvariant(OpILn)) ||
3274 (getSameOpcode({Op, OpILn}, TLI) &&
3275 allSameBlock({Op, OpILn}));
3276 }))
3277 return true;
3278 }
3279 return false;
3280 }
3281
3282 public:
3283 /// Initialize with all the operands of the instruction vector \p RootVL.
3285 const InstructionsState &S, const BoUpSLP &R)
3286 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3287 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3288 // Append all the operands of RootVL.
3289 appendOperands(RootVL, Operands, S);
3290 }
3291
3292 /// \Returns a value vector with the operands across all lanes for the
3293 /// opearnd at \p OpIdx.
3294 ValueList getVL(unsigned OpIdx) const {
3295 ValueList OpVL(OpsVec[OpIdx].size());
3296 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3297 "Expected same num of lanes across all operands");
3298 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3299 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3300 return OpVL;
3301 }
3302
3303 // Performs operand reordering for 2 or more operands.
3304 // The original operands are in OrigOps[OpIdx][Lane].
3305 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3306 void reorder() {
3307 unsigned NumOperands = getNumOperands();
3308 unsigned NumLanes = getNumLanes();
3309 // Each operand has its own mode. We are using this mode to help us select
3310 // the instructions for each lane, so that they match best with the ones
3311 // we have selected so far.
3312 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3313
3314 // This is a greedy single-pass algorithm. We are going over each lane
3315 // once and deciding on the best order right away with no back-tracking.
3316 // However, in order to increase its effectiveness, we start with the lane
3317 // that has operands that can move the least. For example, given the
3318 // following lanes:
3319 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3320 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3321 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3322 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3323 // we will start at Lane 1, since the operands of the subtraction cannot
3324 // be reordered. Then we will visit the rest of the lanes in a circular
3325 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3326
3327 // Find the first lane that we will start our search from.
3328 unsigned FirstLane = getBestLaneToStartReordering();
3329
3330 // Initialize the modes.
3331 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3332 Value *OpLane0 = getValue(OpIdx, FirstLane);
3333 // Keep track if we have instructions with all the same opcode on one
3334 // side.
3335 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3336 // Check if OpLane0 should be broadcast.
3337 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3338 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3339 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3340 else if (isa<LoadInst>(OpILane0))
3341 ReorderingModes[OpIdx] = ReorderingMode::Load;
3342 else
3343 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3344 } else if (isa<Constant>(OpLane0)) {
3345 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3346 } else if (isa<Argument>(OpLane0)) {
3347 // Our best hope is a Splat. It may save some cost in some cases.
3348 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3349 } else {
3350 llvm_unreachable("Unexpected value kind.");
3351 }
3352 }
3353
3354 // Check that we don't have same operands. No need to reorder if operands
3355 // are just perfect diamond or shuffled diamond match. Do not do it only
3356 // for possible broadcasts or non-power of 2 number of scalars (just for
3357 // now).
3358 auto &&SkipReordering = [this]() {
3359 SmallPtrSet<Value *, 4> UniqueValues;
3360 ArrayRef<OperandData> Op0 = OpsVec.front();
3361 for (const OperandData &Data : Op0)
3362 UniqueValues.insert(Data.V);
3364 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3365 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3366 return !UniqueValues.contains(Data.V);
3367 }))
3368 return false;
3369 }
3370 // TODO: Check if we can remove a check for non-power-2 number of
3371 // scalars after full support of non-power-2 vectorization.
3372 return UniqueValues.size() != 2 &&
3373 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3374 UniqueValues.size());
3375 };
3376
3377 // If the initial strategy fails for any of the operand indexes, then we
3378 // perform reordering again in a second pass. This helps avoid assigning
3379 // high priority to the failed strategy, and should improve reordering for
3380 // the non-failed operand indexes.
3381 for (int Pass = 0; Pass != 2; ++Pass) {
3382 // Check if no need to reorder operands since they're are perfect or
3383 // shuffled diamond match.
3384 // Need to do it to avoid extra external use cost counting for
3385 // shuffled matches, which may cause regressions.
3386 if (SkipReordering())
3387 break;
3388 // Skip the second pass if the first pass did not fail.
3389 bool StrategyFailed = false;
3390 // Mark all operand data as free to use.
3391 clearUsed();
3392 // We keep the original operand order for the FirstLane, so reorder the
3393 // rest of the lanes. We are visiting the nodes in a circular fashion,
3394 // using FirstLane as the center point and increasing the radius
3395 // distance.
3396 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3397 for (unsigned I = 0; I < NumOperands; ++I)
3398 MainAltOps[I].push_back(getData(I, FirstLane).V);
3399
3400 SmallBitVector UsedLanes(NumLanes);
3401 UsedLanes.set(FirstLane);
3402 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3403 // Visit the lane on the right and then the lane on the left.
3404 for (int Direction : {+1, -1}) {
3405 int Lane = FirstLane + Direction * Distance;
3406 if (Lane < 0 || Lane >= (int)NumLanes)
3407 continue;
3408 UsedLanes.set(Lane);
3409 int LastLane = Lane - Direction;
3410 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3411 "Out of bounds");
3412 // Look for a good match for each operand.
3413 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3414 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3415 std::optional<unsigned> BestIdx =
3416 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3417 MainAltOps[OpIdx], UsedLanes);
3418 // By not selecting a value, we allow the operands that follow to
3419 // select a better matching value. We will get a non-null value in
3420 // the next run of getBestOperand().
3421 if (BestIdx) {
3422 // Swap the current operand with the one returned by
3423 // getBestOperand().
3424 swap(OpIdx, *BestIdx, Lane);
3425 } else {
3426 // Enable the second pass.
3427 StrategyFailed = true;
3428 }
3429 // Try to get the alternate opcode and follow it during analysis.
3430 if (MainAltOps[OpIdx].size() != 2) {
3431 OperandData &AltOp = getData(OpIdx, Lane);
3432 InstructionsState OpS =
3433 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3434 if (OpS && OpS.isAltShuffle())
3435 MainAltOps[OpIdx].push_back(AltOp.V);
3436 }
3437 }
3438 }
3439 }
3440 // Skip second pass if the strategy did not fail.
3441 if (!StrategyFailed)
3442 break;
3443 }
3444 }
3445
3446#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3447 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3448 switch (RMode) {
3449 case ReorderingMode::Load:
3450 return "Load";
3451 case ReorderingMode::Opcode:
3452 return "Opcode";
3453 case ReorderingMode::Constant:
3454 return "Constant";
3455 case ReorderingMode::Splat:
3456 return "Splat";
3457 case ReorderingMode::Failed:
3458 return "Failed";
3459 }
3460 llvm_unreachable("Unimplemented Reordering Type");
3461 }
3462
3463 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3464 raw_ostream &OS) {
3465 return OS << getModeStr(RMode);
3466 }
3467
3468 /// Debug print.
3469 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3470 printMode(RMode, dbgs());
3471 }
3472
3473 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3474 return printMode(RMode, OS);
3475 }
3476
3478 const unsigned Indent = 2;
3479 unsigned Cnt = 0;
3480 for (const OperandDataVec &OpDataVec : OpsVec) {
3481 OS << "Operand " << Cnt++ << "\n";
3482 for (const OperandData &OpData : OpDataVec) {
3483 OS.indent(Indent) << "{";
3484 if (Value *V = OpData.V)
3485 OS << *V;
3486 else
3487 OS << "null";
3488 OS << ", APO:" << OpData.APO << "}\n";
3489 }
3490 OS << "\n";
3491 }
3492 return OS;
3493 }
3494
3495 /// Debug print.
3496 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3497#endif
3498 };
3499
3500 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3501 /// for a pair which have highest score deemed to have best chance to form
3502 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3503 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3504 /// of the cost, considered to be good enough score.
3505 std::optional<int>
3506 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3507 int Limit = LookAheadHeuristics::ScoreFail) const {
3508 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3510 int BestScore = Limit;
3511 std::optional<int> Index;
3512 for (int I : seq<int>(0, Candidates.size())) {
3513 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3514 Candidates[I].second,
3515 /*U1=*/nullptr, /*U2=*/nullptr,
3516 /*CurrLevel=*/1, {});
3517 if (Score > BestScore) {
3518 BestScore = Score;
3519 Index = I;
3520 }
3521 }
3522 return Index;
3523 }
3524
3525 /// Checks if the instruction is marked for deletion.
3526 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3527
3528 /// Removes an instruction from its block and eventually deletes it.
3529 /// It's like Instruction::eraseFromParent() except that the actual deletion
3530 /// is delayed until BoUpSLP is destructed.
3532 DeletedInstructions.insert(I);
3533 }
3534
3535 /// Remove instructions from the parent function and clear the operands of \p
3536 /// DeadVals instructions, marking for deletion trivially dead operands.
3537 template <typename T>
3539 ArrayRef<T *> DeadVals,
3540 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3542 for (T *V : DeadVals) {
3543 auto *I = cast<Instruction>(V);
3545 }
3546 DenseSet<Value *> Processed;
3547 for (T *V : DeadVals) {
3548 if (!V || !Processed.insert(V).second)
3549 continue;
3550 auto *I = cast<Instruction>(V);
3552 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3553 for (Use &U : I->operands()) {
3554 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3555 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3557 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3558 return Entry->VectorizedValue == OpI;
3559 })))
3560 DeadInsts.push_back(OpI);
3561 }
3562 I->dropAllReferences();
3563 }
3564 for (T *V : DeadVals) {
3565 auto *I = cast<Instruction>(V);
3566 if (!I->getParent())
3567 continue;
3568 assert((I->use_empty() || all_of(I->uses(),
3569 [&](Use &U) {
3570 return isDeleted(
3571 cast<Instruction>(U.getUser()));
3572 })) &&
3573 "trying to erase instruction with users.");
3574 I->removeFromParent();
3575 SE->forgetValue(I);
3576 }
3577 // Process the dead instruction list until empty.
3578 while (!DeadInsts.empty()) {
3579 Value *V = DeadInsts.pop_back_val();
3581 if (!VI || !VI->getParent())
3582 continue;
3584 "Live instruction found in dead worklist!");
3585 assert(VI->use_empty() && "Instructions with uses are not dead.");
3586
3587 // Don't lose the debug info while deleting the instructions.
3588 salvageDebugInfo(*VI);
3589
3590 // Null out all of the instruction's operands to see if any operand
3591 // becomes dead as we go.
3592 for (Use &OpU : VI->operands()) {
3593 Value *OpV = OpU.get();
3594 if (!OpV)
3595 continue;
3596 OpU.set(nullptr);
3597
3598 if (!OpV->use_empty())
3599 continue;
3600
3601 // If the operand is an instruction that became dead as we nulled out
3602 // the operand, and if it is 'trivially' dead, delete it in a future
3603 // loop iteration.
3604 if (auto *OpI = dyn_cast<Instruction>(OpV))
3605 if (!DeletedInstructions.contains(OpI) &&
3606 (!OpI->getType()->isVectorTy() ||
3607 none_of(VectorValuesAndScales,
3608 [&](const std::tuple<Value *, unsigned, bool> &V) {
3609 return std::get<0>(V) == OpI;
3610 })) &&
3612 DeadInsts.push_back(OpI);
3613 }
3614
3615 VI->removeFromParent();
3616 eraseInstruction(VI);
3617 SE->forgetValue(VI);
3618 }
3619 }
3620
3621 /// Checks if the instruction was already analyzed for being possible
3622 /// reduction root.
3624 return AnalyzedReductionsRoots.count(I);
3625 }
3626 /// Register given instruction as already analyzed for being possible
3627 /// reduction root.
3629 AnalyzedReductionsRoots.insert(I);
3630 }
3631 /// Checks if the provided list of reduced values was checked already for
3632 /// vectorization.
3634 return AnalyzedReductionVals.contains(hash_value(VL));
3635 }
3636 /// Adds the list of reduced values to list of already checked values for the
3637 /// vectorization.
3639 AnalyzedReductionVals.insert(hash_value(VL));
3640 }
3641 /// Clear the list of the analyzed reduction root instructions.
3643 AnalyzedReductionsRoots.clear();
3644 AnalyzedReductionVals.clear();
3645 AnalyzedMinBWVals.clear();
3646 }
3647 /// Checks if the given value is gathered in one of the nodes.
3648 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3649 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3650 }
3651 /// Checks if the given value is gathered in one of the nodes.
3652 bool isGathered(const Value *V) const {
3653 return MustGather.contains(V);
3654 }
3655 /// Checks if the specified value was not schedule.
3656 bool isNotScheduled(const Value *V) const {
3657 return NonScheduledFirst.contains(V);
3658 }
3659
3660 /// Check if the value is vectorized in the tree.
3661 bool isVectorized(const Value *V) const {
3662 assert(V && "V cannot be nullptr.");
3663 return ScalarToTreeEntries.contains(V);
3664 }
3665
3666 ~BoUpSLP();
3667
3668private:
3669 /// Determine if a node \p E in can be demoted to a smaller type with a
3670 /// truncation. We collect the entries that will be demoted in ToDemote.
3671 /// \param E Node for analysis
3672 /// \param ToDemote indices of the nodes to be demoted.
3673 bool collectValuesToDemote(
3674 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3676 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3677 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3678
3679 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3680 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3681 /// they have only one user and reordarable).
3682 /// \param ReorderableGathers List of all gather nodes that require reordering
3683 /// (e.g., gather of extractlements or partially vectorizable loads).
3684 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3685 /// reordering, subset of \p NonVectorized.
3686 void buildReorderableOperands(
3687 TreeEntry *UserTE,
3688 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3689 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3690 SmallVectorImpl<TreeEntry *> &GatherOps);
3691
3692 /// Checks if the given \p TE is a gather node with clustered reused scalars
3693 /// and reorders it per given \p Mask.
3694 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3695
3696 /// Checks if all users of \p I are the part of the vectorization tree.
3697 bool areAllUsersVectorized(
3698 Instruction *I,
3699 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3700
3701 /// Return information about the vector formed for the specified index
3702 /// of a vector of (the same) instruction.
3704
3705 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3706 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3707 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3708 return const_cast<TreeEntry *>(
3709 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3710 }
3711
3712 /// Gets the root instruction for the given node. If the node is a strided
3713 /// load/store node with the reverse order, the root instruction is the last
3714 /// one.
3715 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3716
3717 /// \returns Cast context for the given graph node.
3719 getCastContextHint(const TreeEntry &TE) const;
3720
3721 /// \returns the cost of the vectorizable entry.
3722 InstructionCost getEntryCost(const TreeEntry *E,
3723 ArrayRef<Value *> VectorizedVals,
3724 SmallPtrSetImpl<Value *> &CheckedExtracts);
3725
3726 /// Checks if it is legal and profitable to build SplitVectorize node for the
3727 /// given \p VL.
3728 /// \param Op1 first homogeneous scalars.
3729 /// \param Op2 second homogeneous scalars.
3730 /// \param ReorderIndices indices to reorder the scalars.
3731 /// \returns true if the node was successfully built.
3732 bool canBuildSplitNode(ArrayRef<Value *> VL,
3733 const InstructionsState &LocalState,
3736 OrdersType &ReorderIndices) const;
3737
3738 /// This is the recursive part of buildTree.
3739 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3740 unsigned InterleaveFactor = 0);
3741
3742 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3743 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3744 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3745 /// returns false, setting \p CurrentOrder to either an empty vector or a
3746 /// non-identity permutation that allows to reuse extract instructions.
3747 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3748 /// extract order.
3749 bool canReuseExtract(ArrayRef<Value *> VL,
3750 SmallVectorImpl<unsigned> &CurrentOrder,
3751 bool ResizeAllowed = false) const;
3752
3753 /// Vectorize a single entry in the tree.
3754 Value *vectorizeTree(TreeEntry *E);
3755
3756 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3757 /// \p E.
3758 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3759
3760 /// Create a new vector from a list of scalar values. Produces a sequence
3761 /// which exploits values reused across lanes, and arranges the inserts
3762 /// for ease of later optimization.
3763 template <typename BVTy, typename ResTy, typename... Args>
3764 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3765
3766 /// Create a new vector from a list of scalar values. Produces a sequence
3767 /// which exploits values reused across lanes, and arranges the inserts
3768 /// for ease of later optimization.
3769 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3770
3771 /// Returns the instruction in the bundle, which can be used as a base point
3772 /// for scheduling. Usually it is the last instruction in the bundle, except
3773 /// for the case when all operands are external (in this case, it is the first
3774 /// instruction in the list).
3775 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3776
3777 /// Tries to find extractelement instructions with constant indices from fixed
3778 /// vector type and gather such instructions into a bunch, which highly likely
3779 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3780 /// was successful, the matched scalars are replaced by poison values in \p VL
3781 /// for future analysis.
3782 std::optional<TargetTransformInfo::ShuffleKind>
3783 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3784 SmallVectorImpl<int> &Mask) const;
3785
3786 /// Tries to find extractelement instructions with constant indices from fixed
3787 /// vector type and gather such instructions into a bunch, which highly likely
3788 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3789 /// was successful, the matched scalars are replaced by poison values in \p VL
3790 /// for future analysis.
3792 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3794 unsigned NumParts) const;
3795
3796 /// Checks if the gathered \p VL can be represented as a single register
3797 /// shuffle(s) of previous tree entries.
3798 /// \param TE Tree entry checked for permutation.
3799 /// \param VL List of scalars (a subset of the TE scalar), checked for
3800 /// permutations. Must form single-register vector.
3801 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3802 /// commands to build the mask using the original vector value, without
3803 /// relying on the potential reordering.
3804 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3805 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3806 std::optional<TargetTransformInfo::ShuffleKind>
3807 isGatherShuffledSingleRegisterEntry(
3808 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3809 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3810 bool ForOrder);
3811
3812 /// Checks if the gathered \p VL can be represented as multi-register
3813 /// shuffle(s) of previous tree entries.
3814 /// \param TE Tree entry checked for permutation.
3815 /// \param VL List of scalars (a subset of the TE scalar), checked for
3816 /// permutations.
3817 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3818 /// commands to build the mask using the original vector value, without
3819 /// relying on the potential reordering.
3820 /// \returns per-register series of ShuffleKind, if gathered values can be
3821 /// represented as shuffles of previous tree entries. \p Mask is filled with
3822 /// the shuffle mask (also on per-register base).
3824 isGatherShuffledEntry(
3825 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3827 unsigned NumParts, bool ForOrder = false);
3828
3829 /// \returns the cost of gathering (inserting) the values in \p VL into a
3830 /// vector.
3831 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3832 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3833 Type *ScalarTy) const;
3834
3835 /// Set the Builder insert point to one after the last instruction in
3836 /// the bundle
3837 void setInsertPointAfterBundle(const TreeEntry *E);
3838
3839 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3840 /// specified, the starting vector value is poison.
3841 Value *
3842 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3843 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3844
3845 /// \returns whether the VectorizableTree is fully vectorizable and will
3846 /// be beneficial even the tree height is tiny.
3847 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3848
3849 /// Run through the list of all gathered loads in the graph and try to find
3850 /// vector loads/masked gathers instead of regular gathers. Later these loads
3851 /// are reshufled to build final gathered nodes.
3852 void tryToVectorizeGatheredLoads(
3853 const SmallMapVector<
3854 std::tuple<BasicBlock *, Value *, Type *>,
3855 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3856 &GatheredLoads);
3857
3858 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3859 /// users of \p TE and collects the stores. It returns the map from the store
3860 /// pointers to the collected stores.
3862 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3863
3864 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3865 /// stores in \p StoresVec can form a vector instruction. If so it returns
3866 /// true and populates \p ReorderIndices with the shuffle indices of the
3867 /// stores when compared to the sorted vector.
3868 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3869 OrdersType &ReorderIndices) const;
3870
3871 /// Iterates through the users of \p TE, looking for scalar stores that can be
3872 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3873 /// their order and builds an order index vector for each store bundle. It
3874 /// returns all these order vectors found.
3875 /// We run this after the tree has formed, otherwise we may come across user
3876 /// instructions that are not yet in the tree.
3878 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3879
3880 /// Tries to reorder the gathering node for better vectorization
3881 /// opportunities.
3882 void reorderGatherNode(TreeEntry &TE);
3883
3884 class TreeEntry {
3885 public:
3886 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3887 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3888
3889 /// \returns Common mask for reorder indices and reused scalars.
3890 SmallVector<int> getCommonMask() const {
3891 if (State == TreeEntry::SplitVectorize)
3892 return {};
3893 SmallVector<int> Mask;
3894 inversePermutation(ReorderIndices, Mask);
3895 ::addMask(Mask, ReuseShuffleIndices);
3896 return Mask;
3897 }
3898
3899 /// \returns The mask for split nodes.
3900 SmallVector<int> getSplitMask() const {
3901 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3902 "Expected only split vectorize node.");
3903 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3904 unsigned CommonVF = std::max<unsigned>(
3905 CombinedEntriesWithIndices.back().second,
3906 Scalars.size() - CombinedEntriesWithIndices.back().second);
3907 for (auto [Idx, I] : enumerate(ReorderIndices))
3908 Mask[I] =
3909 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3910 ? CommonVF - CombinedEntriesWithIndices.back().second
3911 : 0);
3912 return Mask;
3913 }
3914
3915 /// Updates (reorders) SplitVectorize node according to the given mask \p
3916 /// Mask and order \p MaskOrder.
3917 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3918 ArrayRef<int> MaskOrder);
3919
3920 /// \returns true if the scalars in VL are equal to this entry.
3921 bool isSame(ArrayRef<Value *> VL) const {
3922 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3923 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3924 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3925 return VL.size() == Mask.size() &&
3926 std::equal(VL.begin(), VL.end(), Mask.begin(),
3927 [Scalars](Value *V, int Idx) {
3928 return (isa<UndefValue>(V) &&
3929 Idx == PoisonMaskElem) ||
3930 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3931 });
3932 };
3933 if (!ReorderIndices.empty()) {
3934 // TODO: implement matching if the nodes are just reordered, still can
3935 // treat the vector as the same if the list of scalars matches VL
3936 // directly, without reordering.
3937 SmallVector<int> Mask;
3938 inversePermutation(ReorderIndices, Mask);
3939 if (VL.size() == Scalars.size())
3940 return IsSame(Scalars, Mask);
3941 if (VL.size() == ReuseShuffleIndices.size()) {
3942 ::addMask(Mask, ReuseShuffleIndices);
3943 return IsSame(Scalars, Mask);
3944 }
3945 return false;
3946 }
3947 return IsSame(Scalars, ReuseShuffleIndices);
3948 }
3949
3950 /// \returns true if current entry has same operands as \p TE.
3951 bool hasEqualOperands(const TreeEntry &TE) const {
3952 if (TE.getNumOperands() != getNumOperands())
3953 return false;
3954 SmallBitVector Used(getNumOperands());
3955 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3956 unsigned PrevCount = Used.count();
3957 for (unsigned K = 0; K < E; ++K) {
3958 if (Used.test(K))
3959 continue;
3960 if (getOperand(K) == TE.getOperand(I)) {
3961 Used.set(K);
3962 break;
3963 }
3964 }
3965 // Check if we actually found the matching operand.
3966 if (PrevCount == Used.count())
3967 return false;
3968 }
3969 return true;
3970 }
3971
3972 /// \return Final vectorization factor for the node. Defined by the total
3973 /// number of vectorized scalars, including those, used several times in the
3974 /// entry and counted in the \a ReuseShuffleIndices, if any.
3975 unsigned getVectorFactor() const {
3976 if (!ReuseShuffleIndices.empty())
3977 return ReuseShuffleIndices.size();
3978 return Scalars.size();
3979 };
3980
3981 /// Checks if the current node is a gather node.
3982 bool isGather() const { return State == NeedToGather; }
3983
3984 /// A vector of scalars.
3985 ValueList Scalars;
3986
3987 /// The Scalars are vectorized into this value. It is initialized to Null.
3988 WeakTrackingVH VectorizedValue = nullptr;
3989
3990 /// Do we need to gather this sequence or vectorize it
3991 /// (either with vector instruction or with scatter/gather
3992 /// intrinsics for store/load)?
3993 enum EntryState {
3994 Vectorize, ///< The node is regularly vectorized.
3995 ScatterVectorize, ///< Masked scatter/gather node.
3996 StridedVectorize, ///< Strided loads (and stores)
3997 CompressVectorize, ///< (Masked) load with compress.
3998 NeedToGather, ///< Gather/buildvector node.
3999 CombinedVectorize, ///< Vectorized node, combined with its user into more
4000 ///< complex node like select/cmp to minmax, mul/add to
4001 ///< fma, etc. Must be used for the following nodes in
4002 ///< the pattern, not the very first one.
4003 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4004 ///< independently and then combines back.
4005 };
4006 EntryState State;
4007
4008 /// List of combined opcodes supported by the vectorizer.
4009 enum CombinedOpcode {
4010 NotCombinedOp = -1,
4011 MinMax = Instruction::OtherOpsEnd + 1,
4012 FMulAdd,
4013 };
4014 CombinedOpcode CombinedOp = NotCombinedOp;
4015
4016 /// Does this sequence require some shuffling?
4017 SmallVector<int, 4> ReuseShuffleIndices;
4018
4019 /// Does this entry require reordering?
4020 SmallVector<unsigned, 4> ReorderIndices;
4021
4022 /// Points back to the VectorizableTree.
4023 ///
4024 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4025 /// to be a pointer and needs to be able to initialize the child iterator.
4026 /// Thus we need a reference back to the container to translate the indices
4027 /// to entries.
4028 VecTreeTy &Container;
4029
4030 /// The TreeEntry index containing the user of this entry.
4031 EdgeInfo UserTreeIndex;
4032
4033 /// The index of this treeEntry in VectorizableTree.
4034 unsigned Idx = 0;
4035
4036 /// For gather/buildvector/alt opcode nodes, which are combined from
4037 /// other nodes as a series of insertvector instructions.
4038 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4039
4040 private:
4041 /// The operands of each instruction in each lane Operands[op_index][lane].
4042 /// Note: This helps avoid the replication of the code that performs the
4043 /// reordering of operands during buildTreeRec() and vectorizeTree().
4045
4046 /// Copyable elements of the entry node.
4047 SmallPtrSet<const Value *, 4> CopyableElements;
4048
4049 /// MainOp and AltOp are recorded inside. S should be obtained from
4050 /// newTreeEntry.
4051 InstructionsState S = InstructionsState::invalid();
4052
4053 /// Interleaving factor for interleaved loads Vectorize nodes.
4054 unsigned InterleaveFactor = 0;
4055
4056 /// True if the node does not require scheduling.
4057 bool DoesNotNeedToSchedule = false;
4058
4059 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4060 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4061 if (Operands.size() < OpIdx + 1)
4062 Operands.resize(OpIdx + 1);
4063 assert(Operands[OpIdx].empty() && "Already resized?");
4064 assert(OpVL.size() <= Scalars.size() &&
4065 "Number of operands is greater than the number of scalars.");
4066 Operands[OpIdx].resize(OpVL.size());
4067 copy(OpVL, Operands[OpIdx].begin());
4068 }
4069
4070 public:
4071 /// Returns interleave factor for interleave nodes.
4072 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4073 /// Sets interleaving factor for the interleaving nodes.
4074 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4075
4076 /// Marks the node as one that does not require scheduling.
4077 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4078 /// Returns true if the node is marked as one that does not require
4079 /// scheduling.
4080 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4081
4082 /// Set this bundle's operands from \p Operands.
4083 void setOperands(ArrayRef<ValueList> Operands) {
4084 for (unsigned I : seq<unsigned>(Operands.size()))
4085 setOperand(I, Operands[I]);
4086 }
4087
4088 /// Reorders operands of the node to the given mask \p Mask.
4089 void reorderOperands(ArrayRef<int> Mask) {
4090 for (ValueList &Operand : Operands)
4091 reorderScalars(Operand, Mask);
4092 }
4093
4094 /// \returns the \p OpIdx operand of this TreeEntry.
4095 ValueList &getOperand(unsigned OpIdx) {
4096 assert(OpIdx < Operands.size() && "Off bounds");
4097 return Operands[OpIdx];
4098 }
4099
4100 /// \returns the \p OpIdx operand of this TreeEntry.
4101 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4102 assert(OpIdx < Operands.size() && "Off bounds");
4103 return Operands[OpIdx];
4104 }
4105
4106 /// \returns the number of operands.
4107 unsigned getNumOperands() const { return Operands.size(); }
4108
4109 /// \return the single \p OpIdx operand.
4110 Value *getSingleOperand(unsigned OpIdx) const {
4111 assert(OpIdx < Operands.size() && "Off bounds");
4112 assert(!Operands[OpIdx].empty() && "No operand available");
4113 return Operands[OpIdx][0];
4114 }
4115
4116 /// Some of the instructions in the list have alternate opcodes.
4117 bool isAltShuffle() const { return S.isAltShuffle(); }
4118
4119 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4120 return S.getMatchingMainOpOrAltOp(I);
4121 }
4122
4123 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4124 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4125 /// \p OpValue.
4126 Value *isOneOf(Value *Op) const {
4127 auto *I = dyn_cast<Instruction>(Op);
4128 if (I && getMatchingMainOpOrAltOp(I))
4129 return Op;
4130 return S.getMainOp();
4131 }
4132
4133 void setOperations(const InstructionsState &S) {
4134 assert(S && "InstructionsState is invalid.");
4135 this->S = S;
4136 }
4137
4138 Instruction *getMainOp() const { return S.getMainOp(); }
4139
4140 Instruction *getAltOp() const { return S.getAltOp(); }
4141
4142 /// The main/alternate opcodes for the list of instructions.
4143 unsigned getOpcode() const { return S.getOpcode(); }
4144
4145 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4146
4147 bool hasState() const { return S.valid(); }
4148
4149 /// Add \p V to the list of copyable elements.
4150 void addCopyableElement(Value *V) {
4151 assert(S.isCopyableElement(V) && "Not a copyable element.");
4152 CopyableElements.insert(V);
4153 }
4154
4155 /// Returns true if \p V is a copyable element.
4156 bool isCopyableElement(Value *V) const {
4157 return CopyableElements.contains(V);
4158 }
4159
4160 /// Returns true if any scalar in the list is a copyable element.
4161 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4162
4163 /// Returns the state of the operations.
4164 const InstructionsState &getOperations() const { return S; }
4165
4166 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4167 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4168 unsigned findLaneForValue(Value *V) const {
4169 unsigned FoundLane = getVectorFactor();
4170 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4171 std::advance(It, 1)) {
4172 if (*It != V)
4173 continue;
4174 FoundLane = std::distance(Scalars.begin(), It);
4175 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4176 if (!ReorderIndices.empty())
4177 FoundLane = ReorderIndices[FoundLane];
4178 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4179 if (ReuseShuffleIndices.empty())
4180 break;
4181 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4182 RIt != ReuseShuffleIndices.end()) {
4183 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4184 break;
4185 }
4186 }
4187 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4188 return FoundLane;
4189 }
4190
4191 /// Build a shuffle mask for graph entry which represents a merge of main
4192 /// and alternate operations.
4193 void
4194 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4195 SmallVectorImpl<int> &Mask,
4196 SmallVectorImpl<Value *> *OpScalars = nullptr,
4197 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4198
4199 /// Return true if this is a non-power-of-2 node.
4200 bool isNonPowOf2Vec() const {
4201 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4202 return IsNonPowerOf2;
4203 }
4204
4205 /// Return true if this is a node, which tries to vectorize number of
4206 /// elements, forming whole vectors.
4207 bool
4208 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4209 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4210 TTI, getValueType(Scalars.front()), Scalars.size());
4211 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4212 "Reshuffling not supported with non-power-of-2 vectors yet.");
4213 return IsNonPowerOf2;
4214 }
4215
4216 Value *getOrdered(unsigned Idx) const {
4217 assert(isGather() && "Must be used only for buildvectors/gathers.");
4218 if (ReorderIndices.empty())
4219 return Scalars[Idx];
4220 SmallVector<int> Mask;
4221 inversePermutation(ReorderIndices, Mask);
4222 return Scalars[Mask[Idx]];
4223 }
4224
4225#ifndef NDEBUG
4226 /// Debug printer.
4227 LLVM_DUMP_METHOD void dump() const {
4228 dbgs() << Idx << ".\n";
4229 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4230 dbgs() << "Operand " << OpI << ":\n";
4231 for (const Value *V : Operands[OpI])
4232 dbgs().indent(2) << *V << "\n";
4233 }
4234 dbgs() << "Scalars: \n";
4235 for (Value *V : Scalars)
4236 dbgs().indent(2) << *V << "\n";
4237 dbgs() << "State: ";
4238 if (S && hasCopyableElements())
4239 dbgs() << "[[Copyable]] ";
4240 switch (State) {
4241 case Vectorize:
4242 if (InterleaveFactor > 0) {
4243 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4244 << "\n";
4245 } else {
4246 dbgs() << "Vectorize\n";
4247 }
4248 break;
4249 case ScatterVectorize:
4250 dbgs() << "ScatterVectorize\n";
4251 break;
4252 case StridedVectorize:
4253 dbgs() << "StridedVectorize\n";
4254 break;
4255 case CompressVectorize:
4256 dbgs() << "CompressVectorize\n";
4257 break;
4258 case NeedToGather:
4259 dbgs() << "NeedToGather\n";
4260 break;
4261 case CombinedVectorize:
4262 dbgs() << "CombinedVectorize\n";
4263 break;
4264 case SplitVectorize:
4265 dbgs() << "SplitVectorize\n";
4266 break;
4267 }
4268 if (S) {
4269 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4270 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4271 } else {
4272 dbgs() << "MainOp: NULL\n";
4273 dbgs() << "AltOp: NULL\n";
4274 }
4275 dbgs() << "VectorizedValue: ";
4276 if (VectorizedValue)
4277 dbgs() << *VectorizedValue << "\n";
4278 else
4279 dbgs() << "NULL\n";
4280 dbgs() << "ReuseShuffleIndices: ";
4281 if (ReuseShuffleIndices.empty())
4282 dbgs() << "Empty";
4283 else
4284 for (int ReuseIdx : ReuseShuffleIndices)
4285 dbgs() << ReuseIdx << ", ";
4286 dbgs() << "\n";
4287 dbgs() << "ReorderIndices: ";
4288 for (unsigned ReorderIdx : ReorderIndices)
4289 dbgs() << ReorderIdx << ", ";
4290 dbgs() << "\n";
4291 dbgs() << "UserTreeIndex: ";
4292 if (UserTreeIndex)
4293 dbgs() << UserTreeIndex;
4294 else
4295 dbgs() << "<invalid>";
4296 dbgs() << "\n";
4297 if (!CombinedEntriesWithIndices.empty()) {
4298 dbgs() << "Combined entries: ";
4299 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4300 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4301 });
4302 dbgs() << "\n";
4303 }
4304 }
4305#endif
4306 };
4307
4308#ifndef NDEBUG
4309 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4310 InstructionCost VecCost, InstructionCost ScalarCost,
4311 StringRef Banner) const {
4312 dbgs() << "SLP: " << Banner << ":\n";
4313 E->dump();
4314 dbgs() << "SLP: Costs:\n";
4315 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4316 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4317 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4318 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4319 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4320 }
4321#endif
4322
4323 /// Create a new gather TreeEntry
4324 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4325 const InstructionsState &S,
4326 const EdgeInfo &UserTreeIdx,
4327 ArrayRef<int> ReuseShuffleIndices = {}) {
4328 auto Invalid = ScheduleBundle::invalid();
4329 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4330 }
4331
4332 /// Create a new VectorizableTree entry.
4333 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4334 const InstructionsState &S,
4335 const EdgeInfo &UserTreeIdx,
4336 ArrayRef<int> ReuseShuffleIndices = {},
4337 ArrayRef<unsigned> ReorderIndices = {},
4338 unsigned InterleaveFactor = 0) {
4339 TreeEntry::EntryState EntryState =
4340 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4341 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4342 ReuseShuffleIndices, ReorderIndices);
4343 if (E && InterleaveFactor > 0)
4344 E->setInterleave(InterleaveFactor);
4345 return E;
4346 }
4347
4348 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4349 TreeEntry::EntryState EntryState,
4350 ScheduleBundle &Bundle, const InstructionsState &S,
4351 const EdgeInfo &UserTreeIdx,
4352 ArrayRef<int> ReuseShuffleIndices = {},
4353 ArrayRef<unsigned> ReorderIndices = {}) {
4354 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4355 EntryState == TreeEntry::SplitVectorize)) ||
4356 (Bundle && EntryState != TreeEntry::NeedToGather &&
4357 EntryState != TreeEntry::SplitVectorize)) &&
4358 "Need to vectorize gather entry?");
4359 // Gathered loads still gathered? Do not create entry, use the original one.
4360 if (GatheredLoadsEntriesFirst.has_value() &&
4361 EntryState == TreeEntry::NeedToGather && S &&
4362 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4363 !UserTreeIdx.UserTE)
4364 return nullptr;
4365 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4366 TreeEntry *Last = VectorizableTree.back().get();
4367 Last->Idx = VectorizableTree.size() - 1;
4368 Last->State = EntryState;
4369 if (UserTreeIdx.UserTE)
4370 OperandsToTreeEntry.try_emplace(
4371 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4372 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4373 // for non-power-of-two vectors.
4374 assert(
4375 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4376 ReuseShuffleIndices.empty()) &&
4377 "Reshuffling scalars not yet supported for nodes with padding");
4378 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4379 ReuseShuffleIndices.end());
4380 if (ReorderIndices.empty()) {
4381 Last->Scalars.assign(VL.begin(), VL.end());
4382 if (S)
4383 Last->setOperations(S);
4384 } else {
4385 // Reorder scalars and build final mask.
4386 Last->Scalars.assign(VL.size(), nullptr);
4387 transform(ReorderIndices, Last->Scalars.begin(),
4388 [VL](unsigned Idx) -> Value * {
4389 if (Idx >= VL.size())
4390 return UndefValue::get(VL.front()->getType());
4391 return VL[Idx];
4392 });
4393 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4394 if (S)
4395 Last->setOperations(S);
4396 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4397 }
4398 if (EntryState == TreeEntry::SplitVectorize) {
4399 assert(S && "Split nodes must have operations.");
4400 Last->setOperations(S);
4401 SmallPtrSet<Value *, 4> Processed;
4402 for (Value *V : VL) {
4403 auto *I = dyn_cast<Instruction>(V);
4404 if (!I)
4405 continue;
4406 auto It = ScalarsInSplitNodes.find(V);
4407 if (It == ScalarsInSplitNodes.end()) {
4408 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4409 (void)Processed.insert(V);
4410 } else if (Processed.insert(V).second) {
4411 assert(!is_contained(It->getSecond(), Last) &&
4412 "Value already associated with the node.");
4413 It->getSecond().push_back(Last);
4414 }
4415 }
4416 } else if (!Last->isGather()) {
4417 if (isa<PHINode>(S.getMainOp()) ||
4418 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4419 (!S.areInstructionsWithCopyableElements() &&
4420 doesNotNeedToSchedule(VL)) ||
4421 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4422 Last->setDoesNotNeedToSchedule();
4423 SmallPtrSet<Value *, 4> Processed;
4424 for (Value *V : VL) {
4425 if (isa<PoisonValue>(V))
4426 continue;
4427 if (S.isCopyableElement(V)) {
4428 Last->addCopyableElement(V);
4429 continue;
4430 }
4431 auto It = ScalarToTreeEntries.find(V);
4432 if (It == ScalarToTreeEntries.end()) {
4433 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4434 (void)Processed.insert(V);
4435 } else if (Processed.insert(V).second) {
4436 assert(!is_contained(It->getSecond(), Last) &&
4437 "Value already associated with the node.");
4438 It->getSecond().push_back(Last);
4439 }
4440 }
4441 // Update the scheduler bundle to point to this TreeEntry.
4442 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4443 "Bundle and VL out of sync");
4444 if (!Bundle.getBundle().empty()) {
4445#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4446 auto *BundleMember = Bundle.getBundle().begin();
4447 SmallPtrSet<Value *, 4> Processed;
4448 for (Value *V : VL) {
4449 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4450 continue;
4451 ++BundleMember;
4452 }
4453 assert(BundleMember == Bundle.getBundle().end() &&
4454 "Bundle and VL out of sync");
4455#endif
4456 Bundle.setTreeEntry(Last);
4457 }
4458 } else {
4459 // Build a map for gathered scalars to the nodes where they are used.
4460 bool AllConstsOrCasts = true;
4461 for (Value *V : VL) {
4462 if (S && S.areInstructionsWithCopyableElements() &&
4463 S.isCopyableElement(V))
4464 Last->addCopyableElement(V);
4465 if (!isConstant(V)) {
4466 auto *I = dyn_cast<CastInst>(V);
4467 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4468 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4469 !UserTreeIdx.UserTE->isGather())
4470 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4471 }
4472 }
4473 if (AllConstsOrCasts)
4474 CastMaxMinBWSizes =
4475 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4476 MustGather.insert_range(VL);
4477 }
4478
4479 if (UserTreeIdx.UserTE)
4480 Last->UserTreeIndex = UserTreeIdx;
4481 return Last;
4482 }
4483
4484 /// -- Vectorization State --
4485 /// Holds all of the tree entries.
4486 TreeEntry::VecTreeTy VectorizableTree;
4487
4488#ifndef NDEBUG
4489 /// Debug printer.
4490 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4491 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4492 VectorizableTree[Id]->dump();
4493 dbgs() << "\n";
4494 }
4495 }
4496#endif
4497
4498 /// Get list of vector entries, associated with the value \p V.
4499 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4500 assert(V && "V cannot be nullptr.");
4501 auto It = ScalarToTreeEntries.find(V);
4502 if (It == ScalarToTreeEntries.end())
4503 return {};
4504 return It->getSecond();
4505 }
4506
4507 /// Get list of split vector entries, associated with the value \p V.
4508 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4509 assert(V && "V cannot be nullptr.");
4510 auto It = ScalarsInSplitNodes.find(V);
4511 if (It == ScalarsInSplitNodes.end())
4512 return {};
4513 return It->getSecond();
4514 }
4515
4516 /// Returns first vector node for value \p V, matching values \p VL.
4517 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4518 bool SameVF = false) const {
4519 assert(V && "V cannot be nullptr.");
4520 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4521 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4522 return TE;
4523 return nullptr;
4524 }
4525
4526 /// Check that the operand node of alternate node does not generate
4527 /// buildvector sequence. If it is, then probably not worth it to build
4528 /// alternate shuffle, if number of buildvector operands + alternate
4529 /// instruction > than the number of buildvector instructions.
4530 /// \param S the instructions state of the analyzed values.
4531 /// \param VL list of the instructions with alternate opcodes.
4532 bool areAltOperandsProfitable(const InstructionsState &S,
4533 ArrayRef<Value *> VL) const;
4534
4535 /// Contains all the outputs of legality analysis for a list of values to
4536 /// vectorize.
4537 class ScalarsVectorizationLegality {
4538 InstructionsState S;
4539 bool IsLegal;
4540 bool TryToFindDuplicates;
4541 bool TrySplitVectorize;
4542
4543 public:
4544 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4545 bool TryToFindDuplicates = true,
4546 bool TrySplitVectorize = false)
4547 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4548 TrySplitVectorize(TrySplitVectorize) {
4549 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4550 "Inconsistent state");
4551 }
4552 const InstructionsState &getInstructionsState() const { return S; };
4553 bool isLegal() const { return IsLegal; }
4554 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4555 bool trySplitVectorize() const { return TrySplitVectorize; }
4556 };
4557
4558 /// Checks if the specified list of the instructions/values can be vectorized
4559 /// in general.
4560 ScalarsVectorizationLegality
4561 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4562 const EdgeInfo &UserTreeIdx,
4563 bool TryCopyableElementsVectorization) const;
4564
4565 /// Checks if the specified list of the instructions/values can be vectorized
4566 /// and fills required data before actual scheduling of the instructions.
4567 TreeEntry::EntryState getScalarsVectorizationState(
4568 const InstructionsState &S, ArrayRef<Value *> VL,
4569 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4570 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4571
4572 /// Maps a specific scalar to its tree entry(ies).
4573 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4574
4575 /// Maps the operand index and entry to the corresponding tree entry.
4576 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4577 OperandsToTreeEntry;
4578
4579 /// Scalars, used in split vectorize nodes.
4580 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4581
4582 /// Maps a value to the proposed vectorizable size.
4583 SmallDenseMap<Value *, unsigned> InstrElementSize;
4584
4585 /// A list of scalars that we found that we need to keep as scalars.
4586 ValueSet MustGather;
4587
4588 /// A set of first non-schedulable values.
4589 ValueSet NonScheduledFirst;
4590
4591 /// A map between the vectorized entries and the last instructions in the
4592 /// bundles. The bundles are built in use order, not in the def order of the
4593 /// instructions. So, we cannot rely directly on the last instruction in the
4594 /// bundle being the last instruction in the program order during
4595 /// vectorization process since the basic blocks are affected, need to
4596 /// pre-gather them before.
4597 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4598
4599 /// List of gather nodes, depending on other gather/vector nodes, which should
4600 /// be emitted after the vector instruction emission process to correctly
4601 /// handle order of the vector instructions and shuffles.
4602 SetVector<const TreeEntry *> PostponedGathers;
4603
4604 using ValueToGatherNodesMap =
4605 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4606 ValueToGatherNodesMap ValueToGatherNodes;
4607
4608 /// A list of the load entries (node indices), which can be vectorized using
4609 /// strided or masked gather approach, but attempted to be represented as
4610 /// contiguous loads.
4611 SetVector<unsigned> LoadEntriesToVectorize;
4612
4613 /// true if graph nodes transforming mode is on.
4614 bool IsGraphTransformMode = false;
4615
4616 /// The index of the first gathered load entry in the VectorizeTree.
4617 std::optional<unsigned> GatheredLoadsEntriesFirst;
4618
4619 /// Maps compress entries to their mask data for the final codegen.
4620 SmallDenseMap<const TreeEntry *,
4621 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4622 CompressEntryToData;
4623
4624 /// This POD struct describes one external user in the vectorized tree.
4625 struct ExternalUser {
4626 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4627 : Scalar(S), User(U), E(E), Lane(L) {}
4628
4629 /// Which scalar in our function.
4630 Value *Scalar = nullptr;
4631
4632 /// Which user that uses the scalar.
4633 llvm::User *User = nullptr;
4634
4635 /// Vector node, the value is part of.
4636 const TreeEntry &E;
4637
4638 /// Which lane does the scalar belong to.
4639 unsigned Lane;
4640 };
4641 using UserList = SmallVector<ExternalUser, 16>;
4642
4643 /// Checks if two instructions may access the same memory.
4644 ///
4645 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4646 /// is invariant in the calling loop.
4647 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4648 Instruction *Inst2) {
4649 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4650 // First check if the result is already in the cache.
4651 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4652 auto Res = AliasCache.try_emplace(Key);
4653 if (!Res.second)
4654 return Res.first->second;
4655 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4656 // Store the result in the cache.
4657 Res.first->getSecond() = Aliased;
4658 return Aliased;
4659 }
4660
4661 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4662
4663 /// Cache for alias results.
4664 /// TODO: consider moving this to the AliasAnalysis itself.
4665 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4666
4667 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4668 // globally through SLP because we don't perform any action which
4669 // invalidates capture results.
4670 BatchAAResults BatchAA;
4671
4672 /// Temporary store for deleted instructions. Instructions will be deleted
4673 /// eventually when the BoUpSLP is destructed. The deferral is required to
4674 /// ensure that there are no incorrect collisions in the AliasCache, which
4675 /// can happen if a new instruction is allocated at the same address as a
4676 /// previously deleted instruction.
4677 DenseSet<Instruction *> DeletedInstructions;
4678
4679 /// Set of the instruction, being analyzed already for reductions.
4680 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4681
4682 /// Set of hashes for the list of reduction values already being analyzed.
4683 DenseSet<size_t> AnalyzedReductionVals;
4684
4685 /// Values, already been analyzed for mininmal bitwidth and found to be
4686 /// non-profitable.
4687 DenseSet<Value *> AnalyzedMinBWVals;
4688
4689 /// A list of values that need to extracted out of the tree.
4690 /// This list holds pairs of (Internal Scalar : External User). External User
4691 /// can be nullptr, it means that this Internal Scalar will be used later,
4692 /// after vectorization.
4693 UserList ExternalUses;
4694
4695 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4696 /// extractelement instructions.
4697 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4698
4699 /// A list of scalar to be extracted without specific user necause of too many
4700 /// uses.
4701 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4702
4703 /// Values used only by @llvm.assume calls.
4704 SmallPtrSet<const Value *, 32> EphValues;
4705
4706 /// Holds all of the instructions that we gathered, shuffle instructions and
4707 /// extractelements.
4708 SetVector<Instruction *> GatherShuffleExtractSeq;
4709
4710 /// A list of blocks that we are going to CSE.
4711 DenseSet<BasicBlock *> CSEBlocks;
4712
4713 /// List of hashes of vector of loads, which are known to be non vectorizable.
4714 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4715
4716 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4717 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4718 /// instructions, while ScheduleBundle represents a batch of instructions,
4719 /// going to be groupped together. ScheduleCopyableData models extra user for
4720 /// "copyable" instructions.
4721 class ScheduleEntity {
4722 friend class ScheduleBundle;
4723 friend class ScheduleData;
4724 friend class ScheduleCopyableData;
4725
4726 protected:
4727 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4728 Kind getKind() const { return K; }
4729 ScheduleEntity(Kind K) : K(K) {}
4730
4731 private:
4732 /// Used for getting a "good" final ordering of instructions.
4733 int SchedulingPriority = 0;
4734 /// True if this instruction (or bundle) is scheduled (or considered as
4735 /// scheduled in the dry-run).
4736 bool IsScheduled = false;
4737 /// The kind of the ScheduleEntity.
4738 const Kind K = Kind::ScheduleData;
4739
4740 public:
4741 ScheduleEntity() = delete;
4742 /// Gets/sets the scheduling priority.
4743 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4744 int getSchedulingPriority() const { return SchedulingPriority; }
4745 bool isReady() const {
4746 if (const auto *SD = dyn_cast<ScheduleData>(this))
4747 return SD->isReady();
4748 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4749 return CD->isReady();
4750 return cast<ScheduleBundle>(this)->isReady();
4751 }
4752 /// Returns true if the dependency information has been calculated.
4753 /// Note that depenendency validity can vary between instructions within
4754 /// a single bundle.
4755 bool hasValidDependencies() const {
4756 if (const auto *SD = dyn_cast<ScheduleData>(this))
4757 return SD->hasValidDependencies();
4758 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4759 return CD->hasValidDependencies();
4760 return cast<ScheduleBundle>(this)->hasValidDependencies();
4761 }
4762 /// Gets the number of unscheduled dependencies.
4763 int getUnscheduledDeps() const {
4764 if (const auto *SD = dyn_cast<ScheduleData>(this))
4765 return SD->getUnscheduledDeps();
4766 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4767 return CD->getUnscheduledDeps();
4768 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4769 }
4770 /// Increments the number of unscheduled dependencies.
4771 int incrementUnscheduledDeps(int Incr) {
4772 if (auto *SD = dyn_cast<ScheduleData>(this))
4773 return SD->incrementUnscheduledDeps(Incr);
4774 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4775 }
4776 /// Gets the number of dependencies.
4777 int getDependencies() const {
4778 if (const auto *SD = dyn_cast<ScheduleData>(this))
4779 return SD->getDependencies();
4780 return cast<ScheduleCopyableData>(this)->getDependencies();
4781 }
4782 /// Gets the instruction.
4783 Instruction *getInst() const {
4784 if (const auto *SD = dyn_cast<ScheduleData>(this))
4785 return SD->getInst();
4786 return cast<ScheduleCopyableData>(this)->getInst();
4787 }
4788
4789 /// Gets/sets if the bundle is scheduled.
4790 bool isScheduled() const { return IsScheduled; }
4791 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4792
4793 static bool classof(const ScheduleEntity *) { return true; }
4794
4795#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4796 void dump(raw_ostream &OS) const {
4797 if (const auto *SD = dyn_cast<ScheduleData>(this))
4798 return SD->dump(OS);
4799 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4800 return CD->dump(OS);
4801 return cast<ScheduleBundle>(this)->dump(OS);
4802 }
4803
4804 LLVM_DUMP_METHOD void dump() const {
4805 dump(dbgs());
4806 dbgs() << '\n';
4807 }
4808#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4809 };
4810
4811#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4813 const BoUpSLP::ScheduleEntity &SE) {
4814 SE.dump(OS);
4815 return OS;
4816 }
4817#endif
4818
4819 /// Contains all scheduling relevant data for an instruction.
4820 /// A ScheduleData either represents a single instruction or a member of an
4821 /// instruction bundle (= a group of instructions which is combined into a
4822 /// vector instruction).
4823 class ScheduleData final : public ScheduleEntity {
4824 public:
4825 // The initial value for the dependency counters. It means that the
4826 // dependencies are not calculated yet.
4827 enum { InvalidDeps = -1 };
4828
4829 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4830 static bool classof(const ScheduleEntity *Entity) {
4831 return Entity->getKind() == Kind::ScheduleData;
4832 }
4833
4834 void init(int BlockSchedulingRegionID, Instruction *I) {
4835 NextLoadStore = nullptr;
4836 IsScheduled = false;
4837 SchedulingRegionID = BlockSchedulingRegionID;
4838 clearDependencies();
4839 Inst = I;
4840 }
4841
4842 /// Verify basic self consistency properties
4843 void verify() {
4844 if (hasValidDependencies()) {
4845 assert(UnscheduledDeps <= Dependencies && "invariant");
4846 } else {
4847 assert(UnscheduledDeps == Dependencies && "invariant");
4848 }
4849
4850 if (IsScheduled) {
4851 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4852 "unexpected scheduled state");
4853 }
4854 }
4855
4856 /// Returns true if the dependency information has been calculated.
4857 /// Note that depenendency validity can vary between instructions within
4858 /// a single bundle.
4859 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4860
4861 /// Returns true if it is ready for scheduling, i.e. it has no more
4862 /// unscheduled depending instructions/bundles.
4863 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4864
4865 /// Modifies the number of unscheduled dependencies for this instruction,
4866 /// and returns the number of remaining dependencies for the containing
4867 /// bundle.
4868 int incrementUnscheduledDeps(int Incr) {
4869 assert(hasValidDependencies() &&
4870 "increment of unscheduled deps would be meaningless");
4871 UnscheduledDeps += Incr;
4872 return UnscheduledDeps;
4873 }
4874
4875 /// Sets the number of unscheduled dependencies to the number of
4876 /// dependencies.
4877 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4878
4879 /// Clears all dependency information.
4880 void clearDependencies() {
4881 clearDirectDependencies();
4882 MemoryDependencies.clear();
4883 ControlDependencies.clear();
4884 }
4885
4886 /// Clears all direct dependencies only, except for control and memory
4887 /// dependencies.
4888 /// Required for copyable elements to correctly handle control/memory deps
4889 /// and avoid extra reclaculation of such deps.
4890 void clearDirectDependencies() {
4891 Dependencies = InvalidDeps;
4892 resetUnscheduledDeps();
4893 IsScheduled = false;
4894 }
4895
4896 /// Gets the number of unscheduled dependencies.
4897 int getUnscheduledDeps() const { return UnscheduledDeps; }
4898 /// Gets the number of dependencies.
4899 int getDependencies() const { return Dependencies; }
4900 /// Initializes the number of dependencies.
4901 void initDependencies() { Dependencies = 0; }
4902 /// Increments the number of dependencies.
4903 void incDependencies() { Dependencies++; }
4904
4905 /// Gets scheduling region ID.
4906 int getSchedulingRegionID() const { return SchedulingRegionID; }
4907
4908 /// Gets the instruction.
4909 Instruction *getInst() const { return Inst; }
4910
4911 /// Gets the list of memory dependencies.
4912 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4913 return MemoryDependencies;
4914 }
4915 /// Adds a memory dependency.
4916 void addMemoryDependency(ScheduleData *Dep) {
4917 MemoryDependencies.push_back(Dep);
4918 }
4919 /// Gets the list of control dependencies.
4920 ArrayRef<ScheduleData *> getControlDependencies() const {
4921 return ControlDependencies;
4922 }
4923 /// Adds a control dependency.
4924 void addControlDependency(ScheduleData *Dep) {
4925 ControlDependencies.push_back(Dep);
4926 }
4927 /// Gets/sets the next load/store instruction in the block.
4928 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4929 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4930
4931 void dump(raw_ostream &OS) const { OS << *Inst; }
4932
4933 LLVM_DUMP_METHOD void dump() const {
4934 dump(dbgs());
4935 dbgs() << '\n';
4936 }
4937
4938 private:
4939 Instruction *Inst = nullptr;
4940
4941 /// Single linked list of all memory instructions (e.g. load, store, call)
4942 /// in the block - until the end of the scheduling region.
4943 ScheduleData *NextLoadStore = nullptr;
4944
4945 /// The dependent memory instructions.
4946 /// This list is derived on demand in calculateDependencies().
4947 SmallVector<ScheduleData *> MemoryDependencies;
4948
4949 /// List of instructions which this instruction could be control dependent
4950 /// on. Allowing such nodes to be scheduled below this one could introduce
4951 /// a runtime fault which didn't exist in the original program.
4952 /// ex: this is a load or udiv following a readonly call which inf loops
4953 SmallVector<ScheduleData *> ControlDependencies;
4954
4955 /// This ScheduleData is in the current scheduling region if this matches
4956 /// the current SchedulingRegionID of BlockScheduling.
4957 int SchedulingRegionID = 0;
4958
4959 /// The number of dependencies. Constitutes of the number of users of the
4960 /// instruction plus the number of dependent memory instructions (if any).
4961 /// This value is calculated on demand.
4962 /// If InvalidDeps, the number of dependencies is not calculated yet.
4963 int Dependencies = InvalidDeps;
4964
4965 /// The number of dependencies minus the number of dependencies of scheduled
4966 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4967 /// for scheduling.
4968 /// Note that this is negative as long as Dependencies is not calculated.
4969 int UnscheduledDeps = InvalidDeps;
4970 };
4971
4972#ifndef NDEBUG
4974 const BoUpSLP::ScheduleData &SD) {
4975 SD.dump(OS);
4976 return OS;
4977 }
4978#endif
4979
4980 class ScheduleBundle final : public ScheduleEntity {
4981 /// The schedule data for the instructions in the bundle.
4983 /// True if this bundle is valid.
4984 bool IsValid = true;
4985 /// The TreeEntry that this instruction corresponds to.
4986 TreeEntry *TE = nullptr;
4987 ScheduleBundle(bool IsValid)
4988 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4989
4990 public:
4991 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4992 static bool classof(const ScheduleEntity *Entity) {
4993 return Entity->getKind() == Kind::ScheduleBundle;
4994 }
4995
4996 /// Verify basic self consistency properties
4997 void verify() const {
4998 for (const ScheduleEntity *SD : Bundle) {
4999 if (SD->hasValidDependencies()) {
5000 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5001 "invariant");
5002 } else {
5003 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5004 "invariant");
5005 }
5006
5007 if (isScheduled()) {
5008 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5009 "unexpected scheduled state");
5010 }
5011 }
5012 }
5013
5014 /// Returns the number of unscheduled dependencies in the bundle.
5015 int unscheduledDepsInBundle() const {
5016 assert(*this && "bundle must not be empty");
5017 int Sum = 0;
5018 for (const ScheduleEntity *BundleMember : Bundle) {
5019 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5020 return ScheduleData::InvalidDeps;
5021 Sum += BundleMember->getUnscheduledDeps();
5022 }
5023 return Sum;
5024 }
5025
5026 /// Returns true if the dependency information has been calculated.
5027 /// Note that depenendency validity can vary between instructions within
5028 /// a single bundle.
5029 bool hasValidDependencies() const {
5030 return all_of(Bundle, [](const ScheduleEntity *SD) {
5031 return SD->hasValidDependencies();
5032 });
5033 }
5034
5035 /// Returns true if it is ready for scheduling, i.e. it has no more
5036 /// unscheduled depending instructions/bundles.
5037 bool isReady() const {
5038 assert(*this && "bundle must not be empty");
5039 return unscheduledDepsInBundle() == 0 && !isScheduled();
5040 }
5041
5042 /// Returns the bundle of scheduling data, associated with the current
5043 /// instruction.
5044 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5045 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5046 /// Adds an instruction to the bundle.
5047 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5048
5049 /// Gets/sets the associated tree entry.
5050 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5051 TreeEntry *getTreeEntry() const { return TE; }
5052
5053 static ScheduleBundle invalid() { return {false}; }
5054
5055 operator bool() const { return IsValid; }
5056
5057#ifndef NDEBUG
5058 void dump(raw_ostream &OS) const {
5059 if (!*this) {
5060 OS << "[]";
5061 return;
5062 }
5063 OS << '[';
5064 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5066 OS << "<Copyable>";
5067 OS << *SD->getInst();
5068 });
5069 OS << ']';
5070 }
5071
5072 LLVM_DUMP_METHOD void dump() const {
5073 dump(dbgs());
5074 dbgs() << '\n';
5075 }
5076#endif // NDEBUG
5077 };
5078
5079#ifndef NDEBUG
5081 const BoUpSLP::ScheduleBundle &Bundle) {
5082 Bundle.dump(OS);
5083 return OS;
5084 }
5085#endif
5086
5087 /// Contains all scheduling relevant data for the copyable instruction.
5088 /// It models the virtual instructions, supposed to replace the original
5089 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5090 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5091 /// instruction %virt = add %0, 0.
5092 class ScheduleCopyableData final : public ScheduleEntity {
5093 /// The source schedule data for the instruction.
5094 Instruction *Inst = nullptr;
5095 /// The edge information for the instruction.
5096 const EdgeInfo EI;
5097 /// This ScheduleData is in the current scheduling region if this matches
5098 /// the current SchedulingRegionID of BlockScheduling.
5099 int SchedulingRegionID = 0;
5100 /// Bundle, this data is part of.
5101 ScheduleBundle &Bundle;
5102
5103 public:
5104 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5105 const EdgeInfo &EI, ScheduleBundle &Bundle)
5106 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5107 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5108 static bool classof(const ScheduleEntity *Entity) {
5109 return Entity->getKind() == Kind::ScheduleCopyableData;
5110 }
5111
5112 /// Verify basic self consistency properties
5113 void verify() {
5114 if (hasValidDependencies()) {
5115 assert(UnscheduledDeps <= Dependencies && "invariant");
5116 } else {
5117 assert(UnscheduledDeps == Dependencies && "invariant");
5118 }
5119
5120 if (IsScheduled) {
5121 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5122 "unexpected scheduled state");
5123 }
5124 }
5125
5126 /// Returns true if the dependency information has been calculated.
5127 /// Note that depenendency validity can vary between instructions within
5128 /// a single bundle.
5129 bool hasValidDependencies() const {
5130 return Dependencies != ScheduleData::InvalidDeps;
5131 }
5132
5133 /// Returns true if it is ready for scheduling, i.e. it has no more
5134 /// unscheduled depending instructions/bundles.
5135 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5136
5137 /// Modifies the number of unscheduled dependencies for this instruction,
5138 /// and returns the number of remaining dependencies for the containing
5139 /// bundle.
5140 int incrementUnscheduledDeps(int Incr) {
5141 assert(hasValidDependencies() &&
5142 "increment of unscheduled deps would be meaningless");
5143 UnscheduledDeps += Incr;
5144 assert(UnscheduledDeps >= 0 && "invariant");
5145 return UnscheduledDeps;
5146 }
5147
5148 /// Sets the number of unscheduled dependencies to the number of
5149 /// dependencies.
5150 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5151
5152 /// Gets the number of unscheduled dependencies.
5153 int getUnscheduledDeps() const { return UnscheduledDeps; }
5154 /// Gets the number of dependencies.
5155 int getDependencies() const { return Dependencies; }
5156 /// Initializes the number of dependencies.
5157 void initDependencies() { Dependencies = 0; }
5158 /// Increments the number of dependencies.
5159 void incDependencies() { Dependencies++; }
5160
5161 /// Gets scheduling region ID.
5162 int getSchedulingRegionID() const { return SchedulingRegionID; }
5163
5164 /// Gets the instruction.
5165 Instruction *getInst() const { return Inst; }
5166
5167 /// Clears all dependency information.
5168 void clearDependencies() {
5169 Dependencies = ScheduleData::InvalidDeps;
5170 UnscheduledDeps = ScheduleData::InvalidDeps;
5171 IsScheduled = false;
5172 }
5173
5174 /// Gets the edge information.
5175 const EdgeInfo &getEdgeInfo() const { return EI; }
5176
5177 /// Gets the bundle.
5178 ScheduleBundle &getBundle() { return Bundle; }
5179 const ScheduleBundle &getBundle() const { return Bundle; }
5180
5181#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5182 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5183
5184 LLVM_DUMP_METHOD void dump() const {
5185 dump(dbgs());
5186 dbgs() << '\n';
5187 }
5188#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5189
5190 private:
5191 /// true, if it has valid dependency information. These nodes always have
5192 /// only single dependency.
5193 int Dependencies = ScheduleData::InvalidDeps;
5194
5195 /// The number of dependencies minus the number of dependencies of scheduled
5196 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5197 /// for scheduling.
5198 /// Note that this is negative as long as Dependencies is not calculated.
5199 int UnscheduledDeps = ScheduleData::InvalidDeps;
5200 };
5201
5202#ifndef NDEBUG
5203 friend inline raw_ostream &
5204 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5205 SD.dump(OS);
5206 return OS;
5207 }
5208#endif
5209
5210 friend struct GraphTraits<BoUpSLP *>;
5211 friend struct DOTGraphTraits<BoUpSLP *>;
5212
5213 /// Contains all scheduling data for a basic block.
5214 /// It does not schedules instructions, which are not memory read/write
5215 /// instructions and their operands are either constants, or arguments, or
5216 /// phis, or instructions from others blocks, or their users are phis or from
5217 /// the other blocks. The resulting vector instructions can be placed at the
5218 /// beginning of the basic block without scheduling (if operands does not need
5219 /// to be scheduled) or at the end of the block (if users are outside of the
5220 /// block). It allows to save some compile time and memory used by the
5221 /// compiler.
5222 /// ScheduleData is assigned for each instruction in between the boundaries of
5223 /// the tree entry, even for those, which are not part of the graph. It is
5224 /// required to correctly follow the dependencies between the instructions and
5225 /// their correct scheduling. The ScheduleData is not allocated for the
5226 /// instructions, which do not require scheduling, like phis, nodes with
5227 /// extractelements/insertelements only or nodes with instructions, with
5228 /// uses/operands outside of the block.
5229 struct BlockScheduling {
5230 BlockScheduling(BasicBlock *BB)
5231 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5232
5233 void clear() {
5234 ScheduledBundles.clear();
5235 ScheduledBundlesList.clear();
5236 ScheduleCopyableDataMap.clear();
5237 ScheduleCopyableDataMapByInst.clear();
5238 ScheduleCopyableDataMapByInstUser.clear();
5239 ScheduleCopyableDataMapByUsers.clear();
5240 ReadyInsts.clear();
5241 ScheduleStart = nullptr;
5242 ScheduleEnd = nullptr;
5243 FirstLoadStoreInRegion = nullptr;
5244 LastLoadStoreInRegion = nullptr;
5245 RegionHasStackSave = false;
5246
5247 // Reduce the maximum schedule region size by the size of the
5248 // previous scheduling run.
5249 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5250 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5251 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5252 ScheduleRegionSize = 0;
5253
5254 // Make a new scheduling region, i.e. all existing ScheduleData is not
5255 // in the new region yet.
5256 ++SchedulingRegionID;
5257 }
5258
5259 ScheduleData *getScheduleData(Instruction *I) {
5260 if (!I)
5261 return nullptr;
5262 if (BB != I->getParent())
5263 // Avoid lookup if can't possibly be in map.
5264 return nullptr;
5265 ScheduleData *SD = ScheduleDataMap.lookup(I);
5266 if (SD && isInSchedulingRegion(*SD))
5267 return SD;
5268 return nullptr;
5269 }
5270
5271 ScheduleData *getScheduleData(Value *V) {
5272 return getScheduleData(dyn_cast<Instruction>(V));
5273 }
5274
5275 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5276 /// operand number) and value.
5277 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5278 const Value *V) const {
5279 if (ScheduleCopyableDataMap.empty())
5280 return nullptr;
5281 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5282 if (It == ScheduleCopyableDataMap.end())
5283 return nullptr;
5284 ScheduleCopyableData *SD = It->getSecond().get();
5285 if (!isInSchedulingRegion(*SD))
5286 return nullptr;
5287 return SD;
5288 }
5289
5290 /// Returns the ScheduleCopyableData for the given user \p User, operand
5291 /// number and operand \p V.
5293 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5294 const Value *V) {
5295 if (ScheduleCopyableDataMapByInstUser.empty())
5296 return {};
5297 const auto It = ScheduleCopyableDataMapByInstUser.find(
5298 std::make_pair(std::make_pair(User, OperandIdx), V));
5299 if (It == ScheduleCopyableDataMapByInstUser.end())
5300 return {};
5302 for (ScheduleCopyableData *SD : It->getSecond()) {
5303 if (isInSchedulingRegion(*SD))
5304 Res.push_back(SD);
5305 }
5306 return Res;
5307 }
5308
5309 /// Returns true if all operands of the given instruction \p User are
5310 /// replaced by copyable data.
5311 /// \param User The user instruction.
5312 /// \param Op The operand, which might be replaced by the copyable data.
5313 /// \param SLP The SLP tree.
5314 /// \param NumOps The number of operands used. If the instruction uses the
5315 /// same operand several times, check for the first use, then the second,
5316 /// etc.
5317 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5318 Instruction *Op, BoUpSLP &SLP,
5319 unsigned NumOps) const {
5320 assert(NumOps > 0 && "No operands");
5321 if (ScheduleCopyableDataMap.empty())
5322 return false;
5323 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5324 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5325 for (const Use &U : User->operands()) {
5326 if (U.get() != Op)
5327 continue;
5328 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5329 if (Entries.empty())
5330 return false;
5331 // Check all tree entries, if they have operands replaced by copyable
5332 // data.
5333 for (TreeEntry *TE : Entries) {
5334 // Check if the user is commutative.
5335 // The commutatives are handled later, as their operands can be
5336 // reordered.
5337 // Same applies even for non-commutative cmps, because we can invert
5338 // their predicate potentially and, thus, reorder the operands.
5339 bool IsCommutativeUser =
5340 ::isCommutative(User) ||
5341 ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
5342 if (!IsCommutativeUser && !isa<CmpInst>(User)) {
5343 unsigned &OpCnt =
5344 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
5345 EdgeInfo EI(TE, U.getOperandNo());
5346 if (!getScheduleCopyableData(EI, Op))
5347 continue;
5348 // Found copyable operand - continue.
5349 ++OpCnt;
5350 continue;
5351 }
5352 ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5353 .first->getSecond();
5354 }
5355 }
5356 if (PotentiallyReorderedEntriesCount.empty())
5357 return all_of(OrderedEntriesCount,
5358 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5359 return P.second == NumOps;
5360 });
5361 // Check the commutative/cmp entries.
5362 for (auto &P : PotentiallyReorderedEntriesCount) {
5363 auto *It = find(P.first->Scalars, User);
5364 assert(It != P.first->Scalars.end() && "User is not in the tree entry");
5365 int Lane = std::distance(P.first->Scalars.begin(), It);
5366 assert(Lane >= 0 && "Lane is not found");
5367 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5368 Lane = P.first->ReorderIndices[Lane];
5369 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5370 "Couldn't find extract lane");
5371 SmallVector<unsigned> OpIndices;
5372 for (unsigned OpIdx :
5374 P.first->getMainOp()))) {
5375 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5376 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5377 --P.getSecond();
5378 }
5379 }
5380 return all_of(PotentiallyReorderedEntriesCount,
5381 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5382 return P.second == NumOps - 1;
5383 }) &&
5384 all_of(OrderedEntriesCount,
5385 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5386 return P.second == NumOps;
5387 });
5388 }
5389
5391 getScheduleCopyableData(const Instruction *I) const {
5392 if (ScheduleCopyableDataMapByInst.empty())
5393 return {};
5394 const auto It = ScheduleCopyableDataMapByInst.find(I);
5395 if (It == ScheduleCopyableDataMapByInst.end())
5396 return {};
5398 for (ScheduleCopyableData *SD : It->getSecond()) {
5399 if (isInSchedulingRegion(*SD))
5400 Res.push_back(SD);
5401 }
5402 return Res;
5403 }
5404
5406 getScheduleCopyableDataUsers(const Instruction *User) const {
5407 if (ScheduleCopyableDataMapByUsers.empty())
5408 return {};
5409 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5410 if (It == ScheduleCopyableDataMapByUsers.end())
5411 return {};
5413 for (ScheduleCopyableData *SD : It->getSecond()) {
5414 if (isInSchedulingRegion(*SD))
5415 Res.push_back(SD);
5416 }
5417 return Res;
5418 }
5419
5420 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5421 Instruction *I,
5422 int SchedulingRegionID,
5423 ScheduleBundle &Bundle) {
5424 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5425 ScheduleCopyableData *CD =
5426 ScheduleCopyableDataMap
5427 .try_emplace(std::make_pair(EI, I),
5428 std::make_unique<ScheduleCopyableData>(
5429 SchedulingRegionID, I, EI, Bundle))
5430 .first->getSecond()
5431 .get();
5432 ScheduleCopyableDataMapByInst[I].push_back(CD);
5433 if (EI.UserTE) {
5434 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5435 const auto *It = find(Op, I);
5436 assert(It != Op.end() && "Lane not set");
5437 SmallPtrSet<Instruction *, 4> Visited;
5438 do {
5439 int Lane = std::distance(Op.begin(), It);
5440 assert(Lane >= 0 && "Lane not set");
5441 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5442 !EI.UserTE->ReorderIndices.empty())
5443 Lane = EI.UserTE->ReorderIndices[Lane];
5444 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5445 "Couldn't find extract lane");
5446 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5447 if (!Visited.insert(In).second) {
5448 It = find(make_range(std::next(It), Op.end()), I);
5449 continue;
5450 }
5451 ScheduleCopyableDataMapByInstUser
5452 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5453 .first->getSecond()
5454 .push_back(CD);
5455 ScheduleCopyableDataMapByUsers.try_emplace(I)
5456 .first->getSecond()
5457 .insert(CD);
5458 // Remove extra deps for users, becoming non-immediate users of the
5459 // instruction. It may happen, if the chain of same copyable elements
5460 // appears in the tree.
5461 if (In == I) {
5462 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5463 if (ScheduleCopyableData *UserCD =
5464 getScheduleCopyableData(UserEI, In))
5465 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5466 }
5467 It = find(make_range(std::next(It), Op.end()), I);
5468 } while (It != Op.end());
5469 } else {
5470 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5471 CD);
5472 }
5473 return *CD;
5474 }
5475
5476 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5477 auto *I = dyn_cast<Instruction>(V);
5478 if (!I)
5479 return {};
5480 auto It = ScheduledBundles.find(I);
5481 if (It == ScheduledBundles.end())
5482 return {};
5483 return It->getSecond();
5484 }
5485
5486 /// Returns true if the entity is in the scheduling region.
5487 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5488 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5489 return Data->getSchedulingRegionID() == SchedulingRegionID;
5490 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5491 return CD->getSchedulingRegionID() == SchedulingRegionID;
5492 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5493 [&](const ScheduleEntity *BundleMember) {
5494 return isInSchedulingRegion(*BundleMember);
5495 });
5496 }
5497
5498 /// Marks an instruction as scheduled and puts all dependent ready
5499 /// instructions into the ready-list.
5500 template <typename ReadyListType>
5501 void schedule(const BoUpSLP &R, const InstructionsState &S,
5502 const EdgeInfo &EI, ScheduleEntity *Data,
5503 ReadyListType &ReadyList) {
5504 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5506 // Handle the def-use chain dependencies.
5507
5508 // Decrement the unscheduled counter and insert to ready list if ready.
5509 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5510 if ((IsControl || Data->hasValidDependencies()) &&
5511 Data->incrementUnscheduledDeps(-1) == 0) {
5512 // There are no more unscheduled dependencies after
5513 // decrementing, so we can put the dependent instruction
5514 // into the ready list.
5515 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5517 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5518 CopyableBundle.push_back(&CD->getBundle());
5519 Bundles = CopyableBundle;
5520 } else {
5521 Bundles = getScheduleBundles(Data->getInst());
5522 }
5523 if (!Bundles.empty()) {
5524 for (ScheduleBundle *Bundle : Bundles) {
5525 if (Bundle->unscheduledDepsInBundle() == 0) {
5526 assert(!Bundle->isScheduled() &&
5527 "already scheduled bundle gets ready");
5528 ReadyList.insert(Bundle);
5530 << "SLP: gets ready: " << *Bundle << "\n");
5531 }
5532 }
5533 return;
5534 }
5535 assert(!Data->isScheduled() &&
5536 "already scheduled bundle gets ready");
5538 "Expected non-copyable data");
5539 ReadyList.insert(Data);
5540 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5541 }
5542 };
5543
5544 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5545 Instruction *I) {
5546 if (!ScheduleCopyableDataMap.empty()) {
5548 getScheduleCopyableData(User, OpIdx, I);
5549 for (ScheduleCopyableData *CD : CopyableData)
5550 DecrUnsched(CD, /*IsControl=*/false);
5551 if (!CopyableData.empty())
5552 return;
5553 }
5554 if (ScheduleData *OpSD = getScheduleData(I))
5555 DecrUnsched(OpSD, /*IsControl=*/false);
5556 };
5557
5558 // If BundleMember is a vector bundle, its operands may have been
5559 // reordered during buildTree(). We therefore need to get its operands
5560 // through the TreeEntry.
5561 if (!Bundles.empty()) {
5562 auto *In = BundleMember->getInst();
5563 // Count uses of each instruction operand.
5564 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5565 unsigned TotalOpCount = 0;
5566 if (isa<ScheduleCopyableData>(BundleMember)) {
5567 // Copyable data is used only once (uses itself).
5568 TotalOpCount = OperandsUses[In] = 1;
5569 } else {
5570 for (const Use &U : In->operands()) {
5571 if (auto *I = dyn_cast<Instruction>(U.get())) {
5572 auto Res = OperandsUses.try_emplace(I, 0);
5573 ++Res.first->getSecond();
5574 ++TotalOpCount;
5575 }
5576 }
5577 }
5578 // Decrement the unscheduled counter and insert to ready list if
5579 // ready.
5580 auto DecrUnschedForInst =
5581 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5582 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5583 &Checked) {
5584 if (!ScheduleCopyableDataMap.empty()) {
5585 const EdgeInfo EI = {UserTE, OpIdx};
5586 if (ScheduleCopyableData *CD =
5587 getScheduleCopyableData(EI, I)) {
5588 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
5589 return;
5590 DecrUnsched(CD, /*IsControl=*/false);
5591 return;
5592 }
5593 }
5594 auto It = OperandsUses.find(I);
5595 assert(It != OperandsUses.end() && "Operand not found");
5596 if (It->second > 0) {
5597 --It->getSecond();
5598 assert(TotalOpCount > 0 && "No more operands to decrement");
5599 --TotalOpCount;
5600 if (ScheduleData *OpSD = getScheduleData(I)) {
5601 if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
5602 return;
5603 DecrUnsched(OpSD, /*IsControl=*/false);
5604 }
5605 }
5606 };
5607
5608 for (ScheduleBundle *Bundle : Bundles) {
5609 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5610 break;
5611 // Need to search for the lane since the tree entry can be
5612 // reordered.
5613 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
5614 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5615 do {
5616 int Lane =
5617 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5618 assert(Lane >= 0 && "Lane not set");
5619 if (isa<StoreInst>(In) &&
5620 !Bundle->getTreeEntry()->ReorderIndices.empty())
5621 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5622 assert(Lane < static_cast<int>(
5623 Bundle->getTreeEntry()->Scalars.size()) &&
5624 "Couldn't find extract lane");
5625
5626 // Since vectorization tree is being built recursively this
5627 // assertion ensures that the tree entry has all operands set
5628 // before reaching this code. Couple of exceptions known at the
5629 // moment are extracts where their second (immediate) operand is
5630 // not added. Since immediates do not affect scheduler behavior
5631 // this is considered okay.
5632 assert(In &&
5634 In->getNumOperands() ==
5635 Bundle->getTreeEntry()->getNumOperands() ||
5636 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5637 "Missed TreeEntry operands?");
5638
5639 for (unsigned OpIdx :
5640 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5641 if (auto *I = dyn_cast<Instruction>(
5642 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5643 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5644 << *I << "\n");
5645 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5646 }
5647 // If parent node is schedulable, it will be handle correctly.
5648 if (!Bundle->getTreeEntry()->doesNotNeedToSchedule())
5649 break;
5650 It = std::find(std::next(It),
5651 Bundle->getTreeEntry()->Scalars.end(), In);
5652 } while (It != Bundle->getTreeEntry()->Scalars.end());
5653 }
5654 } else {
5655 // If BundleMember is a stand-alone instruction, no operand reordering
5656 // has taken place, so we directly access its operands.
5657 for (Use &U : BundleMember->getInst()->operands()) {
5658 if (auto *I = dyn_cast<Instruction>(U.get())) {
5660 << "SLP: check for readiness (def): " << *I << "\n");
5661 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5662 }
5663 }
5664 }
5665 // Handle the memory dependencies.
5666 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5667 if (!SD)
5668 return;
5669 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5670 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5671 if (!VisitedMemory.insert(MemoryDep).second)
5672 continue;
5673 // There are no more unscheduled dependencies after decrementing,
5674 // so we can put the dependent instruction into the ready list.
5675 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5676 << *MemoryDep << "\n");
5677 DecrUnsched(MemoryDep);
5678 }
5679 // Handle the control dependencies.
5680 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5681 for (ScheduleData *Dep : SD->getControlDependencies()) {
5682 if (!VisitedControl.insert(Dep).second)
5683 continue;
5684 // There are no more unscheduled dependencies after decrementing,
5685 // so we can put the dependent instruction into the ready list.
5687 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5688 DecrUnsched(Dep, /*IsControl=*/true);
5689 }
5690 };
5691 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5692 SD->setScheduled(/*Scheduled=*/true);
5693 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5696 Instruction *In = SD->getInst();
5697 if (R.isVectorized(In)) {
5698 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5699 for (TreeEntry *TE : Entries) {
5701 In->getNumOperands() != TE->getNumOperands())
5702 continue;
5703 auto &BundlePtr =
5704 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5705 BundlePtr->setTreeEntry(TE);
5706 BundlePtr->add(SD);
5707 Bundles.push_back(BundlePtr.get());
5708 }
5709 }
5710 ProcessBundleMember(SD, Bundles);
5711 } else {
5712 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5713 Bundle.setScheduled(/*Scheduled=*/true);
5714 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5715 auto AreAllBundlesScheduled =
5716 [&](const ScheduleEntity *SD,
5717 ArrayRef<ScheduleBundle *> SDBundles) {
5719 return true;
5720 return !SDBundles.empty() &&
5721 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5722 return SDBundle->isScheduled();
5723 });
5724 };
5725 for (ScheduleEntity *SD : Bundle.getBundle()) {
5728 SDBundles = getScheduleBundles(SD->getInst());
5729 if (AreAllBundlesScheduled(SD, SDBundles)) {
5730 SD->setScheduled(/*Scheduled=*/true);
5731 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5732 : SDBundles);
5733 }
5734 }
5735 }
5736 }
5737
5738 /// Verify basic self consistency properties of the data structure.
5739 void verify() {
5740 if (!ScheduleStart)
5741 return;
5742
5743 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5744 ScheduleStart->comesBefore(ScheduleEnd) &&
5745 "Not a valid scheduling region?");
5746
5747 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5748 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5749 if (!Bundles.empty()) {
5750 for (ScheduleBundle *Bundle : Bundles) {
5751 assert(isInSchedulingRegion(*Bundle) &&
5752 "primary schedule data not in window?");
5753 Bundle->verify();
5754 }
5755 continue;
5756 }
5757 auto *SD = getScheduleData(I);
5758 if (!SD)
5759 continue;
5760 assert(isInSchedulingRegion(*SD) &&
5761 "primary schedule data not in window?");
5762 SD->verify();
5763 }
5764
5765 assert(all_of(ReadyInsts,
5766 [](const ScheduleEntity *Bundle) {
5767 return Bundle->isReady();
5768 }) &&
5769 "item in ready list not ready?");
5770 }
5771
5772 /// Put all instructions into the ReadyList which are ready for scheduling.
5773 template <typename ReadyListType>
5774 void initialFillReadyList(ReadyListType &ReadyList) {
5775 SmallPtrSet<ScheduleBundle *, 16> Visited;
5776 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5777 ScheduleData *SD = getScheduleData(I);
5778 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5779 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5780 !Bundles.empty()) {
5781 for (ScheduleBundle *Bundle : Bundles) {
5782 if (!Visited.insert(Bundle).second)
5783 continue;
5784 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5785 ReadyList.insert(Bundle);
5786 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5787 << *Bundle << "\n");
5788 }
5789 }
5790 continue;
5791 }
5792 ReadyList.insert(SD);
5794 << "SLP: initially in ready list: " << *SD << "\n");
5795 }
5796 }
5797 }
5798
5799 /// Build a bundle from the ScheduleData nodes corresponding to the
5800 /// scalar instruction for each lane.
5801 /// \param VL The list of scalar instructions.
5802 /// \param S The state of the instructions.
5803 /// \param EI The edge in the SLP graph or the user node/operand number.
5804 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5805 const InstructionsState &S, const EdgeInfo &EI);
5806
5807 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5808 /// cyclic dependencies. This is only a dry-run, no instructions are
5809 /// actually moved at this stage.
5810 /// \returns the scheduling bundle. The returned Optional value is not
5811 /// std::nullopt if \p VL is allowed to be scheduled.
5812 std::optional<ScheduleBundle *>
5813 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5814 const InstructionsState &S, const EdgeInfo &EI);
5815
5816 /// Allocates schedule data chunk.
5817 ScheduleData *allocateScheduleDataChunks();
5818
5819 /// Extends the scheduling region so that V is inside the region.
5820 /// \returns true if the region size is within the limit.
5821 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5822
5823 /// Initialize the ScheduleData structures for new instructions in the
5824 /// scheduling region.
5825 void initScheduleData(Instruction *FromI, Instruction *ToI,
5826 ScheduleData *PrevLoadStore,
5827 ScheduleData *NextLoadStore);
5828
5829 /// Updates the dependency information of a bundle and of all instructions/
5830 /// bundles which depend on the original bundle.
5831 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5832 BoUpSLP *SLP,
5833 ArrayRef<ScheduleData *> ControlDeps = {});
5834
5835 /// Sets all instruction in the scheduling region to un-scheduled.
5836 void resetSchedule();
5837
5838 BasicBlock *BB;
5839
5840 /// Simple memory allocation for ScheduleData.
5842
5843 /// The size of a ScheduleData array in ScheduleDataChunks.
5844 int ChunkSize;
5845
5846 /// The allocator position in the current chunk, which is the last entry
5847 /// of ScheduleDataChunks.
5848 int ChunkPos;
5849
5850 /// Attaches ScheduleData to Instruction.
5851 /// Note that the mapping survives during all vectorization iterations, i.e.
5852 /// ScheduleData structures are recycled.
5853 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5854
5855 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5856 /// number) and the operand instruction, represented as copyable element.
5857 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5858 std::unique_ptr<ScheduleCopyableData>>
5859 ScheduleCopyableDataMap;
5860
5861 /// Represents mapping between instruction and all related
5862 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5863 /// element). The SLP tree may contain several representations of the same
5864 /// instruction.
5865 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5866 ScheduleCopyableDataMapByInst;
5867
5868 /// Represents mapping between user value and operand number, the operand
5869 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5870 /// the same user may refernce the same operand in different tree entries
5871 /// and the operand may be modelled by the different copyable data element.
5872 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5874 ScheduleCopyableDataMapByInstUser;
5875
5876 /// Represents mapping between instruction and all related
5877 /// ScheduleCopyableData. It represents the mapping between the actual
5878 /// instruction and the last copyable data element in the chain. E.g., if
5879 /// the graph models the following instructions:
5880 /// %0 = non-add instruction ...
5881 /// ...
5882 /// %4 = add %3, 1
5883 /// %5 = add %4, 1
5884 /// %6 = insertelement poison, %0, 0
5885 /// %7 = insertelement %6, %5, 1
5886 /// And the graph is modeled as:
5887 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
5888 /// -> [1, 0] -> [%1, 0]
5889 ///
5890 /// this map will map %0 only to the copyable element <1>, which is the last
5891 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
5892 /// keep the map to <0>, not the %0.
5893 SmallDenseMap<const Instruction *,
5894 SmallSetVector<ScheduleCopyableData *, 4>>
5895 ScheduleCopyableDataMapByUsers;
5896
5897 /// Attaches ScheduleBundle to Instruction.
5898 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5899 ScheduledBundles;
5900 /// The list of ScheduleBundles.
5901 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5902
5903 /// The ready-list for scheduling (only used for the dry-run).
5904 SetVector<ScheduleEntity *> ReadyInsts;
5905
5906 /// The first instruction of the scheduling region.
5907 Instruction *ScheduleStart = nullptr;
5908
5909 /// The first instruction _after_ the scheduling region.
5910 Instruction *ScheduleEnd = nullptr;
5911
5912 /// The first memory accessing instruction in the scheduling region
5913 /// (can be null).
5914 ScheduleData *FirstLoadStoreInRegion = nullptr;
5915
5916 /// The last memory accessing instruction in the scheduling region
5917 /// (can be null).
5918 ScheduleData *LastLoadStoreInRegion = nullptr;
5919
5920 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5921 /// region? Used to optimize the dependence calculation for the
5922 /// common case where there isn't.
5923 bool RegionHasStackSave = false;
5924
5925 /// The current size of the scheduling region.
5926 int ScheduleRegionSize = 0;
5927
5928 /// The maximum size allowed for the scheduling region.
5929 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
5930
5931 /// The ID of the scheduling region. For a new vectorization iteration this
5932 /// is incremented which "removes" all ScheduleData from the region.
5933 /// Make sure that the initial SchedulingRegionID is greater than the
5934 /// initial SchedulingRegionID in ScheduleData (which is 0).
5935 int SchedulingRegionID = 1;
5936 };
5937
5938 /// Attaches the BlockScheduling structures to basic blocks.
5939 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5940
5941 /// Performs the "real" scheduling. Done before vectorization is actually
5942 /// performed in a basic block.
5943 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
5944
5945 /// List of users to ignore during scheduling and that don't need extracting.
5946 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
5947
5948 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
5949 /// sorted SmallVectors of unsigned.
5950 struct OrdersTypeDenseMapInfo {
5951 static OrdersType getEmptyKey() {
5952 OrdersType V;
5953 V.push_back(~1U);
5954 return V;
5955 }
5956
5957 static OrdersType getTombstoneKey() {
5958 OrdersType V;
5959 V.push_back(~2U);
5960 return V;
5961 }
5962
5963 static unsigned getHashValue(const OrdersType &V) {
5964 return static_cast<unsigned>(hash_combine_range(V));
5965 }
5966
5967 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
5968 return LHS == RHS;
5969 }
5970 };
5971
5972 // Analysis and block reference.
5973 Function *F;
5974 ScalarEvolution *SE;
5975 TargetTransformInfo *TTI;
5976 TargetLibraryInfo *TLI;
5977 LoopInfo *LI;
5978 DominatorTree *DT;
5979 AssumptionCache *AC;
5980 DemandedBits *DB;
5981 const DataLayout *DL;
5982 OptimizationRemarkEmitter *ORE;
5983
5984 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
5985 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
5986
5987 /// Instruction builder to construct the vectorized tree.
5988 IRBuilder<TargetFolder> Builder;
5989
5990 /// A map of scalar integer values to the smallest bit width with which they
5991 /// can legally be represented. The values map to (width, signed) pairs,
5992 /// where "width" indicates the minimum bit width and "signed" is True if the
5993 /// value must be signed-extended, rather than zero-extended, back to its
5994 /// original width.
5995 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
5996
5997 /// Final size of the reduced vector, if the current graph represents the
5998 /// input for the reduction and it was possible to narrow the size of the
5999 /// reduction.
6000 unsigned ReductionBitWidth = 0;
6001
6002 /// Canonical graph size before the transformations.
6003 unsigned BaseGraphSize = 1;
6004
6005 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6006 /// type sizes, used in the tree.
6007 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6008
6009 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6010 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6011 DenseSet<unsigned> ExtraBitWidthNodes;
6012};
6013
6014} // end namespace slpvectorizer
6015
6016template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {
6020 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6021 SecondInfo::getEmptyKey());
6022 }
6023
6025 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6026 SecondInfo::getTombstoneKey());
6027 }
6028
6029 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6030 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6031 SecondInfo::getHashValue(Val.EdgeIdx));
6032 }
6033
6034 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6035 const BoUpSLP::EdgeInfo &RHS) {
6036 return LHS == RHS;
6037 }
6038};
6039
6040template <> struct GraphTraits<BoUpSLP *> {
6041 using TreeEntry = BoUpSLP::TreeEntry;
6042
6043 /// NodeRef has to be a pointer per the GraphWriter.
6045
6046 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6047
6048 /// Add the VectorizableTree to the index iterator to be able to return
6049 /// TreeEntry pointers.
6051 : public iterator_adaptor_base<
6052 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6054
6058
6059 NodeRef operator*() { return I->UserTE; }
6060 };
6061
6063 return R.VectorizableTree[0].get();
6064 }
6065
6067 return {&N->UserTreeIndex, N->Container};
6068 }
6069
6071 return {&N->UserTreeIndex + 1, N->Container};
6072 }
6073
6074 /// For the node iterator we just need to turn the TreeEntry iterator into a
6075 /// TreeEntry* iterator so that it dereferences to NodeRef.
6077 using ItTy = ContainerTy::iterator;
6078 ItTy It;
6079
6080 public:
6081 nodes_iterator(const ItTy &It2) : It(It2) {}
6082 NodeRef operator*() { return It->get(); }
6084 ++It;
6085 return *this;
6086 }
6087 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6088 };
6089
6091 return nodes_iterator(R->VectorizableTree.begin());
6092 }
6093
6095 return nodes_iterator(R->VectorizableTree.end());
6096 }
6097
6098 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6099};
6100
6101template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
6102 using TreeEntry = BoUpSLP::TreeEntry;
6103
6104 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6105
6106 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6107 std::string Str;
6108 raw_string_ostream OS(Str);
6109 OS << Entry->Idx << ".\n";
6110 if (isSplat(Entry->Scalars))
6111 OS << "<splat> ";
6112 for (auto *V : Entry->Scalars) {
6113 OS << *V;
6114 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6115 return EU.Scalar == V;
6116 }))
6117 OS << " <extract>";
6118 OS << "\n";
6119 }
6120 return Str;
6121 }
6122
6123 static std::string getNodeAttributes(const TreeEntry *Entry,
6124 const BoUpSLP *) {
6125 if (Entry->isGather())
6126 return "color=red";
6127 if (Entry->State == TreeEntry::ScatterVectorize ||
6128 Entry->State == TreeEntry::StridedVectorize ||
6129 Entry->State == TreeEntry::CompressVectorize)
6130 return "color=blue";
6131 return "";
6132 }
6133};
6134
6135} // end namespace llvm
6136
6139 for (auto *I : DeletedInstructions) {
6140 if (!I->getParent()) {
6141 // Temporarily insert instruction back to erase them from parent and
6142 // memory later.
6143 if (isa<PHINode>(I))
6144 // Phi nodes must be the very first instructions in the block.
6145 I->insertBefore(F->getEntryBlock(),
6146 F->getEntryBlock().getFirstNonPHIIt());
6147 else
6148 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6149 continue;
6150 }
6151 for (Use &U : I->operands()) {
6152 auto *Op = dyn_cast<Instruction>(U.get());
6153 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6155 DeadInsts.emplace_back(Op);
6156 }
6157 I->dropAllReferences();
6158 }
6159 for (auto *I : DeletedInstructions) {
6160 assert(I->use_empty() &&
6161 "trying to erase instruction with users.");
6162 I->eraseFromParent();
6163 }
6164
6165 // Cleanup any dead scalar code feeding the vectorized instructions
6167
6168#ifdef EXPENSIVE_CHECKS
6169 // If we could guarantee that this call is not extremely slow, we could
6170 // remove the ifdef limitation (see PR47712).
6171 assert(!verifyFunction(*F, &dbgs()));
6172#endif
6173}
6174
6175/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6176/// contains original mask for the scalars reused in the node. Procedure
6177/// transform this mask in accordance with the given \p Mask.
6179 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6180 "Expected non-empty mask.");
6181 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6182 Prev.swap(Reuses);
6183 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6184 if (Mask[I] != PoisonMaskElem)
6185 Reuses[Mask[I]] = Prev[I];
6186}
6187
6188/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6189/// the original order of the scalars. Procedure transforms the provided order
6190/// in accordance with the given \p Mask. If the resulting \p Order is just an
6191/// identity order, \p Order is cleared.
6193 bool BottomOrder = false) {
6194 assert(!Mask.empty() && "Expected non-empty mask.");
6195 unsigned Sz = Mask.size();
6196 if (BottomOrder) {
6197 SmallVector<unsigned> PrevOrder;
6198 if (Order.empty()) {
6199 PrevOrder.resize(Sz);
6200 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6201 } else {
6202 PrevOrder.swap(Order);
6203 }
6204 Order.assign(Sz, Sz);
6205 for (unsigned I = 0; I < Sz; ++I)
6206 if (Mask[I] != PoisonMaskElem)
6207 Order[I] = PrevOrder[Mask[I]];
6208 if (all_of(enumerate(Order), [&](const auto &Data) {
6209 return Data.value() == Sz || Data.index() == Data.value();
6210 })) {
6211 Order.clear();
6212 return;
6213 }
6214 fixupOrderingIndices(Order);
6215 return;
6216 }
6217 SmallVector<int> MaskOrder;
6218 if (Order.empty()) {
6219 MaskOrder.resize(Sz);
6220 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6221 } else {
6222 inversePermutation(Order, MaskOrder);
6223 }
6224 reorderReuses(MaskOrder, Mask);
6225 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6226 Order.clear();
6227 return;
6228 }
6229 Order.assign(Sz, Sz);
6230 for (unsigned I = 0; I < Sz; ++I)
6231 if (MaskOrder[I] != PoisonMaskElem)
6232 Order[MaskOrder[I]] = I;
6233 fixupOrderingIndices(Order);
6234}
6235
6236std::optional<BoUpSLP::OrdersType>
6237BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6238 bool TopToBottom, bool IgnoreReorder) {
6239 assert(TE.isGather() && "Expected gather node only.");
6240 // Try to find subvector extract/insert patterns and reorder only such
6241 // patterns.
6242 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6243 Type *ScalarTy = GatheredScalars.front()->getType();
6244 size_t NumScalars = GatheredScalars.size();
6245 if (!isValidElementType(ScalarTy))
6246 return std::nullopt;
6247 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6248 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6249 SmallVector<int> ExtractMask;
6250 SmallVector<int> Mask;
6253 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6255 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6256 /*ForOrder=*/true);
6257 // No shuffled operands - ignore.
6258 if (GatherShuffles.empty() && ExtractShuffles.empty())
6259 return std::nullopt;
6260 OrdersType CurrentOrder(NumScalars, NumScalars);
6261 if (GatherShuffles.size() == 1 &&
6262 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6263 Entries.front().front()->isSame(TE.Scalars)) {
6264 // If the full matched node in whole tree rotation - no need to consider the
6265 // matching order, rotating the whole tree.
6266 if (TopToBottom)
6267 return std::nullopt;
6268 // No need to keep the order for the same user node.
6269 if (Entries.front().front()->UserTreeIndex.UserTE ==
6270 TE.UserTreeIndex.UserTE)
6271 return std::nullopt;
6272 // No need to keep the order for the matched root node, if it can be freely
6273 // reordered.
6274 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6275 return std::nullopt;
6276 // If shuffling 2 elements only and the matching node has reverse reuses -
6277 // no need to count order, both work fine.
6278 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6279 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6280 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6281 [](const auto &P) {
6282 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6283 }))
6284 return std::nullopt;
6285
6286 // Perfect match in the graph, will reuse the previously vectorized
6287 // node. Cost is 0.
6288 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6289 return CurrentOrder;
6290 }
6291 auto IsSplatMask = [](ArrayRef<int> Mask) {
6292 int SingleElt = PoisonMaskElem;
6293 return all_of(Mask, [&](int I) {
6294 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6295 SingleElt = I;
6296 return I == PoisonMaskElem || I == SingleElt;
6297 });
6298 };
6299 // Exclusive broadcast mask - ignore.
6300 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6301 (Entries.size() != 1 ||
6302 Entries.front().front()->ReorderIndices.empty())) ||
6303 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6304 return std::nullopt;
6305 SmallBitVector ShuffledSubMasks(NumParts);
6306 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6307 ArrayRef<int> Mask, int PartSz, int NumParts,
6308 function_ref<unsigned(unsigned)> GetVF) {
6309 for (int I : seq<int>(0, NumParts)) {
6310 if (ShuffledSubMasks.test(I))
6311 continue;
6312 const int VF = GetVF(I);
6313 if (VF == 0)
6314 continue;
6315 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6316 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6317 // Shuffle of at least 2 vectors - ignore.
6318 if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {
6319 llvm::fill(Slice, NumScalars);
6320 ShuffledSubMasks.set(I);
6321 continue;
6322 }
6323 // Try to include as much elements from the mask as possible.
6324 int FirstMin = INT_MAX;
6325 int SecondVecFound = false;
6326 for (int K : seq<int>(Limit)) {
6327 int Idx = Mask[I * PartSz + K];
6328 if (Idx == PoisonMaskElem) {
6329 Value *V = GatheredScalars[I * PartSz + K];
6330 if (isConstant(V) && !isa<PoisonValue>(V)) {
6331 SecondVecFound = true;
6332 break;
6333 }
6334 continue;
6335 }
6336 if (Idx < VF) {
6337 if (FirstMin > Idx)
6338 FirstMin = Idx;
6339 } else {
6340 SecondVecFound = true;
6341 break;
6342 }
6343 }
6344 FirstMin = (FirstMin / PartSz) * PartSz;
6345 // Shuffle of at least 2 vectors - ignore.
6346 if (SecondVecFound) {
6347 llvm::fill(Slice, NumScalars);
6348 ShuffledSubMasks.set(I);
6349 continue;
6350 }
6351 for (int K : seq<int>(Limit)) {
6352 int Idx = Mask[I * PartSz + K];
6353 if (Idx == PoisonMaskElem)
6354 continue;
6355 Idx -= FirstMin;
6356 if (Idx >= PartSz) {
6357 SecondVecFound = true;
6358 break;
6359 }
6360 if (CurrentOrder[I * PartSz + Idx] >
6361 static_cast<unsigned>(I * PartSz + K) &&
6362 CurrentOrder[I * PartSz + Idx] !=
6363 static_cast<unsigned>(I * PartSz + Idx))
6364 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6365 }
6366 // Shuffle of at least 2 vectors - ignore.
6367 if (SecondVecFound) {
6368 llvm::fill(Slice, NumScalars);
6369 ShuffledSubMasks.set(I);
6370 continue;
6371 }
6372 }
6373 };
6374 int PartSz = getPartNumElems(NumScalars, NumParts);
6375 if (!ExtractShuffles.empty())
6376 TransformMaskToOrder(
6377 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6378 if (!ExtractShuffles[I])
6379 return 0U;
6380 unsigned VF = 0;
6381 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6382 for (unsigned Idx : seq<unsigned>(Sz)) {
6383 int K = I * PartSz + Idx;
6384 if (ExtractMask[K] == PoisonMaskElem)
6385 continue;
6386 if (!TE.ReuseShuffleIndices.empty())
6387 K = TE.ReuseShuffleIndices[K];
6388 if (K == PoisonMaskElem)
6389 continue;
6390 if (!TE.ReorderIndices.empty())
6391 K = std::distance(TE.ReorderIndices.begin(),
6392 find(TE.ReorderIndices, K));
6393 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6394 if (!EI)
6395 continue;
6396 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6397 ->getElementCount()
6398 .getKnownMinValue());
6399 }
6400 return VF;
6401 });
6402 // Check special corner case - single shuffle of the same entry.
6403 if (GatherShuffles.size() == 1 && NumParts != 1) {
6404 if (ShuffledSubMasks.any())
6405 return std::nullopt;
6406 PartSz = NumScalars;
6407 NumParts = 1;
6408 }
6409 if (!Entries.empty())
6410 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6411 if (!GatherShuffles[I])
6412 return 0U;
6413 return std::max(Entries[I].front()->getVectorFactor(),
6414 Entries[I].back()->getVectorFactor());
6415 });
6416 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6417 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6418 return std::nullopt;
6419 return std::move(CurrentOrder);
6420}
6421
6422static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6423 const TargetLibraryInfo &TLI,
6424 bool CompareOpcodes = true) {
6427 return false;
6428 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6429 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6430 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6431 (!GEP2 || GEP2->getNumOperands() == 2) &&
6432 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6433 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6434 !CompareOpcodes ||
6435 (GEP1 && GEP2 &&
6436 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6437}
6438
6439/// Calculates minimal alignment as a common alignment.
6440template <typename T>
6442 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6443 for (Value *V : VL)
6444 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6445 return CommonAlignment;
6446}
6447
6448/// Check if \p Order represents reverse order.
6450 assert(!Order.empty() &&
6451 "Order is empty. Please check it before using isReverseOrder.");
6452 unsigned Sz = Order.size();
6453 return all_of(enumerate(Order), [&](const auto &Pair) {
6454 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6455 });
6456}
6457
6458/// Checks if the provided list of pointers \p Pointers represents the strided
6459/// pointers for type ElemTy. If they are not, nullptr is returned.
6460/// Otherwise, SCEV* of the stride value is returned.
6461static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6462 const DataLayout &DL, ScalarEvolution &SE,
6463 SmallVectorImpl<unsigned> &SortedIndices) {
6465 const SCEV *PtrSCEVLowest = nullptr;
6466 const SCEV *PtrSCEVHighest = nullptr;
6467 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6468 // addresses).
6469 for (Value *Ptr : PointerOps) {
6470 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6471 if (!PtrSCEV)
6472 return nullptr;
6473 SCEVs.push_back(PtrSCEV);
6474 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6475 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6476 continue;
6477 }
6478 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6479 if (isa<SCEVCouldNotCompute>(Diff))
6480 return nullptr;
6481 if (Diff->isNonConstantNegative()) {
6482 PtrSCEVLowest = PtrSCEV;
6483 continue;
6484 }
6485 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6486 if (isa<SCEVCouldNotCompute>(Diff1))
6487 return nullptr;
6488 if (Diff1->isNonConstantNegative()) {
6489 PtrSCEVHighest = PtrSCEV;
6490 continue;
6491 }
6492 }
6493 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6494 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6495 if (isa<SCEVCouldNotCompute>(Dist))
6496 return nullptr;
6497 int Size = DL.getTypeStoreSize(ElemTy);
6498 auto TryGetStride = [&](const SCEV *Dist,
6499 const SCEV *Multiplier) -> const SCEV * {
6500 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6501 if (M->getOperand(0) == Multiplier)
6502 return M->getOperand(1);
6503 if (M->getOperand(1) == Multiplier)
6504 return M->getOperand(0);
6505 return nullptr;
6506 }
6507 if (Multiplier == Dist)
6508 return SE.getConstant(Dist->getType(), 1);
6509 return SE.getUDivExactExpr(Dist, Multiplier);
6510 };
6511 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6512 const SCEV *Stride = nullptr;
6513 if (Size != 1 || SCEVs.size() > 2) {
6514 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6515 Stride = TryGetStride(Dist, Sz);
6516 if (!Stride)
6517 return nullptr;
6518 }
6519 if (!Stride || isa<SCEVConstant>(Stride))
6520 return nullptr;
6521 // Iterate through all pointers and check if all distances are
6522 // unique multiple of Stride.
6523 using DistOrdPair = std::pair<int64_t, int>;
6524 auto Compare = llvm::less_first();
6525 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6526 int Cnt = 0;
6527 bool IsConsecutive = true;
6528 for (const SCEV *PtrSCEV : SCEVs) {
6529 unsigned Dist = 0;
6530 if (PtrSCEV != PtrSCEVLowest) {
6531 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6532 const SCEV *Coeff = TryGetStride(Diff, Stride);
6533 if (!Coeff)
6534 return nullptr;
6535 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6536 if (!SC || isa<SCEVCouldNotCompute>(SC))
6537 return nullptr;
6538 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6539 SE.getMulExpr(Stride, SC)))
6540 ->isZero())
6541 return nullptr;
6542 Dist = SC->getAPInt().getZExtValue();
6543 }
6544 // If the strides are not the same or repeated, we can't vectorize.
6545 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6546 return nullptr;
6547 auto Res = Offsets.emplace(Dist, Cnt);
6548 if (!Res.second)
6549 return nullptr;
6550 // Consecutive order if the inserted element is the last one.
6551 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6552 ++Cnt;
6553 }
6554 if (Offsets.size() != SCEVs.size())
6555 return nullptr;
6556 SortedIndices.clear();
6557 if (!IsConsecutive) {
6558 // Fill SortedIndices array only if it is non-consecutive.
6559 SortedIndices.resize(PointerOps.size());
6560 Cnt = 0;
6561 for (const std::pair<int64_t, int> &Pair : Offsets) {
6562 SortedIndices[Cnt] = Pair.second;
6563 ++Cnt;
6564 }
6565 }
6566 return Stride;
6567}
6568
6569static std::pair<InstructionCost, InstructionCost>
6570getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
6571 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6572 Type *ScalarTy, VectorType *VecTy);
6573
6574/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6575/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6576/// subvector pattern.
6577static InstructionCost
6579 VectorType *Tp, ArrayRef<int> Mask = {},
6581 int Index = 0, VectorType *SubTp = nullptr,
6583 VectorType *DstTy = Tp;
6584 if (!Mask.empty())
6585 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6586
6587 if (Kind != TTI::SK_PermuteTwoSrc)
6588 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6589 Args);
6590 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6591 int NumSubElts;
6593 Mask, NumSrcElts, NumSubElts, Index)) {
6594 if (Index + NumSubElts > NumSrcElts &&
6595 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6596 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6597 TTI::TCK_RecipThroughput, Index, Tp);
6598 }
6599 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6600 Args);
6601}
6602
6603/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6604/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6605/// instead of a scalar.
6606static InstructionCost
6608 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6609 bool Extract, TTI::TargetCostKind CostKind,
6610 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6612 "ScalableVectorType is not supported.");
6613 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6614 getNumElements(Ty) &&
6615 "Incorrect usage.");
6616 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6617 assert(SLPReVec && "Only supported by REVEC.");
6618 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6619 // of CreateInsertElement.
6620 unsigned ScalarTyNumElements = VecTy->getNumElements();
6622 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6623 if (!DemandedElts[I])
6624 continue;
6625 if (Insert)
6627 I * ScalarTyNumElements, VecTy);
6628 if (Extract)
6630 I * ScalarTyNumElements, VecTy);
6631 }
6632 return Cost;
6633 }
6634 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6635 CostKind, ForPoisonSrc, VL);
6636}
6637
6638/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6639/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6641 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6642 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6643 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6644 if (Opcode == Instruction::ExtractElement) {
6645 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6646 assert(SLPReVec && "Only supported by REVEC.");
6647 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6649 cast<VectorType>(Val), {}, CostKind,
6650 Index * VecTy->getNumElements(), VecTy);
6651 }
6652 }
6653 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6654 ScalarUserAndIdx);
6655}
6656
6657/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6658/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6660 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6661 VectorType *VecTy, unsigned Index,
6663 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6664 assert(SLPReVec && "Only supported by REVEC.");
6665 auto *SubTp =
6666 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6668 Index * ScalarTy->getNumElements(), SubTp) +
6669 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6670 CostKind);
6671 }
6672 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6673}
6674
6675/// Creates subvector insert. Generates shuffle using \p Generator or
6676/// using default shuffle.
6678 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6679 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6680 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6681 return Vec;
6682 const unsigned SubVecVF = getNumElements(V->getType());
6683 // Create shuffle, insertvector requires that index is multiple of
6684 // the subvector length.
6685 const unsigned VecVF = getNumElements(Vec->getType());
6686 SmallVector<int> Mask(VecVF, PoisonMaskElem);
6687 if (isa<PoisonValue>(Vec)) {
6688 auto *Begin = std::next(Mask.begin(), Index);
6689 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6690 Vec = Builder.CreateShuffleVector(V, Mask);
6691 return Vec;
6692 }
6693 std::iota(Mask.begin(), Mask.end(), 0);
6694 std::iota(std::next(Mask.begin(), Index),
6695 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6696 if (Generator)
6697 return Generator(Vec, V, Mask);
6698 // 1. Resize V to the size of Vec.
6699 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6700 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6701 V = Builder.CreateShuffleVector(V, ResizeMask);
6702 // 2. Insert V into Vec.
6703 return Builder.CreateShuffleVector(Vec, V, Mask);
6704}
6705
6706/// Generates subvector extract using \p Generator or using default shuffle.
6708 unsigned SubVecVF, unsigned Index) {
6709 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6710 std::iota(Mask.begin(), Mask.end(), Index);
6711 return Builder.CreateShuffleVector(Vec, Mask);
6712}
6713
6714/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6715/// with \p Order.
6716/// \return true if the mask represents strided access, false - otherwise.
6718 ArrayRef<unsigned> Order, Type *ScalarTy,
6719 const DataLayout &DL, ScalarEvolution &SE,
6720 SmallVectorImpl<int> &CompressMask) {
6721 const unsigned Sz = PointerOps.size();
6722 CompressMask.assign(Sz, PoisonMaskElem);
6723 // The first element always set.
6724 CompressMask[0] = 0;
6725 // Check if the mask represents strided access.
6726 std::optional<unsigned> Stride = 0;
6727 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6728 for (unsigned I : seq<unsigned>(1, Sz)) {
6729 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6730 std::optional<int64_t> OptPos =
6731 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6732 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6733 return false;
6734 unsigned Pos = static_cast<unsigned>(*OptPos);
6735 CompressMask[I] = Pos;
6736 if (!Stride)
6737 continue;
6738 if (*Stride == 0) {
6739 *Stride = Pos;
6740 continue;
6741 }
6742 if (Pos != *Stride * I)
6743 Stride.reset();
6744 }
6745 return Stride.has_value();
6746}
6747
6748/// Checks if the \p VL can be transformed to a (masked)load + compress or
6749/// (masked) interleaved load.
6751 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6754 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6755 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6756 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6757 VectorType *&LoadVecTy) {
6758 InterleaveFactor = 0;
6759 Type *ScalarTy = VL.front()->getType();
6760 const size_t Sz = VL.size();
6761 auto *VecTy = getWidenedType(ScalarTy, Sz);
6763 SmallVector<int> Mask;
6764 if (!Order.empty())
6765 inversePermutation(Order, Mask);
6766 // Check external uses.
6767 for (const auto [I, V] : enumerate(VL)) {
6768 if (AreAllUsersVectorized(V))
6769 continue;
6770 InstructionCost ExtractCost =
6771 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6772 Mask.empty() ? I : Mask[I]);
6773 InstructionCost ScalarCost =
6774 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6775 if (ExtractCost <= ScalarCost)
6776 return false;
6777 }
6778 Value *Ptr0;
6779 Value *PtrN;
6780 if (Order.empty()) {
6781 Ptr0 = PointerOps.front();
6782 PtrN = PointerOps.back();
6783 } else {
6784 Ptr0 = PointerOps[Order.front()];
6785 PtrN = PointerOps[Order.back()];
6786 }
6787 std::optional<int64_t> Diff =
6788 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6789 if (!Diff)
6790 return false;
6791 const size_t MaxRegSize =
6793 .getFixedValue();
6794 // Check for very large distances between elements.
6795 if (*Diff / Sz >= MaxRegSize / 8)
6796 return false;
6797 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6798 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6799 Align CommonAlignment = LI->getAlign();
6800 IsMasked = !isSafeToLoadUnconditionally(
6801 Ptr0, LoadVecTy, CommonAlignment, DL,
6802 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6803 &TLI);
6804 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6805 LI->getPointerAddressSpace()))
6806 return false;
6807 // TODO: perform the analysis of each scalar load for better
6808 // safe-load-unconditionally analysis.
6809 bool IsStrided =
6810 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6811 assert(CompressMask.size() >= 2 && "At least two elements are required");
6812 SmallVector<Value *> OrderedPointerOps(PointerOps);
6813 if (!Order.empty())
6814 reorderScalars(OrderedPointerOps, Mask);
6815 auto [ScalarGEPCost, VectorGEPCost] =
6816 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6817 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6818 // The cost of scalar loads.
6819 InstructionCost ScalarLoadsCost =
6820 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6821 [&](InstructionCost C, Value *V) {
6822 return C + TTI.getInstructionCost(cast<Instruction>(V),
6823 CostKind);
6824 }) +
6825 ScalarGEPCost;
6826 APInt DemandedElts = APInt::getAllOnes(Sz);
6827 InstructionCost GatherCost =
6828 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6829 /*Insert=*/true,
6830 /*Extract=*/false, CostKind) +
6831 ScalarLoadsCost;
6832 InstructionCost LoadCost = 0;
6833 if (IsMasked) {
6834 LoadCost =
6835 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6836 LI->getPointerAddressSpace(), CostKind);
6837 } else {
6838 LoadCost =
6839 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6840 LI->getPointerAddressSpace(), CostKind);
6841 }
6842 if (IsStrided && !IsMasked && Order.empty()) {
6843 // Check for potential segmented(interleaved) loads.
6844 VectorType *AlignedLoadVecTy = getWidenedType(
6845 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6846 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6847 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6848 &TLI))
6849 AlignedLoadVecTy = LoadVecTy;
6850 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6851 CommonAlignment,
6852 LI->getPointerAddressSpace())) {
6853 InstructionCost InterleavedCost =
6854 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6855 Instruction::Load, AlignedLoadVecTy,
6856 CompressMask[1], {}, CommonAlignment,
6857 LI->getPointerAddressSpace(), CostKind, IsMasked);
6858 if (InterleavedCost < GatherCost) {
6859 InterleaveFactor = CompressMask[1];
6860 LoadVecTy = AlignedLoadVecTy;
6861 return true;
6862 }
6863 }
6864 }
6865 InstructionCost CompressCost = ::getShuffleCost(
6866 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
6867 if (!Order.empty()) {
6868 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6869 for (unsigned I : seq<unsigned>(Sz)) {
6870 NewMask[I] = CompressMask[Mask[I]];
6871 }
6872 CompressMask.swap(NewMask);
6873 }
6874 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6875 return TotalVecCost < GatherCost;
6876}
6877
6878/// Checks if the \p VL can be transformed to a (masked)load + compress or
6879/// (masked) interleaved load.
6880static bool
6883 const DataLayout &DL, ScalarEvolution &SE,
6884 AssumptionCache &AC, const DominatorTree &DT,
6885 const TargetLibraryInfo &TLI,
6886 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6887 bool IsMasked;
6888 unsigned InterleaveFactor;
6889 SmallVector<int> CompressMask;
6890 VectorType *LoadVecTy;
6891 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6892 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6893 CompressMask, LoadVecTy);
6894}
6895
6896/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6897/// PointerOps:
6898/// 1. Target with strided load support is detected.
6899/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6900/// potential stride <= MaxProfitableLoadStride and the potential stride is
6901/// power-of-2 (to avoid perf regressions for the very small number of loads)
6902/// and max distance > number of loads, or potential stride is -1.
6903/// 3. The loads are ordered, or number of unordered loads <=
6904/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6905/// to avoid extra costs for very expensive shuffles).
6906/// 4. Any pointer operand is an instruction with the users outside of the
6907/// current graph (for masked gathers extra extractelement instructions
6908/// might be required).
6910 Align Alignment, const int64_t Diff,
6911 const size_t Sz) const {
6912 if (Diff % (Sz - 1) != 0)
6913 return false;
6914
6915 // Try to generate strided load node.
6916 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
6917 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
6918 return !isVectorized(U) && !MustGather.contains(U);
6919 });
6920 });
6921
6922 const uint64_t AbsoluteDiff = std::abs(Diff);
6923 auto *VecTy = getWidenedType(ScalarTy, Sz);
6924 if (IsAnyPointerUsedOutGraph ||
6925 (AbsoluteDiff > Sz &&
6927 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
6928 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
6929 Diff == -(static_cast<int64_t>(Sz) - 1)) {
6930 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6931 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
6932 return false;
6933 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
6934 return false;
6935 return true;
6936 }
6937 return false;
6938}
6939
6941 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
6942 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
6943 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
6944 const size_t Sz = PointerOps.size();
6945 if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz))
6946 return false;
6947
6948 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6949
6950 // Iterate through all pointers and check if all distances are
6951 // unique multiple of Dist.
6953 for (Value *Ptr : PointerOps) {
6954 int64_t Dist = 0;
6955 if (Ptr == PtrN)
6956 Dist = Diff;
6957 else if (Ptr != Ptr0)
6958 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
6959 // If the strides are not the same or repeated, we can't
6960 // vectorize.
6961 if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
6962 break;
6963 }
6964 if (Dists.size() == Sz) {
6965 Type *StrideTy = DL->getIndexType(Ptr0->getType());
6966 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6967 SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
6968 return true;
6969 }
6970 return false;
6971}
6972
6974 Type *ScalarTy, Align CommonAlignment,
6975 SmallVectorImpl<unsigned> &SortedIndices,
6976 StridedPtrInfo &SPtrInfo) const {
6977 const unsigned Sz = PointerOps.size();
6978 FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, Sz);
6979 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
6980 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
6981 return false;
6982 if (const SCEV *Stride =
6983 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) {
6984 SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
6985 SPtrInfo.StrideSCEV = Stride;
6986 return true;
6987 }
6988 return false;
6989}
6990
6992 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
6993 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
6994 unsigned *BestVF, bool TryRecursiveCheck) const {
6995 // Check that a vectorized load would load the same memory as a scalar
6996 // load. For example, we don't want to vectorize loads that are smaller
6997 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6998 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6999 // from such a struct, we read/write packed bits disagreeing with the
7000 // unvectorized version.
7001 if (BestVF)
7002 *BestVF = 0;
7004 return LoadsState::Gather;
7005 Type *ScalarTy = VL0->getType();
7006
7007 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7008 return LoadsState::Gather;
7009
7010 // Make sure all loads in the bundle are simple - we can't vectorize
7011 // atomic or volatile loads.
7012 PointerOps.clear();
7013 const size_t Sz = VL.size();
7014 PointerOps.resize(Sz);
7015 auto *POIter = PointerOps.begin();
7016 for (Value *V : VL) {
7017 auto *L = dyn_cast<LoadInst>(V);
7018 if (!L || !L->isSimple())
7019 return LoadsState::Gather;
7020 *POIter = L->getPointerOperand();
7021 ++POIter;
7022 }
7023
7024 Order.clear();
7025 // Check the order of pointer operands or that all pointers are the same.
7026 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7027
7028 auto *VecTy = getWidenedType(ScalarTy, Sz);
7029 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7030 if (!IsSorted) {
7031 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7032 SPtrInfo))
7034
7035 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7036 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7037 return LoadsState::Gather;
7038
7039 if (!all_of(PointerOps, [&](Value *P) {
7040 return arePointersCompatible(P, PointerOps.front(), *TLI);
7041 }))
7042 return LoadsState::Gather;
7043
7044 } else {
7045 Value *Ptr0;
7046 Value *PtrN;
7047 if (Order.empty()) {
7048 Ptr0 = PointerOps.front();
7049 PtrN = PointerOps.back();
7050 } else {
7051 Ptr0 = PointerOps[Order.front()];
7052 PtrN = PointerOps[Order.back()];
7053 }
7054 std::optional<int64_t> Diff =
7055 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7056 // Check that the sorted loads are consecutive.
7057 if (static_cast<uint64_t>(*Diff) == Sz - 1)
7058 return LoadsState::Vectorize;
7059 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7060 *TLI, [&](Value *V) {
7061 return areAllUsersVectorized(
7062 cast<Instruction>(V), UserIgnoreList);
7063 }))
7065 Align Alignment =
7066 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7067 ->getAlign();
7068 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7069 *Diff, Ptr0, PtrN, SPtrInfo))
7071 }
7072 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7073 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7074 return LoadsState::Gather;
7075 // Correctly identify compare the cost of loads + shuffles rather than
7076 // strided/masked gather loads. Returns true if vectorized + shuffles
7077 // representation is better than just gather.
7078 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7079 unsigned *BestVF,
7080 bool ProfitableGatherPointers) {
7081 if (BestVF)
7082 *BestVF = 0;
7083 // Compare masked gather cost and loads + insert subvector costs.
7085 auto [ScalarGEPCost, VectorGEPCost] =
7086 getGEPCosts(TTI, PointerOps, PointerOps.front(),
7087 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7088 // Estimate the cost of masked gather GEP. If not a splat, roughly
7089 // estimate as a buildvector, otherwise estimate as splat.
7090 APInt DemandedElts = APInt::getAllOnes(Sz);
7091 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7092 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7093 if (static_cast<unsigned>(count_if(
7094 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7095 any_of(PointerOps, [&](Value *V) {
7096 return getUnderlyingObject(V) !=
7097 getUnderlyingObject(PointerOps.front());
7098 }))
7099 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7100 DemandedElts, /*Insert=*/true,
7101 /*Extract=*/false, CostKind);
7102 else
7103 VectorGEPCost +=
7105 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7106 /*Insert=*/true, /*Extract=*/false, CostKind) +
7107 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7108 // The cost of scalar loads.
7109 InstructionCost ScalarLoadsCost =
7110 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7111 [&](InstructionCost C, Value *V) {
7112 return C + TTI.getInstructionCost(
7114 }) +
7115 ScalarGEPCost;
7116 // The cost of masked gather.
7117 InstructionCost MaskedGatherCost =
7118 TTI.getGatherScatterOpCost(
7119 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
7120 /*VariableMask=*/false, CommonAlignment, CostKind) +
7121 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7122 InstructionCost GatherCost =
7123 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7124 /*Insert=*/true,
7125 /*Extract=*/false, CostKind) +
7126 ScalarLoadsCost;
7127 // The list of loads is small or perform partial check already - directly
7128 // compare masked gather cost and gather cost.
7129 constexpr unsigned ListLimit = 4;
7130 if (!TryRecursiveCheck || VL.size() < ListLimit)
7131 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7132
7133 // FIXME: The following code has not been updated for non-power-of-2
7134 // vectors (and not whole registers). The splitting logic here does not
7135 // cover the original vector if the vector factor is not a power of two.
7136 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7137 return false;
7138
7139 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7140 unsigned MinVF = getMinVF(2 * Sz);
7141 DemandedElts.clearAllBits();
7142 // Iterate through possible vectorization factors and check if vectorized +
7143 // shuffles is better than just gather.
7144 for (unsigned VF =
7145 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7146 VF >= MinVF;
7147 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7149 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7150 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7152 SmallVector<Value *> PointerOps;
7153 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7154 PointerOps, SPtrInfo, BestVF,
7155 /*TryRecursiveCheck=*/false);
7156 // Check that the sorted loads are consecutive.
7157 if (LS == LoadsState::Gather) {
7158 if (BestVF) {
7159 DemandedElts.setAllBits();
7160 break;
7161 }
7162 DemandedElts.setBits(Cnt, Cnt + VF);
7163 continue;
7164 }
7165 // If need the reorder - consider as high-cost masked gather for now.
7166 if ((LS == LoadsState::Vectorize ||
7169 !Order.empty() && !isReverseOrder(Order))
7171 States.push_back(LS);
7172 }
7173 if (DemandedElts.isAllOnes())
7174 // All loads gathered - try smaller VF.
7175 continue;
7176 // Can be vectorized later as a serie of loads/insertelements.
7177 InstructionCost VecLdCost = 0;
7178 if (!DemandedElts.isZero()) {
7179 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7180 /*Insert=*/true,
7181 /*Extract=*/false, CostKind) +
7182 ScalarGEPCost;
7183 for (unsigned Idx : seq<unsigned>(VL.size()))
7184 if (DemandedElts[Idx])
7185 VecLdCost +=
7186 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7187 }
7188 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7189 for (auto [I, LS] : enumerate(States)) {
7190 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7191 InstructionCost VectorGEPCost =
7192 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7193 ? 0
7194 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7195 LI0->getPointerOperand(),
7196 Instruction::GetElementPtr, CostKind, ScalarTy,
7197 SubVecTy)
7198 .second;
7199 if (LS == LoadsState::ScatterVectorize) {
7200 if (static_cast<unsigned>(
7201 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7202 PointerOps.size() - 1 ||
7203 any_of(PointerOps, [&](Value *V) {
7204 return getUnderlyingObject(V) !=
7205 getUnderlyingObject(PointerOps.front());
7206 }))
7207 VectorGEPCost += getScalarizationOverhead(
7208 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7209 /*Insert=*/true, /*Extract=*/false, CostKind);
7210 else
7211 VectorGEPCost +=
7213 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7214 /*Insert=*/true, /*Extract=*/false, CostKind) +
7215 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7216 CostKind);
7217 }
7218 switch (LS) {
7220 VecLdCost +=
7221 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7222 LI0->getPointerAddressSpace(), CostKind,
7224 VectorGEPCost;
7225 break;
7227 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7228 LI0->getPointerOperand(),
7229 /*VariableMask=*/false,
7230 CommonAlignment, CostKind) +
7231 VectorGEPCost;
7232 break;
7234 VecLdCost += TTI.getMaskedMemoryOpCost(
7235 Instruction::Load, SubVecTy, CommonAlignment,
7236 LI0->getPointerAddressSpace(), CostKind) +
7237 VectorGEPCost +
7239 {}, CostKind);
7240 break;
7242 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7243 LI0->getPointerOperand(),
7244 /*VariableMask=*/false,
7245 CommonAlignment, CostKind) +
7246 VectorGEPCost;
7247 break;
7248 case LoadsState::Gather:
7249 // Gathers are already calculated - ignore.
7250 continue;
7251 }
7252 SmallVector<int> ShuffleMask(VL.size());
7253 for (int Idx : seq<int>(0, VL.size()))
7254 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7255 if (I > 0)
7256 VecLdCost +=
7257 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7258 CostKind, I * VF, SubVecTy);
7259 }
7260 // If masked gather cost is higher - better to vectorize, so
7261 // consider it as a gather node. It will be better estimated
7262 // later.
7263 if (MaskedGatherCost >= VecLdCost &&
7264 VecLdCost - GatherCost < -SLPCostThreshold) {
7265 if (BestVF)
7266 *BestVF = VF;
7267 return true;
7268 }
7269 }
7270 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7271 };
7272 // TODO: need to improve analysis of the pointers, if not all of them are
7273 // GEPs or have > 2 operands, we end up with a gather node, which just
7274 // increases the cost.
7275 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7276 bool ProfitableGatherPointers =
7277 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7278 return L->isLoopInvariant(V);
7279 })) <= Sz / 2;
7280 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7282 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7283 (GEP && GEP->getNumOperands() == 2 &&
7284 isa<Constant, Instruction>(GEP->getOperand(1)));
7285 })) {
7286 // Check if potential masked gather can be represented as series
7287 // of loads + insertsubvectors.
7288 // If masked gather cost is higher - better to vectorize, so
7289 // consider it as a gather node. It will be better estimated
7290 // later.
7291 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7292 ProfitableGatherPointers))
7294 }
7295
7296 return LoadsState::Gather;
7297}
7298
7300 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7301 const DataLayout &DL, ScalarEvolution &SE,
7302 SmallVectorImpl<unsigned> &SortedIndices) {
7303 assert(
7304 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7305 "Expected list of pointer operands.");
7306 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7307 // Ptr into, sort and return the sorted indices with values next to one
7308 // another.
7310 std::pair<BasicBlock *, Value *>,
7312 Bases;
7313 Bases
7314 .try_emplace(std::make_pair(
7316 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7317
7318 SortedIndices.clear();
7319 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7320 auto Key = std::make_pair(BBs[Cnt + 1],
7322 bool Found = any_of(Bases.try_emplace(Key).first->second,
7323 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7324 std::optional<int64_t> Diff =
7325 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7326 ElemTy, Ptr, DL, SE,
7327 /*StrictCheck=*/true);
7328 if (!Diff)
7329 return false;
7330
7331 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7332 return true;
7333 });
7334
7335 if (!Found) {
7336 // If we haven't found enough to usefully cluster, return early.
7337 if (Bases.size() > VL.size() / 2 - 1)
7338 return false;
7339
7340 // Not found already - add a new Base
7341 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7342 }
7343 }
7344
7345 if (Bases.size() == VL.size())
7346 return false;
7347
7348 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7349 Bases.front().second.size() == VL.size()))
7350 return false;
7351
7352 // For each of the bases sort the pointers by Offset and check if any of the
7353 // base become consecutively allocated.
7354 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7355 SmallPtrSet<Value *, 13> FirstPointers;
7356 SmallPtrSet<Value *, 13> SecondPointers;
7357 Value *P1 = Ptr1;
7358 Value *P2 = Ptr2;
7359 unsigned Depth = 0;
7360 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7361 if (P1 == P2 || Depth > RecursionMaxDepth)
7362 return false;
7363 FirstPointers.insert(P1);
7364 SecondPointers.insert(P2);
7365 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7366 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7367 ++Depth;
7368 }
7369 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7370 "Unable to find matching root.");
7371 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7372 };
7373 for (auto &Base : Bases) {
7374 for (auto &Vec : Base.second) {
7375 if (Vec.size() > 1) {
7377 int64_t InitialOffset = std::get<1>(Vec[0]);
7378 bool AnyConsecutive =
7379 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7380 return std::get<1>(P.value()) ==
7381 int64_t(P.index()) + InitialOffset;
7382 });
7383 // Fill SortedIndices array only if it looks worth-while to sort the
7384 // ptrs.
7385 if (!AnyConsecutive)
7386 return false;
7387 }
7388 }
7389 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7390 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7391 });
7392 }
7393
7394 for (auto &T : Bases)
7395 for (const auto &Vec : T.second)
7396 for (const auto &P : Vec)
7397 SortedIndices.push_back(std::get<2>(P));
7398
7399 assert(SortedIndices.size() == VL.size() &&
7400 "Expected SortedIndices to be the size of VL");
7401 return true;
7402}
7403
7404std::optional<BoUpSLP::OrdersType>
7405BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7406 assert(TE.isGather() && "Expected gather node only.");
7407 Type *ScalarTy = TE.Scalars[0]->getType();
7408
7410 Ptrs.reserve(TE.Scalars.size());
7412 BBs.reserve(TE.Scalars.size());
7413 for (Value *V : TE.Scalars) {
7414 auto *L = dyn_cast<LoadInst>(V);
7415 if (!L || !L->isSimple())
7416 return std::nullopt;
7417 Ptrs.push_back(L->getPointerOperand());
7418 BBs.push_back(L->getParent());
7419 }
7420
7421 BoUpSLP::OrdersType Order;
7422 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7423 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7424 return std::move(Order);
7425 return std::nullopt;
7426}
7427
7428/// Check if two insertelement instructions are from the same buildvector.
7431 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7432 // Instructions must be from the same basic blocks.
7433 if (VU->getParent() != V->getParent())
7434 return false;
7435 // Checks if 2 insertelements are from the same buildvector.
7436 if (VU->getType() != V->getType())
7437 return false;
7438 // Multiple used inserts are separate nodes.
7439 if (!VU->hasOneUse() && !V->hasOneUse())
7440 return false;
7441 auto *IE1 = VU;
7442 auto *IE2 = V;
7443 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7444 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7445 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7446 return false;
7447 // Go through the vector operand of insertelement instructions trying to find
7448 // either VU as the original vector for IE2 or V as the original vector for
7449 // IE1.
7450 SmallBitVector ReusedIdx(
7451 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7452 bool IsReusedIdx = false;
7453 do {
7454 if (IE2 == VU && !IE1)
7455 return VU->hasOneUse();
7456 if (IE1 == V && !IE2)
7457 return V->hasOneUse();
7458 if (IE1 && IE1 != V) {
7459 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7460 IsReusedIdx |= ReusedIdx.test(Idx1);
7461 ReusedIdx.set(Idx1);
7462 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7463 IE1 = nullptr;
7464 else
7465 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7466 }
7467 if (IE2 && IE2 != VU) {
7468 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7469 IsReusedIdx |= ReusedIdx.test(Idx2);
7470 ReusedIdx.set(Idx2);
7471 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7472 IE2 = nullptr;
7473 else
7474 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7475 }
7476 } while (!IsReusedIdx && (IE1 || IE2));
7477 return false;
7478}
7479
7480/// Checks if the specified instruction \p I is an alternate operation for
7481/// the given \p MainOp and \p AltOp instructions.
7482static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7483 Instruction *AltOp,
7484 const TargetLibraryInfo &TLI);
7485
7486std::optional<BoUpSLP::OrdersType>
7487BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7488 bool IgnoreReorder) {
7489 // No need to reorder if need to shuffle reuses, still need to shuffle the
7490 // node.
7491 if (!TE.ReuseShuffleIndices.empty()) {
7492 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7493 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7494 "Reshuffling scalars not yet supported for nodes with padding");
7495
7496 if (isSplat(TE.Scalars))
7497 return std::nullopt;
7498 // Check if reuse shuffle indices can be improved by reordering.
7499 // For this, check that reuse mask is "clustered", i.e. each scalar values
7500 // is used once in each submask of size <number_of_scalars>.
7501 // Example: 4 scalar values.
7502 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7503 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7504 // element 3 is used twice in the second submask.
7505 unsigned Sz = TE.Scalars.size();
7506 if (TE.isGather()) {
7507 if (std::optional<OrdersType> CurrentOrder =
7508 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7509 SmallVector<int> Mask;
7510 fixupOrderingIndices(*CurrentOrder);
7511 inversePermutation(*CurrentOrder, Mask);
7512 ::addMask(Mask, TE.ReuseShuffleIndices);
7513 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7514 unsigned Sz = TE.Scalars.size();
7515 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7516 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7517 if (Idx != PoisonMaskElem)
7518 Res[Idx + K * Sz] = I + K * Sz;
7519 }
7520 return std::move(Res);
7521 }
7522 }
7523 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7524 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7525 2 * TE.getVectorFactor())) == 1)
7526 return std::nullopt;
7527 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7528 return std::nullopt;
7529 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7530 Sz)) {
7531 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7532 if (TE.ReorderIndices.empty())
7533 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7534 else
7535 inversePermutation(TE.ReorderIndices, ReorderMask);
7536 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7537 unsigned VF = ReorderMask.size();
7538 OrdersType ResOrder(VF, VF);
7539 unsigned NumParts = divideCeil(VF, Sz);
7540 SmallBitVector UsedVals(NumParts);
7541 for (unsigned I = 0; I < VF; I += Sz) {
7542 int Val = PoisonMaskElem;
7543 unsigned UndefCnt = 0;
7544 unsigned Limit = std::min(Sz, VF - I);
7545 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7546 [&](int Idx) {
7547 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7548 Val = Idx;
7549 if (Idx == PoisonMaskElem)
7550 ++UndefCnt;
7551 return Idx != PoisonMaskElem && Idx != Val;
7552 }) ||
7553 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7554 UndefCnt > Sz / 2)
7555 return std::nullopt;
7556 UsedVals.set(Val);
7557 for (unsigned K = 0; K < NumParts; ++K) {
7558 unsigned Idx = Val + Sz * K;
7559 if (Idx < VF && I + K < VF)
7560 ResOrder[Idx] = I + K;
7561 }
7562 }
7563 return std::move(ResOrder);
7564 }
7565 unsigned VF = TE.getVectorFactor();
7566 // Try build correct order for extractelement instructions.
7567 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7568 TE.ReuseShuffleIndices.end());
7569 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7570 all_of(TE.Scalars, [Sz](Value *V) {
7571 if (isa<PoisonValue>(V))
7572 return true;
7573 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7574 return Idx && *Idx < Sz;
7575 })) {
7576 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7577 "by BinaryOperator and CastInst.");
7578 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7579 if (TE.ReorderIndices.empty())
7580 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7581 else
7582 inversePermutation(TE.ReorderIndices, ReorderMask);
7583 for (unsigned I = 0; I < VF; ++I) {
7584 int &Idx = ReusedMask[I];
7585 if (Idx == PoisonMaskElem)
7586 continue;
7587 Value *V = TE.Scalars[ReorderMask[Idx]];
7588 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7589 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7590 }
7591 }
7592 // Build the order of the VF size, need to reorder reuses shuffles, they are
7593 // always of VF size.
7594 OrdersType ResOrder(VF);
7595 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7596 auto *It = ResOrder.begin();
7597 for (unsigned K = 0; K < VF; K += Sz) {
7598 OrdersType CurrentOrder(TE.ReorderIndices);
7599 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7600 if (SubMask.front() == PoisonMaskElem)
7601 std::iota(SubMask.begin(), SubMask.end(), 0);
7602 reorderOrder(CurrentOrder, SubMask);
7603 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7604 std::advance(It, Sz);
7605 }
7606 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7607 return Data.index() == Data.value();
7608 }))
7609 return std::nullopt; // No need to reorder.
7610 return std::move(ResOrder);
7611 }
7612 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7613 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7614 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
7615 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
7616 return std::nullopt;
7617 if (TE.State == TreeEntry::SplitVectorize ||
7618 ((TE.State == TreeEntry::Vectorize ||
7619 TE.State == TreeEntry::StridedVectorize ||
7620 TE.State == TreeEntry::CompressVectorize) &&
7622 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7623 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7624 "Alternate instructions are only supported by "
7625 "BinaryOperator and CastInst.");
7626 return TE.ReorderIndices;
7627 }
7628 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7629 TE.isAltShuffle()) {
7630 assert(TE.ReuseShuffleIndices.empty() &&
7631 "ReuseShuffleIndices should be "
7632 "empty for alternate instructions.");
7633 SmallVector<int> Mask;
7634 TE.buildAltOpShuffleMask(
7635 [&](Instruction *I) {
7636 assert(TE.getMatchingMainOpOrAltOp(I) &&
7637 "Unexpected main/alternate opcode");
7638 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
7639 },
7640 Mask);
7641 const int VF = TE.getVectorFactor();
7642 OrdersType ResOrder(VF, VF);
7643 for (unsigned I : seq<unsigned>(VF)) {
7644 if (Mask[I] == PoisonMaskElem)
7645 continue;
7646 ResOrder[Mask[I] % VF] = I;
7647 }
7648 return std::move(ResOrder);
7649 }
7650 if (!TE.ReorderIndices.empty())
7651 return TE.ReorderIndices;
7652 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7653 if (!TE.ReorderIndices.empty())
7654 return TE.ReorderIndices;
7655
7656 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
7657 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
7658 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7659 continue;
7660 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
7661 if (!II)
7662 continue;
7663 Instruction *BVHead = nullptr;
7664 BasicBlock *BB = II->getParent();
7665 while (II && II->hasOneUse() && II->getParent() == BB) {
7666 BVHead = II;
7667 II = dyn_cast<InsertElementInst>(II->getOperand(0));
7668 }
7669 I = BVHead;
7670 }
7671
7672 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
7673 assert(BB1 != BB2 && "Expected different basic blocks.");
7674 if (!DT->isReachableFromEntry(BB1))
7675 return false;
7676 if (!DT->isReachableFromEntry(BB2))
7677 return true;
7678 auto *NodeA = DT->getNode(BB1);
7679 auto *NodeB = DT->getNode(BB2);
7680 assert(NodeA && "Should only process reachable instructions");
7681 assert(NodeB && "Should only process reachable instructions");
7682 assert((NodeA == NodeB) ==
7683 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7684 "Different nodes should have different DFS numbers");
7685 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7686 };
7687 auto PHICompare = [&](unsigned I1, unsigned I2) {
7688 Value *V1 = TE.Scalars[I1];
7689 Value *V2 = TE.Scalars[I2];
7690 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
7691 return false;
7692 if (isa<PoisonValue>(V1))
7693 return true;
7694 if (isa<PoisonValue>(V2))
7695 return false;
7696 if (V1->getNumUses() < V2->getNumUses())
7697 return true;
7698 if (V1->getNumUses() > V2->getNumUses())
7699 return false;
7700 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
7701 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
7702 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7703 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7704 FirstUserOfPhi2->getParent());
7705 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7706 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7707 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7708 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7709 if (IE1 && !IE2)
7710 return true;
7711 if (!IE1 && IE2)
7712 return false;
7713 if (IE1 && IE2) {
7714 if (UserBVHead[I1] && !UserBVHead[I2])
7715 return true;
7716 if (!UserBVHead[I1])
7717 return false;
7718 if (UserBVHead[I1] == UserBVHead[I2])
7719 return getElementIndex(IE1) < getElementIndex(IE2);
7720 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
7721 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
7722 UserBVHead[I2]->getParent());
7723 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7724 }
7725 if (EE1 && !EE2)
7726 return true;
7727 if (!EE1 && EE2)
7728 return false;
7729 if (EE1 && EE2) {
7730 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7731 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7732 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7733 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7734 if (!Inst2 && !P2)
7735 return Inst1 || P1;
7736 if (EE1->getOperand(0) == EE2->getOperand(0))
7737 return getElementIndex(EE1) < getElementIndex(EE2);
7738 if (!Inst1 && Inst2)
7739 return false;
7740 if (Inst1 && Inst2) {
7741 if (Inst1->getParent() != Inst2->getParent())
7742 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
7743 return Inst1->comesBefore(Inst2);
7744 }
7745 if (!P1 && P2)
7746 return false;
7747 assert(P1 && P2 &&
7748 "Expected either instructions or arguments vector operands.");
7749 return P1->getArgNo() < P2->getArgNo();
7750 }
7751 return false;
7752 };
7753 OrdersType Phis(TE.Scalars.size());
7754 std::iota(Phis.begin(), Phis.end(), 0);
7755 stable_sort(Phis, PHICompare);
7756 if (isIdentityOrder(Phis))
7757 return std::nullopt; // No need to reorder.
7758 return std::move(Phis);
7759 }
7760 if (TE.isGather() &&
7761 (!TE.hasState() || !TE.isAltShuffle() ||
7762 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7763 allSameType(TE.Scalars)) {
7764 // TODO: add analysis of other gather nodes with extractelement
7765 // instructions and other values/instructions, not only undefs.
7766 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7768 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7769 all_of(TE.Scalars, [](Value *V) {
7770 auto *EE = dyn_cast<ExtractElementInst>(V);
7771 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7772 })) {
7773 // Check that gather of extractelements can be represented as
7774 // just a shuffle of a single vector.
7775 OrdersType CurrentOrder;
7776 bool Reuse =
7777 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
7778 if (Reuse || !CurrentOrder.empty())
7779 return std::move(CurrentOrder);
7780 }
7781 // If the gather node is <undef, v, .., poison> and
7782 // insertelement poison, v, 0 [+ permute]
7783 // is cheaper than
7784 // insertelement poison, v, n - try to reorder.
7785 // If rotating the whole graph, exclude the permute cost, the whole graph
7786 // might be transformed.
7787 int Sz = TE.Scalars.size();
7788 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
7789 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7790 const auto *It = find_if_not(TE.Scalars, isConstant);
7791 if (It == TE.Scalars.begin())
7792 return OrdersType();
7793 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
7794 if (It != TE.Scalars.end()) {
7795 OrdersType Order(Sz, Sz);
7796 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7797 Order[Idx] = 0;
7798 fixupOrderingIndices(Order);
7799 SmallVector<int> Mask;
7800 inversePermutation(Order, Mask);
7801 InstructionCost PermuteCost =
7802 TopToBottom
7803 ? 0
7804 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
7805 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
7806 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
7807 PoisonValue::get(Ty), *It);
7808 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
7809 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
7810 PoisonValue::get(Ty), *It);
7811 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7812 OrdersType Order(Sz, Sz);
7813 Order[Idx] = 0;
7814 return std::move(Order);
7815 }
7816 }
7817 }
7818 if (isSplat(TE.Scalars))
7819 return std::nullopt;
7820 if (TE.Scalars.size() >= 3)
7821 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
7822 return Order;
7823 // Check if can include the order of vectorized loads. For masked gathers do
7824 // extra analysis later, so include such nodes into a special list.
7825 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7826 SmallVector<Value *> PointerOps;
7827 StridedPtrInfo SPtrInfo;
7828 OrdersType CurrentOrder;
7829 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7830 CurrentOrder, PointerOps, SPtrInfo);
7833 return std::move(CurrentOrder);
7834 }
7835 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
7836 // has been auditted for correctness with non-power-of-two vectors.
7837 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
7838 if (std::optional<OrdersType> CurrentOrder =
7839 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
7840 return CurrentOrder;
7841 }
7842 return std::nullopt;
7843}
7844
7845/// Checks if the given mask is a "clustered" mask with the same clusters of
7846/// size \p Sz, which are not identity submasks.
7848 unsigned Sz) {
7849 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
7850 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
7851 return false;
7852 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
7853 ArrayRef<int> Cluster = Mask.slice(I, Sz);
7854 if (Cluster != FirstCluster)
7855 return false;
7856 }
7857 return true;
7858}
7859
7860void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
7861 // Reorder reuses mask.
7862 reorderReuses(TE.ReuseShuffleIndices, Mask);
7863 const unsigned Sz = TE.Scalars.size();
7864 // For vectorized and non-clustered reused no need to do anything else.
7865 if (!TE.isGather() ||
7867 Sz) ||
7868 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
7869 return;
7870 SmallVector<int> NewMask;
7871 inversePermutation(TE.ReorderIndices, NewMask);
7872 addMask(NewMask, TE.ReuseShuffleIndices);
7873 // Clear reorder since it is going to be applied to the new mask.
7874 TE.ReorderIndices.clear();
7875 // Try to improve gathered nodes with clustered reuses, if possible.
7876 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
7877 SmallVector<unsigned> NewOrder(Slice);
7878 inversePermutation(NewOrder, NewMask);
7879 reorderScalars(TE.Scalars, NewMask);
7880 // Fill the reuses mask with the identity submasks.
7881 for (auto *It = TE.ReuseShuffleIndices.begin(),
7882 *End = TE.ReuseShuffleIndices.end();
7883 It != End; std::advance(It, Sz))
7884 std::iota(It, std::next(It, Sz), 0);
7885}
7886
7888 ArrayRef<unsigned> SecondaryOrder) {
7889 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
7890 "Expected same size of orders");
7891 size_t Sz = Order.size();
7892 SmallBitVector UsedIndices(Sz);
7893 for (unsigned Idx : seq<unsigned>(0, Sz)) {
7894 if (Order[Idx] != Sz)
7895 UsedIndices.set(Order[Idx]);
7896 }
7897 if (SecondaryOrder.empty()) {
7898 for (unsigned Idx : seq<unsigned>(0, Sz))
7899 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
7900 Order[Idx] = Idx;
7901 } else {
7902 for (unsigned Idx : seq<unsigned>(0, Sz))
7903 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7904 !UsedIndices.test(SecondaryOrder[Idx]))
7905 Order[Idx] = SecondaryOrder[Idx];
7906 }
7907}
7908
7911 return false;
7912
7913 constexpr unsigned TinyVF = 2;
7914 constexpr unsigned TinyTree = 10;
7915 constexpr unsigned PhiOpsLimit = 12;
7916 constexpr unsigned GatherLoadsLimit = 2;
7917 if (VectorizableTree.size() <= TinyTree)
7918 return true;
7919 if (VectorizableTree.front()->hasState() &&
7920 !VectorizableTree.front()->isGather() &&
7921 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7922 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7923 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7924 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7925 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7926 VectorizableTree.front()->ReorderIndices.empty()) {
7927 // Check if the tree has only single store and single (unordered) load node,
7928 // other nodes are phis or geps/binops, combined with phis, and/or single
7929 // gather load node
7930 if (VectorizableTree.front()->hasState() &&
7931 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7932 VectorizableTree.front()->Scalars.size() == TinyVF &&
7933 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7934 return false;
7935 // Single node, which require reorder - skip.
7936 if (VectorizableTree.front()->hasState() &&
7937 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7938 VectorizableTree.front()->ReorderIndices.empty()) {
7939 const unsigned ReorderedSplitsCnt =
7940 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7941 return TE->State == TreeEntry::SplitVectorize &&
7942 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7943 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7944 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
7945 });
7946 if (ReorderedSplitsCnt <= 1 &&
7947 static_cast<unsigned>(count_if(
7948 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7949 return ((!TE->isGather() &&
7950 (TE->ReorderIndices.empty() ||
7951 (TE->UserTreeIndex.UserTE &&
7952 TE->UserTreeIndex.UserTE->State ==
7953 TreeEntry::Vectorize &&
7954 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7955 .empty()))) ||
7956 (TE->isGather() && TE->ReorderIndices.empty() &&
7957 (!TE->hasState() || TE->isAltShuffle() ||
7958 TE->getOpcode() == Instruction::Load ||
7959 TE->getOpcode() == Instruction::ZExt ||
7960 TE->getOpcode() == Instruction::SExt))) &&
7961 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7962 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
7963 return !isConstant(V) && isVectorized(V);
7964 }));
7965 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7966 return false;
7967 }
7968 bool HasPhis = false;
7969 bool HasLoad = true;
7970 unsigned GatherLoads = 0;
7971 for (const std::unique_ptr<TreeEntry> &TE :
7972 ArrayRef(VectorizableTree).drop_front()) {
7973 if (TE->State == TreeEntry::SplitVectorize)
7974 continue;
7975 if (!TE->hasState()) {
7976 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
7978 continue;
7979 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7981 continue;
7982 return true;
7983 }
7984 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7985 if (!TE->isGather()) {
7986 HasLoad = false;
7987 continue;
7988 }
7989 if (HasLoad)
7990 return true;
7991 ++GatherLoads;
7992 if (GatherLoads >= GatherLoadsLimit)
7993 return true;
7994 }
7995 if (TE->getOpcode() == Instruction::GetElementPtr ||
7996 Instruction::isBinaryOp(TE->getOpcode()))
7997 continue;
7998 if (TE->getOpcode() != Instruction::PHI &&
7999 (!TE->hasCopyableElements() ||
8000 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8001 TE->Scalars.size() / 2))
8002 return true;
8003 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8004 TE->getNumOperands() > PhiOpsLimit)
8005 return false;
8006 HasPhis = true;
8007 }
8008 return !HasPhis;
8009 }
8010 return true;
8011}
8012
8013void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8014 ArrayRef<int> MaskOrder) {
8015 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8016 SmallVector<int> NewMask(getVectorFactor());
8017 SmallVector<int> NewMaskOrder(getVectorFactor());
8018 std::iota(NewMask.begin(), NewMask.end(), 0);
8019 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8020 if (Idx == 0) {
8021 copy(Mask, NewMask.begin());
8022 copy(MaskOrder, NewMaskOrder.begin());
8023 } else {
8024 assert(Idx == 1 && "Expected either 0 or 1 index.");
8025 unsigned Offset = CombinedEntriesWithIndices.back().second;
8026 for (unsigned I : seq<unsigned>(Mask.size())) {
8027 NewMask[I + Offset] = Mask[I] + Offset;
8028 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8029 }
8030 }
8031 reorderScalars(Scalars, NewMask);
8032 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8033 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8034 ReorderIndices.clear();
8035}
8036
8038 // Maps VF to the graph nodes.
8040 // ExtractElement gather nodes which can be vectorized and need to handle
8041 // their ordering.
8043
8044 // Phi nodes can have preferred ordering based on their result users
8046
8047 // AltShuffles can also have a preferred ordering that leads to fewer
8048 // instructions, e.g., the addsub instruction in x86.
8049 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8050
8051 // Maps a TreeEntry to the reorder indices of external users.
8053 ExternalUserReorderMap;
8054 // Find all reorderable nodes with the given VF.
8055 // Currently the are vectorized stores,loads,extracts + some gathering of
8056 // extracts.
8057 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8058 const std::unique_ptr<TreeEntry> &TE) {
8059 // Look for external users that will probably be vectorized.
8060 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8061 findExternalStoreUsersReorderIndices(TE.get());
8062 if (!ExternalUserReorderIndices.empty()) {
8063 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8064 ExternalUserReorderMap.try_emplace(TE.get(),
8065 std::move(ExternalUserReorderIndices));
8066 }
8067
8068 // Patterns like [fadd,fsub] can be combined into a single instruction in
8069 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8070 // to take into account their order when looking for the most used order.
8071 if (TE->hasState() && TE->isAltShuffle() &&
8072 TE->State != TreeEntry::SplitVectorize) {
8073 Type *ScalarTy = TE->Scalars[0]->getType();
8074 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8075 unsigned Opcode0 = TE->getOpcode();
8076 unsigned Opcode1 = TE->getAltOpcode();
8077 SmallBitVector OpcodeMask(
8078 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8079 // If this pattern is supported by the target then we consider the order.
8080 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8081 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8082 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8083 }
8084 // TODO: Check the reverse order too.
8085 }
8086
8087 bool IgnoreReorder =
8088 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8089 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8090 VectorizableTree.front()->getOpcode() == Instruction::Store);
8091 if (std::optional<OrdersType> CurrentOrder =
8092 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8093 // Do not include ordering for nodes used in the alt opcode vectorization,
8094 // better to reorder them during bottom-to-top stage. If follow the order
8095 // here, it causes reordering of the whole graph though actually it is
8096 // profitable just to reorder the subgraph that starts from the alternate
8097 // opcode vectorization node. Such nodes already end-up with the shuffle
8098 // instruction and it is just enough to change this shuffle rather than
8099 // rotate the scalars for the whole graph.
8100 unsigned Cnt = 0;
8101 const TreeEntry *UserTE = TE.get();
8102 while (UserTE && Cnt < RecursionMaxDepth) {
8103 if (!UserTE->UserTreeIndex)
8104 break;
8105 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8106 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8107 UserTE->UserTreeIndex.UserTE->Idx != 0)
8108 return;
8109 UserTE = UserTE->UserTreeIndex.UserTE;
8110 ++Cnt;
8111 }
8112 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8113 if (!(TE->State == TreeEntry::Vectorize ||
8114 TE->State == TreeEntry::StridedVectorize ||
8115 TE->State == TreeEntry::SplitVectorize ||
8116 TE->State == TreeEntry::CompressVectorize) ||
8117 !TE->ReuseShuffleIndices.empty())
8118 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8119 if (TE->State == TreeEntry::Vectorize &&
8120 TE->getOpcode() == Instruction::PHI)
8121 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8122 }
8123 });
8124
8125 // Reorder the graph nodes according to their vectorization factor.
8126 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8127 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8128 auto It = VFToOrderedEntries.find(VF);
8129 if (It == VFToOrderedEntries.end())
8130 continue;
8131 // Try to find the most profitable order. We just are looking for the most
8132 // used order and reorder scalar elements in the nodes according to this
8133 // mostly used order.
8134 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8135 // Delete VF entry upon exit.
8136 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
8137
8138 // All operands are reordered and used only in this node - propagate the
8139 // most used order to the user node.
8142 OrdersUses;
8143 for (const TreeEntry *OpTE : OrderedEntries) {
8144 // No need to reorder this nodes, still need to extend and to use shuffle,
8145 // just need to merge reordering shuffle and the reuse shuffle.
8146 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8147 OpTE->State != TreeEntry::SplitVectorize)
8148 continue;
8149 // Count number of orders uses.
8150 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8151 &PhisToOrders]() -> const OrdersType & {
8152 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8153 auto It = GathersToOrders.find(OpTE);
8154 if (It != GathersToOrders.end())
8155 return It->second;
8156 }
8157 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8158 auto It = AltShufflesToOrders.find(OpTE);
8159 if (It != AltShufflesToOrders.end())
8160 return It->second;
8161 }
8162 if (OpTE->State == TreeEntry::Vectorize &&
8163 OpTE->getOpcode() == Instruction::PHI) {
8164 auto It = PhisToOrders.find(OpTE);
8165 if (It != PhisToOrders.end())
8166 return It->second;
8167 }
8168 return OpTE->ReorderIndices;
8169 }();
8170 // First consider the order of the external scalar users.
8171 auto It = ExternalUserReorderMap.find(OpTE);
8172 if (It != ExternalUserReorderMap.end()) {
8173 const auto &ExternalUserReorderIndices = It->second;
8174 // If the OpTE vector factor != number of scalars - use natural order,
8175 // it is an attempt to reorder node with reused scalars but with
8176 // external uses.
8177 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8178 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8179 ExternalUserReorderIndices.size();
8180 } else {
8181 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8182 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8183 }
8184 // No other useful reorder data in this entry.
8185 if (Order.empty())
8186 continue;
8187 }
8188 // Stores actually store the mask, not the order, need to invert.
8189 if (OpTE->State == TreeEntry::Vectorize &&
8190 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8191 assert(!OpTE->isAltShuffle() &&
8192 "Alternate instructions are only supported by BinaryOperator "
8193 "and CastInst.");
8194 SmallVector<int> Mask;
8195 inversePermutation(Order, Mask);
8196 unsigned E = Order.size();
8197 OrdersType CurrentOrder(E, E);
8198 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8199 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8200 });
8201 fixupOrderingIndices(CurrentOrder);
8202 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8203 } else {
8204 ++OrdersUses.try_emplace(Order, 0).first->second;
8205 }
8206 }
8207 if (OrdersUses.empty())
8208 continue;
8209 // Choose the most used order.
8210 unsigned IdentityCnt = 0;
8211 unsigned FilledIdentityCnt = 0;
8212 OrdersType IdentityOrder(VF, VF);
8213 for (auto &Pair : OrdersUses) {
8214 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8215 if (!Pair.first.empty())
8216 FilledIdentityCnt += Pair.second;
8217 IdentityCnt += Pair.second;
8218 combineOrders(IdentityOrder, Pair.first);
8219 }
8220 }
8221 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8222 unsigned Cnt = IdentityCnt;
8223 for (auto &Pair : OrdersUses) {
8224 // Prefer identity order. But, if filled identity found (non-empty order)
8225 // with same number of uses, as the new candidate order, we can choose
8226 // this candidate order.
8227 if (Cnt < Pair.second ||
8228 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8229 Cnt == Pair.second && !BestOrder.empty() &&
8230 isIdentityOrder(BestOrder))) {
8231 combineOrders(Pair.first, BestOrder);
8232 BestOrder = Pair.first;
8233 Cnt = Pair.second;
8234 } else {
8235 combineOrders(BestOrder, Pair.first);
8236 }
8237 }
8238 // Set order of the user node.
8239 if (isIdentityOrder(BestOrder))
8240 continue;
8241 fixupOrderingIndices(BestOrder);
8242 SmallVector<int> Mask;
8243 inversePermutation(BestOrder, Mask);
8244 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8245 unsigned E = BestOrder.size();
8246 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8247 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8248 });
8249 // Do an actual reordering, if profitable.
8250 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8251 // Just do the reordering for the nodes with the given VF.
8252 if (TE->Scalars.size() != VF) {
8253 if (TE->ReuseShuffleIndices.size() == VF) {
8254 assert(TE->State != TreeEntry::SplitVectorize &&
8255 "Split vectorized not expected.");
8256 // Need to reorder the reuses masks of the operands with smaller VF to
8257 // be able to find the match between the graph nodes and scalar
8258 // operands of the given node during vectorization/cost estimation.
8259 assert(
8260 (!TE->UserTreeIndex ||
8261 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8262 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8263 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8264 "All users must be of VF size.");
8265 if (SLPReVec) {
8266 assert(SLPReVec && "Only supported by REVEC.");
8267 // ShuffleVectorInst does not do reorderOperands (and it should not
8268 // because ShuffleVectorInst supports only a limited set of
8269 // patterns). Only do reorderNodeWithReuses if the user is not
8270 // ShuffleVectorInst.
8271 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8272 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8273 continue;
8274 }
8275 // Update ordering of the operands with the smaller VF than the given
8276 // one.
8277 reorderNodeWithReuses(*TE, Mask);
8278 // Update orders in user split vectorize nodes.
8279 if (TE->UserTreeIndex &&
8280 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8281 TE->UserTreeIndex.UserTE->reorderSplitNode(
8282 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8283 }
8284 continue;
8285 }
8286 if ((TE->State == TreeEntry::SplitVectorize &&
8287 TE->ReuseShuffleIndices.empty()) ||
8288 ((TE->State == TreeEntry::Vectorize ||
8289 TE->State == TreeEntry::StridedVectorize ||
8290 TE->State == TreeEntry::CompressVectorize) &&
8292 InsertElementInst>(TE->getMainOp()) ||
8293 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8294 assert(
8295 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8296 TE->ReuseShuffleIndices.empty())) &&
8297 "Alternate instructions are only supported by BinaryOperator "
8298 "and CastInst.");
8299 // Build correct orders for extract{element,value}, loads,
8300 // stores and alternate (split) nodes.
8301 reorderOrder(TE->ReorderIndices, Mask);
8302 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8303 TE->reorderOperands(Mask);
8304 } else {
8305 // Reorder the node and its operands.
8306 TE->reorderOperands(Mask);
8307 assert(TE->ReorderIndices.empty() &&
8308 "Expected empty reorder sequence.");
8309 reorderScalars(TE->Scalars, Mask);
8310 }
8311 if (!TE->ReuseShuffleIndices.empty()) {
8312 // Apply reversed order to keep the original ordering of the reused
8313 // elements to avoid extra reorder indices shuffling.
8314 OrdersType CurrentOrder;
8315 reorderOrder(CurrentOrder, MaskOrder);
8316 SmallVector<int> NewReuses;
8317 inversePermutation(CurrentOrder, NewReuses);
8318 addMask(NewReuses, TE->ReuseShuffleIndices);
8319 TE->ReuseShuffleIndices.swap(NewReuses);
8320 } else if (TE->UserTreeIndex &&
8321 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8322 // Update orders in user split vectorize nodes.
8323 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8324 Mask, MaskOrder);
8325 }
8326 }
8327}
8328
8329void BoUpSLP::buildReorderableOperands(
8330 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8331 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8332 SmallVectorImpl<TreeEntry *> &GatherOps) {
8333 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8334 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8335 return OpData.first == I &&
8336 (OpData.second->State == TreeEntry::Vectorize ||
8337 OpData.second->State == TreeEntry::StridedVectorize ||
8338 OpData.second->State == TreeEntry::CompressVectorize ||
8339 OpData.second->State == TreeEntry::SplitVectorize);
8340 }))
8341 continue;
8342 // Do not request operands, if they do not exist.
8343 if (UserTE->hasState()) {
8344 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8345 UserTE->getOpcode() == Instruction::ExtractValue)
8346 continue;
8347 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8348 continue;
8349 if (UserTE->getOpcode() == Instruction::Store &&
8350 UserTE->State == TreeEntry::Vectorize && I == 1)
8351 continue;
8352 if (UserTE->getOpcode() == Instruction::Load &&
8353 (UserTE->State == TreeEntry::Vectorize ||
8354 UserTE->State == TreeEntry::StridedVectorize ||
8355 UserTE->State == TreeEntry::CompressVectorize))
8356 continue;
8357 }
8358 TreeEntry *TE = getOperandEntry(UserTE, I);
8359 assert(TE && "Expected operand entry.");
8360 if (!TE->isGather()) {
8361 // Add the node to the list of the ordered nodes with the identity
8362 // order.
8363 Edges.emplace_back(I, TE);
8364 // Add ScatterVectorize nodes to the list of operands, where just
8365 // reordering of the scalars is required. Similar to the gathers, so
8366 // simply add to the list of gathered ops.
8367 // If there are reused scalars, process this node as a regular vectorize
8368 // node, just reorder reuses mask.
8369 if (TE->State == TreeEntry::ScatterVectorize &&
8370 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8371 GatherOps.push_back(TE);
8372 continue;
8373 }
8374 if (ReorderableGathers.contains(TE))
8375 GatherOps.push_back(TE);
8376 }
8377}
8378
8379void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8380 struct TreeEntryCompare {
8381 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8382 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8383 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8384 return LHS->Idx < RHS->Idx;
8385 }
8386 };
8388 DenseSet<const TreeEntry *> GathersToOrders;
8389 // Find all reorderable leaf nodes with the given VF.
8390 // Currently the are vectorized loads,extracts without alternate operands +
8391 // some gathering of extracts.
8393 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8394 if (TE->State != TreeEntry::Vectorize &&
8395 TE->State != TreeEntry::StridedVectorize &&
8396 TE->State != TreeEntry::CompressVectorize &&
8397 TE->State != TreeEntry::SplitVectorize)
8398 NonVectorized.insert(TE.get());
8399 if (std::optional<OrdersType> CurrentOrder =
8400 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8401 Queue.push(TE.get());
8402 if (!(TE->State == TreeEntry::Vectorize ||
8403 TE->State == TreeEntry::StridedVectorize ||
8404 TE->State == TreeEntry::CompressVectorize ||
8405 TE->State == TreeEntry::SplitVectorize) ||
8406 !TE->ReuseShuffleIndices.empty())
8407 GathersToOrders.insert(TE.get());
8408 }
8409 }
8410
8411 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8412 // I.e., if the node has operands, that are reordered, try to make at least
8413 // one operand order in the natural order and reorder others + reorder the
8414 // user node itself.
8415 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8416 while (!Queue.empty()) {
8417 // 1. Filter out only reordered nodes.
8418 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8419 TreeEntry *TE = Queue.top();
8420 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8421 Queue.pop();
8422 SmallVector<TreeEntry *> OrderedOps(1, TE);
8423 while (!Queue.empty()) {
8424 TE = Queue.top();
8425 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8426 break;
8427 Queue.pop();
8428 OrderedOps.push_back(TE);
8429 }
8430 for (TreeEntry *TE : OrderedOps) {
8431 if (!(TE->State == TreeEntry::Vectorize ||
8432 TE->State == TreeEntry::StridedVectorize ||
8433 TE->State == TreeEntry::CompressVectorize ||
8434 TE->State == TreeEntry::SplitVectorize ||
8435 (TE->isGather() && GathersToOrders.contains(TE))) ||
8436 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8437 !Visited.insert(TE).second)
8438 continue;
8439 // Build a map between user nodes and their operands order to speedup
8440 // search. The graph currently does not provide this dependency directly.
8441 Users.first = TE->UserTreeIndex.UserTE;
8442 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8443 }
8444 if (Users.first) {
8445 auto &Data = Users;
8446 if (Data.first->State == TreeEntry::SplitVectorize) {
8447 assert(
8448 Data.second.size() <= 2 &&
8449 "Expected not greater than 2 operands for split vectorize node.");
8450 if (any_of(Data.second,
8451 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8452 continue;
8453 // Update orders in user split vectorize nodes.
8454 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8455 "Expected exactly 2 entries.");
8456 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8457 TreeEntry &OpTE = *VectorizableTree[P.first];
8458 OrdersType Order = OpTE.ReorderIndices;
8459 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8460 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8461 continue;
8462 const auto BestOrder =
8463 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8464 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8465 continue;
8466 Order = *BestOrder;
8467 }
8468 fixupOrderingIndices(Order);
8469 SmallVector<int> Mask;
8470 inversePermutation(Order, Mask);
8471 const unsigned E = Order.size();
8472 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8473 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8474 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8475 });
8476 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8477 // Clear ordering of the operand.
8478 if (!OpTE.ReorderIndices.empty()) {
8479 OpTE.ReorderIndices.clear();
8480 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8481 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8482 } else {
8483 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8484 reorderScalars(OpTE.Scalars, Mask);
8485 }
8486 }
8487 if (Data.first->ReuseShuffleIndices.empty() &&
8488 !Data.first->ReorderIndices.empty()) {
8489 // Insert user node to the list to try to sink reordering deeper in
8490 // the graph.
8491 Queue.push(Data.first);
8492 }
8493 continue;
8494 }
8495 // Check that operands are used only in the User node.
8496 SmallVector<TreeEntry *> GatherOps;
8497 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8498 GatherOps);
8499 // All operands are reordered and used only in this node - propagate the
8500 // most used order to the user node.
8503 OrdersUses;
8504 // Do the analysis for each tree entry only once, otherwise the order of
8505 // the same node my be considered several times, though might be not
8506 // profitable.
8509 for (const auto &Op : Data.second) {
8510 TreeEntry *OpTE = Op.second;
8511 if (!VisitedOps.insert(OpTE).second)
8512 continue;
8513 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8514 continue;
8515 const auto Order = [&]() -> const OrdersType {
8516 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8517 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8518 IgnoreReorder)
8519 .value_or(OrdersType(1));
8520 return OpTE->ReorderIndices;
8521 }();
8522 // The order is partially ordered, skip it in favor of fully non-ordered
8523 // orders.
8524 if (Order.size() == 1)
8525 continue;
8526
8527 // Check that the reordering does not increase number of shuffles, i.e.
8528 // same-values-nodes has same parents or their parents has same parents.
8529 if (!Order.empty() && !isIdentityOrder(Order)) {
8530 Value *Root = OpTE->hasState()
8531 ? OpTE->getMainOp()
8532 : *find_if_not(OpTE->Scalars, isConstant);
8533 auto GetSameNodesUsers = [&](Value *Root) {
8535 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8536 if (TE != OpTE && TE->UserTreeIndex &&
8537 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8538 TE->Scalars.size() == OpTE->Scalars.size() &&
8539 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8540 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8541 Res.insert(TE->UserTreeIndex.UserTE);
8542 }
8543 for (const TreeEntry *TE : getTreeEntries(Root)) {
8544 if (TE != OpTE && TE->UserTreeIndex &&
8545 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8546 TE->Scalars.size() == OpTE->Scalars.size() &&
8547 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8548 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8549 Res.insert(TE->UserTreeIndex.UserTE);
8550 }
8551 return Res.takeVector();
8552 };
8553 auto GetNumOperands = [](const TreeEntry *TE) {
8554 if (TE->State == TreeEntry::SplitVectorize)
8555 return TE->getNumOperands();
8556 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8557 return CI->arg_size();
8558 return TE->getNumOperands();
8559 };
8560 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8561 const TreeEntry *TE) {
8563 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8565 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8568 continue;
8569 const TreeEntry *Op = getOperandEntry(TE, Idx);
8570 if (Op->isGather() && Op->hasState()) {
8571 const TreeEntry *VecOp =
8572 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8573 if (VecOp)
8574 Op = VecOp;
8575 }
8576 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8577 return false;
8578 }
8579 return true;
8580 };
8581 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8582 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8583 if (!RevisitedOps.insert(UTE).second)
8584 return false;
8585 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8586 !UTE->ReuseShuffleIndices.empty() ||
8587 (UTE->UserTreeIndex &&
8588 UTE->UserTreeIndex.UserTE == Data.first) ||
8589 (Data.first->UserTreeIndex &&
8590 Data.first->UserTreeIndex.UserTE == UTE) ||
8591 (IgnoreReorder && UTE->UserTreeIndex &&
8592 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8593 NodeShouldBeReorderedWithOperands(UTE);
8594 }))
8595 continue;
8596 for (TreeEntry *UTE : Users) {
8598 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8600 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8603 continue;
8604 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8605 Visited.erase(Op);
8606 Queue.push(const_cast<TreeEntry *>(Op));
8607 }
8608 }
8609 }
8610 unsigned NumOps = count_if(
8611 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
8612 return P.second == OpTE;
8613 });
8614 // Stores actually store the mask, not the order, need to invert.
8615 if (OpTE->State == TreeEntry::Vectorize &&
8616 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8617 assert(!OpTE->isAltShuffle() &&
8618 "Alternate instructions are only supported by BinaryOperator "
8619 "and CastInst.");
8620 SmallVector<int> Mask;
8621 inversePermutation(Order, Mask);
8622 unsigned E = Order.size();
8623 OrdersType CurrentOrder(E, E);
8624 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8625 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8626 });
8627 fixupOrderingIndices(CurrentOrder);
8628 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8629 } else {
8630 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8631 }
8632 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
8633 const auto AllowsReordering = [&](const TreeEntry *TE) {
8634 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8635 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8636 (IgnoreReorder && TE->Idx == 0))
8637 return true;
8638 if (TE->isGather()) {
8639 if (GathersToOrders.contains(TE))
8640 return !getReorderingData(*TE, /*TopToBottom=*/false,
8641 IgnoreReorder)
8642 .value_or(OrdersType(1))
8643 .empty();
8644 return true;
8645 }
8646 return false;
8647 };
8648 if (OpTE->UserTreeIndex) {
8649 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8650 if (!VisitedUsers.insert(UserTE).second)
8651 continue;
8652 // May reorder user node if it requires reordering, has reused
8653 // scalars, is an alternate op vectorize node or its op nodes require
8654 // reordering.
8655 if (AllowsReordering(UserTE))
8656 continue;
8657 // Check if users allow reordering.
8658 // Currently look up just 1 level of operands to avoid increase of
8659 // the compile time.
8660 // Profitable to reorder if definitely more operands allow
8661 // reordering rather than those with natural order.
8663 if (static_cast<unsigned>(count_if(
8664 Ops, [UserTE, &AllowsReordering](
8665 const std::pair<unsigned, TreeEntry *> &Op) {
8666 return AllowsReordering(Op.second) &&
8667 Op.second->UserTreeIndex.UserTE == UserTE;
8668 })) <= Ops.size() / 2)
8669 ++Res.first->second;
8670 }
8671 }
8672 if (OrdersUses.empty()) {
8673 Visited.insert_range(llvm::make_second_range(Data.second));
8674 continue;
8675 }
8676 // Choose the most used order.
8677 unsigned IdentityCnt = 0;
8678 unsigned VF = Data.second.front().second->getVectorFactor();
8679 OrdersType IdentityOrder(VF, VF);
8680 for (auto &Pair : OrdersUses) {
8681 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8682 IdentityCnt += Pair.second;
8683 combineOrders(IdentityOrder, Pair.first);
8684 }
8685 }
8686 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8687 unsigned Cnt = IdentityCnt;
8688 for (auto &Pair : OrdersUses) {
8689 // Prefer identity order. But, if filled identity found (non-empty
8690 // order) with same number of uses, as the new candidate order, we can
8691 // choose this candidate order.
8692 if (Cnt < Pair.second) {
8693 combineOrders(Pair.first, BestOrder);
8694 BestOrder = Pair.first;
8695 Cnt = Pair.second;
8696 } else {
8697 combineOrders(BestOrder, Pair.first);
8698 }
8699 }
8700 // Set order of the user node.
8701 if (isIdentityOrder(BestOrder)) {
8702 Visited.insert_range(llvm::make_second_range(Data.second));
8703 continue;
8704 }
8705 fixupOrderingIndices(BestOrder);
8706 // Erase operands from OrderedEntries list and adjust their orders.
8707 VisitedOps.clear();
8708 SmallVector<int> Mask;
8709 inversePermutation(BestOrder, Mask);
8710 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8711 unsigned E = BestOrder.size();
8712 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8713 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8714 });
8715 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
8716 TreeEntry *TE = Op.second;
8717 if (!VisitedOps.insert(TE).second)
8718 continue;
8719 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
8720 reorderNodeWithReuses(*TE, Mask);
8721 continue;
8722 }
8723 // Gathers are processed separately.
8724 if (TE->State != TreeEntry::Vectorize &&
8725 TE->State != TreeEntry::StridedVectorize &&
8726 TE->State != TreeEntry::CompressVectorize &&
8727 TE->State != TreeEntry::SplitVectorize &&
8728 (TE->State != TreeEntry::ScatterVectorize ||
8729 TE->ReorderIndices.empty()))
8730 continue;
8731 assert((BestOrder.size() == TE->ReorderIndices.size() ||
8732 TE->ReorderIndices.empty()) &&
8733 "Non-matching sizes of user/operand entries.");
8734 reorderOrder(TE->ReorderIndices, Mask);
8735 if (IgnoreReorder && TE == VectorizableTree.front().get())
8736 IgnoreReorder = false;
8737 }
8738 // For gathers just need to reorder its scalars.
8739 for (TreeEntry *Gather : GatherOps) {
8740 assert(Gather->ReorderIndices.empty() &&
8741 "Unexpected reordering of gathers.");
8742 if (!Gather->ReuseShuffleIndices.empty()) {
8743 // Just reorder reuses indices.
8744 reorderReuses(Gather->ReuseShuffleIndices, Mask);
8745 continue;
8746 }
8747 reorderScalars(Gather->Scalars, Mask);
8748 Visited.insert(Gather);
8749 }
8750 // Reorder operands of the user node and set the ordering for the user
8751 // node itself.
8752 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
8753 return TE.isAltShuffle() &&
8754 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8755 TE.ReorderIndices.empty());
8756 };
8757 if (Data.first->State != TreeEntry::Vectorize ||
8759 Data.first->getMainOp()) ||
8760 IsNotProfitableAltCodeNode(*Data.first))
8761 Data.first->reorderOperands(Mask);
8762 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
8763 IsNotProfitableAltCodeNode(*Data.first) ||
8764 Data.first->State == TreeEntry::StridedVectorize ||
8765 Data.first->State == TreeEntry::CompressVectorize) {
8766 reorderScalars(Data.first->Scalars, Mask);
8767 reorderOrder(Data.first->ReorderIndices, MaskOrder,
8768 /*BottomOrder=*/true);
8769 if (Data.first->ReuseShuffleIndices.empty() &&
8770 !Data.first->ReorderIndices.empty() &&
8771 !IsNotProfitableAltCodeNode(*Data.first)) {
8772 // Insert user node to the list to try to sink reordering deeper in
8773 // the graph.
8774 Queue.push(Data.first);
8775 }
8776 } else {
8777 reorderOrder(Data.first->ReorderIndices, Mask);
8778 }
8779 }
8780 }
8781 // If the reordering is unnecessary, just remove the reorder.
8782 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8783 VectorizableTree.front()->ReuseShuffleIndices.empty())
8784 VectorizableTree.front()->ReorderIndices.clear();
8785}
8786
8787Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
8788 if (Entry.hasState() &&
8789 (Entry.getOpcode() == Instruction::Store ||
8790 Entry.getOpcode() == Instruction::Load) &&
8791 Entry.State == TreeEntry::StridedVectorize &&
8792 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
8793 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8794 return dyn_cast<Instruction>(Entry.Scalars.front());
8795}
8796
8798 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
8799 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8800 DenseMap<Value *, unsigned> ScalarToExtUses;
8801 SmallPtrSet<Value *, 4> ExternalUsers;
8802 // Collect the values that we need to extract from the tree.
8803 for (auto &TEPtr : VectorizableTree) {
8804 TreeEntry *Entry = TEPtr.get();
8805
8806 // No need to handle users of gathered values.
8807 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8808 continue;
8809
8810 // For each lane:
8811 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8812 Value *Scalar = Entry->Scalars[Lane];
8813 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8814 continue;
8815
8816 // All uses must be replaced already? No need to do it again.
8817 auto It = ScalarToExtUses.find(Scalar);
8818 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
8819 continue;
8820
8821 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8822 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8823 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
8824 << " from " << *Scalar << "for many users.\n");
8825 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8826 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8827 ExternalUsesWithNonUsers.insert(Scalar);
8828 continue;
8829 }
8830
8831 // Check if the scalar is externally used as an extra arg.
8832 const auto ExtI = ExternallyUsedValues.find(Scalar);
8833 if (ExtI != ExternallyUsedValues.end()) {
8834 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8835 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
8836 << FoundLane << " from " << *Scalar << ".\n");
8837 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
8838 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8839 continue;
8840 }
8841 for (User *U : Scalar->users()) {
8842 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
8843
8844 Instruction *UserInst = dyn_cast<Instruction>(U);
8845 if (!UserInst || isDeleted(UserInst))
8846 continue;
8847
8848 // Ignore users in the user ignore list.
8849 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8850 continue;
8851
8852 // Skip in-tree scalars that become vectors
8853 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
8854 !UseEntries.empty()) {
8855 // Some in-tree scalars will remain as scalar in vectorized
8856 // instructions. If that is the case, the one in FoundLane will
8857 // be used.
8858 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8859 isa<LoadInst, StoreInst>(UserInst)) ||
8860 isa<CallInst>(UserInst)) ||
8861 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8862 return UseEntry->State == TreeEntry::ScatterVectorize ||
8864 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8865 TTI);
8866 })) {
8867 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
8868 << ".\n");
8869 assert(none_of(UseEntries,
8870 [](TreeEntry *UseEntry) {
8871 return UseEntry->isGather();
8872 }) &&
8873 "Bad state");
8874 continue;
8875 }
8876 U = nullptr;
8877 if (It != ScalarToExtUses.end()) {
8878 ExternalUses[It->second].User = nullptr;
8879 break;
8880 }
8881 }
8882
8883 if (U && Scalar->hasNUsesOrMore(UsesLimit))
8884 U = nullptr;
8885 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8886 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
8887 << " from lane " << FoundLane << " from " << *Scalar
8888 << ".\n");
8889 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8890 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8891 ExternalUsesWithNonUsers.insert(Scalar);
8892 if (!U)
8893 break;
8894 }
8895 }
8896 }
8897}
8898
8900BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
8903 PtrToStoresMap;
8904 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
8905 Value *V = TE->Scalars[Lane];
8906 // Don't iterate over the users of constant data.
8907 if (!isa<Instruction>(V))
8908 continue;
8909 // To save compilation time we don't visit if we have too many users.
8910 if (V->hasNUsesOrMore(UsesLimit))
8911 break;
8912
8913 // Collect stores per pointer object.
8914 for (User *U : V->users()) {
8915 auto *SI = dyn_cast<StoreInst>(U);
8916 // Test whether we can handle the store. V might be a global, which could
8917 // be used in a different function.
8918 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
8919 !isValidElementType(SI->getValueOperand()->getType()))
8920 continue;
8921 // Skip entry if already
8922 if (isVectorized(U))
8923 continue;
8924
8925 Value *Ptr =
8926 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
8927 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
8928 SI->getValueOperand()->getType(), Ptr}];
8929 // For now just keep one store per pointer object per lane.
8930 // TODO: Extend this to support multiple stores per pointer per lane
8931 if (StoresVec.size() > Lane)
8932 continue;
8933 if (!StoresVec.empty()) {
8934 std::optional<int64_t> Diff = getPointersDiff(
8935 SI->getValueOperand()->getType(), SI->getPointerOperand(),
8936 SI->getValueOperand()->getType(),
8937 StoresVec.front()->getPointerOperand(), *DL, *SE,
8938 /*StrictCheck=*/true);
8939 // We failed to compare the pointers so just abandon this store.
8940 if (!Diff)
8941 continue;
8942 }
8943 StoresVec.push_back(SI);
8944 }
8945 }
8946 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
8947 unsigned I = 0;
8948 for (auto &P : PtrToStoresMap) {
8949 Res[I].swap(P.second);
8950 ++I;
8951 }
8952 return Res;
8953}
8954
8955bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
8956 OrdersType &ReorderIndices) const {
8957 // We check whether the stores in StoreVec can form a vector by sorting them
8958 // and checking whether they are consecutive.
8959
8960 // To avoid calling getPointersDiff() while sorting we create a vector of
8961 // pairs {store, offset from first} and sort this instead.
8963 StoreInst *S0 = StoresVec[0];
8964 StoreOffsetVec.emplace_back(0, 0);
8965 Type *S0Ty = S0->getValueOperand()->getType();
8966 Value *S0Ptr = S0->getPointerOperand();
8967 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
8968 StoreInst *SI = StoresVec[Idx];
8969 std::optional<int64_t> Diff =
8970 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
8971 SI->getPointerOperand(), *DL, *SE,
8972 /*StrictCheck=*/true);
8973 StoreOffsetVec.emplace_back(*Diff, Idx);
8974 }
8975
8976 // Check if the stores are consecutive by checking if their difference is 1.
8977 if (StoreOffsetVec.size() != StoresVec.size())
8978 return false;
8979 sort(StoreOffsetVec, llvm::less_first());
8980 unsigned Idx = 0;
8981 int64_t PrevDist = 0;
8982 for (const auto &P : StoreOffsetVec) {
8983 if (Idx > 0 && P.first != PrevDist + 1)
8984 return false;
8985 PrevDist = P.first;
8986 ++Idx;
8987 }
8988
8989 // Calculate the shuffle indices according to their offset against the sorted
8990 // StoreOffsetVec.
8991 ReorderIndices.assign(StoresVec.size(), 0);
8992 bool IsIdentity = true;
8993 for (auto [I, P] : enumerate(StoreOffsetVec)) {
8994 ReorderIndices[P.second] = I;
8995 IsIdentity &= P.second == I;
8996 }
8997 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
8998 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
8999 // same convention here.
9000 if (IsIdentity)
9001 ReorderIndices.clear();
9002
9003 return true;
9004}
9005
9006#ifndef NDEBUG
9008 for (unsigned Idx : Order)
9009 dbgs() << Idx << ", ";
9010 dbgs() << "\n";
9011}
9012#endif
9013
9015BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9016 unsigned NumLanes = TE->Scalars.size();
9017
9018 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9019
9020 // Holds the reorder indices for each candidate store vector that is a user of
9021 // the current TreeEntry.
9022 SmallVector<OrdersType, 1> ExternalReorderIndices;
9023
9024 // Now inspect the stores collected per pointer and look for vectorization
9025 // candidates. For each candidate calculate the reorder index vector and push
9026 // it into `ExternalReorderIndices`
9027 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9028 // If we have fewer than NumLanes stores, then we can't form a vector.
9029 if (StoresVec.size() != NumLanes)
9030 continue;
9031
9032 // If the stores are not consecutive then abandon this StoresVec.
9033 OrdersType ReorderIndices;
9034 if (!canFormVector(StoresVec, ReorderIndices))
9035 continue;
9036
9037 // We now know that the scalars in StoresVec can form a vector instruction,
9038 // so set the reorder indices.
9039 ExternalReorderIndices.push_back(ReorderIndices);
9040 }
9041 return ExternalReorderIndices;
9042}
9043
9045 const SmallDenseSet<Value *> &UserIgnoreLst) {
9046 deleteTree();
9047 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9048 "TreeEntryToStridedPtrInfoMap is not cleared");
9049 UserIgnoreList = &UserIgnoreLst;
9050 if (!allSameType(Roots))
9051 return;
9052 buildTreeRec(Roots, 0, EdgeInfo());
9053}
9054
9056 deleteTree();
9057 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9058 "TreeEntryToStridedPtrInfoMap is not cleared");
9059 if (!allSameType(Roots))
9060 return;
9061 buildTreeRec(Roots, 0, EdgeInfo());
9062}
9063
9064/// Tries to find subvector of loads and builds new vector of only loads if can
9065/// be profitable.
9067 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9069 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9070 bool AddNew = true) {
9071 if (VL.empty())
9072 return;
9073 Type *ScalarTy = getValueType(VL.front());
9074 if (!isValidElementType(ScalarTy))
9075 return;
9077 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9078 for (Value *V : VL) {
9079 auto *LI = dyn_cast<LoadInst>(V);
9080 if (!LI)
9081 continue;
9082 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9083 continue;
9084 bool IsFound = false;
9085 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9086 assert(LI->getParent() == Data.front().first->getParent() &&
9087 LI->getType() == Data.front().first->getType() &&
9088 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9089 getUnderlyingObject(Data.front().first->getPointerOperand(),
9091 "Expected loads with the same type, same parent and same "
9092 "underlying pointer.");
9093 std::optional<int64_t> Dist = getPointersDiff(
9094 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9095 Data.front().first->getPointerOperand(), DL, SE,
9096 /*StrictCheck=*/true);
9097 if (!Dist)
9098 continue;
9099 auto It = Map.find(*Dist);
9100 if (It != Map.end() && It->second != LI)
9101 continue;
9102 if (It == Map.end()) {
9103 Data.emplace_back(LI, *Dist);
9104 Map.try_emplace(*Dist, LI);
9105 }
9106 IsFound = true;
9107 break;
9108 }
9109 if (!IsFound) {
9110 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9111 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9112 }
9113 }
9114 auto FindMatchingLoads =
9117 &GatheredLoads,
9118 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9119 int64_t &Offset, unsigned &Start) {
9120 if (Loads.empty())
9121 return GatheredLoads.end();
9122 LoadInst *LI = Loads.front().first;
9123 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9124 if (Idx < Start)
9125 continue;
9126 ToAdd.clear();
9127 if (LI->getParent() != Data.front().first->getParent() ||
9128 LI->getType() != Data.front().first->getType())
9129 continue;
9130 std::optional<int64_t> Dist =
9132 Data.front().first->getType(),
9133 Data.front().first->getPointerOperand(), DL, SE,
9134 /*StrictCheck=*/true);
9135 if (!Dist)
9136 continue;
9137 SmallSet<int64_t, 4> DataDists;
9139 for (std::pair<LoadInst *, int64_t> P : Data) {
9140 DataDists.insert(P.second);
9141 DataLoads.insert(P.first);
9142 }
9143 // Found matching gathered loads - check if all loads are unique or
9144 // can be effectively vectorized.
9145 unsigned NumUniques = 0;
9146 for (auto [Cnt, Pair] : enumerate(Loads)) {
9147 bool Used = DataLoads.contains(Pair.first);
9148 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9149 ++NumUniques;
9150 ToAdd.insert(Cnt);
9151 } else if (Used) {
9152 Repeated.insert(Cnt);
9153 }
9154 }
9155 if (NumUniques > 0 &&
9156 (Loads.size() == NumUniques ||
9157 (Loads.size() - NumUniques >= 2 &&
9158 Loads.size() - NumUniques >= Loads.size() / 2 &&
9159 (has_single_bit(Data.size() + NumUniques) ||
9160 bit_ceil(Data.size()) <
9161 bit_ceil(Data.size() + NumUniques))))) {
9162 Offset = *Dist;
9163 Start = Idx + 1;
9164 return std::next(GatheredLoads.begin(), Idx);
9165 }
9166 }
9167 ToAdd.clear();
9168 return GatheredLoads.end();
9169 };
9170 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9171 unsigned Start = 0;
9172 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9173 int64_t Offset = 0;
9174 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9175 Offset, Start);
9176 while (It != GatheredLoads.end()) {
9177 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9178 for (unsigned Idx : LocalToAdd)
9179 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9180 ToAdd.insert_range(LocalToAdd);
9181 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9182 Start);
9183 }
9184 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9185 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9186 })) {
9187 auto AddNewLoads =
9189 for (unsigned Idx : seq<unsigned>(Data.size())) {
9190 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9191 continue;
9192 Loads.push_back(Data[Idx]);
9193 }
9194 };
9195 if (!AddNew) {
9196 LoadInst *LI = Data.front().first;
9197 It = find_if(
9198 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9199 return PD.front().first->getParent() == LI->getParent() &&
9200 PD.front().first->getType() == LI->getType();
9201 });
9202 while (It != GatheredLoads.end()) {
9203 AddNewLoads(*It);
9204 It = std::find_if(
9205 std::next(It), GatheredLoads.end(),
9206 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9207 return PD.front().first->getParent() == LI->getParent() &&
9208 PD.front().first->getType() == LI->getType();
9209 });
9210 }
9211 }
9212 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9213 AddNewLoads(GatheredLoads.emplace_back());
9214 }
9215 }
9216}
9217
9218void BoUpSLP::tryToVectorizeGatheredLoads(
9219 const SmallMapVector<
9220 std::tuple<BasicBlock *, Value *, Type *>,
9221 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9222 &GatheredLoads) {
9223 GatheredLoadsEntriesFirst = VectorizableTree.size();
9224
9225 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9226 LoadEntriesToVectorize.size());
9227 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9228 Set.insert_range(VectorizableTree[Idx]->Scalars);
9229
9230 // Sort loads by distance.
9231 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9232 const std::pair<LoadInst *, int64_t> &L2) {
9233 return L1.second > L2.second;
9234 };
9235
9236 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9237 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9238 Loads.size());
9239 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9240 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9241 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9242 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9243 };
9244
9245 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9246 BoUpSLP::ValueSet &VectorizedLoads,
9247 SmallVectorImpl<LoadInst *> &NonVectorized,
9248 bool Final, unsigned MaxVF) {
9250 unsigned StartIdx = 0;
9251 SmallVector<int> CandidateVFs;
9252 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9253 CandidateVFs.push_back(MaxVF);
9254 for (int NumElts = getFloorFullVectorNumberOfElements(
9255 *TTI, Loads.front()->getType(), MaxVF);
9256 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9257 *TTI, Loads.front()->getType(), NumElts - 1)) {
9258 CandidateVFs.push_back(NumElts);
9259 if (VectorizeNonPowerOf2 && NumElts > 2)
9260 CandidateVFs.push_back(NumElts - 1);
9261 }
9262
9263 if (Final && CandidateVFs.empty())
9264 return Results;
9265
9266 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9267 for (unsigned NumElts : CandidateVFs) {
9268 if (Final && NumElts > BestVF)
9269 continue;
9270 SmallVector<unsigned> MaskedGatherVectorized;
9271 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9272 ++Cnt) {
9273 ArrayRef<LoadInst *> Slice =
9274 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9275 if (VectorizedLoads.count(Slice.front()) ||
9276 VectorizedLoads.count(Slice.back()) ||
9278 continue;
9279 // Check if it is profitable to try vectorizing gathered loads. It is
9280 // profitable if we have more than 3 consecutive loads or if we have
9281 // less but all users are vectorized or deleted.
9282 bool AllowToVectorize = false;
9283 // Check if it is profitable to vectorize 2-elements loads.
9284 if (NumElts == 2) {
9285 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9286 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9287 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9288 for (LoadInst *LI : Slice) {
9289 // If single use/user - allow to vectorize.
9290 if (LI->hasOneUse())
9291 continue;
9292 // 1. Check if number of uses equals number of users.
9293 // 2. All users are deleted.
9294 // 3. The load broadcasts are not allowed or the load is not
9295 // broadcasted.
9296 if (static_cast<unsigned int>(std::distance(
9297 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9298 return false;
9299 if (!IsLegalBroadcastLoad)
9300 continue;
9301 if (LI->hasNUsesOrMore(UsesLimit))
9302 return false;
9303 for (User *U : LI->users()) {
9304 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9305 continue;
9306 for (const TreeEntry *UTE : getTreeEntries(U)) {
9307 for (int I : seq<int>(UTE->getNumOperands())) {
9308 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9309 return V == LI || isa<PoisonValue>(V);
9310 }))
9311 // Found legal broadcast - do not vectorize.
9312 return false;
9313 }
9314 }
9315 }
9316 }
9317 return true;
9318 };
9319 AllowToVectorize = CheckIfAllowed(Slice);
9320 } else {
9321 AllowToVectorize =
9322 (NumElts >= 3 ||
9323 any_of(ValueToGatherNodes.at(Slice.front()),
9324 [=](const TreeEntry *TE) {
9325 return TE->Scalars.size() == 2 &&
9326 ((TE->Scalars.front() == Slice.front() &&
9327 TE->Scalars.back() == Slice.back()) ||
9328 (TE->Scalars.front() == Slice.back() &&
9329 TE->Scalars.back() == Slice.front()));
9330 })) &&
9331 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9332 Slice.size());
9333 }
9334 if (AllowToVectorize) {
9335 SmallVector<Value *> PointerOps;
9336 OrdersType CurrentOrder;
9337 // Try to build vector load.
9338 ArrayRef<Value *> Values(
9339 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9340 StridedPtrInfo SPtrInfo;
9341 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9342 PointerOps, SPtrInfo, &BestVF);
9343 if (LS != LoadsState::Gather ||
9344 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9345 if (LS == LoadsState::ScatterVectorize) {
9346 if (MaskedGatherVectorized.empty() ||
9347 Cnt >= MaskedGatherVectorized.back() + NumElts)
9348 MaskedGatherVectorized.push_back(Cnt);
9349 continue;
9350 }
9351 if (LS != LoadsState::Gather) {
9352 Results.emplace_back(Values, LS);
9353 VectorizedLoads.insert_range(Slice);
9354 // If we vectorized initial block, no need to try to vectorize it
9355 // again.
9356 if (Cnt == StartIdx)
9357 StartIdx += NumElts;
9358 }
9359 // Check if the whole array was vectorized already - exit.
9360 if (StartIdx >= Loads.size())
9361 break;
9362 // Erase last masked gather candidate, if another candidate within
9363 // the range is found to be better.
9364 if (!MaskedGatherVectorized.empty() &&
9365 Cnt < MaskedGatherVectorized.back() + NumElts)
9366 MaskedGatherVectorized.pop_back();
9367 Cnt += NumElts - 1;
9368 continue;
9369 }
9370 }
9371 if (!AllowToVectorize || BestVF == 0)
9373 }
9374 // Mark masked gathers candidates as vectorized, if any.
9375 for (unsigned Cnt : MaskedGatherVectorized) {
9376 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9377 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9378 ArrayRef<Value *> Values(
9379 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9380 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9381 VectorizedLoads.insert_range(Slice);
9382 // If we vectorized initial block, no need to try to vectorize it again.
9383 if (Cnt == StartIdx)
9384 StartIdx += NumElts;
9385 }
9386 }
9387 for (LoadInst *LI : Loads) {
9388 if (!VectorizedLoads.contains(LI))
9389 NonVectorized.push_back(LI);
9390 }
9391 return Results;
9392 };
9393 auto ProcessGatheredLoads =
9394 [&, &TTI = *TTI](
9396 bool Final = false) {
9397 SmallVector<LoadInst *> NonVectorized;
9398 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9399 GatheredLoads) {
9400 if (LoadsDists.size() <= 1) {
9401 NonVectorized.push_back(LoadsDists.back().first);
9402 continue;
9403 }
9405 LoadsDists);
9406 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9407 stable_sort(LocalLoadsDists, LoadSorter);
9409 unsigned MaxConsecutiveDistance = 0;
9410 unsigned CurrentConsecutiveDist = 1;
9411 int64_t LastDist = LocalLoadsDists.front().second;
9412 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9413 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9414 if (isVectorized(L.first))
9415 continue;
9416 assert(LastDist >= L.second &&
9417 "Expected first distance always not less than second");
9418 if (static_cast<uint64_t>(LastDist - L.second) ==
9419 CurrentConsecutiveDist) {
9420 ++CurrentConsecutiveDist;
9421 MaxConsecutiveDistance =
9422 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9423 Loads.push_back(L.first);
9424 continue;
9425 }
9426 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9427 !Loads.empty())
9428 Loads.pop_back();
9429 CurrentConsecutiveDist = 1;
9430 LastDist = L.second;
9431 Loads.push_back(L.first);
9432 }
9433 if (Loads.size() <= 1)
9434 continue;
9435 if (AllowMaskedGather)
9436 MaxConsecutiveDistance = Loads.size();
9437 else if (MaxConsecutiveDistance < 2)
9438 continue;
9439 BoUpSLP::ValueSet VectorizedLoads;
9440 SmallVector<LoadInst *> SortedNonVectorized;
9442 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9443 Final, MaxConsecutiveDistance);
9444 if (!Results.empty() && !SortedNonVectorized.empty() &&
9445 OriginalLoads.size() == Loads.size() &&
9446 MaxConsecutiveDistance == Loads.size() &&
9448 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9449 return P.second == LoadsState::ScatterVectorize;
9450 })) {
9451 VectorizedLoads.clear();
9452 SmallVector<LoadInst *> UnsortedNonVectorized;
9454 UnsortedResults =
9455 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9456 UnsortedNonVectorized, Final,
9457 OriginalLoads.size());
9458 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9459 SortedNonVectorized.swap(UnsortedNonVectorized);
9460 Results.swap(UnsortedResults);
9461 }
9462 }
9463 for (auto [Slice, _] : Results) {
9464 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9465 << Slice.size() << ")\n");
9466 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9467 for (Value *L : Slice)
9468 if (!isVectorized(L))
9469 SortedNonVectorized.push_back(cast<LoadInst>(L));
9470 continue;
9471 }
9472
9473 // Select maximum VF as a maximum of user gathered nodes and
9474 // distance between scalar loads in these nodes.
9475 unsigned MaxVF = Slice.size();
9476 unsigned UserMaxVF = 0;
9477 unsigned InterleaveFactor = 0;
9478 if (MaxVF == 2) {
9479 UserMaxVF = MaxVF;
9480 } else {
9481 // Found distance between segments of the interleaved loads.
9482 std::optional<unsigned> InterleavedLoadsDistance = 0;
9483 unsigned Order = 0;
9484 std::optional<unsigned> CommonVF = 0;
9485 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9486 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9487 for (auto [Idx, V] : enumerate(Slice)) {
9488 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9489 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9490 unsigned Pos =
9491 EntryToPosition.try_emplace(E, Idx).first->second;
9492 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9493 if (CommonVF) {
9494 if (*CommonVF == 0) {
9495 CommonVF = E->Scalars.size();
9496 continue;
9497 }
9498 if (*CommonVF != E->Scalars.size())
9499 CommonVF.reset();
9500 }
9501 // Check if the load is the part of the interleaved load.
9502 if (Pos != Idx && InterleavedLoadsDistance) {
9503 if (!DeinterleavedNodes.contains(E) &&
9504 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9505 if (isa<Constant>(V))
9506 return false;
9507 if (isVectorized(V))
9508 return true;
9509 const auto &Nodes = ValueToGatherNodes.at(V);
9510 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9511 !is_contained(Slice, V);
9512 })) {
9513 InterleavedLoadsDistance.reset();
9514 continue;
9515 }
9516 DeinterleavedNodes.insert(E);
9517 if (*InterleavedLoadsDistance == 0) {
9518 InterleavedLoadsDistance = Idx - Pos;
9519 continue;
9520 }
9521 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9522 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9523 InterleavedLoadsDistance.reset();
9524 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9525 }
9526 }
9527 }
9528 DeinterleavedNodes.clear();
9529 // Check if the large load represents interleaved load operation.
9530 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9531 CommonVF.value_or(0) != 0) {
9532 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9533 unsigned VF = *CommonVF;
9534 OrdersType Order;
9535 SmallVector<Value *> PointerOps;
9536 StridedPtrInfo SPtrInfo;
9537 // Segmented load detected - vectorize at maximum vector factor.
9538 if (InterleaveFactor <= Slice.size() &&
9539 TTI.isLegalInterleavedAccessType(
9540 getWidenedType(Slice.front()->getType(), VF),
9541 InterleaveFactor,
9542 cast<LoadInst>(Slice.front())->getAlign(),
9543 cast<LoadInst>(Slice.front())
9545 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9546 SPtrInfo) == LoadsState::Vectorize) {
9547 UserMaxVF = InterleaveFactor * VF;
9548 } else {
9549 InterleaveFactor = 0;
9550 }
9551 }
9552 // Cannot represent the loads as consecutive vectorizable nodes -
9553 // just exit.
9554 unsigned ConsecutiveNodesSize = 0;
9555 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9556 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9557 [&, Slice = Slice](const auto &P) {
9558 const auto *It = find_if(Slice, [&](Value *V) {
9559 return std::get<1>(P).contains(V);
9560 });
9561 if (It == Slice.end())
9562 return false;
9563 const TreeEntry &TE =
9564 *VectorizableTree[std::get<0>(P)];
9565 ArrayRef<Value *> VL = TE.Scalars;
9566 OrdersType Order;
9567 SmallVector<Value *> PointerOps;
9568 StridedPtrInfo SPtrInfo;
9570 VL, VL.front(), Order, PointerOps, SPtrInfo);
9571 if (State == LoadsState::ScatterVectorize ||
9573 return false;
9574 ConsecutiveNodesSize += VL.size();
9575 size_t Start = std::distance(Slice.begin(), It);
9576 size_t Sz = Slice.size() - Start;
9577 return Sz < VL.size() ||
9578 Slice.slice(Start, VL.size()) != VL;
9579 }))
9580 continue;
9581 // Try to build long masked gather loads.
9582 UserMaxVF = bit_ceil(UserMaxVF);
9583 if (InterleaveFactor == 0 &&
9584 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9585 [&, Slice = Slice](unsigned Idx) {
9586 OrdersType Order;
9587 SmallVector<Value *> PointerOps;
9588 StridedPtrInfo SPtrInfo;
9589 return canVectorizeLoads(
9590 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9591 Slice[Idx * UserMaxVF], Order, PointerOps,
9592 SPtrInfo) == LoadsState::ScatterVectorize;
9593 }))
9594 UserMaxVF = MaxVF;
9595 if (Slice.size() != ConsecutiveNodesSize)
9596 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9597 }
9598 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9599 bool IsVectorized = true;
9600 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
9601 ArrayRef<Value *> SubSlice =
9602 Slice.slice(I, std::min(VF, E - I));
9603 if (isVectorized(SubSlice.front()))
9604 continue;
9605 // Check if the subslice is to be-vectorized entry, which is not
9606 // equal to entry.
9607 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9608 [&](const auto &P) {
9609 return !SubSlice.equals(
9610 VectorizableTree[std::get<0>(P)]
9611 ->Scalars) &&
9612 set_is_subset(SubSlice, std::get<1>(P));
9613 }))
9614 continue;
9615 unsigned Sz = VectorizableTree.size();
9616 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9617 if (Sz == VectorizableTree.size()) {
9618 IsVectorized = false;
9619 // Try non-interleaved vectorization with smaller vector
9620 // factor.
9621 if (InterleaveFactor > 0) {
9622 VF = 2 * (MaxVF / InterleaveFactor);
9623 InterleaveFactor = 0;
9624 }
9625 continue;
9626 }
9627 }
9628 if (IsVectorized)
9629 break;
9630 }
9631 }
9632 NonVectorized.append(SortedNonVectorized);
9633 }
9634 return NonVectorized;
9635 };
9636 for (const auto &GLs : GatheredLoads) {
9637 const auto &Ref = GLs.second;
9638 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
9639 if (!Ref.empty() && !NonVectorized.empty() &&
9640 std::accumulate(
9641 Ref.begin(), Ref.end(), 0u,
9642 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9643 -> unsigned { return S + LoadsDists.size(); }) !=
9644 NonVectorized.size() &&
9645 IsMaskedGatherSupported(NonVectorized)) {
9647 FinalGatheredLoads;
9648 for (LoadInst *LI : NonVectorized) {
9649 // Reinsert non-vectorized loads to other list of loads with the same
9650 // base pointers.
9651 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
9652 FinalGatheredLoads,
9653 /*AddNew=*/false);
9654 }
9655 // Final attempt to vectorize non-vectorized loads.
9656 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
9657 }
9658 }
9659 // Try to vectorize postponed load entries, previously marked as gathered.
9660 for (unsigned Idx : LoadEntriesToVectorize) {
9661 const TreeEntry &E = *VectorizableTree[Idx];
9662 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
9663 // Avoid reordering, if possible.
9664 if (!E.ReorderIndices.empty()) {
9665 // Build a mask out of the reorder indices and reorder scalars per this
9666 // mask.
9667 SmallVector<int> ReorderMask;
9668 inversePermutation(E.ReorderIndices, ReorderMask);
9669 reorderScalars(GatheredScalars, ReorderMask);
9670 }
9671 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9672 }
9673 // If no new entries created, consider it as no gathered loads entries must be
9674 // handled.
9675 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9676 VectorizableTree.size())
9677 GatheredLoadsEntriesFirst.reset();
9678}
9679
9680/// Generates key/subkey pair for the given value to provide effective sorting
9681/// of the values and better detection of the vectorizable values sequences. The
9682/// keys/subkeys can be used for better sorting of the values themselves (keys)
9683/// and in values subgroups (subkeys).
9684static std::pair<size_t, size_t> generateKeySubkey(
9685 Value *V, const TargetLibraryInfo *TLI,
9686 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
9687 bool AllowAlternate) {
9688 hash_code Key = hash_value(V->getValueID() + 2);
9689 hash_code SubKey = hash_value(0);
9690 // Sort the loads by the distance between the pointers.
9691 if (auto *LI = dyn_cast<LoadInst>(V)) {
9692 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
9693 if (LI->isSimple())
9694 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
9695 else
9696 Key = SubKey = hash_value(LI);
9697 } else if (isVectorLikeInstWithConstOps(V)) {
9698 // Sort extracts by the vector operands.
9700 Key = hash_value(Value::UndefValueVal + 1);
9701 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
9702 if (!isUndefVector(EI->getVectorOperand()).all() &&
9703 !isa<UndefValue>(EI->getIndexOperand()))
9704 SubKey = hash_value(EI->getVectorOperand());
9705 }
9706 } else if (auto *I = dyn_cast<Instruction>(V)) {
9707 // Sort other instructions just by the opcodes except for CMPInst.
9708 // For CMP also sort by the predicate kind.
9710 isValidForAlternation(I->getOpcode())) {
9711 if (AllowAlternate)
9712 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
9713 else
9714 Key = hash_combine(hash_value(I->getOpcode()), Key);
9715 SubKey = hash_combine(
9716 hash_value(I->getOpcode()), hash_value(I->getType()),
9718 ? I->getType()
9719 : cast<CastInst>(I)->getOperand(0)->getType()));
9720 // For casts, look through the only operand to improve compile time.
9721 if (isa<CastInst>(I)) {
9722 std::pair<size_t, size_t> OpVals =
9723 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
9724 /*AllowAlternate=*/true);
9725 Key = hash_combine(OpVals.first, Key);
9726 SubKey = hash_combine(OpVals.first, SubKey);
9727 }
9728 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
9729 CmpInst::Predicate Pred = CI->getPredicate();
9730 if (CI->isCommutative())
9731 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
9733 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
9734 hash_value(SwapPred),
9735 hash_value(CI->getOperand(0)->getType()));
9736 } else if (auto *Call = dyn_cast<CallInst>(I)) {
9739 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
9740 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
9741 SubKey = hash_combine(hash_value(I->getOpcode()),
9742 hash_value(Call->getCalledFunction()));
9743 } else {
9745 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
9746 }
9747 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
9748 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
9749 hash_value(Op.Tag), SubKey);
9750 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
9751 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9752 SubKey = hash_value(Gep->getPointerOperand());
9753 else
9754 SubKey = hash_value(Gep);
9755 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
9756 !isa<ConstantInt>(I->getOperand(1))) {
9757 // Do not try to vectorize instructions with potentially high cost.
9758 SubKey = hash_value(I);
9759 } else {
9760 SubKey = hash_value(I->getOpcode());
9761 }
9762 Key = hash_combine(hash_value(I->getParent()), Key);
9763 }
9764 return std::make_pair(Key, SubKey);
9765}
9766
9767/// Checks if the specified instruction \p I is an main operation for the given
9768/// \p MainOp and \p AltOp instructions.
9769static bool isMainInstruction(Instruction *I, Instruction *MainOp,
9770 Instruction *AltOp, const TargetLibraryInfo &TLI);
9771
9772bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
9773 ArrayRef<Value *> VL) const {
9774 Type *ScalarTy = S.getMainOp()->getType();
9775 unsigned Opcode0 = S.getOpcode();
9776 unsigned Opcode1 = S.getAltOpcode();
9777 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9778 // If this pattern is supported by the target then consider it profitable.
9779 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
9780 Opcode1, OpcodeMask))
9781 return true;
9782 SmallVector<ValueList> Operands;
9783 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9784 Operands.emplace_back();
9785 // Prepare the operand vector.
9786 for (Value *V : VL) {
9787 if (isa<PoisonValue>(V)) {
9788 Operands.back().push_back(
9789 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
9790 continue;
9791 }
9792 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
9793 }
9794 }
9795 if (Operands.size() == 2) {
9796 // Try find best operands candidates.
9797 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9799 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
9800 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
9801 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
9802 std::optional<int> Res = findBestRootPair(Candidates);
9803 switch (Res.value_or(0)) {
9804 case 0:
9805 break;
9806 case 1:
9807 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
9808 break;
9809 case 2:
9810 std::swap(Operands[0][I], Operands[1][I]);
9811 break;
9812 default:
9813 llvm_unreachable("Unexpected index.");
9814 }
9815 }
9816 }
9817 DenseSet<unsigned> UniqueOpcodes;
9818 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
9819 unsigned NonInstCnt = 0;
9820 // Estimate number of instructions, required for the vectorized node and for
9821 // the buildvector node.
9822 unsigned UndefCnt = 0;
9823 // Count the number of extra shuffles, required for vector nodes.
9824 unsigned ExtraShuffleInsts = 0;
9825 // Check that operands do not contain same values and create either perfect
9826 // diamond match or shuffled match.
9827 if (Operands.size() == 2) {
9828 // Do not count same operands twice.
9829 if (Operands.front() == Operands.back()) {
9830 Operands.erase(Operands.begin());
9831 } else if (!allConstant(Operands.front()) &&
9832 all_of(Operands.front(), [&](Value *V) {
9833 return is_contained(Operands.back(), V);
9834 })) {
9835 Operands.erase(Operands.begin());
9836 ++ExtraShuffleInsts;
9837 }
9838 }
9839 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
9840 // Vectorize node, if:
9841 // 1. at least single operand is constant or splat.
9842 // 2. Operands have many loop invariants (the instructions are not loop
9843 // invariants).
9844 // 3. At least single unique operands is supposed to vectorized.
9845 return none_of(Operands,
9846 [&](ArrayRef<Value *> Op) {
9847 if (allConstant(Op) ||
9848 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
9849 getSameOpcode(Op, *TLI)))
9850 return false;
9851 DenseMap<Value *, unsigned> Uniques;
9852 for (Value *V : Op) {
9854 isVectorized(V) || (L && L->isLoopInvariant(V))) {
9855 if (isa<UndefValue>(V))
9856 ++UndefCnt;
9857 continue;
9858 }
9859 auto Res = Uniques.try_emplace(V, 0);
9860 // Found first duplicate - need to add shuffle.
9861 if (!Res.second && Res.first->second == 1)
9862 ++ExtraShuffleInsts;
9863 ++Res.first->getSecond();
9864 if (auto *I = dyn_cast<Instruction>(V))
9865 UniqueOpcodes.insert(I->getOpcode());
9866 else if (Res.second)
9867 ++NonInstCnt;
9868 }
9869 return none_of(Uniques, [&](const auto &P) {
9870 return P.first->hasNUsesOrMore(P.second + 1) &&
9871 none_of(P.first->users(), [&](User *U) {
9872 return isVectorized(U) || Uniques.contains(U);
9873 });
9874 });
9875 }) ||
9876 // Do not vectorize node, if estimated number of vector instructions is
9877 // more than estimated number of buildvector instructions. Number of
9878 // vector operands is number of vector instructions + number of vector
9879 // instructions for operands (buildvectors). Number of buildvector
9880 // instructions is just number_of_operands * number_of_scalars.
9881 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9882 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
9883 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9884}
9885
9886/// Builds the arguments types vector for the given call instruction with the
9887/// given \p ID for the specified vector factor.
9890 const unsigned VF, unsigned MinBW,
9891 const TargetTransformInfo *TTI) {
9892 SmallVector<Type *> ArgTys;
9893 for (auto [Idx, Arg] : enumerate(CI->args())) {
9896 ArgTys.push_back(Arg->getType());
9897 continue;
9898 }
9899 if (MinBW > 0) {
9900 ArgTys.push_back(
9901 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9902 continue;
9903 }
9904 }
9905 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9906 }
9907 return ArgTys;
9908}
9909
9910/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
9911/// function (if possible) calls. Returns invalid cost for the corresponding
9912/// calls, if they cannot be vectorized/will be scalarized.
9913static std::pair<InstructionCost, InstructionCost>
9916 ArrayRef<Type *> ArgTys) {
9917 auto Shape = VFShape::get(CI->getFunctionType(),
9919 false /*HasGlobalPred*/);
9920 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9921 auto LibCost = InstructionCost::getInvalid();
9922 if (!CI->isNoBuiltin() && VecFunc) {
9923 // Calculate the cost of the vector library call.
9924 // If the corresponding vector call is cheaper, return its cost.
9925 LibCost =
9926 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9927 }
9929
9930 // Calculate the cost of the vector intrinsic call.
9931 FastMathFlags FMF;
9932 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9933 FMF = FPCI->getFastMathFlags();
9934 const InstructionCost ScalarLimit = 10000;
9935 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
9936 LibCost.isValid() ? LibCost : ScalarLimit);
9937 auto IntrinsicCost =
9938 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
9939 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
9940 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
9942
9943 return {IntrinsicCost, LibCost};
9944}
9945
9946BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9947 const InstructionsState &S, ArrayRef<Value *> VL,
9948 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9949 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9950 assert(S.getMainOp() &&
9951 "Expected instructions with same/alternate opcodes only.");
9952
9953 unsigned ShuffleOrOp =
9954 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9955 Instruction *VL0 = S.getMainOp();
9956 switch (ShuffleOrOp) {
9957 case Instruction::PHI: {
9958 // Too many operands - gather, most probably won't be vectorized.
9959 if (VL0->getNumOperands() > MaxPHINumOperands)
9960 return TreeEntry::NeedToGather;
9961 // Check for terminator values (e.g. invoke).
9962 for (Value *V : VL) {
9963 auto *PHI = dyn_cast<PHINode>(V);
9964 if (!PHI)
9965 continue;
9966 for (Value *Incoming : PHI->incoming_values()) {
9968 if (Term && Term->isTerminator()) {
9970 << "SLP: Need to swizzle PHINodes (terminator use).\n");
9971 return TreeEntry::NeedToGather;
9972 }
9973 }
9974 }
9975
9976 return TreeEntry::Vectorize;
9977 }
9978 case Instruction::ExtractElement:
9979 if (any_of(VL, [&](Value *V) {
9980 auto *EI = dyn_cast<ExtractElementInst>(V);
9981 if (!EI)
9982 return true;
9983 return isVectorized(EI->getOperand(0));
9984 }))
9985 return TreeEntry::NeedToGather;
9986 [[fallthrough]];
9987 case Instruction::ExtractValue: {
9988 bool Reuse = canReuseExtract(VL, CurrentOrder);
9989 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
9990 // non-full registers).
9991 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
9992 return TreeEntry::NeedToGather;
9993 if (Reuse || !CurrentOrder.empty())
9994 return TreeEntry::Vectorize;
9995 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
9996 return TreeEntry::NeedToGather;
9997 }
9998 case Instruction::InsertElement: {
9999 // Check that we have a buildvector and not a shuffle of 2 or more
10000 // different vectors.
10001 ValueSet SourceVectors;
10002 for (Value *V : VL) {
10003 if (isa<PoisonValue>(V)) {
10004 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10005 return TreeEntry::NeedToGather;
10006 }
10007 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10008 assert(getElementIndex(V) != std::nullopt &&
10009 "Non-constant or undef index?");
10010 }
10011
10012 if (count_if(VL, [&SourceVectors](Value *V) {
10013 return !SourceVectors.contains(V);
10014 }) >= 2) {
10015 // Found 2nd source vector - cancel.
10016 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10017 "different source vectors.\n");
10018 return TreeEntry::NeedToGather;
10019 }
10020
10021 if (any_of(VL, [&SourceVectors](Value *V) {
10022 // The last InsertElement can have multiple uses.
10023 return SourceVectors.contains(V) && !V->hasOneUse();
10024 })) {
10025 assert(SLPReVec && "Only supported by REVEC.");
10026 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10027 "multiple uses.\n");
10028 return TreeEntry::NeedToGather;
10029 }
10030
10031 return TreeEntry::Vectorize;
10032 }
10033 case Instruction::Load: {
10034 // Check that a vectorized load would load the same memory as a scalar
10035 // load. For example, we don't want to vectorize loads that are smaller
10036 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10037 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10038 // from such a struct, we read/write packed bits disagreeing with the
10039 // unvectorized version.
10040 auto IsGatheredNode = [&]() {
10041 if (!GatheredLoadsEntriesFirst)
10042 return false;
10043 return all_of(VL, [&](Value *V) {
10044 if (isa<PoisonValue>(V))
10045 return true;
10046 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10047 return TE->Idx >= *GatheredLoadsEntriesFirst;
10048 });
10049 });
10050 };
10051 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10053 return TreeEntry::Vectorize;
10055 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10056 // Delay slow vectorized nodes for better vectorization attempts.
10057 LoadEntriesToVectorize.insert(VectorizableTree.size());
10058 return TreeEntry::NeedToGather;
10059 }
10060 return IsGatheredNode() ? TreeEntry::NeedToGather
10061 : TreeEntry::CompressVectorize;
10063 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10064 // Delay slow vectorized nodes for better vectorization attempts.
10065 LoadEntriesToVectorize.insert(VectorizableTree.size());
10066 return TreeEntry::NeedToGather;
10067 }
10068 return IsGatheredNode() ? TreeEntry::NeedToGather
10069 : TreeEntry::ScatterVectorize;
10071 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10072 // Delay slow vectorized nodes for better vectorization attempts.
10073 LoadEntriesToVectorize.insert(VectorizableTree.size());
10074 return TreeEntry::NeedToGather;
10075 }
10076 return IsGatheredNode() ? TreeEntry::NeedToGather
10077 : TreeEntry::StridedVectorize;
10078 case LoadsState::Gather:
10079#ifndef NDEBUG
10080 Type *ScalarTy = VL0->getType();
10081 if (DL->getTypeSizeInBits(ScalarTy) !=
10082 DL->getTypeAllocSizeInBits(ScalarTy))
10083 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10084 else if (any_of(VL, [](Value *V) {
10085 auto *LI = dyn_cast<LoadInst>(V);
10086 return !LI || !LI->isSimple();
10087 }))
10088 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10089 else
10090 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10091#endif // NDEBUG
10093 return TreeEntry::NeedToGather;
10094 }
10095 llvm_unreachable("Unexpected state of loads");
10096 }
10097 case Instruction::ZExt:
10098 case Instruction::SExt:
10099 case Instruction::FPToUI:
10100 case Instruction::FPToSI:
10101 case Instruction::FPExt:
10102 case Instruction::PtrToInt:
10103 case Instruction::IntToPtr:
10104 case Instruction::SIToFP:
10105 case Instruction::UIToFP:
10106 case Instruction::Trunc:
10107 case Instruction::FPTrunc:
10108 case Instruction::BitCast: {
10109 Type *SrcTy = VL0->getOperand(0)->getType();
10110 for (Value *V : VL) {
10111 if (isa<PoisonValue>(V))
10112 continue;
10113 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10114 if (Ty != SrcTy || !isValidElementType(Ty)) {
10115 LLVM_DEBUG(
10116 dbgs() << "SLP: Gathering casts with different src types.\n");
10117 return TreeEntry::NeedToGather;
10118 }
10119 }
10120 return TreeEntry::Vectorize;
10121 }
10122 case Instruction::ICmp:
10123 case Instruction::FCmp: {
10124 // Check that all of the compares have the same predicate.
10125 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10127 Type *ComparedTy = VL0->getOperand(0)->getType();
10128 for (Value *V : VL) {
10129 if (isa<PoisonValue>(V))
10130 continue;
10131 auto *Cmp = cast<CmpInst>(V);
10132 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10133 Cmp->getOperand(0)->getType() != ComparedTy) {
10134 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10135 return TreeEntry::NeedToGather;
10136 }
10137 }
10138 return TreeEntry::Vectorize;
10139 }
10140 case Instruction::Select:
10141 case Instruction::FNeg:
10142 case Instruction::Add:
10143 case Instruction::FAdd:
10144 case Instruction::Sub:
10145 case Instruction::FSub:
10146 case Instruction::Mul:
10147 case Instruction::FMul:
10148 case Instruction::UDiv:
10149 case Instruction::SDiv:
10150 case Instruction::FDiv:
10151 case Instruction::URem:
10152 case Instruction::SRem:
10153 case Instruction::FRem:
10154 case Instruction::Shl:
10155 case Instruction::LShr:
10156 case Instruction::AShr:
10157 case Instruction::And:
10158 case Instruction::Or:
10159 case Instruction::Xor:
10160 case Instruction::Freeze:
10161 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10162 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10163 auto *I = dyn_cast<Instruction>(V);
10164 return I && I->isBinaryOp() && !I->isFast();
10165 }))
10166 return TreeEntry::NeedToGather;
10167 return TreeEntry::Vectorize;
10168 case Instruction::GetElementPtr: {
10169 // We don't combine GEPs with complicated (nested) indexing.
10170 for (Value *V : VL) {
10171 auto *I = dyn_cast<GetElementPtrInst>(V);
10172 if (!I)
10173 continue;
10174 if (I->getNumOperands() != 2) {
10175 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10176 return TreeEntry::NeedToGather;
10177 }
10178 }
10179
10180 // We can't combine several GEPs into one vector if they operate on
10181 // different types.
10182 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10183 for (Value *V : VL) {
10184 auto *GEP = dyn_cast<GEPOperator>(V);
10185 if (!GEP)
10186 continue;
10187 Type *CurTy = GEP->getSourceElementType();
10188 if (Ty0 != CurTy) {
10189 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10190 return TreeEntry::NeedToGather;
10191 }
10192 }
10193
10194 // We don't combine GEPs with non-constant indexes.
10195 Type *Ty1 = VL0->getOperand(1)->getType();
10196 for (Value *V : VL) {
10197 auto *I = dyn_cast<GetElementPtrInst>(V);
10198 if (!I)
10199 continue;
10200 auto *Op = I->getOperand(1);
10201 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10202 (Op->getType() != Ty1 &&
10203 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10204 Op->getType()->getScalarSizeInBits() >
10205 DL->getIndexSizeInBits(
10206 V->getType()->getPointerAddressSpace())))) {
10207 LLVM_DEBUG(
10208 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10209 return TreeEntry::NeedToGather;
10210 }
10211 }
10212
10213 return TreeEntry::Vectorize;
10214 }
10215 case Instruction::Store: {
10216 // Check if the stores are consecutive or if we need to swizzle them.
10217 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10218 // Avoid types that are padded when being allocated as scalars, while
10219 // being packed together in a vector (such as i1).
10220 if (DL->getTypeSizeInBits(ScalarTy) !=
10221 DL->getTypeAllocSizeInBits(ScalarTy)) {
10222 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10223 return TreeEntry::NeedToGather;
10224 }
10225 // Make sure all stores in the bundle are simple - we can't vectorize
10226 // atomic or volatile stores.
10227 for (Value *V : VL) {
10228 auto *SI = cast<StoreInst>(V);
10229 if (!SI->isSimple()) {
10230 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10231 return TreeEntry::NeedToGather;
10232 }
10233 PointerOps.push_back(SI->getPointerOperand());
10234 }
10235
10236 // Check the order of pointer operands.
10237 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10238 Value *Ptr0;
10239 Value *PtrN;
10240 if (CurrentOrder.empty()) {
10241 Ptr0 = PointerOps.front();
10242 PtrN = PointerOps.back();
10243 } else {
10244 Ptr0 = PointerOps[CurrentOrder.front()];
10245 PtrN = PointerOps[CurrentOrder.back()];
10246 }
10247 std::optional<int64_t> Dist =
10248 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10249 // Check that the sorted pointer operands are consecutive.
10250 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10251 return TreeEntry::Vectorize;
10252 }
10253
10254 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10255 return TreeEntry::NeedToGather;
10256 }
10257 case Instruction::Call: {
10258 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10259 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10260 auto *I = dyn_cast<Instruction>(V);
10261 return I && !I->isFast();
10262 }))
10263 return TreeEntry::NeedToGather;
10264 // Check if the calls are all to the same vectorizable intrinsic or
10265 // library function.
10266 CallInst *CI = cast<CallInst>(VL0);
10268
10269 VFShape Shape = VFShape::get(
10270 CI->getFunctionType(),
10271 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10272 false /*HasGlobalPred*/);
10273 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10274
10275 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10276 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10277 return TreeEntry::NeedToGather;
10278 }
10279 Function *F = CI->getCalledFunction();
10280 unsigned NumArgs = CI->arg_size();
10281 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10282 for (unsigned J = 0; J != NumArgs; ++J)
10284 ScalarArgs[J] = CI->getArgOperand(J);
10285 for (Value *V : VL) {
10286 CallInst *CI2 = dyn_cast<CallInst>(V);
10287 if (!CI2 || CI2->getCalledFunction() != F ||
10288 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10289 (VecFunc &&
10290 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10292 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10293 << "\n");
10294 return TreeEntry::NeedToGather;
10295 }
10296 // Some intrinsics have scalar arguments and should be same in order for
10297 // them to be vectorized.
10298 for (unsigned J = 0; J != NumArgs; ++J) {
10300 Value *A1J = CI2->getArgOperand(J);
10301 if (ScalarArgs[J] != A1J) {
10303 << "SLP: mismatched arguments in call:" << *CI
10304 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10305 return TreeEntry::NeedToGather;
10306 }
10307 }
10308 }
10309 // Verify that the bundle operands are identical between the two calls.
10310 if (CI->hasOperandBundles() &&
10311 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10312 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10313 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10314 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10315 << "!=" << *V << '\n');
10316 return TreeEntry::NeedToGather;
10317 }
10318 }
10319 SmallVector<Type *> ArgTys =
10320 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10321 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10322 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10323 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10324 return TreeEntry::NeedToGather;
10325
10326 return TreeEntry::Vectorize;
10327 }
10328 case Instruction::ShuffleVector: {
10329 if (!S.isAltShuffle()) {
10330 // REVEC can support non alternate shuffle.
10332 return TreeEntry::Vectorize;
10333 // If this is not an alternate sequence of opcode like add-sub
10334 // then do not vectorize this instruction.
10335 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10336 return TreeEntry::NeedToGather;
10337 }
10338 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10339 LLVM_DEBUG(
10340 dbgs()
10341 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10342 "the whole alt sequence is not profitable.\n");
10343 return TreeEntry::NeedToGather;
10344 }
10345
10346 return TreeEntry::Vectorize;
10347 }
10348 default:
10349 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10350 return TreeEntry::NeedToGather;
10351 }
10352}
10353
10354namespace {
10355/// Allows to correctly handle operands of the phi nodes based on the \p Main
10356/// PHINode order of incoming basic blocks/values.
10357class PHIHandler {
10358 DominatorTree &DT;
10359 PHINode *Main = nullptr;
10362
10363public:
10364 PHIHandler() = delete;
10365 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10366 : DT(DT), Main(Main), Phis(Phis),
10367 Operands(Main->getNumIncomingValues(),
10368 SmallVector<Value *>(Phis.size(), nullptr)) {}
10369 void buildOperands() {
10370 constexpr unsigned FastLimit = 4;
10371 if (Main->getNumIncomingValues() <= FastLimit) {
10372 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10373 BasicBlock *InBB = Main->getIncomingBlock(I);
10374 if (!DT.isReachableFromEntry(InBB)) {
10375 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10376 continue;
10377 }
10378 // Prepare the operand vector.
10379 for (auto [Idx, V] : enumerate(Phis)) {
10380 auto *P = dyn_cast<PHINode>(V);
10381 if (!P) {
10383 "Expected isa instruction or poison value.");
10384 Operands[I][Idx] = V;
10385 continue;
10386 }
10387 if (P->getIncomingBlock(I) == InBB)
10388 Operands[I][Idx] = P->getIncomingValue(I);
10389 else
10390 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10391 }
10392 }
10393 return;
10394 }
10395 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10396 Blocks;
10397 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10398 BasicBlock *InBB = Main->getIncomingBlock(I);
10399 if (!DT.isReachableFromEntry(InBB)) {
10400 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10401 continue;
10402 }
10403 Blocks.try_emplace(InBB).first->second.push_back(I);
10404 }
10405 for (auto [Idx, V] : enumerate(Phis)) {
10406 if (isa<PoisonValue>(V)) {
10407 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10408 Operands[I][Idx] = V;
10409 continue;
10410 }
10411 auto *P = cast<PHINode>(V);
10412 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10413 BasicBlock *InBB = P->getIncomingBlock(I);
10414 if (InBB == Main->getIncomingBlock(I)) {
10415 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10416 continue;
10417 Operands[I][Idx] = P->getIncomingValue(I);
10418 continue;
10419 }
10420 auto *It = Blocks.find(InBB);
10421 if (It == Blocks.end())
10422 continue;
10423 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10424 }
10425 }
10426 for (const auto &P : Blocks) {
10427 ArrayRef<unsigned> IncomingValues = P.second;
10428 if (IncomingValues.size() <= 1)
10429 continue;
10430 unsigned BasicI = IncomingValues.consume_front();
10431 for (unsigned I : IncomingValues) {
10432 assert(all_of(enumerate(Operands[I]),
10433 [&](const auto &Data) {
10434 return !Data.value() ||
10435 Data.value() == Operands[BasicI][Data.index()];
10436 }) &&
10437 "Expected empty operands list.");
10438 Operands[I] = Operands[BasicI];
10439 }
10440 }
10441 }
10442 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10443};
10444} // namespace
10445
10446/// Returns main/alternate instructions for the given \p VL. Unlike
10447/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10448/// node support.
10449/// \returns first main/alt instructions, if only poisons and instruction with
10450/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10451static std::pair<Instruction *, Instruction *>
10453 Instruction *MainOp = nullptr;
10454 Instruction *AltOp = nullptr;
10455 for (Value *V : VL) {
10456 if (isa<PoisonValue>(V))
10457 continue;
10458 auto *I = dyn_cast<Instruction>(V);
10459 if (!I)
10460 return {};
10461 if (!MainOp) {
10462 MainOp = I;
10463 continue;
10464 }
10465 if (MainOp->getOpcode() == I->getOpcode()) {
10466 if (I->getParent() != MainOp->getParent())
10467 return {};
10468 continue;
10469 }
10470 if (!AltOp) {
10471 AltOp = I;
10472 continue;
10473 }
10474 if (AltOp->getOpcode() == I->getOpcode()) {
10475 if (I->getParent() != AltOp->getParent())
10476 return {};
10477 continue;
10478 }
10479 return {};
10480 }
10481 if (!AltOp)
10482 return {};
10483 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10484 "Expected different main and alt instructions.");
10485 return std::make_pair(MainOp, AltOp);
10486}
10487
10488/// Checks that every instruction appears once in the list and if not, packs
10489/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10490/// unique scalars is extended by poison values to the whole register size.
10491///
10492/// \returns false if \p VL could not be uniquified, in which case \p VL is
10493/// unchanged and \p ReuseShuffleIndices is empty.
10495 SmallVectorImpl<int> &ReuseShuffleIndices,
10496 const TargetTransformInfo &TTI,
10497 const TargetLibraryInfo &TLI,
10498 const InstructionsState &S,
10499 const BoUpSLP::EdgeInfo &UserTreeIdx,
10500 bool TryPad = false) {
10501 // Check that every instruction appears once in this bundle.
10502 SmallVector<Value *> UniqueValues;
10503 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10504 for (Value *V : VL) {
10505 if (isConstant(V)) {
10506 // Constants are always considered distinct, even if the same constant
10507 // appears multiple times in VL.
10508 ReuseShuffleIndices.emplace_back(
10509 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10510 UniqueValues.emplace_back(V);
10511 continue;
10512 }
10513 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10514 ReuseShuffleIndices.emplace_back(Res.first->second);
10515 if (Res.second)
10516 UniqueValues.emplace_back(V);
10517 }
10518
10519 // Easy case: VL has unique values and a "natural" size
10520 size_t NumUniqueScalarValues = UniqueValues.size();
10521 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10522 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10523 if (NumUniqueScalarValues == VL.size() &&
10524 (VectorizeNonPowerOf2 || IsFullVectors)) {
10525 ReuseShuffleIndices.clear();
10526 return true;
10527 }
10528
10529 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10530 if ((UserTreeIdx.UserTE &&
10531 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10533 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10534 "for nodes with padding.\n");
10535 ReuseShuffleIndices.clear();
10536 return false;
10537 }
10538
10539 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10540 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10541 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10542 return isa<UndefValue>(V) || !isConstant(V);
10543 }))) {
10544 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10545 S.getMainOp()->isSafeToRemove() &&
10546 (S.areInstructionsWithCopyableElements() ||
10547 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10548 // Find the number of elements, which forms full vectors.
10549 unsigned PWSz = getFullVectorNumberOfElements(
10550 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10551 PWSz = std::min<unsigned>(PWSz, VL.size());
10552 if (PWSz == VL.size()) {
10553 // We ended up with the same size after removing duplicates and
10554 // upgrading the resulting vector size to a "nice size". Just keep
10555 // the initial VL then.
10556 ReuseShuffleIndices.clear();
10557 } else {
10558 // Pad unique values with poison to grow the vector to a "nice" size
10559 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10560 UniqueValues.end());
10561 PaddedUniqueValues.append(
10562 PWSz - UniqueValues.size(),
10563 PoisonValue::get(UniqueValues.front()->getType()));
10564 // Check that extended with poisons/copyable operations are still valid
10565 // for vectorization (div/rem are not allowed).
10566 if ((!S.areInstructionsWithCopyableElements() &&
10567 !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
10568 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10569 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10570 isa<CallInst>(S.getMainOp())))) {
10571 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10572 ReuseShuffleIndices.clear();
10573 return false;
10574 }
10575 VL = std::move(PaddedUniqueValues);
10576 }
10577 return true;
10578 }
10579 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10580 ReuseShuffleIndices.clear();
10581 return false;
10582 }
10583 VL = std::move(UniqueValues);
10584 return true;
10585}
10586
10587bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10588 const InstructionsState &LocalState,
10589 SmallVectorImpl<Value *> &Op1,
10590 SmallVectorImpl<Value *> &Op2,
10591 OrdersType &ReorderIndices) const {
10592 constexpr unsigned SmallNodeSize = 4;
10593 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10595 return false;
10596
10597 // Check if this is a duplicate of another split entry.
10598 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10599 << ".\n");
10600 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10601 if (E->isSame(VL)) {
10602 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10603 << *LocalState.getMainOp() << ".\n");
10604 return false;
10605 }
10606 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10607 if (all_of(VL, [&](Value *V) {
10608 return isa<PoisonValue>(V) || Values.contains(V);
10609 })) {
10610 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10611 return false;
10612 }
10613 }
10614
10615 ReorderIndices.assign(VL.size(), VL.size());
10616 SmallBitVector Op1Indices(VL.size());
10617 for (auto [Idx, V] : enumerate(VL)) {
10618 auto *I = dyn_cast<Instruction>(V);
10619 if (!I) {
10620 Op1.push_back(V);
10621 Op1Indices.set(Idx);
10622 continue;
10623 }
10624 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10625 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
10626 *TLI)) ||
10627 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10628 !isAlternateInstruction(I, LocalState.getMainOp(),
10629 LocalState.getAltOp(), *TLI))) {
10630 Op1.push_back(V);
10631 Op1Indices.set(Idx);
10632 continue;
10633 }
10634 Op2.push_back(V);
10635 }
10636 Type *ScalarTy = getValueType(VL.front());
10637 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
10638 unsigned Opcode0 = LocalState.getOpcode();
10639 unsigned Opcode1 = LocalState.getAltOpcode();
10640 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10641 // Enable split node, only if all nodes do not form legal alternate
10642 // instruction (like X86 addsub).
10643 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
10644 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
10645 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10646 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10647 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
10648 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
10649 return false;
10650 // Enable split node, only if all nodes are power-of-2/full registers.
10651 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
10652 for (unsigned Idx : seq<unsigned>(VL.size())) {
10653 if (Op1Indices.test(Idx)) {
10654 ReorderIndices[Op1Cnt] = Idx;
10655 ++Op1Cnt;
10656 } else {
10657 ReorderIndices[Op2Cnt] = Idx;
10658 ++Op2Cnt;
10659 }
10660 }
10661 if (isIdentityOrder(ReorderIndices))
10662 ReorderIndices.clear();
10663 SmallVector<int> Mask;
10664 if (!ReorderIndices.empty())
10665 inversePermutation(ReorderIndices, Mask);
10666 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10667 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
10668 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
10669 // Check non-profitable single register ops, which better to be represented
10670 // as alternate ops.
10671 if (NumParts >= VL.size())
10672 return false;
10674 InstructionCost InsertCost = ::getShuffleCost(
10675 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
10676 FixedVectorType *SubVecTy =
10677 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
10678 InstructionCost NewShuffleCost =
10679 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
10680 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10681 (Mask.empty() || InsertCost >= NewShuffleCost))
10682 return false;
10683 if ((LocalState.getMainOp()->isBinaryOp() &&
10684 LocalState.getAltOp()->isBinaryOp() &&
10685 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10686 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10687 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10688 (LocalState.getMainOp()->isUnaryOp() &&
10689 LocalState.getAltOp()->isUnaryOp())) {
10690 InstructionCost OriginalVecOpsCost =
10691 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10692 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10693 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
10694 for (unsigned Idx : seq<unsigned>(VL.size())) {
10695 if (isa<PoisonValue>(VL[Idx]))
10696 continue;
10697 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
10698 }
10699 InstructionCost OriginalCost =
10700 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
10701 VecTy, OriginalMask, Kind);
10702 InstructionCost NewVecOpsCost =
10703 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10704 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10705 InstructionCost NewCost =
10706 NewVecOpsCost + InsertCost +
10707 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10708 VectorizableTree.front()->getOpcode() == Instruction::Store
10709 ? NewShuffleCost
10710 : 0);
10711 // If not profitable to split - exit.
10712 if (NewCost >= OriginalCost)
10713 return false;
10714 }
10715 return true;
10716}
10717
10718namespace {
10719/// Class accepts incoming list of values, checks if it is able to model
10720/// "copyable" values as compatible operations, and generates the list of values
10721/// for scheduling and list of operands doe the new nodes.
10722class InstructionsCompatibilityAnalysis {
10723 DominatorTree &DT;
10724 const DataLayout &DL;
10725 const TargetTransformInfo &TTI;
10726 const TargetLibraryInfo &TLI;
10727 unsigned MainOpcode = 0;
10728 Instruction *MainOp = nullptr;
10729
10730 /// Checks if the opcode is supported as the main opcode for copyable
10731 /// elements.
10732 static bool isSupportedOpcode(const unsigned Opcode) {
10733 return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
10734 Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
10735 Opcode == Instruction::UDiv || Opcode == Instruction::And ||
10736 Opcode == Instruction::Or || Opcode == Instruction::Xor;
10737 }
10738
10739 /// Identifies the best candidate value, which represents main opcode
10740 /// operation.
10741 /// Currently the best candidate is the Add instruction with the parent
10742 /// block with the highest DFS incoming number (block, that dominates other).
10743 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
10744 BasicBlock *Parent = nullptr;
10745 // Checks if the instruction has supported opcode.
10746 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
10747 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
10748 return false;
10749 return I && isSupportedOpcode(I->getOpcode()) &&
10750 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
10751 };
10752 // Exclude operands instructions immediately to improve compile time, it
10753 // will be unable to schedule anyway.
10754 SmallDenseSet<Value *, 8> Operands;
10755 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10756 bool AnyUndef = false;
10757 for (Value *V : VL) {
10758 auto *I = dyn_cast<Instruction>(V);
10759 if (!I) {
10760 AnyUndef |= isa<UndefValue>(V);
10761 continue;
10762 }
10763 if (!DT.isReachableFromEntry(I->getParent()))
10764 continue;
10765 if (Candidates.empty()) {
10766 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10767 Parent = I->getParent();
10768 Operands.insert(I->op_begin(), I->op_end());
10769 continue;
10770 }
10771 if (Parent == I->getParent()) {
10772 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10773 Operands.insert(I->op_begin(), I->op_end());
10774 continue;
10775 }
10776 auto *NodeA = DT.getNode(Parent);
10777 auto *NodeB = DT.getNode(I->getParent());
10778 assert(NodeA && "Should only process reachable instructions");
10779 assert(NodeB && "Should only process reachable instructions");
10780 assert((NodeA == NodeB) ==
10781 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10782 "Different nodes should have different DFS numbers");
10783 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10784 Candidates.clear();
10785 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10786 Parent = I->getParent();
10787 Operands.clear();
10788 Operands.insert(I->op_begin(), I->op_end());
10789 }
10790 }
10791 unsigned BestOpcodeNum = 0;
10792 MainOp = nullptr;
10793 for (const auto &P : Candidates) {
10794 if (P.second.size() < BestOpcodeNum)
10795 continue;
10796 for (Instruction *I : P.second) {
10797 if (IsSupportedInstruction(I, AnyUndef) && !Operands.contains(I)) {
10798 MainOp = I;
10799 BestOpcodeNum = P.second.size();
10800 break;
10801 }
10802 }
10803 }
10804 if (MainOp) {
10805 // Do not match, if any copyable is a terminator from the same block as
10806 // the main operation.
10807 if (any_of(VL, [&](Value *V) {
10808 auto *I = dyn_cast<Instruction>(V);
10809 return I && I->getParent() == MainOp->getParent() &&
10810 I->isTerminator();
10811 })) {
10812 MainOp = nullptr;
10813 return;
10814 }
10815 MainOpcode = MainOp->getOpcode();
10816 }
10817 }
10818
10819 /// Returns the idempotent value for the \p MainOp with the detected \p
10820 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
10821 /// the operand itself, since V or V == V.
10822 Value *selectBestIdempotentValue() const {
10823 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10824 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
10825 !MainOp->isCommutative());
10826 }
10827
10828 /// Returns the value and operands for the \p V, considering if it is original
10829 /// instruction and its actual operands should be returned, or it is a
10830 /// copyable element and its should be represented as idempotent instruction.
10831 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
10832 if (isa<PoisonValue>(V))
10833 return {V, V};
10834 if (!S.isCopyableElement(V))
10835 return convertTo(cast<Instruction>(V), S).second;
10836 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10837 return {V, selectBestIdempotentValue()};
10838 }
10839
10840 /// Builds operands for the original instructions.
10841 void
10842 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
10843 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
10844
10845 unsigned ShuffleOrOp =
10846 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10847 Instruction *VL0 = S.getMainOp();
10848
10849 switch (ShuffleOrOp) {
10850 case Instruction::PHI: {
10851 auto *PH = cast<PHINode>(VL0);
10852
10853 // Keeps the reordered operands to avoid code duplication.
10854 PHIHandler Handler(DT, PH, VL);
10855 Handler.buildOperands();
10856 Operands.assign(PH->getNumOperands(), {});
10857 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
10858 Operands[I].assign(Handler.getOperands(I).begin(),
10859 Handler.getOperands(I).end());
10860 return;
10861 }
10862 case Instruction::ExtractValue:
10863 case Instruction::ExtractElement:
10864 // This is a special case, as it does not gather, but at the same time
10865 // we are not extending buildTree_rec() towards the operands.
10866 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
10867 return;
10868 case Instruction::InsertElement:
10869 Operands.assign(2, {VL.size(), nullptr});
10870 for (auto [Idx, V] : enumerate(VL)) {
10871 auto *IE = cast<InsertElementInst>(V);
10872 for (auto [OpIdx, Ops] : enumerate(Operands))
10873 Ops[Idx] = IE->getOperand(OpIdx);
10874 }
10875 return;
10876 case Instruction::Load:
10877 Operands.assign(
10878 1, {VL.size(),
10879 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
10880 for (auto [V, Op] : zip(VL, Operands.back())) {
10881 auto *LI = dyn_cast<LoadInst>(V);
10882 if (!LI)
10883 continue;
10884 Op = LI->getPointerOperand();
10885 }
10886 return;
10887 case Instruction::ZExt:
10888 case Instruction::SExt:
10889 case Instruction::FPToUI:
10890 case Instruction::FPToSI:
10891 case Instruction::FPExt:
10892 case Instruction::PtrToInt:
10893 case Instruction::IntToPtr:
10894 case Instruction::SIToFP:
10895 case Instruction::UIToFP:
10896 case Instruction::Trunc:
10897 case Instruction::FPTrunc:
10898 case Instruction::BitCast:
10899 case Instruction::ICmp:
10900 case Instruction::FCmp:
10901 case Instruction::Select:
10902 case Instruction::FNeg:
10903 case Instruction::Add:
10904 case Instruction::FAdd:
10905 case Instruction::Sub:
10906 case Instruction::FSub:
10907 case Instruction::Mul:
10908 case Instruction::FMul:
10909 case Instruction::UDiv:
10910 case Instruction::SDiv:
10911 case Instruction::FDiv:
10912 case Instruction::URem:
10913 case Instruction::SRem:
10914 case Instruction::FRem:
10915 case Instruction::Shl:
10916 case Instruction::LShr:
10917 case Instruction::AShr:
10918 case Instruction::And:
10919 case Instruction::Or:
10920 case Instruction::Xor:
10921 case Instruction::Freeze:
10922 case Instruction::Store:
10923 case Instruction::ShuffleVector:
10924 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
10925 for (auto [Idx, V] : enumerate(VL)) {
10926 auto *I = dyn_cast<Instruction>(V);
10927 if (!I) {
10928 for (auto [OpIdx, Ops] : enumerate(Operands))
10929 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
10930 continue;
10931 }
10932 auto [Op, ConvertedOps] = convertTo(I, S);
10933 for (auto [OpIdx, Ops] : enumerate(Operands))
10934 Ops[Idx] = ConvertedOps[OpIdx];
10935 }
10936 return;
10937 case Instruction::GetElementPtr: {
10938 Operands.assign(2, {VL.size(), nullptr});
10939 // Need to cast all indices to the same type before vectorization to
10940 // avoid crash.
10941 // Required to be able to find correct matches between different gather
10942 // nodes and reuse the vectorized values rather than trying to gather them
10943 // again.
10944 const unsigned IndexIdx = 1;
10945 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
10946 Type *Ty =
10947 all_of(VL,
10948 [&](Value *V) {
10950 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
10951 })
10952 ? VL0Ty
10953 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
10954 ->getPointerOperandType()
10955 ->getScalarType());
10956 for (auto [Idx, V] : enumerate(VL)) {
10958 if (!GEP) {
10959 Operands[0][Idx] = V;
10960 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10961 continue;
10962 }
10963 Operands[0][Idx] = GEP->getPointerOperand();
10964 auto *Op = GEP->getOperand(IndexIdx);
10965 auto *CI = dyn_cast<ConstantInt>(Op);
10966 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
10967 CI, Ty, CI->getValue().isSignBitSet(), DL)
10968 : Op;
10969 }
10970 return;
10971 }
10972 case Instruction::Call: {
10973 auto *CI = cast<CallInst>(VL0);
10975 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
10977 continue;
10978 auto &Ops = Operands.emplace_back();
10979 for (Value *V : VL) {
10980 auto *I = dyn_cast<Instruction>(V);
10981 Ops.push_back(I ? I->getOperand(Idx)
10982 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
10983 }
10984 }
10985 return;
10986 }
10987 default:
10988 break;
10989 }
10990 llvm_unreachable("Unexpected vectorization of the instructions.");
10991 }
10992
10993public:
10994 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
10995 const TargetTransformInfo &TTI,
10996 const TargetLibraryInfo &TLI)
10997 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
10998
10999 InstructionsState
11000 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
11001 bool TryCopyableElementsVectorization,
11002 bool WithProfitabilityCheck = false,
11003 bool SkipSameCodeCheck = false) {
11004 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11005 ? InstructionsState::invalid()
11006 : getSameOpcode(VL, TLI);
11007 if (S)
11008 return S;
11009 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
11010 return S;
11011 findAndSetMainInstruction(VL, R);
11012 if (!MainOp)
11013 return InstructionsState::invalid();
11014 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11015 if (!WithProfitabilityCheck)
11016 return S;
11017 // Check if it is profitable to vectorize the instruction.
11018 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11019 auto BuildCandidates =
11020 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11021 Value *V2) {
11022 if (V1 != V2 && isa<PHINode>(V1))
11023 return;
11024 auto *I1 = dyn_cast<Instruction>(V1);
11025 auto *I2 = dyn_cast<Instruction>(V2);
11026 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11027 I1->getParent() != I2->getParent())
11028 return;
11029 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
11030 };
11031 if (VL.size() == 2) {
11032 // Check if the operands allow better vectorization.
11033 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11034 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11035 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11036 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11037 R.findBestRootPair(Candidates1) &&
11038 R.findBestRootPair(Candidates2);
11039 if (!Res && isCommutative(MainOp)) {
11040 Candidates1.clear();
11041 Candidates2.clear();
11042 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11043 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11044 Res = !Candidates1.empty() && !Candidates2.empty() &&
11045 R.findBestRootPair(Candidates1) &&
11046 R.findBestRootPair(Candidates2);
11047 }
11048 if (!Res)
11049 return InstructionsState::invalid();
11051 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
11052 InstructionCost VectorCost;
11053 FixedVectorType *VecTy =
11054 getWidenedType(S.getMainOp()->getType(), VL.size());
11055 switch (MainOpcode) {
11056 case Instruction::Add:
11057 case Instruction::LShr:
11058 case Instruction::Shl:
11059 case Instruction::SDiv:
11060 case Instruction::UDiv:
11061 case Instruction::And:
11062 case Instruction::Or:
11063 case Instruction::Xor:
11064 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
11065 break;
11066 default:
11067 llvm_unreachable("Unexpected instruction.");
11068 }
11069 if (VectorCost > ScalarCost)
11070 return InstructionsState::invalid();
11071 return S;
11072 }
11073 assert(Operands.size() == 2 && "Unexpected number of operands!");
11074 unsigned CopyableNum =
11075 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
11076 if (CopyableNum < VL.size() / 2)
11077 return S;
11078 // Too many phi copyables - exit.
11079 const unsigned Limit = VL.size() / 24;
11080 if ((CopyableNum >= VL.size() - Limit ||
11081 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11082 CopyableNum >= MaxPHINumOperands) &&
11083 all_of(VL, [&](Value *V) {
11084 return isa<PHINode>(V) || !S.isCopyableElement(V);
11085 }))
11086 return InstructionsState::invalid();
11087 // Check profitability if number of copyables > VL.size() / 2.
11088 // 1. Reorder operands for better matching.
11089 if (isCommutative(MainOp)) {
11090 for (auto &Ops : Operands) {
11091 // Make instructions the first operands.
11092 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
11093 std::swap(Ops.front(), Ops.back());
11094 continue;
11095 }
11096 // Make constants the second operands.
11097 if (isa<Constant>(Ops.front())) {
11098 std::swap(Ops.front(), Ops.back());
11099 continue;
11100 }
11101 }
11102 }
11103 // 2. Check, if operands can be vectorized.
11104 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
11105 return InstructionsState::invalid();
11106 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11107 if (allConstant(Ops) || isSplat(Ops))
11108 return true;
11109 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11110 // one is different.
11111 constexpr unsigned Limit = 4;
11112 if (Operands.front().size() >= Limit) {
11113 SmallDenseMap<const Value *, unsigned> Counters;
11114 for (Value *V : Ops) {
11115 if (isa<UndefValue>(V))
11116 continue;
11117 ++Counters[V];
11118 }
11119 if (Counters.size() == 2 &&
11120 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11121 return C.second == 1;
11122 }))
11123 return true;
11124 }
11125 // First operand not a constant or splat? Last attempt - check for
11126 // potential vectorization.
11127 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11128 InstructionsState OpS = Analysis.buildInstructionsState(
11129 Ops, R, /*TryCopyableElementsVectorization=*/true);
11130 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11131 return false;
11132 unsigned CopyableNum =
11133 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11134 return CopyableNum <= VL.size() / 2;
11135 };
11136 if (!CheckOperand(Operands.front()))
11137 return InstructionsState::invalid();
11138
11139 return S;
11140 }
11141
11142 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11143 ArrayRef<Value *> VL) {
11144 assert(S && "Invalid state!");
11146 if (S.areInstructionsWithCopyableElements()) {
11147 MainOp = S.getMainOp();
11148 MainOpcode = S.getOpcode();
11149 Operands.assign(MainOp->getNumOperands(),
11150 BoUpSLP::ValueList(VL.size(), nullptr));
11151 for (auto [Idx, V] : enumerate(VL)) {
11152 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11153 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11154 Operands[OperandIdx][Idx] = Operand;
11155 }
11156 } else {
11157 buildOriginalOperands(S, VL, Operands);
11158 }
11159 return Operands;
11160 }
11161};
11162} // namespace
11163
11164BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11165 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11166 bool TryCopyableElementsVectorization) const {
11167 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11168
11169 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11170 InstructionsState S = Analysis.buildInstructionsState(
11171 VL, *this, TryCopyableElementsVectorization,
11172 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11173
11174 // Don't go into catchswitch blocks, which can happen with PHIs.
11175 // Such blocks can only have PHIs and the catchswitch. There is no
11176 // place to insert a shuffle if we need to, so just avoid that issue.
11177 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
11178 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11179 // Do not try to pack to avoid extra instructions here.
11180 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11181 /*TryToFindDuplicates=*/false);
11182 }
11183
11184 // Check if this is a duplicate of another entry.
11185 if (S) {
11186 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11187 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11188 if (E->isSame(VL)) {
11189 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11190 << ".\n");
11191 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11192 }
11193 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11194 if (all_of(VL, [&](Value *V) {
11195 return isa<PoisonValue>(V) || Values.contains(V) ||
11196 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11197 LI->getLoopFor(S.getMainOp()->getParent()) &&
11198 isVectorized(V));
11199 })) {
11200 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11201 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11202 }
11203 }
11204 }
11205
11206 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11207 // a load), in which case peek through to include it in the tree, without
11208 // ballooning over-budget.
11209 if (Depth >= RecursionMaxDepth &&
11210 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
11211 (match(S.getMainOp(), m_Load(m_Value())) ||
11212 all_of(VL, [&S](const Value *I) {
11213 return match(I,
11215 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11216 })))) {
11217 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11218 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11219 }
11220
11221 // Don't handle scalable vectors
11222 if (S && S.getOpcode() == Instruction::ExtractElement &&
11224 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11225 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11226 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11227 }
11228
11229 // Don't handle vectors.
11230 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11231 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11232 // Do not try to pack to avoid extra instructions here.
11233 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11234 /*TryToFindDuplicates=*/false);
11235 }
11236
11237 // If all of the operands are identical or constant we have a simple solution.
11238 // If we deal with insert/extract instructions, they all must have constant
11239 // indices, otherwise we should gather them, not try to vectorize.
11240 // If alternate op node with 2 elements with gathered operands - do not
11241 // vectorize.
11242 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11243 if (!S || !S.isAltShuffle() || VL.size() > 2)
11244 return false;
11245 if (VectorizableTree.size() < MinTreeSize)
11246 return false;
11247 if (Depth >= RecursionMaxDepth - 1)
11248 return true;
11249 // Check if all operands are extracts, part of vector node or can build a
11250 // regular vectorize node.
11251 SmallVector<unsigned, 8> InstsCount;
11252 for (Value *V : VL) {
11253 auto *I = cast<Instruction>(V);
11254 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11255 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11256 }));
11257 }
11258 bool IsCommutative =
11259 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11260 if ((IsCommutative &&
11261 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11262 (!IsCommutative &&
11263 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11264 return true;
11265 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11267 auto *I1 = cast<Instruction>(VL.front());
11268 auto *I2 = cast<Instruction>(VL.back());
11269 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11270 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11271 I2->getOperand(Op));
11272 if (static_cast<unsigned>(count_if(
11273 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11275 })) >= S.getMainOp()->getNumOperands() / 2)
11276 return false;
11277 if (S.getMainOp()->getNumOperands() > 2)
11278 return true;
11279 if (IsCommutative) {
11280 // Check permuted operands.
11281 Candidates.clear();
11282 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11283 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11284 I2->getOperand((Op + 1) % E));
11285 if (any_of(
11286 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11288 }))
11289 return false;
11290 }
11291 return true;
11292 };
11293 SmallVector<unsigned> SortedIndices;
11294 BasicBlock *BB = nullptr;
11295 bool IsScatterVectorizeUserTE =
11296 UserTreeIdx.UserTE &&
11297 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11298 bool AreAllSameBlock = S.valid();
11299 bool AreScatterAllGEPSameBlock =
11300 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11301 VL.size() > 2 &&
11302 all_of(VL,
11303 [&BB](Value *V) {
11304 auto *I = dyn_cast<GetElementPtrInst>(V);
11305 if (!I)
11306 return doesNotNeedToBeScheduled(V);
11307 if (!BB)
11308 BB = I->getParent();
11309 return BB == I->getParent() && I->getNumOperands() == 2;
11310 }) &&
11311 BB &&
11312 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11313 SortedIndices));
11314 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11315 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
11316 (S &&
11318 S.getMainOp()) &&
11320 NotProfitableForVectorization(VL)) {
11321 if (!S) {
11322 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11323 "C,S,B,O, small shuffle. \n";
11324 dbgs() << "[";
11325 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11326 dbgs() << "]\n");
11327 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11328 /*TryToFindDuplicates=*/true,
11329 /*TrySplitVectorize=*/true);
11330 }
11331 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11332 dbgs() << "[";
11333 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11334 dbgs() << "]\n");
11335 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11336 }
11337
11338 // Don't vectorize ephemeral values.
11339 if (S && !EphValues.empty()) {
11340 for (Value *V : VL) {
11341 if (EphValues.count(V)) {
11342 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11343 << ") is ephemeral.\n");
11344 // Do not try to pack to avoid extra instructions here.
11345 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11346 /*TryToFindDuplicates=*/false);
11347 }
11348 }
11349 }
11350
11351 // We now know that this is a vector of instructions of the same type from
11352 // the same block.
11353
11354 // Check that none of the instructions in the bundle are already in the tree
11355 // and the node may be not profitable for the vectorization as the small
11356 // alternate node.
11357 if (S && S.isAltShuffle()) {
11358 auto GetNumVectorizedExtracted = [&]() {
11359 APInt Extracted = APInt::getZero(VL.size());
11360 APInt Vectorized = APInt::getAllOnes(VL.size());
11361 for (auto [Idx, V] : enumerate(VL)) {
11362 auto *I = dyn_cast<Instruction>(V);
11363 if (!I || doesNotNeedToBeScheduled(I) ||
11364 all_of(I->operands(), [&](const Use &U) {
11365 return isa<ExtractElementInst>(U.get());
11366 }))
11367 continue;
11368 if (isVectorized(I))
11369 Vectorized.clearBit(Idx);
11370 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11371 Extracted.setBit(Idx);
11372 }
11373 return std::make_pair(Vectorized, Extracted);
11374 };
11375 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11377 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11378 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11379 // Rough cost estimation, if the vector code (+ potential extracts) is
11380 // more profitable than the scalar + buildvector.
11381 Type *ScalarTy = VL.front()->getType();
11382 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11383 InstructionCost VectorizeCostEstimate =
11384 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11385 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11386 /*Insert=*/false, /*Extract=*/true, Kind);
11387 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11388 *TTI, ScalarTy, VecTy, Vectorized,
11389 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11390 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11391 }
11392 if (PreferScalarize) {
11393 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11394 "node is not profitable.\n");
11395 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11396 }
11397 }
11398
11399 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11400 if (UserIgnoreList && !UserIgnoreList->empty()) {
11401 for (Value *V : VL) {
11402 if (UserIgnoreList->contains(V)) {
11403 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11404 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11405 }
11406 }
11407 }
11408
11409 // Special processing for sorted pointers for ScatterVectorize node with
11410 // constant indeces only.
11411 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11412 assert(VL.front()->getType()->isPointerTy() &&
11414 "Expected pointers only.");
11415 // Reset S to make it GetElementPtr kind of node.
11416 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11417 assert(It != VL.end() && "Expected at least one GEP.");
11418 S = getSameOpcode(*It, *TLI);
11419 }
11420
11421 // Check that all of the users of the scalars that we want to vectorize are
11422 // schedulable.
11423 Instruction *VL0 = S.getMainOp();
11424 BB = VL0->getParent();
11425
11426 if (S &&
11428 !DT->isReachableFromEntry(BB))) {
11429 // Don't go into unreachable blocks. They may contain instructions with
11430 // dependency cycles which confuse the final scheduling.
11431 // Do not vectorize EH and non-returning blocks, not profitable in most
11432 // cases.
11433 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11434 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11435 }
11436 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11437}
11438
11439void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11440 const EdgeInfo &UserTreeIdx,
11441 unsigned InterleaveFactor) {
11442 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11443
11444 SmallVector<int> ReuseShuffleIndices;
11445 SmallVector<Value *> VL(VLRef);
11446
11447 // Tries to build split node.
11448 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11449 SmallVector<Value *> Op1, Op2;
11450 OrdersType ReorderIndices;
11451 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11452 return false;
11453
11454 auto Invalid = ScheduleBundle::invalid();
11455 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11456 UserTreeIdx, {}, ReorderIndices);
11457 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11458 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11459 InstructionsState S = getSameOpcode(Op, *TLI);
11460 if (S && (isa<LoadInst>(S.getMainOp()) ||
11461 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11462 // Build gather node for loads, they will be gathered later.
11463 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11464 Idx == 0 ? 0 : Op1.size());
11465 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11466 } else {
11467 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11468 Idx == 0 ? 0 : Op1.size());
11469 buildTreeRec(Op, Depth, {TE, Idx});
11470 }
11471 };
11472 AddNode(Op1, 0);
11473 AddNode(Op2, 1);
11474 return true;
11475 };
11476
11477 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11478 bool AreConsts = false;
11479 for (Value *V : VL) {
11480 if (isa<PoisonValue>(V))
11481 continue;
11482 if (isa<Constant>(V)) {
11483 AreConsts = true;
11484 continue;
11485 }
11486 if (!isa<PHINode>(V))
11487 return false;
11488 }
11489 return AreConsts;
11490 };
11491 if (AreOnlyConstsWithPHIs(VL)) {
11492 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11493 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11494 return;
11495 }
11496
11497 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11498 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11499 InstructionsState S = Legality.getInstructionsState();
11500 if (!Legality.isLegal()) {
11501 if (Legality.trySplitVectorize()) {
11502 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11503 // Last chance to try to vectorize alternate node.
11504 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11505 return;
11506 }
11507 if (!S)
11508 Legality = getScalarsVectorizationLegality(
11509 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11510 if (!Legality.isLegal()) {
11511 if (Legality.tryToFindDuplicates())
11512 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11513 UserTreeIdx);
11514
11515 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11516 return;
11517 }
11518 S = Legality.getInstructionsState();
11519 }
11520
11521 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11522 if (S.isAltShuffle() && TrySplitNode(S))
11523 return;
11524
11525 // Check that every instruction appears once in this bundle.
11526 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11527 /*TryPad=*/true)) {
11528 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11529 return;
11530 }
11531
11532 // Perform specific checks for each particular instruction kind.
11533 bool IsScatterVectorizeUserTE =
11534 UserTreeIdx.UserTE &&
11535 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11536 OrdersType CurrentOrder;
11537 SmallVector<Value *> PointerOps;
11538 StridedPtrInfo SPtrInfo;
11539 TreeEntry::EntryState State = getScalarsVectorizationState(
11540 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11541 if (State == TreeEntry::NeedToGather) {
11542 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11543 return;
11544 }
11545
11546 Instruction *VL0 = S.getMainOp();
11547 BasicBlock *BB = VL0->getParent();
11548 auto &BSRef = BlocksSchedules[BB];
11549 if (!BSRef)
11550 BSRef = std::make_unique<BlockScheduling>(BB);
11551
11552 BlockScheduling &BS = *BSRef;
11553
11554 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11555 std::optional<ScheduleBundle *> BundlePtr =
11556 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11557#ifdef EXPENSIVE_CHECKS
11558 // Make sure we didn't break any internal invariants
11559 BS.verify();
11560#endif
11561 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11562 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11563 // Last chance to try to vectorize alternate node.
11564 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11565 return;
11566 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11567 NonScheduledFirst.insert(VL.front());
11568 if (S.getOpcode() == Instruction::Load &&
11569 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11571 return;
11572 }
11573 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11574 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11575 ScheduleBundle Empty;
11576 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11577 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11578
11579 unsigned ShuffleOrOp =
11580 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11581 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11582 // Postpone PHI nodes creation
11583 SmallVector<unsigned> PHIOps;
11584 for (unsigned I : seq<unsigned>(Operands.size())) {
11585 ArrayRef<Value *> Op = Operands[I];
11586 if (Op.empty())
11587 continue;
11588 InstructionsState S = getSameOpcode(Op, *TLI);
11589 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11590 buildTreeRec(Op, Depth + 1, {TE, I});
11591 else
11592 PHIOps.push_back(I);
11593 }
11594 for (unsigned I : PHIOps)
11595 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11596 };
11597 switch (ShuffleOrOp) {
11598 case Instruction::PHI: {
11599 TreeEntry *TE =
11600 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11601 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11602 TE->dump());
11603
11604 TE->setOperands(Operands);
11605 CreateOperandNodes(TE, Operands);
11606 return;
11607 }
11608 case Instruction::ExtractValue:
11609 case Instruction::ExtractElement: {
11610 if (CurrentOrder.empty()) {
11611 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11612 } else {
11613 LLVM_DEBUG({
11614 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11615 "with order";
11616 for (unsigned Idx : CurrentOrder)
11617 dbgs() << " " << Idx;
11618 dbgs() << "\n";
11619 });
11620 fixupOrderingIndices(CurrentOrder);
11621 }
11622 // Insert new order with initial value 0, if it does not exist,
11623 // otherwise return the iterator to the existing one.
11624 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11625 ReuseShuffleIndices, CurrentOrder);
11626 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11627 "(ExtractValueInst/ExtractElementInst).\n";
11628 TE->dump());
11629 // This is a special case, as it does not gather, but at the same time
11630 // we are not extending buildTreeRec() towards the operands.
11631 TE->setOperands(Operands);
11632 return;
11633 }
11634 case Instruction::InsertElement: {
11635 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11636
11637 auto OrdCompare = [](const std::pair<int, int> &P1,
11638 const std::pair<int, int> &P2) {
11639 return P1.first > P2.first;
11640 };
11641 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
11642 decltype(OrdCompare)>
11643 Indices(OrdCompare);
11644 for (int I = 0, E = VL.size(); I < E; ++I) {
11645 unsigned Idx = *getElementIndex(VL[I]);
11646 Indices.emplace(Idx, I);
11647 }
11648 OrdersType CurrentOrder(VL.size(), VL.size());
11649 bool IsIdentity = true;
11650 for (int I = 0, E = VL.size(); I < E; ++I) {
11651 CurrentOrder[Indices.top().second] = I;
11652 IsIdentity &= Indices.top().second == I;
11653 Indices.pop();
11654 }
11655 if (IsIdentity)
11656 CurrentOrder.clear();
11657 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11658 {}, CurrentOrder);
11659 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
11660 TE->dump());
11661
11662 TE->setOperands(Operands);
11663 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
11664 return;
11665 }
11666 case Instruction::Load: {
11667 // Check that a vectorized load would load the same memory as a scalar
11668 // load. For example, we don't want to vectorize loads that are smaller
11669 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
11670 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
11671 // from such a struct, we read/write packed bits disagreeing with the
11672 // unvectorized version.
11673 TreeEntry *TE = nullptr;
11674 fixupOrderingIndices(CurrentOrder);
11675 switch (State) {
11676 case TreeEntry::Vectorize:
11677 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11678 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11679 if (CurrentOrder.empty())
11680 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
11681 TE->dump());
11682 else
11684 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
11685 TE->dump());
11686 break;
11687 case TreeEntry::CompressVectorize:
11688 // Vectorizing non-consecutive loads with (masked)load + compress.
11689 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11690 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11691 LLVM_DEBUG(
11692 dbgs()
11693 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11694 TE->dump());
11695 break;
11696 case TreeEntry::StridedVectorize:
11697 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11698 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11699 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11700 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
11701 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11702 TE->dump());
11703 break;
11704 case TreeEntry::ScatterVectorize:
11705 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11706 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11707 UserTreeIdx, ReuseShuffleIndices);
11708 LLVM_DEBUG(
11709 dbgs()
11710 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11711 TE->dump());
11712 break;
11713 case TreeEntry::CombinedVectorize:
11714 case TreeEntry::SplitVectorize:
11715 case TreeEntry::NeedToGather:
11716 llvm_unreachable("Unexpected loads state.");
11717 }
11718 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11719 assert(Operands.size() == 1 && "Expected a single operand only");
11720 SmallVector<int> Mask;
11721 inversePermutation(CurrentOrder, Mask);
11722 reorderScalars(Operands.front(), Mask);
11723 }
11724 TE->setOperands(Operands);
11725 if (State == TreeEntry::ScatterVectorize)
11726 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
11727 return;
11728 }
11729 case Instruction::ZExt:
11730 case Instruction::SExt:
11731 case Instruction::FPToUI:
11732 case Instruction::FPToSI:
11733 case Instruction::FPExt:
11734 case Instruction::PtrToInt:
11735 case Instruction::IntToPtr:
11736 case Instruction::SIToFP:
11737 case Instruction::UIToFP:
11738 case Instruction::Trunc:
11739 case Instruction::FPTrunc:
11740 case Instruction::BitCast: {
11741 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11742 std::make_pair(std::numeric_limits<unsigned>::min(),
11743 std::numeric_limits<unsigned>::max()));
11744 if (ShuffleOrOp == Instruction::ZExt ||
11745 ShuffleOrOp == Instruction::SExt) {
11746 CastMaxMinBWSizes = std::make_pair(
11747 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11748 PrevMaxBW),
11749 std::min<unsigned>(
11750 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11751 PrevMinBW));
11752 } else if (ShuffleOrOp == Instruction::Trunc) {
11753 CastMaxMinBWSizes = std::make_pair(
11754 std::max<unsigned>(
11755 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11756 PrevMaxBW),
11757 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11758 PrevMinBW));
11759 }
11760 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11761 ReuseShuffleIndices);
11762 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
11763 TE->dump());
11764
11765 TE->setOperands(Operands);
11766 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11767 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11768 if (ShuffleOrOp == Instruction::Trunc) {
11769 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11770 } else if (ShuffleOrOp == Instruction::SIToFP ||
11771 ShuffleOrOp == Instruction::UIToFP) {
11772 unsigned NumSignBits =
11773 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11774 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
11775 APInt Mask = DB->getDemandedBits(OpI);
11776 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
11777 }
11778 if (NumSignBits * 2 >=
11779 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11780 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11781 }
11782 return;
11783 }
11784 case Instruction::ICmp:
11785 case Instruction::FCmp: {
11786 // Check that all of the compares have the same predicate.
11787 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11788 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11789 ReuseShuffleIndices);
11790 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
11791 TE->dump());
11792
11793 VLOperands Ops(VL, Operands, S, *this);
11794 if (cast<CmpInst>(VL0)->isCommutative()) {
11795 // Commutative predicate - collect + sort operands of the instructions
11796 // so that each side is more likely to have the same opcode.
11798 "Commutative Predicate mismatch");
11799 Ops.reorder();
11800 Operands.front() = Ops.getVL(0);
11801 Operands.back() = Ops.getVL(1);
11802 } else {
11803 // Collect operands - commute if it uses the swapped predicate.
11804 for (auto [Idx, V] : enumerate(VL)) {
11805 if (isa<PoisonValue>(V))
11806 continue;
11807 auto *Cmp = cast<CmpInst>(V);
11808 if (Cmp->getPredicate() != P0)
11809 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11810 }
11811 }
11812 TE->setOperands(Operands);
11813 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11814 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11815 if (ShuffleOrOp == Instruction::ICmp) {
11816 unsigned NumSignBits0 =
11817 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11818 if (NumSignBits0 * 2 >=
11819 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11820 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11821 unsigned NumSignBits1 =
11822 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
11823 if (NumSignBits1 * 2 >=
11824 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
11825 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11826 }
11827 return;
11828 }
11829 case Instruction::Select:
11830 case Instruction::FNeg:
11831 case Instruction::Add:
11832 case Instruction::FAdd:
11833 case Instruction::Sub:
11834 case Instruction::FSub:
11835 case Instruction::Mul:
11836 case Instruction::FMul:
11837 case Instruction::UDiv:
11838 case Instruction::SDiv:
11839 case Instruction::FDiv:
11840 case Instruction::URem:
11841 case Instruction::SRem:
11842 case Instruction::FRem:
11843 case Instruction::Shl:
11844 case Instruction::LShr:
11845 case Instruction::AShr:
11846 case Instruction::And:
11847 case Instruction::Or:
11848 case Instruction::Xor:
11849 case Instruction::Freeze: {
11850 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11851 ReuseShuffleIndices);
11852 LLVM_DEBUG(
11853 dbgs() << "SLP: added a new TreeEntry "
11854 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11855 TE->dump());
11856
11857 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
11858 VLOperands Ops(VL, Operands, S, *this);
11859 Ops.reorder();
11860 Operands[0] = Ops.getVL(0);
11861 Operands[1] = Ops.getVL(1);
11862 }
11863 TE->setOperands(Operands);
11864 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11865 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11866 return;
11867 }
11868 case Instruction::GetElementPtr: {
11869 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11870 ReuseShuffleIndices);
11871 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
11872 TE->dump());
11873 TE->setOperands(Operands);
11874
11875 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
11876 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11877 return;
11878 }
11879 case Instruction::Store: {
11880 bool Consecutive = CurrentOrder.empty();
11881 if (!Consecutive)
11882 fixupOrderingIndices(CurrentOrder);
11883 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11884 ReuseShuffleIndices, CurrentOrder);
11885 if (Consecutive)
11886 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
11887 TE->dump());
11888 else
11889 LLVM_DEBUG(
11890 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
11891 TE->dump());
11892 TE->setOperands(Operands);
11893 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
11894 return;
11895 }
11896 case Instruction::Call: {
11897 // Check if the calls are all to the same vectorizable intrinsic or
11898 // library function.
11899 CallInst *CI = cast<CallInst>(VL0);
11901
11902 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11903 ReuseShuffleIndices);
11904 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
11905 TE->dump());
11906 if (isCommutative(VL0)) {
11907 VLOperands Ops(VL, Operands, S, *this);
11908 Ops.reorder();
11909 Operands[0] = Ops.getVL(0);
11910 Operands[1] = Ops.getVL(1);
11911 }
11912 TE->setOperands(Operands);
11913 for (unsigned I : seq<unsigned>(CI->arg_size())) {
11914 // For scalar operands no need to create an entry since no need to
11915 // vectorize it.
11917 continue;
11918 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11919 }
11920 return;
11921 }
11922 case Instruction::ShuffleVector: {
11923 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11924 ReuseShuffleIndices);
11925 if (S.isAltShuffle()) {
11926 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
11927 TE->dump());
11928 } else {
11929 assert(SLPReVec && "Only supported by REVEC.");
11930 LLVM_DEBUG(
11931 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11932 TE->dump());
11933 }
11934
11935 // Reorder operands if reordering would enable vectorization.
11936 auto *CI = dyn_cast<CmpInst>(VL0);
11937 if (CI && any_of(VL, [](Value *V) {
11938 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
11939 })) {
11940 auto *MainCI = cast<CmpInst>(S.getMainOp());
11941 auto *AltCI = cast<CmpInst>(S.getAltOp());
11942 CmpInst::Predicate MainP = MainCI->getPredicate();
11943 CmpInst::Predicate AltP = AltCI->getPredicate();
11944 assert(MainP != AltP &&
11945 "Expected different main/alternate predicates.");
11946 // Collect operands - commute if it uses the swapped predicate or
11947 // alternate operation.
11948 for (auto [Idx, V] : enumerate(VL)) {
11949 if (isa<PoisonValue>(V))
11950 continue;
11951 auto *Cmp = cast<CmpInst>(V);
11952
11953 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
11954 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11955 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11956 } else {
11957 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11958 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11959 }
11960 }
11961 TE->setOperands(Operands);
11962 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11963 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11964 return;
11965 }
11966
11967 if (isa<BinaryOperator>(VL0) || CI) {
11968 VLOperands Ops(VL, Operands, S, *this);
11969 Ops.reorder();
11970 Operands[0] = Ops.getVL(0);
11971 Operands[1] = Ops.getVL(1);
11972 }
11973 TE->setOperands(Operands);
11974 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11975 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11976 return;
11977 }
11978 default:
11979 break;
11980 }
11981 llvm_unreachable("Unexpected vectorization of the instructions.");
11982}
11983
11985 unsigned N = 1;
11986 Type *EltTy = T;
11987
11989 if (EltTy->isEmptyTy())
11990 return 0;
11991 if (auto *ST = dyn_cast<StructType>(EltTy)) {
11992 // Check that struct is homogeneous.
11993 for (const auto *Ty : ST->elements())
11994 if (Ty != *ST->element_begin())
11995 return 0;
11996 N *= ST->getNumElements();
11997 EltTy = *ST->element_begin();
11998 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
11999 N *= AT->getNumElements();
12000 EltTy = AT->getElementType();
12001 } else {
12002 auto *VT = cast<FixedVectorType>(EltTy);
12003 N *= VT->getNumElements();
12004 EltTy = VT->getElementType();
12005 }
12006 }
12007
12008 if (!isValidElementType(EltTy))
12009 return 0;
12010 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
12011 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12012 VTSize != DL->getTypeStoreSizeInBits(T))
12013 return 0;
12014 return N;
12015}
12016
12017bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12018 SmallVectorImpl<unsigned> &CurrentOrder,
12019 bool ResizeAllowed) const {
12021 assert(It != VL.end() && "Expected at least one extract instruction.");
12022 auto *E0 = cast<Instruction>(*It);
12023 assert(
12025 "Invalid opcode");
12026 // Check if all of the extracts come from the same vector and from the
12027 // correct offset.
12028 Value *Vec = E0->getOperand(0);
12029
12030 CurrentOrder.clear();
12031
12032 // We have to extract from a vector/aggregate with the same number of elements.
12033 unsigned NElts;
12034 if (E0->getOpcode() == Instruction::ExtractValue) {
12035 NElts = canMapToVector(Vec->getType());
12036 if (!NElts)
12037 return false;
12038 // Check if load can be rewritten as load of vector.
12039 LoadInst *LI = dyn_cast<LoadInst>(Vec);
12040 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
12041 return false;
12042 } else {
12043 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
12044 }
12045
12046 unsigned E = VL.size();
12047 if (!ResizeAllowed && NElts != E)
12048 return false;
12050 unsigned MinIdx = NElts, MaxIdx = 0;
12051 for (auto [I, V] : enumerate(VL)) {
12052 auto *Inst = dyn_cast<Instruction>(V);
12053 if (!Inst)
12054 continue;
12055 if (Inst->getOperand(0) != Vec)
12056 return false;
12057 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
12058 if (isa<UndefValue>(EE->getIndexOperand()))
12059 continue;
12060 std::optional<unsigned> Idx = getExtractIndex(Inst);
12061 if (!Idx)
12062 return false;
12063 const unsigned ExtIdx = *Idx;
12064 if (ExtIdx >= NElts)
12065 continue;
12066 Indices[I] = ExtIdx;
12067 if (MinIdx > ExtIdx)
12068 MinIdx = ExtIdx;
12069 if (MaxIdx < ExtIdx)
12070 MaxIdx = ExtIdx;
12071 }
12072 if (MaxIdx - MinIdx + 1 > E)
12073 return false;
12074 if (MaxIdx + 1 <= E)
12075 MinIdx = 0;
12076
12077 // Check that all of the indices extract from the correct offset.
12078 bool ShouldKeepOrder = true;
12079 // Assign to all items the initial value E + 1 so we can check if the extract
12080 // instruction index was used already.
12081 // Also, later we can check that all the indices are used and we have a
12082 // consecutive access in the extract instructions, by checking that no
12083 // element of CurrentOrder still has value E + 1.
12084 CurrentOrder.assign(E, E);
12085 for (unsigned I = 0; I < E; ++I) {
12086 if (Indices[I] == PoisonMaskElem)
12087 continue;
12088 const unsigned ExtIdx = Indices[I] - MinIdx;
12089 if (CurrentOrder[ExtIdx] != E) {
12090 CurrentOrder.clear();
12091 return false;
12092 }
12093 ShouldKeepOrder &= ExtIdx == I;
12094 CurrentOrder[ExtIdx] = I;
12095 }
12096 if (ShouldKeepOrder)
12097 CurrentOrder.clear();
12098
12099 return ShouldKeepOrder;
12100}
12101
12102bool BoUpSLP::areAllUsersVectorized(
12103 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12104 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
12105 all_of(I->users(), [this](User *U) {
12106 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12107 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12108 });
12109}
12110
12111void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12112 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12113 SmallVectorImpl<Value *> *OpScalars,
12114 SmallVectorImpl<Value *> *AltScalars) const {
12115 unsigned Sz = Scalars.size();
12116 Mask.assign(Sz, PoisonMaskElem);
12117 SmallVector<int> OrderMask;
12118 if (!ReorderIndices.empty())
12119 inversePermutation(ReorderIndices, OrderMask);
12120 for (unsigned I = 0; I < Sz; ++I) {
12121 unsigned Idx = I;
12122 if (!ReorderIndices.empty())
12123 Idx = OrderMask[I];
12124 if (isa<PoisonValue>(Scalars[Idx]))
12125 continue;
12126 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12127 if (IsAltOp(OpInst)) {
12128 Mask[I] = Sz + Idx;
12129 if (AltScalars)
12130 AltScalars->push_back(OpInst);
12131 } else {
12132 Mask[I] = Idx;
12133 if (OpScalars)
12134 OpScalars->push_back(OpInst);
12135 }
12136 }
12137 if (!ReuseShuffleIndices.empty()) {
12138 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12139 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12140 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12141 });
12142 Mask.swap(NewMask);
12143 }
12144}
12145
12147 Instruction *AltOp,
12148 const TargetLibraryInfo &TLI) {
12149 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12150}
12151
12153 Instruction *AltOp,
12154 const TargetLibraryInfo &TLI) {
12155 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12156 auto *AltCI = cast<CmpInst>(AltOp);
12157 CmpInst::Predicate MainP = MainCI->getPredicate();
12158 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12159 assert(MainP != AltP && "Expected different main/alternate predicates.");
12160 auto *CI = cast<CmpInst>(I);
12161 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12162 return false;
12163 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12164 return true;
12165 CmpInst::Predicate P = CI->getPredicate();
12167
12168 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12169 "CmpInst expected to match either main or alternate predicate or "
12170 "their swap.");
12171 return MainP != P && MainP != SwappedP;
12172 }
12173 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12174}
12175
12176TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12177 assert(!Ops.empty());
12178 const auto *Op0 = Ops.front();
12179
12180 const bool IsConstant = all_of(Ops, [](Value *V) {
12181 // TODO: We should allow undef elements here
12182 return isConstant(V) && !isa<UndefValue>(V);
12183 });
12184 const bool IsUniform = all_of(Ops, [=](Value *V) {
12185 // TODO: We should allow undef elements here
12186 return V == Op0;
12187 });
12188 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12189 // TODO: We should allow undef elements here
12190 if (auto *CI = dyn_cast<ConstantInt>(V))
12191 return CI->getValue().isPowerOf2();
12192 return false;
12193 });
12194 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12195 // TODO: We should allow undef elements here
12196 if (auto *CI = dyn_cast<ConstantInt>(V))
12197 return CI->getValue().isNegatedPowerOf2();
12198 return false;
12199 });
12200
12202 if (IsConstant && IsUniform)
12204 else if (IsConstant)
12206 else if (IsUniform)
12208
12210 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12211 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12212
12213 return {VK, VP};
12214}
12215
12216namespace {
12217/// The base class for shuffle instruction emission and shuffle cost estimation.
12218class BaseShuffleAnalysis {
12219protected:
12220 Type *ScalarTy = nullptr;
12221
12222 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12223
12224 /// V is expected to be a vectorized value.
12225 /// When REVEC is disabled, there is no difference between VF and
12226 /// VNumElements.
12227 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12228 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12229 /// of 8.
12230 unsigned getVF(Value *V) const {
12231 assert(V && "V cannot be nullptr");
12232 assert(isa<FixedVectorType>(V->getType()) &&
12233 "V does not have FixedVectorType");
12234 assert(ScalarTy && "ScalarTy cannot be nullptr");
12235 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12236 unsigned VNumElements =
12237 cast<FixedVectorType>(V->getType())->getNumElements();
12238 assert(VNumElements > ScalarTyNumElements &&
12239 "the number of elements of V is not large enough");
12240 assert(VNumElements % ScalarTyNumElements == 0 &&
12241 "the number of elements of V is not a vectorized value");
12242 return VNumElements / ScalarTyNumElements;
12243 }
12244
12245 /// Checks if the mask is an identity mask.
12246 /// \param IsStrict if is true the function returns false if mask size does
12247 /// not match vector size.
12248 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12249 bool IsStrict) {
12250 int Limit = Mask.size();
12251 int VF = VecTy->getNumElements();
12252 int Index = -1;
12253 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12254 return true;
12255 if (!IsStrict) {
12256 // Consider extract subvector starting from index 0.
12257 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12258 Index == 0)
12259 return true;
12260 // All VF-size submasks are identity (e.g.
12261 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12262 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12263 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12264 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
12266 }))
12267 return true;
12268 }
12269 return false;
12270 }
12271
12272 /// Tries to combine 2 different masks into single one.
12273 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12274 /// change the size of the vector, \p LocalVF is the original size of the
12275 /// shuffled vector.
12276 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12277 ArrayRef<int> ExtMask) {
12278 unsigned VF = Mask.size();
12279 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12280 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12281 if (ExtMask[I] == PoisonMaskElem)
12282 continue;
12283 int MaskedIdx = Mask[ExtMask[I] % VF];
12284 NewMask[I] =
12285 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12286 }
12287 Mask.swap(NewMask);
12288 }
12289
12290 /// Looks through shuffles trying to reduce final number of shuffles in the
12291 /// code. The function looks through the previously emitted shuffle
12292 /// instructions and properly mark indices in mask as undef.
12293 /// For example, given the code
12294 /// \code
12295 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12296 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12297 /// \endcode
12298 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12299 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12300 /// <0, 1, 2, 3> for the shuffle.
12301 /// If 2 operands are of different size, the smallest one will be resized and
12302 /// the mask recalculated properly.
12303 /// For example, given the code
12304 /// \code
12305 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12306 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12307 /// \endcode
12308 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12309 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12310 /// <0, 1, 2, 3> for the shuffle.
12311 /// So, it tries to transform permutations to simple vector merge, if
12312 /// possible.
12313 /// \param V The input vector which must be shuffled using the given \p Mask.
12314 /// If the better candidate is found, \p V is set to this best candidate
12315 /// vector.
12316 /// \param Mask The input mask for the shuffle. If the best candidate is found
12317 /// during looking-through-shuffles attempt, it is updated accordingly.
12318 /// \param SinglePermute true if the shuffle operation is originally a
12319 /// single-value-permutation. In this case the look-through-shuffles procedure
12320 /// may look for resizing shuffles as the best candidates.
12321 /// \return true if the shuffle results in the non-resizing identity shuffle
12322 /// (and thus can be ignored), false - otherwise.
12323 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12324 bool SinglePermute) {
12325 Value *Op = V;
12326 ShuffleVectorInst *IdentityOp = nullptr;
12327 SmallVector<int> IdentityMask;
12328 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12329 // Exit if not a fixed vector type or changing size shuffle.
12330 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12331 if (!SVTy)
12332 break;
12333 // Remember the identity or broadcast mask, if it is not a resizing
12334 // shuffle. If no better candidates are found, this Op and Mask will be
12335 // used in the final shuffle.
12336 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12337 if (!IdentityOp || !SinglePermute ||
12338 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12340 IdentityMask.size()))) {
12341 IdentityOp = SV;
12342 // Store current mask in the IdentityMask so later we did not lost
12343 // this info if IdentityOp is selected as the best candidate for the
12344 // permutation.
12345 IdentityMask.assign(Mask);
12346 }
12347 }
12348 // Remember the broadcast mask. If no better candidates are found, this Op
12349 // and Mask will be used in the final shuffle.
12350 // Zero splat can be used as identity too, since it might be used with
12351 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12352 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12353 // expensive, the analysis founds out, that the source vector is just a
12354 // broadcast, this original mask can be transformed to identity mask <0,
12355 // 1, 2, 3>.
12356 // \code
12357 // %0 = shuffle %v, poison, zeroinitalizer
12358 // %res = shuffle %0, poison, <3, 1, 2, 0>
12359 // \endcode
12360 // may be transformed to
12361 // \code
12362 // %0 = shuffle %v, poison, zeroinitalizer
12363 // %res = shuffle %0, poison, <0, 1, 2, 3>
12364 // \endcode
12365 if (SV->isZeroEltSplat()) {
12366 IdentityOp = SV;
12367 IdentityMask.assign(Mask);
12368 }
12369 int LocalVF = Mask.size();
12370 if (auto *SVOpTy =
12371 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12372 LocalVF = SVOpTy->getNumElements();
12373 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12374 for (auto [Idx, I] : enumerate(Mask)) {
12375 if (I == PoisonMaskElem ||
12376 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12377 continue;
12378 ExtMask[Idx] = SV->getMaskValue(I);
12379 }
12380 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12381 SV->getOperand(0),
12382 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12383 .all();
12384 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12385 SV->getOperand(1),
12386 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12387 .all();
12388 if (!IsOp1Undef && !IsOp2Undef) {
12389 // Update mask and mark undef elems.
12390 for (int &I : Mask) {
12391 if (I == PoisonMaskElem)
12392 continue;
12393 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12395 I = PoisonMaskElem;
12396 }
12397 break;
12398 }
12399 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12400 combineMasks(LocalVF, ShuffleMask, Mask);
12401 Mask.swap(ShuffleMask);
12402 if (IsOp2Undef)
12403 Op = SV->getOperand(0);
12404 else
12405 Op = SV->getOperand(1);
12406 }
12407 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12408 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12410 if (IdentityOp) {
12411 V = IdentityOp;
12412 assert(Mask.size() == IdentityMask.size() &&
12413 "Expected masks of same sizes.");
12414 // Clear known poison elements.
12415 for (auto [I, Idx] : enumerate(Mask))
12416 if (Idx == PoisonMaskElem)
12417 IdentityMask[I] = PoisonMaskElem;
12418 Mask.swap(IdentityMask);
12419 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12420 return SinglePermute &&
12421 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12422 /*IsStrict=*/true) ||
12423 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12424 Shuffle->isZeroEltSplat() &&
12426 all_of(enumerate(Mask), [&](const auto &P) {
12427 return P.value() == PoisonMaskElem ||
12428 Shuffle->getShuffleMask()[P.index()] == 0;
12429 })));
12430 }
12431 V = Op;
12432 return false;
12433 }
12434 V = Op;
12435 return true;
12436 }
12437
12438 /// Smart shuffle instruction emission, walks through shuffles trees and
12439 /// tries to find the best matching vector for the actual shuffle
12440 /// instruction.
12441 template <typename T, typename ShuffleBuilderTy>
12442 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12443 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12444 assert(V1 && "Expected at least one vector value.");
12445 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12446 SmallVector<int> NewMask(Mask);
12447 if (ScalarTyNumElements != 1) {
12448 assert(SLPReVec && "FixedVectorType is not expected.");
12449 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12450 Mask = NewMask;
12451 }
12452 if (V2)
12453 Builder.resizeToMatch(V1, V2);
12454 int VF = Mask.size();
12455 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12456 VF = FTy->getNumElements();
12458 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12459 .all()) {
12460 // Peek through shuffles.
12461 Value *Op1 = V1;
12462 Value *Op2 = V2;
12463 int VF =
12464 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12465 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12466 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12467 for (int I = 0, E = Mask.size(); I < E; ++I) {
12468 if (Mask[I] < VF)
12469 CombinedMask1[I] = Mask[I];
12470 else
12471 CombinedMask2[I] = Mask[I] - VF;
12472 }
12473 Value *PrevOp1;
12474 Value *PrevOp2;
12475 do {
12476 PrevOp1 = Op1;
12477 PrevOp2 = Op2;
12478 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12479 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12480 // Check if we have 2 resizing shuffles - need to peek through operands
12481 // again.
12482 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12483 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12484 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12485 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12486 if (I == PoisonMaskElem)
12487 continue;
12488 ExtMask1[Idx] = SV1->getMaskValue(I);
12489 }
12490 SmallBitVector UseMask1 = buildUseMask(
12491 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12492 ->getNumElements(),
12493 ExtMask1, UseMask::SecondArg);
12494 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12495 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12496 if (I == PoisonMaskElem)
12497 continue;
12498 ExtMask2[Idx] = SV2->getMaskValue(I);
12499 }
12500 SmallBitVector UseMask2 = buildUseMask(
12501 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12502 ->getNumElements(),
12503 ExtMask2, UseMask::SecondArg);
12504 if (SV1->getOperand(0)->getType() ==
12505 SV2->getOperand(0)->getType() &&
12506 SV1->getOperand(0)->getType() != SV1->getType() &&
12507 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12508 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12509 Op1 = SV1->getOperand(0);
12510 Op2 = SV2->getOperand(0);
12511 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12512 int LocalVF = ShuffleMask1.size();
12513 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12514 LocalVF = FTy->getNumElements();
12515 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12516 CombinedMask1.swap(ShuffleMask1);
12517 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12518 LocalVF = ShuffleMask2.size();
12519 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12520 LocalVF = FTy->getNumElements();
12521 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12522 CombinedMask2.swap(ShuffleMask2);
12523 }
12524 }
12525 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12526 Builder.resizeToMatch(Op1, Op2);
12527 VF = std::max(cast<VectorType>(Op1->getType())
12528 ->getElementCount()
12529 .getKnownMinValue(),
12531 ->getElementCount()
12532 .getKnownMinValue());
12533 for (int I = 0, E = Mask.size(); I < E; ++I) {
12534 if (CombinedMask2[I] != PoisonMaskElem) {
12535 assert(CombinedMask1[I] == PoisonMaskElem &&
12536 "Expected undefined mask element");
12537 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12538 }
12539 }
12540 if (Op1 == Op2 &&
12541 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12542 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12544 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12545 ArrayRef(CombinedMask1))))
12546 return Builder.createIdentity(Op1);
12547 return Builder.createShuffleVector(
12548 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12549 CombinedMask1);
12550 }
12551 if (isa<PoisonValue>(V1))
12552 return Builder.createPoison(
12553 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12554 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12555 assert(V1 && "Expected non-null value after looking through shuffles.");
12556
12557 if (!IsIdentity)
12558 return Builder.createShuffleVector(V1, NewMask);
12559 return Builder.createIdentity(V1);
12560 }
12561
12562 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12563 /// shuffle emission.
12564 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12565 ArrayRef<int> Mask) {
12566 for (unsigned I : seq<unsigned>(CommonMask.size()))
12567 if (Mask[I] != PoisonMaskElem)
12568 CommonMask[I] = I;
12569 }
12570};
12571} // namespace
12572
12573/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12574static std::pair<InstructionCost, InstructionCost>
12576 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12577 Type *ScalarTy, VectorType *VecTy) {
12578 InstructionCost ScalarCost = 0;
12579 InstructionCost VecCost = 0;
12580 // Here we differentiate two cases: (1) when Ptrs represent a regular
12581 // vectorization tree node (as they are pointer arguments of scattered
12582 // loads) or (2) when Ptrs are the arguments of loads or stores being
12583 // vectorized as plane wide unit-stride load/store since all the
12584 // loads/stores are known to be from/to adjacent locations.
12585 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12586 // Case 2: estimate costs for pointer related costs when vectorizing to
12587 // a wide load/store.
12588 // Scalar cost is estimated as a set of pointers with known relationship
12589 // between them.
12590 // For vector code we will use BasePtr as argument for the wide load/store
12591 // but we also need to account all the instructions which are going to
12592 // stay in vectorized code due to uses outside of these scalar
12593 // loads/stores.
12594 ScalarCost = TTI.getPointersChainCost(
12595 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12596 CostKind);
12597
12598 SmallVector<const Value *> PtrsRetainedInVecCode;
12599 for (Value *V : Ptrs) {
12600 if (V == BasePtr) {
12601 PtrsRetainedInVecCode.push_back(V);
12602 continue;
12603 }
12605 // For simplicity assume Ptr to stay in vectorized code if it's not a
12606 // GEP instruction. We don't care since it's cost considered free.
12607 // TODO: We should check for any uses outside of vectorizable tree
12608 // rather than just single use.
12609 if (!Ptr || !Ptr->hasOneUse())
12610 PtrsRetainedInVecCode.push_back(V);
12611 }
12612
12613 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12614 // If all pointers stay in vectorized code then we don't have
12615 // any savings on that.
12616 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
12617 }
12618 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12619 TTI::PointersChainInfo::getKnownStride(),
12620 VecTy, CostKind);
12621 } else {
12622 // Case 1: Ptrs are the arguments of loads that we are going to transform
12623 // into masked gather load intrinsic.
12624 // All the scalar GEPs will be removed as a result of vectorization.
12625 // For any external uses of some lanes extract element instructions will
12626 // be generated (which cost is estimated separately).
12627 TTI::PointersChainInfo PtrsInfo =
12628 all_of(Ptrs,
12629 [](const Value *V) {
12631 return Ptr && !Ptr->hasAllConstantIndices();
12632 })
12633 ? TTI::PointersChainInfo::getUnknownStride()
12634 : TTI::PointersChainInfo::getKnownStride();
12635
12636 ScalarCost =
12637 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
12638 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12639 if (!BaseGEP) {
12640 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
12641 if (It != Ptrs.end())
12642 BaseGEP = cast<GEPOperator>(*It);
12643 }
12644 if (BaseGEP) {
12645 SmallVector<const Value *> Indices(BaseGEP->indices());
12646 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
12647 BaseGEP->getPointerOperand(), Indices, VecTy,
12648 CostKind);
12649 }
12650 }
12651
12652 return std::make_pair(ScalarCost, VecCost);
12653}
12654
12655void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12656 assert(TE.isGather() && TE.ReorderIndices.empty() &&
12657 "Expected gather node without reordering.");
12658 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
12659 SmallSet<size_t, 2> LoadKeyUsed;
12660
12661 // Do not reorder nodes if it small (just 2 elements), all-constant or all
12662 // instructions have same opcode already.
12663 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
12664 all_of(TE.Scalars, isConstant))
12665 return;
12666
12667 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
12668 return VectorizableTree[Idx]->isSame(TE.Scalars);
12669 }))
12670 return;
12671
12672 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
12673 Key = hash_combine(hash_value(LI->getParent()), Key);
12674 Value *Ptr =
12675 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
12676 if (LoadKeyUsed.contains(Key)) {
12677 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
12678 if (LIt != LoadsMap.end()) {
12679 for (LoadInst *RLI : LIt->second) {
12680 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
12681 LI->getType(), LI->getPointerOperand(), *DL, *SE,
12682 /*StrictCheck=*/true))
12683 return hash_value(RLI->getPointerOperand());
12684 }
12685 for (LoadInst *RLI : LIt->second) {
12687 LI->getPointerOperand(), *TLI)) {
12688 hash_code SubKey = hash_value(RLI->getPointerOperand());
12689 return SubKey;
12690 }
12691 }
12692 if (LIt->second.size() > 2) {
12693 hash_code SubKey =
12694 hash_value(LIt->second.back()->getPointerOperand());
12695 return SubKey;
12696 }
12697 }
12698 }
12699 LoadKeyUsed.insert(Key);
12700 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
12701 return hash_value(LI->getPointerOperand());
12702 };
12703 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12704 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12705 bool IsOrdered = true;
12706 unsigned NumInstructions = 0;
12707 // Try to "cluster" scalar instructions, to be able to build extra vectorized
12708 // nodes.
12709 for (auto [I, V] : enumerate(TE.Scalars)) {
12710 size_t Key = 1, Idx = 1;
12711 if (auto *Inst = dyn_cast<Instruction>(V);
12713 !isDeleted(Inst) && !isVectorized(V)) {
12714 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
12715 /*AllowAlternate=*/false);
12716 ++NumInstructions;
12717 }
12718 auto &Container = SortedValues[Key];
12719 if (IsOrdered && !KeyToIndex.contains(V) &&
12722 ((Container.contains(Idx) &&
12723 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
12724 (!Container.empty() && !Container.contains(Idx) &&
12725 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
12726 IsOrdered = false;
12727 auto &KTI = KeyToIndex[V];
12728 if (KTI.empty())
12729 Container[Idx].push_back(V);
12730 KTI.push_back(I);
12731 }
12733 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12734 if (!IsOrdered && NumInstructions > 1) {
12735 unsigned Cnt = 0;
12736 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
12737 for (const auto &D : SortedValues) {
12738 for (const auto &P : D.second) {
12739 unsigned Sz = 0;
12740 for (Value *V : P.second) {
12741 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
12742 for (auto [K, Idx] : enumerate(Indices)) {
12743 TE.ReorderIndices[Cnt + K] = Idx;
12744 TE.Scalars[Cnt + K] = V;
12745 }
12746 Sz += Indices.size();
12747 Cnt += Indices.size();
12748 }
12749 if (Sz > 1 && isa<Instruction>(P.second.front())) {
12750 const unsigned SubVF = getFloorFullVectorNumberOfElements(
12751 *TTI, TE.Scalars.front()->getType(), Sz);
12752 SubVectors.emplace_back(Cnt - Sz, SubVF);
12753 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12754 DemandedElts.clearBit(I);
12755 } else if (!P.second.empty() && isConstant(P.second.front())) {
12756 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12757 DemandedElts.clearBit(I);
12758 }
12759 }
12760 }
12761 }
12762 // Reuses always require shuffles, so consider it as profitable.
12763 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
12764 return;
12765 // Do simple cost estimation.
12768 auto *ScalarTy = TE.Scalars.front()->getType();
12769 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
12770 for (auto [Idx, Sz] : SubVectors) {
12772 Idx, getWidenedType(ScalarTy, Sz));
12773 }
12774 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12775 /*Insert=*/true,
12776 /*Extract=*/false, CostKind);
12777 int Sz = TE.Scalars.size();
12778 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
12779 TE.ReorderIndices.end());
12780 for (unsigned I : seq<unsigned>(Sz)) {
12781 Value *V = TE.getOrdered(I);
12782 if (isa<PoisonValue>(V)) {
12783 ReorderMask[I] = PoisonMaskElem;
12784 } else if (isConstant(V) || DemandedElts[I]) {
12785 ReorderMask[I] = I + TE.ReorderIndices.size();
12786 }
12787 }
12788 Cost += ::getShuffleCost(*TTI,
12789 any_of(ReorderMask, [&](int I) { return I >= Sz; })
12792 VecTy, ReorderMask);
12793 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12794 ReorderMask.assign(Sz, PoisonMaskElem);
12795 for (unsigned I : seq<unsigned>(Sz)) {
12796 Value *V = TE.getOrdered(I);
12797 if (isConstant(V)) {
12798 DemandedElts.clearBit(I);
12799 if (!isa<PoisonValue>(V))
12800 ReorderMask[I] = I;
12801 } else {
12802 ReorderMask[I] = I + Sz;
12803 }
12804 }
12805 InstructionCost BVCost =
12806 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12807 /*Insert=*/true, /*Extract=*/false, CostKind);
12808 if (!DemandedElts.isAllOnes())
12809 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
12810 if (Cost >= BVCost) {
12811 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
12812 reorderScalars(TE.Scalars, Mask);
12813 TE.ReorderIndices.clear();
12814 }
12815}
12816
12817/// Check if we can convert fadd/fsub sequence to FMAD.
12818/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
12820 const InstructionsState &S,
12821 DominatorTree &DT, const DataLayout &DL,
12823 const TargetLibraryInfo &TLI) {
12824 assert(all_of(VL,
12825 [](Value *V) {
12826 return V->getType()->getScalarType()->isFloatingPointTy();
12827 }) &&
12828 "Can only convert to FMA for floating point types");
12829 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12830
12831 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12832 FastMathFlags FMF;
12833 FMF.set();
12834 for (Value *V : VL) {
12835 auto *I = dyn_cast<Instruction>(V);
12836 if (!I)
12837 continue;
12838 if (S.isCopyableElement(I))
12839 continue;
12840 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12841 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12842 continue;
12843 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12844 FMF &= FPCI->getFastMathFlags();
12845 }
12846 return FMF.allowContract();
12847 };
12848 if (!CheckForContractable(VL))
12850 // fmul also should be contractable
12851 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12852 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12853
12854 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
12855 if (!OpS.valid())
12857
12858 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12860 if (!CheckForContractable(Operands.front()))
12862 // Compare the costs.
12863 InstructionCost FMulPlusFAddCost = 0;
12864 InstructionCost FMACost = 0;
12866 FastMathFlags FMF;
12867 FMF.set();
12868 for (Value *V : VL) {
12869 auto *I = dyn_cast<Instruction>(V);
12870 if (!I)
12871 continue;
12872 if (!S.isCopyableElement(I))
12873 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12874 FMF &= FPCI->getFastMathFlags();
12875 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12876 }
12877 unsigned NumOps = 0;
12878 for (auto [V, Op] : zip(VL, Operands.front())) {
12879 if (S.isCopyableElement(V))
12880 continue;
12881 auto *I = dyn_cast<Instruction>(Op);
12882 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
12883 if (auto *OpI = dyn_cast<Instruction>(V))
12884 FMACost += TTI.getInstructionCost(OpI, CostKind);
12885 if (I)
12886 FMACost += TTI.getInstructionCost(I, CostKind);
12887 continue;
12888 }
12889 ++NumOps;
12890 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12891 FMF &= FPCI->getFastMathFlags();
12892 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12893 }
12894 Type *Ty = VL.front()->getType();
12895 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
12896 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
12897 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
12898}
12899
12902 BaseGraphSize = VectorizableTree.size();
12903 // Turn graph transforming mode on and off, when done.
12904 class GraphTransformModeRAAI {
12905 bool &SavedIsGraphTransformMode;
12906
12907 public:
12908 GraphTransformModeRAAI(bool &IsGraphTransformMode)
12909 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12910 IsGraphTransformMode = true;
12911 }
12912 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
12913 } TransformContext(IsGraphTransformMode);
12914 // Operands are profitable if they are:
12915 // 1. At least one constant
12916 // or
12917 // 2. Splats
12918 // or
12919 // 3. Results in good vectorization opportunity, i.e. may generate vector
12920 // nodes and reduce cost of the graph.
12921 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
12922 const InstructionsState &S) {
12924 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
12925 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
12926 I2->getOperand(Op));
12927 return all_of(
12928 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
12929 return all_of(Cand,
12930 [](const std::pair<Value *, Value *> &P) {
12931 return isa<Constant>(P.first) ||
12932 isa<Constant>(P.second) || P.first == P.second;
12933 }) ||
12935 });
12936 };
12937
12938 // Try to reorder gather nodes for better vectorization opportunities.
12939 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12940 TreeEntry &E = *VectorizableTree[Idx];
12941 if (E.isGather())
12942 reorderGatherNode(E);
12943 }
12944
12945 // Better to use full gathered loads analysis, if there are only 2 loads
12946 // gathered nodes each having less than 16 elements.
12947 constexpr unsigned VFLimit = 16;
12948 bool ForceLoadGather =
12949 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12950 return TE->isGather() && TE->hasState() &&
12951 TE->getOpcode() == Instruction::Load &&
12952 TE->getVectorFactor() < VFLimit;
12953 }) == 2;
12954
12955 // Checks if the scalars are used in other node.
12956 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
12957 function_ref<bool(Value *)> CheckContainer) {
12958 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
12959 if (isa<PoisonValue>(V))
12960 return true;
12961 auto *I = dyn_cast<Instruction>(V);
12962 if (!I)
12963 return false;
12964 return is_contained(TE->Scalars, I) || CheckContainer(I);
12965 });
12966 };
12967 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
12968 if (E.hasState()) {
12969 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
12970 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12971 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12972 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12973 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12974 return is_contained(TEs, TE);
12975 });
12976 });
12977 }))
12978 return true;
12979 ;
12980 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
12981 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12982 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12983 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12984 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12985 return is_contained(TEs, TE);
12986 });
12987 });
12988 }))
12989 return true;
12990 } else {
12991 // Check if the gather node full copy of split node.
12992 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
12993 if (It != E.Scalars.end()) {
12994 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
12995 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12996 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12997 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12998 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12999 return is_contained(TEs, TE);
13000 });
13001 });
13002 }))
13003 return true;
13004 }
13005 }
13006 return false;
13007 };
13008 // The tree may grow here, so iterate over nodes, built before.
13009 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13010 TreeEntry &E = *VectorizableTree[Idx];
13011 if (E.isGather()) {
13012 ArrayRef<Value *> VL = E.Scalars;
13013 const unsigned Sz = getVectorElementSize(VL.front());
13014 unsigned MinVF = getMinVF(2 * Sz);
13015 // Do not try partial vectorization for small nodes (<= 2), nodes with the
13016 // same opcode and same parent block or all constants.
13017 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13018 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
13019 // We use allSameOpcode instead of isAltShuffle because we don't
13020 // want to use interchangeable instruction here.
13021 !allSameOpcode(VL) || !allSameBlock(VL)) ||
13022 allConstant(VL) || isSplat(VL))
13023 continue;
13024 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13025 continue;
13026 // Check if the node is a copy of other vector nodes.
13027 if (CheckForSameVectorNodes(E))
13028 continue;
13029 // Try to find vectorizable sequences and transform them into a series of
13030 // insertvector instructions.
13031 unsigned StartIdx = 0;
13032 unsigned End = VL.size();
13033 for (unsigned VF = getFloorFullVectorNumberOfElements(
13034 *TTI, VL.front()->getType(), VL.size() - 1);
13035 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13036 *TTI, VL.front()->getType(), VF - 1)) {
13037 if (StartIdx + VF > End)
13038 continue;
13040 bool AllStrided = true;
13041 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13042 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
13043 // If any instruction is vectorized already - do not try again.
13044 // Reuse the existing node, if it fully matches the slice.
13045 if (isVectorized(Slice.front()) &&
13046 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
13047 continue;
13048 // Constant already handled effectively - skip.
13049 if (allConstant(Slice))
13050 continue;
13051 // Do not try to vectorize small splats (less than vector register and
13052 // only with the single non-undef element).
13053 bool IsSplat = isSplat(Slice);
13054 bool IsTwoRegisterSplat = true;
13055 if (IsSplat && VF == 2) {
13056 unsigned NumRegs2VF = ::getNumberOfParts(
13057 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
13058 IsTwoRegisterSplat = NumRegs2VF == 2;
13059 }
13060 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
13061 count(Slice, Slice.front()) ==
13062 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
13063 : 1)) {
13064 if (IsSplat)
13065 continue;
13066 InstructionsState S = getSameOpcode(Slice, *TLI);
13067 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
13068 (S.getOpcode() == Instruction::Load &&
13070 (S.getOpcode() != Instruction::Load &&
13071 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
13072 continue;
13073 if (VF == 2) {
13074 // Try to vectorize reduced values or if all users are vectorized.
13075 // For expensive instructions extra extracts might be profitable.
13076 if ((!UserIgnoreList || E.Idx != 0) &&
13077 TTI->getInstructionCost(S.getMainOp(), CostKind) <
13079 !all_of(Slice, [&](Value *V) {
13080 if (isa<PoisonValue>(V))
13081 return true;
13082 return areAllUsersVectorized(cast<Instruction>(V),
13083 UserIgnoreList);
13084 }))
13085 continue;
13086 if (S.getOpcode() == Instruction::Load) {
13087 OrdersType Order;
13088 SmallVector<Value *> PointerOps;
13089 StridedPtrInfo SPtrInfo;
13090 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
13091 PointerOps, SPtrInfo);
13092 AllStrided &= Res == LoadsState::StridedVectorize ||
13094 Res == LoadsState::Gather;
13095 // Do not vectorize gathers.
13096 if (Res == LoadsState::ScatterVectorize ||
13097 Res == LoadsState::Gather) {
13098 if (Res == LoadsState::Gather) {
13100 // If reductions and the scalars from the root node are
13101 // analyzed - mark as non-vectorizable reduction.
13102 if (UserIgnoreList && E.Idx == 0)
13103 analyzedReductionVals(Slice);
13104 }
13105 continue;
13106 }
13107 } else if (S.getOpcode() == Instruction::ExtractElement ||
13108 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
13110 !CheckOperandsProfitability(
13111 S.getMainOp(),
13114 S))) {
13115 // Do not vectorize extractelements (handled effectively
13116 // alread). Do not vectorize non-profitable instructions (with
13117 // low cost and non-vectorizable operands.)
13118 continue;
13119 }
13120 }
13121 }
13122 Slices.emplace_back(Cnt, Slice.size());
13123 }
13124 // Do not try to vectorize if all slides are strided or gathered with
13125 // vector factor 2 and there are more than 2 slices. Better to handle
13126 // them in gathered loads analysis, may result in better vectorization.
13127 if (VF == 2 && AllStrided && Slices.size() > 2)
13128 continue;
13129 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13130 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13131 if (StartIdx == Cnt)
13132 StartIdx = Cnt + Sz;
13133 if (End == Cnt + Sz)
13134 End = Cnt;
13135 };
13136 for (auto [Cnt, Sz] : Slices) {
13137 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13138 const TreeEntry *SameTE = nullptr;
13139 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13140 It != Slice.end()) {
13141 // If any instruction is vectorized already - do not try again.
13142 SameTE = getSameValuesTreeEntry(*It, Slice);
13143 }
13144 unsigned PrevSize = VectorizableTree.size();
13145 [[maybe_unused]] unsigned PrevEntriesSize =
13146 LoadEntriesToVectorize.size();
13147 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13148 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13149 VectorizableTree[PrevSize]->isGather() &&
13150 VectorizableTree[PrevSize]->hasState() &&
13151 VectorizableTree[PrevSize]->getOpcode() !=
13152 Instruction::ExtractElement &&
13153 !isSplat(Slice)) {
13154 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13155 analyzedReductionVals(Slice);
13156 VectorizableTree.pop_back();
13157 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13158 "LoadEntriesToVectorize expected to remain the same");
13159 continue;
13160 }
13161 AddCombinedNode(PrevSize, Cnt, Sz);
13162 }
13163 }
13164 // Restore ordering, if no extra vectorization happened.
13165 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13166 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13167 reorderScalars(E.Scalars, Mask);
13168 E.ReorderIndices.clear();
13169 }
13170 }
13171 if (!E.hasState())
13172 continue;
13173 switch (E.getOpcode()) {
13174 case Instruction::Load: {
13175 // No need to reorder masked gather loads, just reorder the scalar
13176 // operands.
13177 if (E.State != TreeEntry::Vectorize)
13178 break;
13179 Type *ScalarTy = E.getMainOp()->getType();
13180 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13181 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13182 // Check if profitable to represent consecutive load + reverse as strided
13183 // load with stride -1.
13184 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13185 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13186 SmallVector<int> Mask;
13187 inversePermutation(E.ReorderIndices, Mask);
13188 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13189 InstructionCost OriginalVecCost =
13190 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13191 BaseLI->getPointerAddressSpace(), CostKind,
13193 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13194 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13195 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13196 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
13197 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13198 // Strided load is more profitable than consecutive load + reverse -
13199 // transform the node to strided load.
13200 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13201 ->getPointerOperand()
13202 ->getType());
13203 StridedPtrInfo SPtrInfo;
13204 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13205 SPtrInfo.Ty = VecTy;
13206 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13207 E.State = TreeEntry::StridedVectorize;
13208 }
13209 }
13210 break;
13211 }
13212 case Instruction::Store: {
13213 Type *ScalarTy =
13214 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13215 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13216 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13217 // Check if profitable to represent consecutive load + reverse as strided
13218 // load with stride -1.
13219 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13220 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13221 SmallVector<int> Mask;
13222 inversePermutation(E.ReorderIndices, Mask);
13223 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13224 InstructionCost OriginalVecCost =
13225 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13226 BaseSI->getPointerAddressSpace(), CostKind,
13228 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13229 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13230 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13231 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
13232 if (StridedCost < OriginalVecCost)
13233 // Strided store is more profitable than reverse + consecutive store -
13234 // transform the node to strided store.
13235 E.State = TreeEntry::StridedVectorize;
13236 } else if (!E.ReorderIndices.empty()) {
13237 // Check for interleaved stores.
13238 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13239 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13240 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13241 if (Mask.size() < 4)
13242 return 0u;
13243 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13245 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13246 TTI.isLegalInterleavedAccessType(
13247 VecTy, Factor, BaseSI->getAlign(),
13248 BaseSI->getPointerAddressSpace()))
13249 return Factor;
13250 }
13251
13252 return 0u;
13253 };
13254 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13255 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13256 if (InterleaveFactor != 0)
13257 E.setInterleave(InterleaveFactor);
13258 }
13259 break;
13260 }
13261 case Instruction::Select: {
13262 if (E.State != TreeEntry::Vectorize)
13263 break;
13264 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13265 if (MinMaxID == Intrinsic::not_intrinsic)
13266 break;
13267 // This node is a minmax node.
13268 E.CombinedOp = TreeEntry::MinMax;
13269 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13270 if (SelectOnly && CondEntry->UserTreeIndex &&
13271 CondEntry->State == TreeEntry::Vectorize) {
13272 // The condition node is part of the combined minmax node.
13273 CondEntry->State = TreeEntry::CombinedVectorize;
13274 }
13275 break;
13276 }
13277 case Instruction::FSub:
13278 case Instruction::FAdd: {
13279 // Check if possible to convert (a*b)+c to fma.
13280 if (E.State != TreeEntry::Vectorize ||
13281 !E.getOperations().isAddSubLikeOp())
13282 break;
13283 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13284 .isValid())
13285 break;
13286 // This node is a fmuladd node.
13287 E.CombinedOp = TreeEntry::FMulAdd;
13288 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13289 if (FMulEntry->UserTreeIndex &&
13290 FMulEntry->State == TreeEntry::Vectorize) {
13291 // The FMul node is part of the combined fmuladd node.
13292 FMulEntry->State = TreeEntry::CombinedVectorize;
13293 }
13294 break;
13295 }
13296 default:
13297 break;
13298 }
13299 }
13300
13301 if (LoadEntriesToVectorize.empty()) {
13302 // Single load node - exit.
13303 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13304 VectorizableTree.front()->getOpcode() == Instruction::Load)
13305 return;
13306 // Small graph with small VF - exit.
13307 constexpr unsigned SmallTree = 3;
13308 constexpr unsigned SmallVF = 2;
13309 if ((VectorizableTree.size() <= SmallTree &&
13310 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13311 (VectorizableTree.size() <= 2 && UserIgnoreList))
13312 return;
13313
13314 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13315 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13316 getCanonicalGraphSize() <= SmallTree &&
13317 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13318 [](const std::unique_ptr<TreeEntry> &TE) {
13319 return TE->isGather() && TE->hasState() &&
13320 TE->getOpcode() == Instruction::Load &&
13321 !allSameBlock(TE->Scalars);
13322 }) == 1)
13323 return;
13324 }
13325
13326 // A list of loads to be gathered during the vectorization process. We can
13327 // try to vectorize them at the end, if profitable.
13328 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13330 GatheredLoads;
13331
13332 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13333 TreeEntry &E = *TE;
13334 if (E.isGather() &&
13335 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13336 (!E.hasState() && any_of(E.Scalars,
13337 [&](Value *V) {
13338 return isa<LoadInst>(V) &&
13339 !isVectorized(V) &&
13340 !isDeleted(cast<Instruction>(V));
13341 }))) &&
13342 !isSplat(E.Scalars)) {
13343 for (Value *V : E.Scalars) {
13344 auto *LI = dyn_cast<LoadInst>(V);
13345 if (!LI)
13346 continue;
13347 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13348 continue;
13350 *this, V, *DL, *SE, *TTI,
13351 GatheredLoads[std::make_tuple(
13352 LI->getParent(),
13353 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13354 LI->getType())]);
13355 }
13356 }
13357 }
13358 // Try to vectorize gathered loads if this is not just a gather of loads.
13359 if (!GatheredLoads.empty())
13360 tryToVectorizeGatheredLoads(GatheredLoads);
13361}
13362
13363/// Merges shuffle masks and emits final shuffle instruction, if required. It
13364/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13365/// when the actual shuffle instruction is generated only if this is actually
13366/// required. Otherwise, the shuffle instruction emission is delayed till the
13367/// end of the process, to reduce the number of emitted instructions and further
13368/// analysis/transformations.
13369class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13370 bool IsFinalized = false;
13371 SmallVector<int> CommonMask;
13373 const TargetTransformInfo &TTI;
13374 InstructionCost Cost = 0;
13375 SmallDenseSet<Value *> VectorizedVals;
13376 BoUpSLP &R;
13377 SmallPtrSetImpl<Value *> &CheckedExtracts;
13378 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13379 /// While set, still trying to estimate the cost for the same nodes and we
13380 /// can delay actual cost estimation (virtual shuffle instruction emission).
13381 /// May help better estimate the cost if same nodes must be permuted + allows
13382 /// to move most of the long shuffles cost estimation to TTI.
13383 bool SameNodesEstimated = true;
13384
13385 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13386 if (Ty->getScalarType()->isPointerTy()) {
13389 IntegerType::get(Ty->getContext(),
13390 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13391 Ty->getScalarType());
13392 if (auto *VTy = dyn_cast<VectorType>(Ty))
13393 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13394 return Res;
13395 }
13396 return Constant::getAllOnesValue(Ty);
13397 }
13398
13399 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13400 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13401 return TTI::TCC_Free;
13402 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13403 InstructionCost GatherCost = 0;
13404 SmallVector<Value *> Gathers(VL);
13405 if (!Root && isSplat(VL)) {
13406 // Found the broadcasting of the single scalar, calculate the cost as
13407 // the broadcast.
13408 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13409 assert(It != VL.end() && "Expected at least one non-undef value.");
13410 // Add broadcast for non-identity shuffle only.
13411 bool NeedShuffle =
13412 count(VL, *It) > 1 &&
13413 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13414 if (!NeedShuffle) {
13415 if (isa<FixedVectorType>(ScalarTy)) {
13416 assert(SLPReVec && "FixedVectorType is not expected.");
13417 return TTI.getShuffleCost(
13418 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13419 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13420 cast<FixedVectorType>(ScalarTy));
13421 }
13422 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13423 CostKind, std::distance(VL.begin(), It),
13424 PoisonValue::get(VecTy), *It);
13425 }
13426
13427 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13428 transform(VL, ShuffleMask.begin(), [](Value *V) {
13429 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13430 });
13431 InstructionCost InsertCost =
13432 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13433 PoisonValue::get(VecTy), *It);
13434 return InsertCost + ::getShuffleCost(TTI,
13436 VecTy, ShuffleMask, CostKind,
13437 /*Index=*/0, /*SubTp=*/nullptr,
13438 /*Args=*/*It);
13439 }
13440 return GatherCost +
13441 (all_of(Gathers, IsaPred<UndefValue>)
13443 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13444 ScalarTy));
13445 };
13446
13447 /// Compute the cost of creating a vector containing the extracted values from
13448 /// \p VL.
13450 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13451 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13452 unsigned NumParts) {
13453 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13454 unsigned NumElts =
13455 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13456 auto *EE = dyn_cast<ExtractElementInst>(V);
13457 if (!EE)
13458 return Sz;
13459 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13460 if (!VecTy)
13461 return Sz;
13462 return std::max(Sz, VecTy->getNumElements());
13463 });
13464 // FIXME: this must be moved to TTI for better estimation.
13465 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13466 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13468 SmallVectorImpl<unsigned> &SubVecSizes)
13469 -> std::optional<TTI::ShuffleKind> {
13470 if (NumElts <= EltsPerVector)
13471 return std::nullopt;
13472 int OffsetReg0 =
13473 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13474 [](int S, int I) {
13475 if (I == PoisonMaskElem)
13476 return S;
13477 return std::min(S, I);
13478 }),
13479 EltsPerVector);
13480 int OffsetReg1 = OffsetReg0;
13481 DenseSet<int> RegIndices;
13482 // Check that if trying to permute same single/2 input vectors.
13484 int FirstRegId = -1;
13485 Indices.assign(1, OffsetReg0);
13486 for (auto [Pos, I] : enumerate(Mask)) {
13487 if (I == PoisonMaskElem)
13488 continue;
13489 int Idx = I - OffsetReg0;
13490 int RegId =
13491 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13492 if (FirstRegId < 0)
13493 FirstRegId = RegId;
13494 RegIndices.insert(RegId);
13495 if (RegIndices.size() > 2)
13496 return std::nullopt;
13497 if (RegIndices.size() == 2) {
13498 ShuffleKind = TTI::SK_PermuteTwoSrc;
13499 if (Indices.size() == 1) {
13500 OffsetReg1 = alignDown(
13501 std::accumulate(
13502 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13503 [&](int S, int I) {
13504 if (I == PoisonMaskElem)
13505 return S;
13506 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13507 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13508 if (RegId == FirstRegId)
13509 return S;
13510 return std::min(S, I);
13511 }),
13512 EltsPerVector);
13513 unsigned Index = OffsetReg1 % NumElts;
13514 Indices.push_back(Index);
13515 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13516 }
13517 Idx = I - OffsetReg1;
13518 }
13519 I = (Idx % NumElts) % EltsPerVector +
13520 (RegId == FirstRegId ? 0 : EltsPerVector);
13521 }
13522 return ShuffleKind;
13523 };
13524 InstructionCost Cost = 0;
13525
13526 // Process extracts in blocks of EltsPerVector to check if the source vector
13527 // operand can be re-used directly. If not, add the cost of creating a
13528 // shuffle to extract the values into a vector register.
13529 for (unsigned Part : seq<unsigned>(NumParts)) {
13530 if (!ShuffleKinds[Part])
13531 continue;
13532 ArrayRef<int> MaskSlice = Mask.slice(
13533 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13534 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13535 copy(MaskSlice, SubMask.begin());
13537 SmallVector<unsigned, 2> SubVecSizes;
13538 std::optional<TTI::ShuffleKind> RegShuffleKind =
13539 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13540 if (!RegShuffleKind) {
13541 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13543 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13544 Cost +=
13545 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13546 getWidenedType(ScalarTy, NumElts), MaskSlice);
13547 continue;
13548 }
13549 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13550 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13551 Cost +=
13552 ::getShuffleCost(TTI, *RegShuffleKind,
13553 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13554 }
13555 const unsigned BaseVF = getFullVectorNumberOfElements(
13556 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13557 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13558 assert((Idx + SubVecSize) <= BaseVF &&
13559 "SK_ExtractSubvector index out of range");
13561 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13562 Idx, getWidenedType(ScalarTy, SubVecSize));
13563 }
13564 // Second attempt to check, if just a permute is better estimated than
13565 // subvector extract.
13566 SubMask.assign(NumElts, PoisonMaskElem);
13567 copy(MaskSlice, SubMask.begin());
13568 InstructionCost OriginalCost = ::getShuffleCost(
13569 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13570 if (OriginalCost < Cost)
13571 Cost = OriginalCost;
13572 }
13573 return Cost;
13574 }
13575 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13576 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13577 /// elements.
13578 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13579 ArrayRef<int> Mask, unsigned Part,
13580 unsigned SliceSize) {
13581 if (SameNodesEstimated) {
13582 // Delay the cost estimation if the same nodes are reshuffling.
13583 // If we already requested the cost of reshuffling of E1 and E2 before, no
13584 // need to estimate another cost with the sub-Mask, instead include this
13585 // sub-Mask into the CommonMask to estimate it later and avoid double cost
13586 // estimation.
13587 if ((InVectors.size() == 2 &&
13588 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
13589 cast<const TreeEntry *>(InVectors.back()) == E2) ||
13590 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
13591 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
13592 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
13593 [](int Idx) { return Idx == PoisonMaskElem; }) &&
13594 "Expected all poisoned elements.");
13595 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
13596 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13597 return;
13598 }
13599 // Found non-matching nodes - need to estimate the cost for the matched
13600 // and transform mask.
13601 Cost += createShuffle(InVectors.front(),
13602 InVectors.size() == 1 ? nullptr : InVectors.back(),
13603 CommonMask);
13604 transformMaskAfterShuffle(CommonMask, CommonMask);
13605 } else if (InVectors.size() == 2) {
13606 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13607 transformMaskAfterShuffle(CommonMask, CommonMask);
13608 }
13609 SameNodesEstimated = false;
13610 if (!E2 && InVectors.size() == 1) {
13611 unsigned VF = E1.getVectorFactor();
13612 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
13613 VF = std::max(VF, getVF(V1));
13614 } else {
13615 const auto *E = cast<const TreeEntry *>(InVectors.front());
13616 VF = std::max(VF, E->getVectorFactor());
13617 }
13618 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13619 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13620 CommonMask[Idx] = Mask[Idx] + VF;
13621 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13622 transformMaskAfterShuffle(CommonMask, CommonMask);
13623 } else {
13624 auto P = InVectors.front();
13625 Cost += createShuffle(&E1, E2, Mask);
13626 unsigned VF = Mask.size();
13627 if (Value *V1 = dyn_cast<Value *>(P)) {
13628 VF = std::max(VF,
13629 getNumElements(V1->getType()));
13630 } else {
13631 const auto *E = cast<const TreeEntry *>(P);
13632 VF = std::max(VF, E->getVectorFactor());
13633 }
13634 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13635 if (Mask[Idx] != PoisonMaskElem)
13636 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13637 Cost += createShuffle(P, InVectors.front(), CommonMask);
13638 transformMaskAfterShuffle(CommonMask, CommonMask);
13639 }
13640 }
13641
13642 class ShuffleCostBuilder {
13643 const TargetTransformInfo &TTI;
13644
13645 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
13646 int Index = -1;
13647 return Mask.empty() ||
13648 (VF == Mask.size() &&
13651 Index == 0);
13652 }
13653
13654 public:
13655 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
13656 ~ShuffleCostBuilder() = default;
13657 InstructionCost createShuffleVector(Value *V1, Value *,
13658 ArrayRef<int> Mask) const {
13659 // Empty mask or identity mask are free.
13660 unsigned VF =
13661 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13662 if (isEmptyOrIdentity(Mask, VF))
13663 return TTI::TCC_Free;
13664 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
13665 cast<VectorType>(V1->getType()), Mask);
13666 }
13667 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
13668 // Empty mask or identity mask are free.
13669 unsigned VF =
13670 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13671 if (isEmptyOrIdentity(Mask, VF))
13672 return TTI::TCC_Free;
13673 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
13674 cast<VectorType>(V1->getType()), Mask);
13675 }
13676 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
13677 InstructionCost createPoison(Type *Ty, unsigned VF) const {
13678 return TTI::TCC_Free;
13679 }
13680 void resizeToMatch(Value *&, Value *&) const {}
13681 };
13682
13683 /// Smart shuffle instruction emission, walks through shuffles trees and
13684 /// tries to find the best matching vector for the actual shuffle
13685 /// instruction.
13687 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
13689 ArrayRef<int> Mask) {
13690 ShuffleCostBuilder Builder(TTI);
13691 SmallVector<int> CommonMask(Mask);
13692 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
13693 unsigned CommonVF = Mask.size();
13694 InstructionCost ExtraCost = 0;
13695 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
13696 unsigned VF) -> InstructionCost {
13697 if (E.isGather() && allConstant(E.Scalars))
13698 return TTI::TCC_Free;
13699 Type *EScalarTy = E.Scalars.front()->getType();
13700 bool IsSigned = true;
13701 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13702 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
13703 IsSigned = It->second.second;
13704 }
13705 if (EScalarTy != ScalarTy) {
13706 unsigned CastOpcode = Instruction::Trunc;
13707 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13708 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13709 if (DstSz > SrcSz)
13710 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13711 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
13712 getWidenedType(EScalarTy, VF),
13713 TTI::CastContextHint::None, CostKind);
13714 }
13715 return TTI::TCC_Free;
13716 };
13717 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
13718 if (isa<Constant>(V))
13719 return TTI::TCC_Free;
13720 auto *VecTy = cast<VectorType>(V->getType());
13721 Type *EScalarTy = VecTy->getElementType();
13722 if (EScalarTy != ScalarTy) {
13723 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
13724 unsigned CastOpcode = Instruction::Trunc;
13725 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13726 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13727 if (DstSz > SrcSz)
13728 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13729 return TTI.getCastInstrCost(
13730 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
13731 VecTy, TTI::CastContextHint::None, CostKind);
13732 }
13733 return TTI::TCC_Free;
13734 };
13735 if (!V1 && !V2 && !P2.isNull()) {
13736 // Shuffle 2 entry nodes.
13737 const TreeEntry *E = cast<const TreeEntry *>(P1);
13738 unsigned VF = E->getVectorFactor();
13739 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13740 CommonVF = std::max(VF, E2->getVectorFactor());
13741 assert(all_of(Mask,
13742 [=](int Idx) {
13743 return Idx < 2 * static_cast<int>(CommonVF);
13744 }) &&
13745 "All elements in mask must be less than 2 * CommonVF.");
13746 if (E->Scalars.size() == E2->Scalars.size()) {
13747 SmallVector<int> EMask = E->getCommonMask();
13748 SmallVector<int> E2Mask = E2->getCommonMask();
13749 if (!EMask.empty() || !E2Mask.empty()) {
13750 for (int &Idx : CommonMask) {
13751 if (Idx == PoisonMaskElem)
13752 continue;
13753 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
13754 Idx = EMask[Idx];
13755 else if (Idx >= static_cast<int>(CommonVF))
13756 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13757 E->Scalars.size();
13758 }
13759 }
13760 CommonVF = E->Scalars.size();
13761 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13762 GetNodeMinBWAffectedCost(*E2, CommonVF);
13763 } else {
13764 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13765 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13766 }
13767 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13768 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13769 } else if (!V1 && P2.isNull()) {
13770 // Shuffle single entry node.
13771 const TreeEntry *E = cast<const TreeEntry *>(P1);
13772 unsigned VF = E->getVectorFactor();
13773 CommonVF = VF;
13774 assert(
13775 all_of(Mask,
13776 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13777 "All elements in mask must be less than CommonVF.");
13778 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13779 SmallVector<int> EMask = E->getCommonMask();
13780 assert(!EMask.empty() && "Expected non-empty common mask.");
13781 for (int &Idx : CommonMask) {
13782 if (Idx != PoisonMaskElem)
13783 Idx = EMask[Idx];
13784 }
13785 CommonVF = E->Scalars.size();
13786 } else if (unsigned Factor = E->getInterleaveFactor();
13787 Factor > 0 && E->Scalars.size() != Mask.size() &&
13789 Factor)) {
13790 // Deinterleaved nodes are free.
13791 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13792 }
13793 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13794 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13795 // Not identity/broadcast? Try to see if the original vector is better.
13796 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13797 CommonVF == CommonMask.size() &&
13798 any_of(enumerate(CommonMask),
13799 [](const auto &&P) {
13800 return P.value() != PoisonMaskElem &&
13801 static_cast<unsigned>(P.value()) != P.index();
13802 }) &&
13803 any_of(CommonMask,
13804 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
13805 SmallVector<int> ReorderMask;
13806 inversePermutation(E->ReorderIndices, ReorderMask);
13807 ::addMask(CommonMask, ReorderMask);
13808 }
13809 } else if (V1 && P2.isNull()) {
13810 // Shuffle single vector.
13811 ExtraCost += GetValueMinBWAffectedCost(V1);
13812 CommonVF = getVF(V1);
13813 assert(
13814 all_of(Mask,
13815 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13816 "All elements in mask must be less than CommonVF.");
13817 } else if (V1 && !V2) {
13818 // Shuffle vector and tree node.
13819 unsigned VF = getVF(V1);
13820 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13821 CommonVF = std::max(VF, E2->getVectorFactor());
13822 assert(all_of(Mask,
13823 [=](int Idx) {
13824 return Idx < 2 * static_cast<int>(CommonVF);
13825 }) &&
13826 "All elements in mask must be less than 2 * CommonVF.");
13827 if (E2->Scalars.size() == VF && VF != CommonVF) {
13828 SmallVector<int> E2Mask = E2->getCommonMask();
13829 assert(!E2Mask.empty() && "Expected non-empty common mask.");
13830 for (int &Idx : CommonMask) {
13831 if (Idx == PoisonMaskElem)
13832 continue;
13833 if (Idx >= static_cast<int>(CommonVF))
13834 Idx = E2Mask[Idx - CommonVF] + VF;
13835 }
13836 CommonVF = VF;
13837 }
13838 ExtraCost += GetValueMinBWAffectedCost(V1);
13839 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13840 ExtraCost += GetNodeMinBWAffectedCost(
13841 *E2, std::min(CommonVF, E2->getVectorFactor()));
13842 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13843 } else if (!V1 && V2) {
13844 // Shuffle vector and tree node.
13845 unsigned VF = getVF(V2);
13846 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13847 CommonVF = std::max(VF, E1->getVectorFactor());
13848 assert(all_of(Mask,
13849 [=](int Idx) {
13850 return Idx < 2 * static_cast<int>(CommonVF);
13851 }) &&
13852 "All elements in mask must be less than 2 * CommonVF.");
13853 if (E1->Scalars.size() == VF && VF != CommonVF) {
13854 SmallVector<int> E1Mask = E1->getCommonMask();
13855 assert(!E1Mask.empty() && "Expected non-empty common mask.");
13856 for (int &Idx : CommonMask) {
13857 if (Idx == PoisonMaskElem)
13858 continue;
13859 if (Idx >= static_cast<int>(CommonVF))
13860 Idx = E1Mask[Idx - CommonVF] + VF;
13861 else
13862 Idx = E1Mask[Idx];
13863 }
13864 CommonVF = VF;
13865 }
13866 ExtraCost += GetNodeMinBWAffectedCost(
13867 *E1, std::min(CommonVF, E1->getVectorFactor()));
13868 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13869 ExtraCost += GetValueMinBWAffectedCost(V2);
13870 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13871 } else {
13872 assert(V1 && V2 && "Expected both vectors.");
13873 unsigned VF = getVF(V1);
13874 CommonVF = std::max(VF, getVF(V2));
13875 assert(all_of(Mask,
13876 [=](int Idx) {
13877 return Idx < 2 * static_cast<int>(CommonVF);
13878 }) &&
13879 "All elements in mask must be less than 2 * CommonVF.");
13880 ExtraCost +=
13881 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13882 if (V1->getType() != V2->getType()) {
13883 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13884 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13885 } else {
13886 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
13887 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13888 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
13889 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13890 }
13891 }
13892 InVectors.front() =
13893 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13894 if (InVectors.size() == 2)
13895 InVectors.pop_back();
13896 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13897 V1, V2, CommonMask, Builder, ScalarTy);
13898 }
13899
13900public:
13902 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
13903 SmallPtrSetImpl<Value *> &CheckedExtracts)
13904 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13905 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13906 CheckedExtracts(CheckedExtracts) {}
13907 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
13908 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13909 unsigned NumParts, bool &UseVecBaseAsInput) {
13910 UseVecBaseAsInput = false;
13911 if (Mask.empty())
13912 return nullptr;
13913 Value *VecBase = nullptr;
13914 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
13915 if (!E->ReorderIndices.empty()) {
13916 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
13917 E->ReorderIndices.end());
13918 reorderScalars(VL, ReorderMask);
13919 }
13920 // Check if it can be considered reused if same extractelements were
13921 // vectorized already.
13922 bool PrevNodeFound = any_of(
13923 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13924 [&](const std::unique_ptr<TreeEntry> &TE) {
13925 return ((TE->hasState() && !TE->isAltShuffle() &&
13926 TE->getOpcode() == Instruction::ExtractElement) ||
13927 TE->isGather()) &&
13928 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13929 return VL.size() > Data.index() &&
13930 (Mask[Data.index()] == PoisonMaskElem ||
13931 isa<UndefValue>(VL[Data.index()]) ||
13932 Data.value() == VL[Data.index()]);
13933 });
13934 });
13935 SmallPtrSet<Value *, 4> UniqueBases;
13936 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13937 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
13938 for (unsigned Part : seq<unsigned>(NumParts)) {
13939 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
13940 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13941 for (auto [I, V] :
13942 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
13943 // Ignore non-extractelement scalars.
13944 if (isa<UndefValue>(V) ||
13945 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
13946 continue;
13947 // If all users of instruction are going to be vectorized and this
13948 // instruction itself is not going to be vectorized, consider this
13949 // instruction as dead and remove its cost from the final cost of the
13950 // vectorized tree.
13951 // Also, avoid adjusting the cost for extractelements with multiple uses
13952 // in different graph entries.
13953 auto *EE = cast<ExtractElementInst>(V);
13954 VecBase = EE->getVectorOperand();
13955 UniqueBases.insert(VecBase);
13956 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
13957 if (!CheckedExtracts.insert(V).second ||
13958 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
13959 any_of(EE->users(),
13960 [&](User *U) {
13961 return isa<GetElementPtrInst>(U) &&
13962 !R.areAllUsersVectorized(cast<Instruction>(U),
13963 &VectorizedVals);
13964 }) ||
13965 (!VEs.empty() && !is_contained(VEs, E)))
13966 continue;
13967 std::optional<unsigned> EEIdx = getExtractIndex(EE);
13968 if (!EEIdx)
13969 continue;
13970 unsigned Idx = *EEIdx;
13971 // Take credit for instruction that will become dead.
13972 if (EE->hasOneUse() || !PrevNodeFound) {
13973 Instruction *Ext = EE->user_back();
13974 if (isa<SExtInst, ZExtInst>(Ext) &&
13975 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
13976 // Use getExtractWithExtendCost() to calculate the cost of
13977 // extractelement/ext pair.
13978 Cost -= TTI.getExtractWithExtendCost(
13979 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13980 Idx, CostKind);
13981 // Add back the cost of s|zext which is subtracted separately.
13982 Cost += TTI.getCastInstrCost(
13983 Ext->getOpcode(), Ext->getType(), EE->getType(),
13985 continue;
13986 }
13987 }
13988 APInt &DemandedElts =
13989 VectorOpsToExtracts
13990 .try_emplace(VecBase,
13991 APInt::getZero(getNumElements(VecBase->getType())))
13992 .first->getSecond();
13993 DemandedElts.setBit(Idx);
13994 }
13995 }
13996 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13998 DemandedElts, /*Insert=*/false,
13999 /*Extract=*/true, CostKind);
14000 // Check that gather of extractelements can be represented as just a
14001 // shuffle of a single/two vectors the scalars are extracted from.
14002 // Found the bunch of extractelement instructions that must be gathered
14003 // into a vector and can be represented as a permutation elements in a
14004 // single input vector or of 2 input vectors.
14005 // Done for reused if same extractelements were vectorized already.
14006 if (!PrevNodeFound)
14007 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14008 InVectors.assign(1, E);
14009 CommonMask.assign(Mask.begin(), Mask.end());
14010 transformMaskAfterShuffle(CommonMask, CommonMask);
14011 SameNodesEstimated = false;
14012 if (NumParts != 1 && UniqueBases.size() != 1) {
14013 UseVecBaseAsInput = true;
14014 VecBase =
14015 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14016 }
14017 return VecBase;
14018 }
14019 /// Checks if the specified entry \p E needs to be delayed because of its
14020 /// dependency nodes.
14021 std::optional<InstructionCost>
14022 needToDelay(const TreeEntry *,
14024 // No need to delay the cost estimation during analysis.
14025 return std::nullopt;
14026 }
14027 /// Reset the builder to handle perfect diamond match.
14029 IsFinalized = false;
14030 CommonMask.clear();
14031 InVectors.clear();
14032 Cost = 0;
14033 VectorizedVals.clear();
14034 SameNodesEstimated = true;
14035 }
14036 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14037 if (&E1 == &E2) {
14038 assert(all_of(Mask,
14039 [&](int Idx) {
14040 return Idx < static_cast<int>(E1.getVectorFactor());
14041 }) &&
14042 "Expected single vector shuffle mask.");
14043 add(E1, Mask);
14044 return;
14045 }
14046 if (InVectors.empty()) {
14047 CommonMask.assign(Mask.begin(), Mask.end());
14048 InVectors.assign({&E1, &E2});
14049 return;
14050 }
14051 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14052 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14053 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14054 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14055 const auto *It =
14056 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
14057 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14058 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14059 }
14060 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14061 if (InVectors.empty()) {
14062 CommonMask.assign(Mask.begin(), Mask.end());
14063 InVectors.assign(1, &E1);
14064 return;
14065 }
14066 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14067 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14068 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14069 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14070 const auto *It =
14071 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
14072 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14073 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
14074 if (!SameNodesEstimated && InVectors.size() == 1)
14075 InVectors.emplace_back(&E1);
14076 }
14077 /// Adds 2 input vectors and the mask for their shuffling.
14078 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14079 // May come only for shuffling of 2 vectors with extractelements, already
14080 // handled in adjustExtracts.
14081 assert(InVectors.size() == 1 &&
14082 all_of(enumerate(CommonMask),
14083 [&](auto P) {
14084 if (P.value() == PoisonMaskElem)
14085 return Mask[P.index()] == PoisonMaskElem;
14086 auto *EI = cast<ExtractElementInst>(
14087 cast<const TreeEntry *>(InVectors.front())
14088 ->getOrdered(P.index()));
14089 return EI->getVectorOperand() == V1 ||
14090 EI->getVectorOperand() == V2;
14091 }) &&
14092 "Expected extractelement vectors.");
14093 }
14094 /// Adds another one input vector and the mask for the shuffling.
14095 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14096 if (InVectors.empty()) {
14097 assert(CommonMask.empty() && !ForExtracts &&
14098 "Expected empty input mask/vectors.");
14099 CommonMask.assign(Mask.begin(), Mask.end());
14100 InVectors.assign(1, V1);
14101 return;
14102 }
14103 if (ForExtracts) {
14104 // No need to add vectors here, already handled them in adjustExtracts.
14105 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14106 !CommonMask.empty() &&
14107 all_of(enumerate(CommonMask),
14108 [&](auto P) {
14109 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14110 ->getOrdered(P.index());
14111 if (P.value() == PoisonMaskElem)
14112 return P.value() == Mask[P.index()] ||
14113 isa<UndefValue>(Scalar);
14114 if (isa<Constant>(V1))
14115 return true;
14116 auto *EI = cast<ExtractElementInst>(Scalar);
14117 return EI->getVectorOperand() == V1;
14118 }) &&
14119 "Expected only tree entry for extractelement vectors.");
14120 return;
14121 }
14122 assert(!InVectors.empty() && !CommonMask.empty() &&
14123 "Expected only tree entries from extracts/reused buildvectors.");
14124 unsigned VF = getVF(V1);
14125 if (InVectors.size() == 2) {
14126 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14127 transformMaskAfterShuffle(CommonMask, CommonMask);
14128 VF = std::max<unsigned>(VF, CommonMask.size());
14129 } else if (const auto *InTE =
14130 InVectors.front().dyn_cast<const TreeEntry *>()) {
14131 VF = std::max(VF, InTE->getVectorFactor());
14132 } else {
14133 VF = std::max(
14134 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14135 ->getNumElements());
14136 }
14137 InVectors.push_back(V1);
14138 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14139 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14140 CommonMask[Idx] = Mask[Idx] + VF;
14141 }
14142 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14143 Value *Root = nullptr) {
14144 Cost += getBuildVectorCost(VL, Root);
14145 if (!Root) {
14146 // FIXME: Need to find a way to avoid use of getNullValue here.
14148 unsigned VF = VL.size();
14149 if (MaskVF != 0)
14150 VF = std::min(VF, MaskVF);
14151 Type *VLScalarTy = VL.front()->getType();
14152 for (Value *V : VL.take_front(VF)) {
14153 Type *ScalarTy = VLScalarTy->getScalarType();
14154 if (isa<PoisonValue>(V)) {
14155 Vals.push_back(PoisonValue::get(ScalarTy));
14156 continue;
14157 }
14158 if (isa<UndefValue>(V)) {
14159 Vals.push_back(UndefValue::get(ScalarTy));
14160 continue;
14161 }
14162 Vals.push_back(Constant::getNullValue(ScalarTy));
14163 }
14164 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14165 assert(SLPReVec && "FixedVectorType is not expected.");
14166 // When REVEC is enabled, we need to expand vector types into scalar
14167 // types.
14168 Vals = replicateMask(Vals, VecTy->getNumElements());
14169 }
14170 return ConstantVector::get(Vals);
14171 }
14174 cast<FixedVectorType>(Root->getType())->getNumElements()),
14175 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14176 }
14178 /// Finalize emission of the shuffles.
14180 ArrayRef<int> ExtMask,
14181 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14182 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14185 Action = {}) {
14186 IsFinalized = true;
14187 if (Action) {
14188 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14189 if (InVectors.size() == 2)
14190 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14191 else
14192 Cost += createShuffle(Vec, nullptr, CommonMask);
14193 transformMaskAfterShuffle(CommonMask, CommonMask);
14194 assert(VF > 0 &&
14195 "Expected vector length for the final value before action.");
14196 Value *V = cast<Value *>(Vec);
14197 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14198 Cost += createShuffle(V1, V2, Mask);
14199 return V1;
14200 });
14201 InVectors.front() = V;
14202 }
14203 if (!SubVectors.empty()) {
14204 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14205 if (InVectors.size() == 2)
14206 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14207 else
14208 Cost += createShuffle(Vec, nullptr, CommonMask);
14209 transformMaskAfterShuffle(CommonMask, CommonMask);
14210 // Add subvectors permutation cost.
14211 if (!SubVectorsMask.empty()) {
14212 assert(SubVectorsMask.size() <= CommonMask.size() &&
14213 "Expected same size of masks for subvectors and common mask.");
14214 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14215 copy(SubVectorsMask, SVMask.begin());
14216 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14217 if (I2 != PoisonMaskElem) {
14218 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14219 I1 = I2 + CommonMask.size();
14220 }
14221 }
14223 getWidenedType(ScalarTy, CommonMask.size()),
14224 SVMask, CostKind);
14225 }
14226 for (auto [E, Idx] : SubVectors) {
14227 Type *EScalarTy = E->Scalars.front()->getType();
14228 bool IsSigned = true;
14229 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14230 EScalarTy =
14231 IntegerType::get(EScalarTy->getContext(), It->second.first);
14232 IsSigned = It->second.second;
14233 }
14234 if (ScalarTy != EScalarTy) {
14235 unsigned CastOpcode = Instruction::Trunc;
14236 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14237 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14238 if (DstSz > SrcSz)
14239 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14240 Cost += TTI.getCastInstrCost(
14241 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14242 getWidenedType(EScalarTy, E->getVectorFactor()),
14244 }
14247 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14248 getWidenedType(ScalarTy, E->getVectorFactor()));
14249 if (!CommonMask.empty()) {
14250 std::iota(std::next(CommonMask.begin(), Idx),
14251 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14252 Idx);
14253 }
14254 }
14255 }
14256
14257 if (!ExtMask.empty()) {
14258 if (CommonMask.empty()) {
14259 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14260 } else {
14261 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14262 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14263 if (ExtMask[I] == PoisonMaskElem)
14264 continue;
14265 NewMask[I] = CommonMask[ExtMask[I]];
14266 }
14267 CommonMask.swap(NewMask);
14268 }
14269 }
14270 if (CommonMask.empty()) {
14271 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14272 return Cost;
14273 }
14274 return Cost +
14275 createShuffle(InVectors.front(),
14276 InVectors.size() == 2 ? InVectors.back() : nullptr,
14277 CommonMask);
14278 }
14279
14281 assert((IsFinalized || CommonMask.empty()) &&
14282 "Shuffle construction must be finalized.");
14283 }
14284};
14285
14286const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14287 unsigned Idx) const {
14288 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14289 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14290 return Op;
14291}
14292
14293TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14294 if (TE.State == TreeEntry::ScatterVectorize ||
14295 TE.State == TreeEntry::StridedVectorize)
14297 if (TE.State == TreeEntry::CompressVectorize)
14299 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14300 !TE.isAltShuffle()) {
14301 if (TE.ReorderIndices.empty())
14303 SmallVector<int> Mask;
14304 inversePermutation(TE.ReorderIndices, Mask);
14305 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14307 }
14309}
14310
14312BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14313 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14314 ArrayRef<Value *> VL = E->Scalars;
14315
14316 Type *ScalarTy = getValueType(VL[0]);
14317 if (!isValidElementType(ScalarTy))
14320
14321 // If we have computed a smaller type for the expression, update VecTy so
14322 // that the costs will be accurate.
14323 auto It = MinBWs.find(E);
14324 Type *OrigScalarTy = ScalarTy;
14325 if (It != MinBWs.end()) {
14326 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14327 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14328 if (VecTy)
14329 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14330 }
14331 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14332 unsigned EntryVF = E->getVectorFactor();
14333 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14334
14335 if (E->isGather()) {
14336 if (allConstant(VL))
14337 return 0;
14338 if (isa<InsertElementInst>(VL[0]))
14340 if (isa<CmpInst>(VL.front()))
14341 ScalarTy = VL.front()->getType();
14342 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14343 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14344 }
14345 if (E->State == TreeEntry::SplitVectorize) {
14346 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14347 "Expected exactly 2 combined entries.");
14348 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14349 InstructionCost VectorCost = 0;
14350 if (E->ReorderIndices.empty()) {
14351 VectorCost = ::getShuffleCost(
14352 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14353 E->CombinedEntriesWithIndices.back().second,
14355 ScalarTy,
14356 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14357 ->getVectorFactor()));
14358 } else {
14359 unsigned CommonVF =
14360 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14361 ->getVectorFactor(),
14362 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14363 ->getVectorFactor());
14364 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14365 getWidenedType(ScalarTy, CommonVF),
14366 E->getSplitMask(), CostKind);
14367 }
14368 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14369 return VectorCost;
14370 }
14371 InstructionCost CommonCost = 0;
14372 SmallVector<int> Mask;
14373 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14374 (E->State != TreeEntry::StridedVectorize ||
14375 !isReverseOrder(E->ReorderIndices))) {
14376 SmallVector<int> NewMask;
14377 if (E->getOpcode() == Instruction::Store) {
14378 // For stores the order is actually a mask.
14379 NewMask.resize(E->ReorderIndices.size());
14380 copy(E->ReorderIndices, NewMask.begin());
14381 } else {
14382 inversePermutation(E->ReorderIndices, NewMask);
14383 }
14384 ::addMask(Mask, NewMask);
14385 }
14386 if (!E->ReuseShuffleIndices.empty())
14387 ::addMask(Mask, E->ReuseShuffleIndices);
14388 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14389 CommonCost =
14390 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14391 assert((E->State == TreeEntry::Vectorize ||
14392 E->State == TreeEntry::ScatterVectorize ||
14393 E->State == TreeEntry::StridedVectorize ||
14394 E->State == TreeEntry::CompressVectorize) &&
14395 "Unhandled state");
14396 assert(E->getOpcode() &&
14397 ((allSameType(VL) && allSameBlock(VL)) ||
14398 (E->getOpcode() == Instruction::GetElementPtr &&
14399 E->getMainOp()->getType()->isPointerTy()) ||
14400 E->hasCopyableElements()) &&
14401 "Invalid VL");
14402 Instruction *VL0 = E->getMainOp();
14403 unsigned ShuffleOrOp =
14404 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14405 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14406 ShuffleOrOp = E->CombinedOp;
14407 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
14408 const unsigned Sz = UniqueValues.size();
14409 SmallBitVector UsedScalars(Sz, false);
14410 for (unsigned I = 0; I < Sz; ++I) {
14411 if (isa<Instruction>(UniqueValues[I]) &&
14412 !E->isCopyableElement(UniqueValues[I]) &&
14413 getTreeEntries(UniqueValues[I]).front() == E)
14414 continue;
14415 UsedScalars.set(I);
14416 }
14417 auto GetCastContextHint = [&](Value *V) {
14418 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14419 return getCastContextHint(*OpTEs.front());
14420 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14421 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14422 !SrcState.isAltShuffle())
14425 };
14426 auto GetCostDiff =
14427 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14428 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14429 // Calculate the cost of this instruction.
14430 InstructionCost ScalarCost = 0;
14431 if (isa<CastInst, CallInst>(VL0)) {
14432 // For some of the instructions no need to calculate cost for each
14433 // particular instruction, we can use the cost of the single
14434 // instruction x total number of scalar instructions.
14435 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14436 } else {
14437 for (unsigned I = 0; I < Sz; ++I) {
14438 if (UsedScalars.test(I))
14439 continue;
14440 ScalarCost += ScalarEltCost(I);
14441 }
14442 }
14443
14444 InstructionCost VecCost = VectorCost(CommonCost);
14445 // Check if the current node must be resized, if the parent node is not
14446 // resized.
14447 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14448 E->Idx != 0 &&
14449 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14450 const EdgeInfo &EI = E->UserTreeIndex;
14451 if (!EI.UserTE->hasState() ||
14452 EI.UserTE->getOpcode() != Instruction::Select ||
14453 EI.EdgeIdx != 0) {
14454 auto UserBWIt = MinBWs.find(EI.UserTE);
14455 Type *UserScalarTy =
14456 (EI.UserTE->isGather() ||
14457 EI.UserTE->State == TreeEntry::SplitVectorize)
14458 ? EI.UserTE->Scalars.front()->getType()
14459 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14460 if (UserBWIt != MinBWs.end())
14461 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14462 UserBWIt->second.first);
14463 if (ScalarTy != UserScalarTy) {
14464 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14465 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14466 unsigned VecOpcode;
14467 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14468 if (BWSz > SrcBWSz)
14469 VecOpcode = Instruction::Trunc;
14470 else
14471 VecOpcode =
14472 It->second.second ? Instruction::SExt : Instruction::ZExt;
14473 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14474 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14475 CostKind);
14476 }
14477 }
14478 }
14479 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14480 ScalarCost, "Calculated costs for Tree"));
14481 return VecCost - ScalarCost;
14482 };
14483 // Calculate cost difference from vectorizing set of GEPs.
14484 // Negative value means vectorizing is profitable.
14485 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14486 assert((E->State == TreeEntry::Vectorize ||
14487 E->State == TreeEntry::StridedVectorize ||
14488 E->State == TreeEntry::CompressVectorize) &&
14489 "Entry state expected to be Vectorize, StridedVectorize or "
14490 "MaskedLoadCompressVectorize here.");
14491 InstructionCost ScalarCost = 0;
14492 InstructionCost VecCost = 0;
14493 std::tie(ScalarCost, VecCost) = getGEPCosts(
14494 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14495 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14496 "Calculated GEPs cost for Tree"));
14497
14498 return VecCost - ScalarCost;
14499 };
14500
14501 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14502 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14503 if (MinMaxID == Intrinsic::not_intrinsic)
14505 Type *CanonicalType = Ty;
14506 if (CanonicalType->isPtrOrPtrVectorTy())
14507 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14508 CanonicalType->getContext(),
14509 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14510
14511 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14512 {CanonicalType, CanonicalType});
14514 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14515 // If the selects are the only uses of the compares, they will be
14516 // dead and we can adjust the cost by removing their cost.
14517 if (VI && SelectOnly) {
14518 assert((!Ty->isVectorTy() || SLPReVec) &&
14519 "Expected only for scalar type.");
14520 auto *CI = cast<CmpInst>(VI->getOperand(0));
14521 IntrinsicCost -= TTI->getCmpSelInstrCost(
14522 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14523 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14524 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14525 }
14526 return IntrinsicCost;
14527 };
14528 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14529 Instruction *VI) {
14530 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14531 return Cost;
14532 };
14533 switch (ShuffleOrOp) {
14534 case Instruction::PHI: {
14535 // Count reused scalars.
14536 InstructionCost ScalarCost = 0;
14537 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14538 for (Value *V : UniqueValues) {
14539 auto *PHI = dyn_cast<PHINode>(V);
14540 if (!PHI)
14541 continue;
14542
14543 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14544 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14545 Value *Op = PHI->getIncomingValue(I);
14546 Operands[I] = Op;
14547 }
14548 if (const TreeEntry *OpTE =
14549 getSameValuesTreeEntry(Operands.front(), Operands))
14550 if (CountedOps.insert(OpTE).second &&
14551 !OpTE->ReuseShuffleIndices.empty())
14552 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14553 OpTE->Scalars.size());
14554 }
14555
14556 return CommonCost - ScalarCost;
14557 }
14558 case Instruction::ExtractValue:
14559 case Instruction::ExtractElement: {
14560 APInt DemandedElts;
14561 VectorType *SrcVecTy = nullptr;
14562 auto GetScalarCost = [&](unsigned Idx) {
14563 if (isa<PoisonValue>(UniqueValues[Idx]))
14565
14566 auto *I = cast<Instruction>(UniqueValues[Idx]);
14567 if (!SrcVecTy) {
14568 if (ShuffleOrOp == Instruction::ExtractElement) {
14569 auto *EE = cast<ExtractElementInst>(I);
14570 SrcVecTy = EE->getVectorOperandType();
14571 } else {
14572 auto *EV = cast<ExtractValueInst>(I);
14573 Type *AggregateTy = EV->getAggregateOperand()->getType();
14574 unsigned NumElts;
14575 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14576 NumElts = ATy->getNumElements();
14577 else
14578 NumElts = AggregateTy->getStructNumElements();
14579 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
14580 }
14581 }
14582 if (I->hasOneUse()) {
14583 Instruction *Ext = I->user_back();
14584 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14586 // Use getExtractWithExtendCost() to calculate the cost of
14587 // extractelement/ext pair.
14588 InstructionCost Cost = TTI->getExtractWithExtendCost(
14589 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
14590 CostKind);
14591 // Subtract the cost of s|zext which is subtracted separately.
14592 Cost -= TTI->getCastInstrCost(
14593 Ext->getOpcode(), Ext->getType(), I->getType(),
14595 return Cost;
14596 }
14597 }
14598 if (DemandedElts.isZero())
14599 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
14600 DemandedElts.setBit(*getExtractIndex(I));
14602 };
14603 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14604 return CommonCost - (DemandedElts.isZero()
14606 : TTI.getScalarizationOverhead(
14607 SrcVecTy, DemandedElts, /*Insert=*/false,
14608 /*Extract=*/true, CostKind));
14609 };
14610 return GetCostDiff(GetScalarCost, GetVectorCost);
14611 }
14612 case Instruction::InsertElement: {
14613 assert(E->ReuseShuffleIndices.empty() &&
14614 "Unique insertelements only are expected.");
14615 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
14616 unsigned const NumElts = SrcVecTy->getNumElements();
14617 unsigned const NumScalars = VL.size();
14618
14619 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
14620
14621 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14622 unsigned OffsetBeg = *getElementIndex(VL.front());
14623 unsigned OffsetEnd = OffsetBeg;
14624 InsertMask[OffsetBeg] = 0;
14625 for (auto [I, V] : enumerate(VL.drop_front())) {
14626 unsigned Idx = *getElementIndex(V);
14627 if (OffsetBeg > Idx)
14628 OffsetBeg = Idx;
14629 else if (OffsetEnd < Idx)
14630 OffsetEnd = Idx;
14631 InsertMask[Idx] = I + 1;
14632 }
14633 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
14634 if (NumOfParts > 0 && NumOfParts < NumElts)
14635 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14636 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14637 VecScalarsSz;
14638 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14639 unsigned InsertVecSz = std::min<unsigned>(
14640 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
14641 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14642 bool IsWholeSubvector =
14643 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14644 // Check if we can safely insert a subvector. If it is not possible, just
14645 // generate a whole-sized vector and shuffle the source vector and the new
14646 // subvector.
14647 if (OffsetBeg + InsertVecSz > VecSz) {
14648 // Align OffsetBeg to generate correct mask.
14649 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
14650 InsertVecSz = VecSz;
14651 }
14652
14653 APInt DemandedElts = APInt::getZero(NumElts);
14654 // TODO: Add support for Instruction::InsertValue.
14655 SmallVector<int> Mask;
14656 if (!E->ReorderIndices.empty()) {
14657 inversePermutation(E->ReorderIndices, Mask);
14658 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
14659 } else {
14660 Mask.assign(VecSz, PoisonMaskElem);
14661 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
14662 }
14663 bool IsIdentity = true;
14664 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
14665 Mask.swap(PrevMask);
14666 for (unsigned I = 0; I < NumScalars; ++I) {
14667 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
14668 DemandedElts.setBit(InsertIdx);
14669 IsIdentity &= InsertIdx - OffsetBeg == I;
14670 Mask[InsertIdx - OffsetBeg] = I;
14671 }
14672 assert(Offset < NumElts && "Failed to find vector index offset");
14673
14675 Cost -=
14676 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
14677 /*Insert*/ true, /*Extract*/ false, CostKind);
14678
14679 // First cost - resize to actual vector size if not identity shuffle or
14680 // need to shift the vector.
14681 // Do not calculate the cost if the actual size is the register size and
14682 // we can merge this shuffle with the following SK_Select.
14683 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
14684 if (!IsIdentity)
14686 InsertVecTy, Mask);
14687 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14688 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14689 }));
14690 // Second cost - permutation with subvector, if some elements are from the
14691 // initial vector or inserting a subvector.
14692 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
14693 // subvector of ActualVecTy.
14694 SmallBitVector InMask =
14695 isUndefVector(FirstInsert->getOperand(0),
14696 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14697 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
14698 if (InsertVecSz != VecSz) {
14699 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
14700 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
14701 CostKind, OffsetBeg - Offset, InsertVecTy);
14702 } else {
14703 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
14704 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
14705 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
14706 I <= End; ++I)
14707 if (Mask[I] != PoisonMaskElem)
14708 Mask[I] = I + VecSz;
14709 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
14710 Mask[I] =
14711 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
14712 Cost +=
14713 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
14714 }
14715 }
14716 return Cost;
14717 }
14718 case Instruction::ZExt:
14719 case Instruction::SExt:
14720 case Instruction::FPToUI:
14721 case Instruction::FPToSI:
14722 case Instruction::FPExt:
14723 case Instruction::PtrToInt:
14724 case Instruction::IntToPtr:
14725 case Instruction::SIToFP:
14726 case Instruction::UIToFP:
14727 case Instruction::Trunc:
14728 case Instruction::FPTrunc:
14729 case Instruction::BitCast: {
14730 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14731 Type *SrcScalarTy = VL0->getOperand(0)->getType();
14732 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
14733 unsigned Opcode = ShuffleOrOp;
14734 unsigned VecOpcode = Opcode;
14735 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14736 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14737 // Check if the values are candidates to demote.
14738 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
14739 if (SrcIt != MinBWs.end()) {
14740 SrcBWSz = SrcIt->second.first;
14741 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
14742 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
14743 SrcVecTy =
14744 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
14745 }
14746 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14747 if (BWSz == SrcBWSz) {
14748 VecOpcode = Instruction::BitCast;
14749 } else if (BWSz < SrcBWSz) {
14750 VecOpcode = Instruction::Trunc;
14751 } else if (It != MinBWs.end()) {
14752 assert(BWSz > SrcBWSz && "Invalid cast!");
14753 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14754 } else if (SrcIt != MinBWs.end()) {
14755 assert(BWSz > SrcBWSz && "Invalid cast!");
14756 VecOpcode =
14757 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14758 }
14759 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14760 !SrcIt->second.second) {
14761 VecOpcode = Instruction::UIToFP;
14762 }
14763 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
14764 assert(Idx == 0 && "Expected 0 index only");
14765 return TTI->getCastInstrCost(Opcode, VL0->getType(),
14766 VL0->getOperand(0)->getType(),
14768 };
14769 auto GetVectorCost = [=](InstructionCost CommonCost) {
14770 // Do not count cost here if minimum bitwidth is in effect and it is just
14771 // a bitcast (here it is just a noop).
14772 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14773 return CommonCost;
14774 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
14775 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
14776
14777 bool IsArithmeticExtendedReduction =
14778 E->Idx == 0 && UserIgnoreList &&
14779 all_of(*UserIgnoreList, [](Value *V) {
14780 auto *I = cast<Instruction>(V);
14781 return is_contained({Instruction::Add, Instruction::FAdd,
14782 Instruction::Mul, Instruction::FMul,
14783 Instruction::And, Instruction::Or,
14784 Instruction::Xor},
14785 I->getOpcode());
14786 });
14787 if (IsArithmeticExtendedReduction &&
14788 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14789 return CommonCost;
14790 return CommonCost +
14791 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
14792 VecOpcode == Opcode ? VI : nullptr);
14793 };
14794 return GetCostDiff(GetScalarCost, GetVectorCost);
14795 }
14796 case Instruction::FCmp:
14797 case Instruction::ICmp:
14798 case Instruction::Select: {
14799 CmpPredicate VecPred, SwappedVecPred;
14800 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
14801 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
14802 match(VL0, MatchCmp))
14803 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
14804 else
14805 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
14808 auto GetScalarCost = [&](unsigned Idx) {
14809 if (isa<PoisonValue>(UniqueValues[Idx]))
14811
14812 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14813 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
14816 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
14817 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
14818 !match(VI, MatchCmp)) ||
14819 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
14820 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
14821 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
14824
14825 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
14826 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14827 CostKind, getOperandInfo(VI->getOperand(0)),
14828 getOperandInfo(VI->getOperand(1)), VI);
14829 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
14830 if (IntrinsicCost.isValid())
14831 ScalarCost = IntrinsicCost;
14832
14833 return ScalarCost;
14834 };
14835 auto GetVectorCost = [&](InstructionCost CommonCost) {
14836 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
14837
14838 InstructionCost VecCost =
14839 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
14840 CostKind, getOperandInfo(E->getOperand(0)),
14841 getOperandInfo(E->getOperand(1)), VL0);
14842 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
14843 auto *CondType =
14844 getWidenedType(SI->getCondition()->getType(), VL.size());
14845 unsigned CondNumElements = CondType->getNumElements();
14846 unsigned VecTyNumElements = getNumElements(VecTy);
14847 assert(VecTyNumElements >= CondNumElements &&
14848 VecTyNumElements % CondNumElements == 0 &&
14849 "Cannot vectorize Instruction::Select");
14850 if (CondNumElements != VecTyNumElements) {
14851 // When the return type is i1 but the source is fixed vector type, we
14852 // need to duplicate the condition value.
14853 VecCost += ::getShuffleCost(
14854 *TTI, TTI::SK_PermuteSingleSrc, CondType,
14855 createReplicatedMask(VecTyNumElements / CondNumElements,
14856 CondNumElements));
14857 }
14858 }
14859 return VecCost + CommonCost;
14860 };
14861 return GetCostDiff(GetScalarCost, GetVectorCost);
14862 }
14863 case TreeEntry::MinMax: {
14864 auto GetScalarCost = [&](unsigned Idx) {
14865 return GetMinMaxCost(OrigScalarTy);
14866 };
14867 auto GetVectorCost = [&](InstructionCost CommonCost) {
14868 InstructionCost VecCost = GetMinMaxCost(VecTy);
14869 return VecCost + CommonCost;
14870 };
14871 return GetCostDiff(GetScalarCost, GetVectorCost);
14872 }
14873 case TreeEntry::FMulAdd: {
14874 auto GetScalarCost = [&](unsigned Idx) {
14875 if (isa<PoisonValue>(UniqueValues[Idx]))
14877 return GetFMulAddCost(E->getOperations(),
14878 cast<Instruction>(UniqueValues[Idx]));
14879 };
14880 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14881 FastMathFlags FMF;
14882 FMF.set();
14883 for (Value *V : E->Scalars) {
14884 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
14885 FMF &= FPCI->getFastMathFlags();
14886 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
14887 FMF &= FPCIOp->getFastMathFlags();
14888 }
14889 }
14890 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14891 {VecTy, VecTy, VecTy}, FMF);
14892 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
14893 return VecCost + CommonCost;
14894 };
14895 return GetCostDiff(GetScalarCost, GetVectorCost);
14896 }
14897 case Instruction::FNeg:
14898 case Instruction::Add:
14899 case Instruction::FAdd:
14900 case Instruction::Sub:
14901 case Instruction::FSub:
14902 case Instruction::Mul:
14903 case Instruction::FMul:
14904 case Instruction::UDiv:
14905 case Instruction::SDiv:
14906 case Instruction::FDiv:
14907 case Instruction::URem:
14908 case Instruction::SRem:
14909 case Instruction::FRem:
14910 case Instruction::Shl:
14911 case Instruction::LShr:
14912 case Instruction::AShr:
14913 case Instruction::And:
14914 case Instruction::Or:
14915 case Instruction::Xor: {
14916 auto GetScalarCost = [&](unsigned Idx) {
14917 if (isa<PoisonValue>(UniqueValues[Idx]))
14919
14920 // We cannot retrieve the operand from UniqueValues[Idx] because an
14921 // interchangeable instruction may be used. The order and the actual
14922 // operand might differ from what is retrieved from UniqueValues[Idx].
14923 Value *Op1 = E->getOperand(0)[Idx];
14924 Value *Op2;
14925 SmallVector<const Value *, 2> Operands(1, Op1);
14926 if (isa<UnaryOperator>(UniqueValues[Idx])) {
14927 Op2 = Op1;
14928 } else {
14929 Op2 = E->getOperand(1)[Idx];
14930 Operands.push_back(Op2);
14931 }
14934 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
14935 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
14936 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
14937 I && (ShuffleOrOp == Instruction::FAdd ||
14938 ShuffleOrOp == Instruction::FSub)) {
14939 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
14940 if (IntrinsicCost.isValid())
14941 ScalarCost = IntrinsicCost;
14942 }
14943 return ScalarCost;
14944 };
14945 auto GetVectorCost = [=](InstructionCost CommonCost) {
14946 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14947 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
14948 ArrayRef<Value *> Ops = E->getOperand(I);
14949 if (all_of(Ops, [&](Value *Op) {
14950 auto *CI = dyn_cast<ConstantInt>(Op);
14951 return CI && CI->getValue().countr_one() >= It->second.first;
14952 }))
14953 return CommonCost;
14954 }
14955 }
14956 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
14957 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
14958 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
14959 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
14960 Op2Info, {}, nullptr, TLI) +
14961 CommonCost;
14962 };
14963 return GetCostDiff(GetScalarCost, GetVectorCost);
14964 }
14965 case Instruction::GetElementPtr: {
14966 return CommonCost + GetGEPCostDiff(VL, VL0);
14967 }
14968 case Instruction::Load: {
14969 auto GetScalarCost = [&](unsigned Idx) {
14970 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
14971 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14972 VI->getAlign(), VI->getPointerAddressSpace(),
14974 };
14975 auto *LI0 = cast<LoadInst>(VL0);
14976 auto GetVectorCost = [&](InstructionCost CommonCost) {
14977 InstructionCost VecLdCost;
14978 switch (E->State) {
14979 case TreeEntry::Vectorize:
14980 if (unsigned Factor = E->getInterleaveFactor()) {
14981 VecLdCost = TTI->getInterleavedMemoryOpCost(
14982 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14983 LI0->getPointerAddressSpace(), CostKind);
14984
14985 } else {
14986 VecLdCost = TTI->getMemoryOpCost(
14987 Instruction::Load, VecTy, LI0->getAlign(),
14988 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14989 }
14990 break;
14991 case TreeEntry::StridedVectorize: {
14992 Align CommonAlignment =
14993 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14994 VecLdCost = TTI->getStridedMemoryOpCost(
14995 Instruction::Load, VecTy, LI0->getPointerOperand(),
14996 /*VariableMask=*/false, CommonAlignment, CostKind);
14997 break;
14998 }
14999 case TreeEntry::CompressVectorize: {
15000 bool IsMasked;
15001 unsigned InterleaveFactor;
15002 SmallVector<int> CompressMask;
15003 VectorType *LoadVecTy;
15004 SmallVector<Value *> Scalars(VL);
15005 if (!E->ReorderIndices.empty()) {
15006 SmallVector<int> Mask(E->ReorderIndices.begin(),
15007 E->ReorderIndices.end());
15008 reorderScalars(Scalars, Mask);
15009 }
15010 SmallVector<Value *> PointerOps(Scalars.size());
15011 for (auto [I, V] : enumerate(Scalars))
15012 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15013 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15014 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15015 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
15016 CompressMask, LoadVecTy);
15017 assert(IsVectorized && "Failed to vectorize load");
15018 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
15019 InterleaveFactor, IsMasked);
15020 Align CommonAlignment = LI0->getAlign();
15021 if (InterleaveFactor) {
15022 VecLdCost = TTI->getInterleavedMemoryOpCost(
15023 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15024 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
15025 } else if (IsMasked) {
15026 VecLdCost = TTI->getMaskedMemoryOpCost(
15027 Instruction::Load, LoadVecTy, CommonAlignment,
15028 LI0->getPointerAddressSpace(), CostKind);
15029 // TODO: include this cost into CommonCost.
15030 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15031 LoadVecTy, CompressMask, CostKind);
15032 } else {
15033 VecLdCost = TTI->getMemoryOpCost(
15034 Instruction::Load, LoadVecTy, CommonAlignment,
15035 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15036 // TODO: include this cost into CommonCost.
15037 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15038 LoadVecTy, CompressMask, CostKind);
15039 }
15040 break;
15041 }
15042 case TreeEntry::ScatterVectorize: {
15043 Align CommonAlignment =
15044 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15045 VecLdCost = TTI->getGatherScatterOpCost(
15046 Instruction::Load, VecTy, LI0->getPointerOperand(),
15047 /*VariableMask=*/false, CommonAlignment, CostKind);
15048 break;
15049 }
15050 case TreeEntry::CombinedVectorize:
15051 case TreeEntry::SplitVectorize:
15052 case TreeEntry::NeedToGather:
15053 llvm_unreachable("Unexpected vectorization state.");
15054 }
15055 return VecLdCost + CommonCost;
15056 };
15057
15058 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
15059 // If this node generates masked gather load then it is not a terminal node.
15060 // Hence address operand cost is estimated separately.
15061 if (E->State == TreeEntry::ScatterVectorize)
15062 return Cost;
15063
15064 // Estimate cost of GEPs since this tree node is a terminator.
15065 SmallVector<Value *> PointerOps(VL.size());
15066 for (auto [I, V] : enumerate(VL))
15067 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15068 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15069 }
15070 case Instruction::Store: {
15071 bool IsReorder = !E->ReorderIndices.empty();
15072 auto GetScalarCost = [=](unsigned Idx) {
15073 auto *VI = cast<StoreInst>(VL[Idx]);
15074 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
15075 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15076 VI->getAlign(), VI->getPointerAddressSpace(),
15077 CostKind, OpInfo, VI);
15078 };
15079 auto *BaseSI =
15080 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
15081 auto GetVectorCost = [=](InstructionCost CommonCost) {
15082 // We know that we can merge the stores. Calculate the cost.
15083 InstructionCost VecStCost;
15084 if (E->State == TreeEntry::StridedVectorize) {
15085 Align CommonAlignment =
15086 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
15087 VecStCost = TTI->getStridedMemoryOpCost(
15088 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
15089 /*VariableMask=*/false, CommonAlignment, CostKind);
15090 } else {
15091 assert(E->State == TreeEntry::Vectorize &&
15092 "Expected either strided or consecutive stores.");
15093 if (unsigned Factor = E->getInterleaveFactor()) {
15094 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
15095 "No reused shuffles expected");
15096 CommonCost = 0;
15097 VecStCost = TTI->getInterleavedMemoryOpCost(
15098 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15099 BaseSI->getPointerAddressSpace(), CostKind);
15100 } else {
15101 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
15102 VecStCost = TTI->getMemoryOpCost(
15103 Instruction::Store, VecTy, BaseSI->getAlign(),
15104 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
15105 }
15106 }
15107 return VecStCost + CommonCost;
15108 };
15109 SmallVector<Value *> PointerOps(VL.size());
15110 for (auto [I, V] : enumerate(VL)) {
15111 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
15112 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
15113 }
15114
15115 return GetCostDiff(GetScalarCost, GetVectorCost) +
15116 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15117 }
15118 case Instruction::Call: {
15119 auto GetScalarCost = [&](unsigned Idx) {
15120 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15123 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15124 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15125 }
15126 return TTI->getCallInstrCost(CI->getCalledFunction(),
15128 CI->getFunctionType()->params(), CostKind);
15129 };
15130 auto GetVectorCost = [=](InstructionCost CommonCost) {
15131 auto *CI = cast<CallInst>(VL0);
15134 CI, ID, VecTy->getNumElements(),
15135 It != MinBWs.end() ? It->second.first : 0, TTI);
15136 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15137 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15138 };
15139 return GetCostDiff(GetScalarCost, GetVectorCost);
15140 }
15141 case Instruction::ShuffleVector: {
15142 if (!SLPReVec || E->isAltShuffle())
15143 assert(E->isAltShuffle() &&
15144 ((Instruction::isBinaryOp(E->getOpcode()) &&
15145 Instruction::isBinaryOp(E->getAltOpcode())) ||
15146 (Instruction::isCast(E->getOpcode()) &&
15147 Instruction::isCast(E->getAltOpcode())) ||
15148 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15149 "Invalid Shuffle Vector Operand");
15150 // Try to find the previous shuffle node with the same operands and same
15151 // main/alternate ops.
15152 auto TryFindNodeWithEqualOperands = [=]() {
15153 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15154 if (TE.get() == E)
15155 break;
15156 if (TE->hasState() && TE->isAltShuffle() &&
15157 ((TE->getOpcode() == E->getOpcode() &&
15158 TE->getAltOpcode() == E->getAltOpcode()) ||
15159 (TE->getOpcode() == E->getAltOpcode() &&
15160 TE->getAltOpcode() == E->getOpcode())) &&
15161 TE->hasEqualOperands(*E))
15162 return true;
15163 }
15164 return false;
15165 };
15166 auto GetScalarCost = [&](unsigned Idx) {
15167 if (isa<PoisonValue>(UniqueValues[Idx]))
15169
15170 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15171 assert(E->getMatchingMainOpOrAltOp(VI) &&
15172 "Unexpected main/alternate opcode");
15173 (void)E;
15174 return TTI->getInstructionCost(VI, CostKind);
15175 };
15176 // Need to clear CommonCost since the final shuffle cost is included into
15177 // vector cost.
15178 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15179 // VecCost is equal to sum of the cost of creating 2 vectors
15180 // and the cost of creating shuffle.
15181 InstructionCost VecCost = 0;
15182 if (TryFindNodeWithEqualOperands()) {
15183 LLVM_DEBUG({
15184 dbgs() << "SLP: diamond match for alternate node found.\n";
15185 E->dump();
15186 });
15187 // No need to add new vector costs here since we're going to reuse
15188 // same main/alternate vector ops, just do different shuffling.
15189 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15190 VecCost =
15191 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15192 VecCost +=
15193 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15194 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15195 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15196 VecCost = TTIRef.getCmpSelInstrCost(
15197 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15198 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15199 VL0);
15200 VecCost += TTIRef.getCmpSelInstrCost(
15201 E->getOpcode(), VecTy, MaskTy,
15202 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15203 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15204 E->getAltOp());
15205 } else {
15206 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15207 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15208 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15209 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15210 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15211 unsigned SrcBWSz =
15212 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15213 if (SrcIt != MinBWs.end()) {
15214 SrcBWSz = SrcIt->second.first;
15215 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15216 SrcTy = getWidenedType(SrcSclTy, VL.size());
15217 }
15218 if (BWSz <= SrcBWSz) {
15219 if (BWSz < SrcBWSz)
15220 VecCost =
15221 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15223 LLVM_DEBUG({
15224 dbgs()
15225 << "SLP: alternate extension, which should be truncated.\n";
15226 E->dump();
15227 });
15228 return VecCost;
15229 }
15230 }
15231 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15233 VecCost +=
15234 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15236 }
15237 SmallVector<int> Mask;
15238 E->buildAltOpShuffleMask(
15239 [&](Instruction *I) {
15240 assert(E->getMatchingMainOpOrAltOp(I) &&
15241 "Unexpected main/alternate opcode");
15242 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15243 *TLI);
15244 },
15245 Mask);
15247 FinalVecTy, Mask, CostKind);
15248 // Patterns like [fadd,fsub] can be combined into a single instruction
15249 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15250 // need to take into account their order when looking for the most used
15251 // order.
15252 unsigned Opcode0 = E->getOpcode();
15253 unsigned Opcode1 = E->getAltOpcode();
15254 SmallBitVector OpcodeMask(
15255 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15256 // If this pattern is supported by the target then we consider the
15257 // order.
15258 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15259 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15260 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15261 return AltVecCost < VecCost ? AltVecCost : VecCost;
15262 }
15263 // TODO: Check the reverse order too.
15264 return VecCost;
15265 };
15266 if (SLPReVec && !E->isAltShuffle())
15267 return GetCostDiff(
15268 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15269 // If a group uses mask in order, the shufflevector can be
15270 // eliminated by instcombine. Then the cost is 0.
15272 "Not supported shufflevector usage.");
15273 auto *SV = cast<ShuffleVectorInst>(VL.front());
15274 unsigned SVNumElements =
15275 cast<FixedVectorType>(SV->getOperand(0)->getType())
15276 ->getNumElements();
15277 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15278 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15279 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15280 int NextIndex = 0;
15281 if (!all_of(Group, [&](Value *V) {
15283 "Not supported shufflevector usage.");
15284 auto *SV = cast<ShuffleVectorInst>(V);
15285 int Index;
15286 [[maybe_unused]] bool IsExtractSubvectorMask =
15287 SV->isExtractSubvectorMask(Index);
15288 assert(IsExtractSubvectorMask &&
15289 "Not supported shufflevector usage.");
15290 if (NextIndex != Index)
15291 return false;
15292 NextIndex += SV->getShuffleMask().size();
15293 return true;
15294 }))
15295 return ::getShuffleCost(
15297 calculateShufflevectorMask(E->Scalars));
15298 }
15299 return TTI::TCC_Free;
15300 });
15301 return GetCostDiff(GetScalarCost, GetVectorCost);
15302 }
15303 case Instruction::Freeze:
15304 return CommonCost;
15305 default:
15306 llvm_unreachable("Unknown instruction");
15307 }
15308}
15309
15310bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15311 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15312 << VectorizableTree.size() << " is fully vectorizable .\n");
15313
15314 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15315 SmallVector<int> Mask;
15316 return TE->isGather() &&
15317 !any_of(TE->Scalars,
15318 [this](Value *V) { return EphValues.contains(V); }) &&
15319 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15320 TE->Scalars.size() < Limit ||
15321 (((TE->hasState() &&
15322 TE->getOpcode() == Instruction::ExtractElement) ||
15324 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15325 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15326 !TE->isAltShuffle()) ||
15327 any_of(TE->Scalars, IsaPred<LoadInst>));
15328 };
15329
15330 // We only handle trees of heights 1 and 2.
15331 if (VectorizableTree.size() == 1 &&
15332 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15333 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15334 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15335 (ForReduction &&
15336 AreVectorizableGathers(VectorizableTree[0].get(),
15337 VectorizableTree[0]->Scalars.size()) &&
15338 VectorizableTree[0]->getVectorFactor() > 2)))
15339 return true;
15340
15341 if (VectorizableTree.size() != 2)
15342 return false;
15343
15344 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15345 // with the second gather nodes if they have less scalar operands rather than
15346 // the initial tree element (may be profitable to shuffle the second gather)
15347 // or they are extractelements, which form shuffle.
15348 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15349 AreVectorizableGathers(VectorizableTree[1].get(),
15350 VectorizableTree[0]->Scalars.size()))
15351 return true;
15352
15353 // Gathering cost would be too much for tiny trees.
15354 if (VectorizableTree[0]->isGather() ||
15355 (VectorizableTree[1]->isGather() &&
15356 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15357 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15358 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15359 return false;
15360
15361 return true;
15362}
15363
15364static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15366 bool MustMatchOrInst) {
15367 // Look past the root to find a source value. Arbitrarily follow the
15368 // path through operand 0 of any 'or'. Also, peek through optional
15369 // shift-left-by-multiple-of-8-bits.
15370 Value *ZextLoad = Root;
15371 const APInt *ShAmtC;
15372 bool FoundOr = false;
15373 while (!isa<ConstantExpr>(ZextLoad) &&
15374 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15375 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15376 ShAmtC->urem(8) == 0))) {
15377 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15378 ZextLoad = BinOp->getOperand(0);
15379 if (BinOp->getOpcode() == Instruction::Or)
15380 FoundOr = true;
15381 }
15382 // Check if the input is an extended load of the required or/shift expression.
15383 Value *Load;
15384 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15385 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15386 return false;
15387
15388 // Require that the total load bit width is a legal integer type.
15389 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15390 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15391 Type *SrcTy = Load->getType();
15392 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15393 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15394 return false;
15395
15396 // Everything matched - assume that we can fold the whole sequence using
15397 // load combining.
15398 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15399 << *(cast<Instruction>(Root)) << "\n");
15400
15401 return true;
15402}
15403
15405 if (RdxKind != RecurKind::Or)
15406 return false;
15407
15408 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15409 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15410 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15411 /* MatchOr */ false);
15412}
15413
15415 // Peek through a final sequence of stores and check if all operations are
15416 // likely to be load-combined.
15417 unsigned NumElts = Stores.size();
15418 for (Value *Scalar : Stores) {
15419 Value *X;
15420 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15421 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15422 return false;
15423 }
15424 return true;
15425}
15426
15427bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15428 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15429 return true;
15430
15431 // Graph is empty - do nothing.
15432 if (VectorizableTree.empty()) {
15433 assert(ExternalUses.empty() && "We shouldn't have any external users");
15434
15435 return true;
15436 }
15437
15438 // No need to vectorize inserts of gathered values.
15439 if (VectorizableTree.size() == 2 &&
15440 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15441 VectorizableTree[1]->isGather() &&
15442 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15443 !(isSplat(VectorizableTree[1]->Scalars) ||
15444 allConstant(VectorizableTree[1]->Scalars))))
15445 return true;
15446
15447 // If the graph includes only PHI nodes and gathers, it is defnitely not
15448 // profitable for the vectorization, we can skip it, if the cost threshold is
15449 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15450 // gathers/buildvectors.
15451 constexpr int Limit = 4;
15452 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15453 !VectorizableTree.empty() &&
15454 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15455 return (TE->isGather() &&
15456 (!TE->hasState() ||
15457 TE->getOpcode() != Instruction::ExtractElement) &&
15458 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15459 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15460 }))
15461 return true;
15462
15463 // Do not vectorize small tree of phis only, if all vector phis are also
15464 // gathered.
15465 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15466 VectorizableTree.size() <= Limit &&
15467 all_of(VectorizableTree,
15468 [&](const std::unique_ptr<TreeEntry> &TE) {
15469 return (TE->isGather() &&
15470 (!TE->hasState() ||
15471 TE->getOpcode() != Instruction::ExtractElement) &&
15472 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15473 Limit) ||
15474 (TE->hasState() &&
15475 (TE->getOpcode() == Instruction::InsertElement ||
15476 (TE->getOpcode() == Instruction::PHI &&
15477 all_of(TE->Scalars, [&](Value *V) {
15478 return isa<PoisonValue>(V) || MustGather.contains(V);
15479 }))));
15480 }) &&
15481 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15482 return TE->State == TreeEntry::Vectorize &&
15483 TE->getOpcode() == Instruction::PHI;
15484 }))
15485 return true;
15486
15487 // If the tree contains only phis, buildvectors, split nodes and
15488 // small nodes with reuses, we can skip it.
15489 SmallVector<const TreeEntry *> StoreLoadNodes;
15490 unsigned NumGathers = 0;
15491 constexpr int LimitTreeSize = 36;
15492 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15493 all_of(VectorizableTree,
15494 [&](const std::unique_ptr<TreeEntry> &TE) {
15495 if (!TE->isGather() && TE->hasState() &&
15496 (TE->getOpcode() == Instruction::Load ||
15497 TE->getOpcode() == Instruction::Store)) {
15498 StoreLoadNodes.push_back(TE.get());
15499 return true;
15500 }
15501 if (TE->isGather())
15502 ++NumGathers;
15503 return TE->State == TreeEntry::SplitVectorize ||
15504 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15505 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15506 VectorizableTree.size() > LimitTreeSize) ||
15507 (TE->isGather() &&
15508 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15509 (TE->hasState() &&
15510 (TE->getOpcode() == Instruction::PHI ||
15511 (TE->hasCopyableElements() &&
15512 static_cast<unsigned>(count_if(
15513 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15514 TE->Scalars.size() / 2) ||
15515 ((!TE->ReuseShuffleIndices.empty() ||
15516 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15517 TE->Scalars.size() == 2)));
15518 }) &&
15519 (StoreLoadNodes.empty() ||
15520 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15521 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15522 return TE->getOpcode() == Instruction::Store ||
15523 all_of(TE->Scalars, [&](Value *V) {
15524 return !isa<LoadInst>(V) ||
15525 areAllUsersVectorized(cast<Instruction>(V));
15526 });
15527 })))))
15528 return true;
15529
15530 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15531 // tree node) and other buildvectors, we can skip it.
15532 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15533 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15534 VectorizableTree.size() >= Limit &&
15535 count_if(ArrayRef(VectorizableTree).drop_front(),
15536 [&](const std::unique_ptr<TreeEntry> &TE) {
15537 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15538 TE->UserTreeIndex.UserTE->Idx == 0;
15539 }) == 2)
15540 return true;
15541
15542 // If the tree contains only vectorization of the phi node from the
15543 // buildvector - skip it.
15544 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15545 VectorizableTree.size() > 2 &&
15546 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15547 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15548 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15549 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15550 all_of(
15551 ArrayRef(VectorizableTree).drop_front(2),
15552 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
15553 return true;
15554
15555 // We can vectorize the tree if its size is greater than or equal to the
15556 // minimum size specified by the MinTreeSize command line option.
15557 if (VectorizableTree.size() >= MinTreeSize)
15558 return false;
15559
15560 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
15561 // can vectorize it if we can prove it fully vectorizable.
15562 if (isFullyVectorizableTinyTree(ForReduction))
15563 return false;
15564
15565 // Check if any of the gather node forms an insertelement buildvector
15566 // somewhere.
15567 bool IsAllowedSingleBVNode =
15568 VectorizableTree.size() > 1 ||
15569 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15570 !VectorizableTree.front()->isAltShuffle() &&
15571 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15572 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15573 allSameBlock(VectorizableTree.front()->Scalars));
15574 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15575 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
15576 return isa<ExtractElementInst, Constant>(V) ||
15577 (IsAllowedSingleBVNode &&
15578 !V->hasNUsesOrMore(UsesLimit) &&
15579 any_of(V->users(), IsaPred<InsertElementInst>));
15580 });
15581 }))
15582 return false;
15583
15584 if (VectorizableTree.back()->isGather() &&
15585 VectorizableTree.back()->hasState() &&
15586 VectorizableTree.back()->isAltShuffle() &&
15587 VectorizableTree.back()->getVectorFactor() > 2 &&
15588 allSameBlock(VectorizableTree.back()->Scalars) &&
15589 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15590 TTI->getScalarizationOverhead(
15591 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15592 VectorizableTree.back()->getVectorFactor()),
15593 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
15594 /*Insert=*/true, /*Extract=*/false,
15596 return false;
15597
15598 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
15599 // vectorizable.
15600 return true;
15601}
15602
15605 constexpr unsigned SmallTree = 3;
15606 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15607 getCanonicalGraphSize() <= SmallTree &&
15608 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15609 [](const std::unique_ptr<TreeEntry> &TE) {
15610 return TE->isGather() && TE->hasState() &&
15611 TE->getOpcode() == Instruction::Load &&
15612 !allSameBlock(TE->Scalars);
15613 }) == 1)
15614 return true;
15615 return false;
15616 }
15617 bool Res = false;
15618 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
15619 TreeEntry &E = *VectorizableTree[Idx];
15620 if (E.State == TreeEntry::SplitVectorize)
15621 return false;
15622 if (!E.isGather())
15623 continue;
15624 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15625 (!E.hasState() &&
15627 (isa<ExtractElementInst>(E.Scalars.front()) &&
15628 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
15629 return false;
15630 if (isSplat(E.Scalars) || allConstant(E.Scalars))
15631 continue;
15632 Res = true;
15633 }
15634 return Res;
15635}
15636
15638 // Walk from the bottom of the tree to the top, tracking which values are
15639 // live. When we see a call instruction that is not part of our tree,
15640 // query TTI to see if there is a cost to keeping values live over it
15641 // (for example, if spills and fills are required).
15642
15643 const TreeEntry *Root = VectorizableTree.front().get();
15644 if (Root->isGather())
15645 return 0;
15646
15649 EntriesToOperands;
15650 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
15651 SmallPtrSet<const Instruction *, 8> LastInstructions;
15652 for (const auto &TEPtr : VectorizableTree) {
15653 if (!TEPtr->isGather()) {
15654 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15655 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
15656 LastInstructions.insert(LastInst);
15657 }
15658 if (TEPtr->UserTreeIndex)
15659 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15660 }
15661
15662 auto NoCallIntrinsic = [this](const Instruction *I) {
15663 const auto *II = dyn_cast<IntrinsicInst>(I);
15664 if (!II)
15665 return false;
15666 if (II->isAssumeLikeIntrinsic())
15667 return true;
15668 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
15669 InstructionCost IntrCost =
15670 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
15671 InstructionCost CallCost = TTI->getCallInstrCost(
15672 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
15673 return IntrCost < CallCost;
15674 };
15675
15676 // Maps last instruction in the entry to the last instruction for the one of
15677 // operand entries and the flag. If the flag is true, there are no calls in
15678 // between these instructions.
15680 CheckedInstructions;
15681 unsigned Budget = 0;
15682 const unsigned BudgetLimit =
15683 ScheduleRegionSizeBudget / VectorizableTree.size();
15684 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
15685 const Instruction *Last) {
15686 assert(First->getParent() == Last->getParent() &&
15687 "Expected instructions in same block.");
15688 if (auto It = CheckedInstructions.find(Last);
15689 It != CheckedInstructions.end()) {
15690 const Instruction *Checked = It->second.getPointer();
15691 if (Checked == First || Checked->comesBefore(First))
15692 return It->second.getInt() != 0;
15693 Last = Checked;
15694 } else if (Last == First || Last->comesBefore(First)) {
15695 return true;
15696 }
15698 ++First->getIterator().getReverse(),
15699 PrevInstIt =
15700 Last->getIterator().getReverse();
15701 SmallVector<const Instruction *> LastInstsInRange;
15702 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15703 // Debug information does not impact spill cost.
15704 // Vectorized calls, represented as vector intrinsics, do not impact spill
15705 // cost.
15706 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15707 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
15708 for (const Instruction *LastInst : LastInstsInRange)
15709 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
15710 return false;
15711 }
15712 if (LastInstructions.contains(&*PrevInstIt))
15713 LastInstsInRange.push_back(&*PrevInstIt);
15714
15715 ++PrevInstIt;
15716 ++Budget;
15717 }
15718 for (const Instruction *LastInst : LastInstsInRange)
15719 CheckedInstructions.try_emplace(
15720 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
15721 Budget <= BudgetLimit ? 1 : 0);
15722 return Budget <= BudgetLimit;
15723 };
15724 auto AddCosts = [&](const TreeEntry *Op) {
15725 Type *ScalarTy = Op->Scalars.front()->getType();
15726 auto It = MinBWs.find(Op);
15727 if (It != MinBWs.end())
15728 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
15729 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
15730 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15731 if (ScalarTy->isVectorTy()) {
15732 // Handle revec dead vector instructions.
15733 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15734 }
15735 };
15736 // Memoize the relationship between blocks, i.e. if there is (at least one)
15737 // non-vectorized call between the blocks. This allows to skip the analysis of
15738 // the same block paths multiple times.
15740 ParentOpParentToPreds;
15741 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
15742 BasicBlock *OpParent) {
15743 auto Key = std::make_pair(Root, OpParent);
15744 if (auto It = ParentOpParentToPreds.find(Key);
15745 It != ParentOpParentToPreds.end())
15746 return It->second;
15748 if (Pred)
15749 Worklist.push_back(Pred);
15750 else
15751 Worklist.append(pred_begin(Root), pred_end(Root));
15754 ParentsPairsToAdd;
15755 bool Res = false;
15756 auto Cleanup = make_scope_exit([&]() {
15757 for (const auto &KeyPair : ParentsPairsToAdd) {
15758 assert(!ParentOpParentToPreds.contains(KeyPair) &&
15759 "Should not have been added before.");
15760 ParentOpParentToPreds.try_emplace(KeyPair, Res);
15761 }
15762 });
15763 while (!Worklist.empty()) {
15764 BasicBlock *BB = Worklist.pop_back_val();
15765 if (BB == OpParent || !Visited.insert(BB).second)
15766 continue;
15767 auto Pair = std::make_pair(BB, OpParent);
15768 if (auto It = ParentOpParentToPreds.find(Pair);
15769 It != ParentOpParentToPreds.end()) {
15770 Res = It->second;
15771 return Res;
15772 }
15773 ParentsPairsToAdd.insert(Pair);
15774 unsigned BlockSize = BB->size();
15775 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
15776 return Res;
15777 Budget += BlockSize;
15778 if (Budget > BudgetLimit)
15779 return Res;
15780 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
15781 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
15782 BB->getTerminator()))
15783 return Res;
15784 Worklist.append(pred_begin(BB), pred_end(BB));
15785 }
15786 Res = true;
15787 return Res;
15788 };
15789 SmallVector<const TreeEntry *> LiveEntries(1, Root);
15790 while (!LiveEntries.empty()) {
15791 const TreeEntry *Entry = LiveEntries.pop_back_val();
15792 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
15793 if (Operands.empty())
15794 continue;
15795 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
15796 BasicBlock *Parent = LastInst->getParent();
15797 for (const TreeEntry *Op : Operands) {
15798 if (!Op->isGather())
15799 LiveEntries.push_back(Op);
15800 if (Entry->State == TreeEntry::SplitVectorize ||
15801 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
15802 (Op->isGather() && allConstant(Op->Scalars)))
15803 continue;
15804 Budget = 0;
15805 BasicBlock *Pred = nullptr;
15806 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15807 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15808 BasicBlock *OpParent;
15809 Instruction *OpLastInst;
15810 if (Op->isGather()) {
15811 assert(Entry->getOpcode() == Instruction::PHI &&
15812 "Expected phi node only.");
15813 OpParent = cast<PHINode>(Entry->getMainOp())
15814 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15815 OpLastInst = OpParent->getTerminator();
15816 for (Value *V : Op->Scalars) {
15817 auto *Inst = dyn_cast<Instruction>(V);
15818 if (!Inst)
15819 continue;
15820 if (isVectorized(V)) {
15821 OpParent = Inst->getParent();
15822 OpLastInst = Inst;
15823 break;
15824 }
15825 }
15826 } else {
15827 OpLastInst = EntriesToLastInstruction.at(Op);
15828 OpParent = OpLastInst->getParent();
15829 }
15830 // Check the call instructions within the same basic blocks.
15831 if (OpParent == Parent) {
15832 if (Entry->getOpcode() == Instruction::PHI) {
15833 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15834 AddCosts(Op);
15835 continue;
15836 }
15837 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15838 AddCosts(Op);
15839 continue;
15840 }
15841 // Check for call instruction in between blocks.
15842 // 1. Check entry's block to the head.
15843 if (Entry->getOpcode() != Instruction::PHI &&
15844 !CheckForNonVecCallsInSameBlock(
15845 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
15846 LastInst)) {
15847 AddCosts(Op);
15848 continue;
15849 }
15850 // 2. Check op's block from the end.
15851 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15852 OpParent->getTerminator())) {
15853 AddCosts(Op);
15854 continue;
15855 }
15856 // 3. Check the predecessors of entry's block till op's block.
15857 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15858 AddCosts(Op);
15859 continue;
15860 }
15861 }
15862 }
15863
15864 return Cost;
15865}
15866
15867/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
15868/// buildvector sequence.
15870 const InsertElementInst *IE2) {
15871 if (IE1 == IE2)
15872 return false;
15873 const auto *I1 = IE1;
15874 const auto *I2 = IE2;
15875 const InsertElementInst *PrevI1;
15876 const InsertElementInst *PrevI2;
15877 unsigned Idx1 = *getElementIndex(IE1);
15878 unsigned Idx2 = *getElementIndex(IE2);
15879 do {
15880 if (I2 == IE1)
15881 return true;
15882 if (I1 == IE2)
15883 return false;
15884 PrevI1 = I1;
15885 PrevI2 = I2;
15886 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15887 getElementIndex(I1).value_or(Idx2) != Idx2)
15888 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
15889 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
15890 getElementIndex(I2).value_or(Idx1) != Idx1)
15891 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
15892 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15893 llvm_unreachable("Two different buildvectors not expected.");
15894}
15895
15896namespace {
15897/// Returns incoming Value *, if the requested type is Value * too, or a default
15898/// value, otherwise.
15899struct ValueSelect {
15900 template <typename U>
15901 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
15902 return V;
15903 }
15904 template <typename U>
15905 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
15906 return U();
15907 }
15908};
15909} // namespace
15910
15911/// Does the analysis of the provided shuffle masks and performs the requested
15912/// actions on the vectors with the given shuffle masks. It tries to do it in
15913/// several steps.
15914/// 1. If the Base vector is not undef vector, resizing the very first mask to
15915/// have common VF and perform action for 2 input vectors (including non-undef
15916/// Base). Other shuffle masks are combined with the resulting after the 1 stage
15917/// and processed as a shuffle of 2 elements.
15918/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
15919/// action only for 1 vector with the given mask, if it is not the identity
15920/// mask.
15921/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
15922/// vectors, combing the masks properly between the steps.
15923template <typename T>
15925 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
15926 function_ref<unsigned(T *)> GetVF,
15927 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
15929 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
15930 SmallVector<int> Mask(ShuffleMask.begin()->second);
15931 auto VMIt = std::next(ShuffleMask.begin());
15932 T *Prev = nullptr;
15933 SmallBitVector UseMask =
15934 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15935 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
15936 if (!IsBaseUndef.all()) {
15937 // Base is not undef, need to combine it with the next subvectors.
15938 std::pair<T *, bool> Res =
15939 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
15940 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
15941 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15942 if (Mask[Idx] == PoisonMaskElem)
15943 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
15944 else
15945 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15946 }
15947 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
15948 assert((!V || GetVF(V) == Mask.size()) &&
15949 "Expected base vector of VF number of elements.");
15950 Prev = Action(Mask, {nullptr, Res.first});
15951 } else if (ShuffleMask.size() == 1) {
15952 // Base is undef and only 1 vector is shuffled - perform the action only for
15953 // single vector, if the mask is not the identity mask.
15954 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15955 /*ForSingleMask=*/true);
15956 if (Res.second)
15957 // Identity mask is found.
15958 Prev = Res.first;
15959 else
15960 Prev = Action(Mask, {ShuffleMask.begin()->first});
15961 } else {
15962 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
15963 // shuffles step by step, combining shuffle between the steps.
15964 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15965 unsigned Vec2VF = GetVF(VMIt->first);
15966 if (Vec1VF == Vec2VF) {
15967 // No need to resize the input vectors since they are of the same size, we
15968 // can shuffle them directly.
15969 ArrayRef<int> SecMask = VMIt->second;
15970 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15971 if (SecMask[I] != PoisonMaskElem) {
15972 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15973 Mask[I] = SecMask[I] + Vec1VF;
15974 }
15975 }
15976 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15977 } else {
15978 // Vectors of different sizes - resize and reshuffle.
15979 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15980 /*ForSingleMask=*/false);
15981 std::pair<T *, bool> Res2 =
15982 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15983 ArrayRef<int> SecMask = VMIt->second;
15984 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15985 if (Mask[I] != PoisonMaskElem) {
15986 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15987 if (Res1.second)
15988 Mask[I] = I;
15989 } else if (SecMask[I] != PoisonMaskElem) {
15990 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15991 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
15992 }
15993 }
15994 Prev = Action(Mask, {Res1.first, Res2.first});
15995 }
15996 VMIt = std::next(VMIt);
15997 }
15998 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
15999 // Perform requested actions for the remaining masks/vectors.
16000 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16001 // Shuffle other input vectors, if any.
16002 std::pair<T *, bool> Res =
16003 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16004 ArrayRef<int> SecMask = VMIt->second;
16005 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16006 if (SecMask[I] != PoisonMaskElem) {
16007 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
16008 "Multiple uses of scalars.");
16009 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
16010 } else if (Mask[I] != PoisonMaskElem) {
16011 Mask[I] = I;
16012 }
16013 }
16014 Prev = Action(Mask, {Prev, Res.first});
16015 }
16016 return Prev;
16017}
16018
16019namespace {
16020/// Data type for handling buildvector sequences with the reused scalars from
16021/// other tree entries.
16022template <typename T> struct ShuffledInsertData {
16023 /// List of insertelements to be replaced by shuffles.
16024 SmallVector<InsertElementInst *> InsertElements;
16025 /// The parent vectors and shuffle mask for the given list of inserts.
16026 MapVector<T, SmallVector<int>> ValueMasks;
16027};
16028} // namespace
16029
16031 InstructionCost ReductionCost) {
16032 InstructionCost Cost = ReductionCost;
16033 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16034 << VectorizableTree.size() << ".\n");
16035
16036 SmallPtrSet<Value *, 4> CheckedExtracts;
16037 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
16038 TreeEntry &TE = *VectorizableTree[I];
16039 // No need to count the cost for combined entries, they are combined and
16040 // just skip their cost.
16041 if (TE.State == TreeEntry::CombinedVectorize) {
16042 LLVM_DEBUG(
16043 dbgs() << "SLP: Skipping cost for combined node that starts with "
16044 << *TE.Scalars[0] << ".\n";
16045 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16046 continue;
16047 }
16048 if (TE.hasState() &&
16049 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16050 if (const TreeEntry *E =
16051 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16052 E && E->getVectorFactor() == TE.getVectorFactor()) {
16053 // Some gather nodes might be absolutely the same as some vectorizable
16054 // nodes after reordering, need to handle it.
16055 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16056 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16057 << "SLP: Current total cost = " << Cost << "\n");
16058 continue;
16059 }
16060 }
16061
16062 // Exclude cost of gather loads nodes which are not used. These nodes were
16063 // built as part of the final attempt to vectorize gathered loads.
16064 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16065 "Expected gather nodes with users only.");
16066
16067 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
16068 Cost += C;
16069 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16070 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16071 << "SLP: Current total cost = " << Cost << "\n");
16072 }
16073
16074 if (Cost >= -SLPCostThreshold &&
16075 none_of(ExternalUses, [](const ExternalUser &EU) {
16076 return isa_and_nonnull<InsertElementInst>(EU.User);
16077 }))
16078 return Cost;
16079
16080 SmallPtrSet<Value *, 16> ExtractCostCalculated;
16081 InstructionCost ExtractCost = 0;
16083 SmallVector<APInt> DemandedElts;
16084 SmallDenseSet<Value *, 4> UsedInserts;
16086 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16088 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
16089 // Keep track {Scalar, Index, User} tuple.
16090 // On AArch64, this helps in fusing a mov instruction, associated with
16091 // extractelement, with fmul in the backend so that extractelement is free.
16093 for (ExternalUser &EU : ExternalUses) {
16094 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
16095 }
16096 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
16097 for (ExternalUser &EU : ExternalUses) {
16098 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
16099 << EU.E.Idx << " in lane " << EU.Lane << "\n");
16100 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
16101 else dbgs() << " User: nullptr\n");
16102 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
16103
16104 // Uses by ephemeral values are free (because the ephemeral value will be
16105 // removed prior to code generation, and so the extraction will be
16106 // removed as well).
16107 if (EphValues.count(EU.User))
16108 continue;
16109
16110 // Check if the scalar for the given user or all users is accounted already.
16111 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
16112 (EU.User &&
16113 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
16114 continue;
16115
16116 // Used in unreachable blocks or in EH pads (rarely executed) or is
16117 // terminated with unreachable instruction.
16118 if (BasicBlock *UserParent =
16119 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
16120 UserParent &&
16121 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16122 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
16123 continue;
16124
16125 // We only add extract cost once for the same scalar.
16126 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16127 !ExtractCostCalculated.insert(EU.Scalar).second)
16128 continue;
16129
16130 // No extract cost for vector "scalar" if REVEC is disabled
16131 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16132 continue;
16133
16134 // If found user is an insertelement, do not calculate extract cost but try
16135 // to detect it as a final shuffled/identity match.
16136 // TODO: what if a user is insertvalue when REVEC is enabled?
16137 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16138 VU && VU->getOperand(1) == EU.Scalar) {
16139 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16140 if (!UsedInserts.insert(VU).second)
16141 continue;
16142 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16143 if (InsertIdx) {
16144 const TreeEntry *ScalarTE = &EU.E;
16145 auto *It = find_if(
16146 ShuffledInserts,
16147 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16148 // Checks if 2 insertelements are from the same buildvector.
16149 InsertElementInst *VecInsert = Data.InsertElements.front();
16151 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16152 Value *Op0 = II->getOperand(0);
16153 if (isVectorized(II) && !isVectorized(Op0))
16154 return nullptr;
16155 return Op0;
16156 });
16157 });
16158 int VecId = -1;
16159 if (It == ShuffledInserts.end()) {
16160 auto &Data = ShuffledInserts.emplace_back();
16161 Data.InsertElements.emplace_back(VU);
16162 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16163 VecId = ShuffledInserts.size() - 1;
16164 auto It = MinBWs.find(ScalarTE);
16165 if (It != MinBWs.end() &&
16166 VectorCasts
16167 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16168 .second) {
16169 unsigned BWSz = It->second.first;
16170 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16171 unsigned VecOpcode;
16172 if (DstBWSz < BWSz)
16173 VecOpcode = Instruction::Trunc;
16174 else
16175 VecOpcode =
16176 It->second.second ? Instruction::SExt : Instruction::ZExt;
16178 InstructionCost C = TTI->getCastInstrCost(
16179 VecOpcode, FTy,
16180 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16181 FTy->getNumElements()),
16183 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16184 << " for extending externally used vector with "
16185 "non-equal minimum bitwidth.\n");
16186 Cost += C;
16187 }
16188 } else {
16189 if (isFirstInsertElement(VU, It->InsertElements.front()))
16190 It->InsertElements.front() = VU;
16191 VecId = std::distance(ShuffledInserts.begin(), It);
16192 }
16193 int InIdx = *InsertIdx;
16194 SmallVectorImpl<int> &Mask =
16195 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16196 if (Mask.empty())
16197 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16198 Mask[InIdx] = EU.Lane;
16199 DemandedElts[VecId].setBit(InIdx);
16200 continue;
16201 }
16202 }
16203 }
16204
16206 // If we plan to rewrite the tree in a smaller type, we will need to sign
16207 // extend the extracted value back to the original type. Here, we account
16208 // for the extract and the added cost of the sign extend if needed.
16209 InstructionCost ExtraCost = TTI::TCC_Free;
16210 auto *ScalarTy = EU.Scalar->getType();
16211 const unsigned BundleWidth = EU.E.getVectorFactor();
16212 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16213 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16214 const TreeEntry *Entry = &EU.E;
16215 auto It = MinBWs.find(Entry);
16216 if (It != MinBWs.end()) {
16217 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16218 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16219 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16220 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16221 ? Instruction::ZExt
16222 : Instruction::SExt;
16223 VecTy = getWidenedType(MinTy, BundleWidth);
16224 ExtraCost =
16225 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16226 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16227 << ExtraCost << "\n");
16228 } else {
16229 ExtraCost =
16230 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16231 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16232 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16233 << *VecTy << ": " << ExtraCost << "\n");
16234 }
16235 // Leave the scalar instructions as is if they are cheaper than extracts.
16236 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16237 Entry->getOpcode() == Instruction::Load) {
16238 // Checks if the user of the external scalar is phi in loop body.
16239 auto IsPhiInLoop = [&](const ExternalUser &U) {
16240 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16241 auto *I = cast<Instruction>(U.Scalar);
16242 const Loop *L = LI->getLoopFor(Phi->getParent());
16243 return L && (Phi->getParent() == I->getParent() ||
16244 L == LI->getLoopFor(I->getParent()));
16245 }
16246 return false;
16247 };
16248 if (!ValueToExtUses) {
16249 ValueToExtUses.emplace();
16250 for (const auto &P : enumerate(ExternalUses)) {
16251 // Ignore phis in loops.
16252 if (IsPhiInLoop(P.value()))
16253 continue;
16254
16255 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16256 }
16257 }
16258 // Can use original instruction, if no operands vectorized or they are
16259 // marked as externally used already.
16260 auto *Inst = cast<Instruction>(EU.Scalar);
16261 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16262 auto OperandIsScalar = [&](Value *V) {
16263 if (!isVectorized(V)) {
16264 // Some extractelements might be not vectorized, but
16265 // transformed into shuffle and removed from the function,
16266 // consider it here.
16267 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16268 return !EE->hasOneUse() || !MustGather.contains(EE);
16269 return true;
16270 }
16271 return ValueToExtUses->contains(V);
16272 };
16273 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16274 bool CanBeUsedAsScalarCast = false;
16275 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16276 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16277 Op && all_of(Op->operands(), OperandIsScalar)) {
16278 InstructionCost OpCost =
16279 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16280 ? TTI->getInstructionCost(Op, CostKind)
16281 : 0;
16282 if (ScalarCost + OpCost <= ExtraCost) {
16283 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16284 ScalarCost += OpCost;
16285 }
16286 }
16287 }
16288 if (CanBeUsedAsScalar) {
16289 bool KeepScalar = ScalarCost <= ExtraCost;
16290 // Try to keep original scalar if the user is the phi node from the same
16291 // block as the root phis, currently vectorized. It allows to keep
16292 // better ordering info of PHIs, being vectorized currently.
16293 bool IsProfitablePHIUser =
16294 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16295 VectorizableTree.front()->Scalars.size() > 2)) &&
16296 VectorizableTree.front()->hasState() &&
16297 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16298 !Inst->hasNUsesOrMore(UsesLimit) &&
16299 none_of(Inst->users(),
16300 [&](User *U) {
16301 auto *PHIUser = dyn_cast<PHINode>(U);
16302 return (!PHIUser ||
16303 PHIUser->getParent() !=
16304 cast<Instruction>(
16305 VectorizableTree.front()->getMainOp())
16306 ->getParent()) &&
16307 !isVectorized(U);
16308 }) &&
16309 count_if(Entry->Scalars, [&](Value *V) {
16310 return ValueToExtUses->contains(V);
16311 }) <= 2;
16312 if (IsProfitablePHIUser) {
16313 KeepScalar = true;
16314 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16315 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16316 (!GatheredLoadsEntriesFirst.has_value() ||
16317 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16318 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16319 return ValueToExtUses->contains(V);
16320 });
16321 auto It = ExtractsCount.find(Entry);
16322 if (It != ExtractsCount.end()) {
16323 assert(ScalarUsesCount >= It->getSecond().size() &&
16324 "Expected total number of external uses not less than "
16325 "number of scalar uses.");
16326 ScalarUsesCount -= It->getSecond().size();
16327 }
16328 // Keep original scalar if number of externally used instructions in
16329 // the same entry is not power of 2. It may help to do some extra
16330 // vectorization for now.
16331 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16332 }
16333 if (KeepScalar) {
16334 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16335 for (Value *V : Inst->operands()) {
16336 auto It = ValueToExtUses->find(V);
16337 if (It != ValueToExtUses->end()) {
16338 // Replace all uses to avoid compiler crash.
16339 ExternalUses[It->second].User = nullptr;
16340 }
16341 }
16342 ExtraCost = ScalarCost;
16343 if (!IsPhiInLoop(EU))
16344 ExtractsCount[Entry].insert(Inst);
16345 if (CanBeUsedAsScalarCast) {
16346 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16347 // Update the users of the operands of the cast operand to avoid
16348 // compiler crash.
16349 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16350 for (Value *V : IOp->operands()) {
16351 auto It = ValueToExtUses->find(V);
16352 if (It != ValueToExtUses->end()) {
16353 // Replace all uses to avoid compiler crash.
16354 ExternalUses[It->second].User = nullptr;
16355 }
16356 }
16357 }
16358 }
16359 }
16360 }
16361 }
16362
16363 ExtractCost += ExtraCost;
16364 }
16365 // Insert externals for extract of operands of casts to be emitted as scalars
16366 // instead of extractelement.
16367 for (Value *V : ScalarOpsFromCasts) {
16368 ExternalUsesAsOriginalScalar.insert(V);
16369 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
16370 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
16371 TEs.front()->findLaneForValue(V));
16372 }
16373 }
16374 // Add reduced value cost, if resized.
16375 if (!VectorizedVals.empty()) {
16376 const TreeEntry &Root = *VectorizableTree.front();
16377 auto BWIt = MinBWs.find(&Root);
16378 if (BWIt != MinBWs.end()) {
16379 Type *DstTy = Root.Scalars.front()->getType();
16380 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
16381 unsigned SrcSz =
16382 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16383 if (OriginalSz != SrcSz) {
16384 unsigned Opcode = Instruction::Trunc;
16385 if (OriginalSz > SrcSz)
16386 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16387 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
16388 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16389 assert(SLPReVec && "Only supported by REVEC.");
16390 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
16391 }
16392 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16395 }
16396 }
16397 }
16398
16399 Cost += ExtractCost;
16400 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
16401 bool ForSingleMask) {
16402 InstructionCost C = 0;
16403 unsigned VF = Mask.size();
16404 unsigned VecVF = TE->getVectorFactor();
16405 bool HasLargeIndex =
16406 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
16407 if ((VF != VecVF && HasLargeIndex) ||
16409
16410 if (HasLargeIndex) {
16411 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
16412 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16413 OrigMask.begin());
16415 getWidenedType(TE->getMainOp()->getType(), VecVF),
16416 OrigMask);
16417 LLVM_DEBUG(
16418 dbgs() << "SLP: Adding cost " << C
16419 << " for final shuffle of insertelement external users.\n";
16420 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16421 Cost += C;
16422 return std::make_pair(TE, true);
16423 }
16424
16425 if (!ForSingleMask) {
16426 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16427 for (unsigned I = 0; I < VF; ++I) {
16428 if (Mask[I] != PoisonMaskElem)
16429 ResizeMask[Mask[I]] = Mask[I];
16430 }
16431 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
16434 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
16435 LLVM_DEBUG(
16436 dbgs() << "SLP: Adding cost " << C
16437 << " for final shuffle of insertelement external users.\n";
16438 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16439
16440 Cost += C;
16441 }
16442 }
16443 return std::make_pair(TE, false);
16444 };
16445 // Calculate the cost of the reshuffled vectors, if any.
16446 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16447 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
16448 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16449 unsigned VF = 0;
16450 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
16452 assert((TEs.size() == 1 || TEs.size() == 2) &&
16453 "Expected exactly 1 or 2 tree entries.");
16454 if (TEs.size() == 1) {
16455 if (VF == 0)
16456 VF = TEs.front()->getVectorFactor();
16457 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16458 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
16459 !all_of(enumerate(Mask), [=](const auto &Data) {
16460 return Data.value() == PoisonMaskElem ||
16461 (Data.index() < VF &&
16462 static_cast<int>(Data.index()) == Data.value());
16463 })) {
16466 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16467 << " for final shuffle of insertelement "
16468 "external users.\n";
16469 TEs.front()->dump();
16470 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16471 Cost += C;
16472 }
16473 } else {
16474 if (VF == 0) {
16475 if (TEs.front() &&
16476 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16477 VF = TEs.front()->getVectorFactor();
16478 else
16479 VF = Mask.size();
16480 }
16481 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16483 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
16484 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16485 << " for final shuffle of vector node and external "
16486 "insertelement users.\n";
16487 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16488 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16489 Cost += C;
16490 }
16491 VF = Mask.size();
16492 return TEs.back();
16493 };
16495 MutableArrayRef(Vector.data(), Vector.size()), Base,
16496 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16497 EstimateShufflesCost);
16498 InstructionCost InsertCost = TTI->getScalarizationOverhead(
16500 ShuffledInserts[I].InsertElements.front()->getType()),
16501 DemandedElts[I],
16502 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
16503 Cost -= InsertCost;
16504 }
16505
16506 // Add the cost for reduced value resize (if required).
16507 if (ReductionBitWidth != 0) {
16508 assert(UserIgnoreList && "Expected reduction tree.");
16509 const TreeEntry &E = *VectorizableTree.front();
16510 auto It = MinBWs.find(&E);
16511 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16512 unsigned SrcSize = It->second.first;
16513 unsigned DstSize = ReductionBitWidth;
16514 unsigned Opcode = Instruction::Trunc;
16515 if (SrcSize < DstSize) {
16516 bool IsArithmeticExtendedReduction =
16517 all_of(*UserIgnoreList, [](Value *V) {
16518 auto *I = cast<Instruction>(V);
16519 return is_contained({Instruction::Add, Instruction::FAdd,
16520 Instruction::Mul, Instruction::FMul,
16521 Instruction::And, Instruction::Or,
16522 Instruction::Xor},
16523 I->getOpcode());
16524 });
16525 if (IsArithmeticExtendedReduction)
16526 Opcode =
16527 Instruction::BitCast; // Handle it by getExtendedReductionCost
16528 else
16529 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16530 }
16531 if (Opcode != Instruction::BitCast) {
16532 auto *SrcVecTy =
16533 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16534 auto *DstVecTy =
16535 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16536 TTI::CastContextHint CCH = getCastContextHint(E);
16537 InstructionCost CastCost;
16538 switch (E.getOpcode()) {
16539 case Instruction::SExt:
16540 case Instruction::ZExt:
16541 case Instruction::Trunc: {
16542 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16543 CCH = getCastContextHint(*OpTE);
16544 break;
16545 }
16546 default:
16547 break;
16548 }
16549 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16551 Cost += CastCost;
16552 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
16553 << " for final resize for reduction from " << SrcVecTy
16554 << " to " << DstVecTy << "\n";
16555 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16556 }
16557 }
16558 }
16559
16560 std::optional<InstructionCost> SpillCost;
16561 if (Cost < -SLPCostThreshold) {
16562 SpillCost = getSpillCost();
16563 Cost += *SpillCost;
16564 }
16565#ifndef NDEBUG
16566 SmallString<256> Str;
16567 {
16568 raw_svector_ostream OS(Str);
16569 OS << "SLP: Spill Cost = ";
16570 if (SpillCost)
16571 OS << *SpillCost;
16572 else
16573 OS << "<skipped>";
16574 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
16575 << "SLP: Total Cost = " << Cost << ".\n";
16576 }
16577 LLVM_DEBUG(dbgs() << Str);
16578 if (ViewSLPTree)
16579 ViewGraph(this, "SLP" + F->getName(), false, Str);
16580#endif
16581
16582 return Cost;
16583}
16584
16585/// Tries to find extractelement instructions with constant indices from fixed
16586/// vector type and gather such instructions into a bunch, which highly likely
16587/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16588/// successful, the matched scalars are replaced by poison values in \p VL for
16589/// future analysis.
16590std::optional<TTI::ShuffleKind>
16591BoUpSLP::tryToGatherSingleRegisterExtractElements(
16593 // Scan list of gathered scalars for extractelements that can be represented
16594 // as shuffles.
16596 SmallVector<int> UndefVectorExtracts;
16597 for (int I = 0, E = VL.size(); I < E; ++I) {
16598 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16599 if (!EI) {
16600 if (isa<UndefValue>(VL[I]))
16601 UndefVectorExtracts.push_back(I);
16602 continue;
16603 }
16604 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16605 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16606 continue;
16607 std::optional<unsigned> Idx = getExtractIndex(EI);
16608 // Undefined index.
16609 if (!Idx) {
16610 UndefVectorExtracts.push_back(I);
16611 continue;
16612 }
16613 if (Idx >= VecTy->getNumElements()) {
16614 UndefVectorExtracts.push_back(I);
16615 continue;
16616 }
16617 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
16618 ExtractMask.reset(*Idx);
16619 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
16620 UndefVectorExtracts.push_back(I);
16621 continue;
16622 }
16623 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
16624 }
16625 // Sort the vector operands by the maximum number of uses in extractelements.
16627 VectorOpToIdx.takeVector();
16628 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
16629 return P1.second.size() > P2.second.size();
16630 });
16631 // Find the best pair of the vectors or a single vector.
16632 const int UndefSz = UndefVectorExtracts.size();
16633 unsigned SingleMax = 0;
16634 unsigned PairMax = 0;
16635 if (!Vectors.empty()) {
16636 SingleMax = Vectors.front().second.size() + UndefSz;
16637 if (Vectors.size() > 1) {
16638 auto *ItNext = std::next(Vectors.begin());
16639 PairMax = SingleMax + ItNext->second.size();
16640 }
16641 }
16642 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16643 return std::nullopt;
16644 // Check if better to perform a shuffle of 2 vectors or just of a single
16645 // vector.
16646 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
16647 SmallVector<Value *> GatheredExtracts(
16648 VL.size(), PoisonValue::get(VL.front()->getType()));
16649 if (SingleMax >= PairMax && SingleMax) {
16650 for (int Idx : Vectors.front().second)
16651 std::swap(GatheredExtracts[Idx], VL[Idx]);
16652 } else if (!Vectors.empty()) {
16653 for (unsigned Idx : {0, 1})
16654 for (int Idx : Vectors[Idx].second)
16655 std::swap(GatheredExtracts[Idx], VL[Idx]);
16656 }
16657 // Add extracts from undefs too.
16658 for (int Idx : UndefVectorExtracts)
16659 std::swap(GatheredExtracts[Idx], VL[Idx]);
16660 // Check that gather of extractelements can be represented as just a
16661 // shuffle of a single/two vectors the scalars are extracted from.
16662 std::optional<TTI::ShuffleKind> Res =
16663 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
16664 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16665 // TODO: try to check other subsets if possible.
16666 // Restore the original VL if attempt was not successful.
16667 copy(SavedVL, VL.begin());
16668 return std::nullopt;
16669 }
16670 // Restore unused scalars from mask, if some of the extractelements were not
16671 // selected for shuffle.
16672 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
16673 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
16674 isa<UndefValue>(GatheredExtracts[I])) {
16675 std::swap(VL[I], GatheredExtracts[I]);
16676 continue;
16677 }
16678 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16679 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16680 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16681 is_contained(UndefVectorExtracts, I))
16682 continue;
16683 }
16684 return Res;
16685}
16686
16687/// Tries to find extractelement instructions with constant indices from fixed
16688/// vector type and gather such instructions into a bunch, which highly likely
16689/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16690/// successful, the matched scalars are replaced by poison values in \p VL for
16691/// future analysis.
16693BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16694 SmallVectorImpl<int> &Mask,
16695 unsigned NumParts) const {
16696 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
16697 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
16698 Mask.assign(VL.size(), PoisonMaskElem);
16699 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16700 for (unsigned Part : seq<unsigned>(NumParts)) {
16701 // Scan list of gathered scalars for extractelements that can be represented
16702 // as shuffles.
16703 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
16704 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
16705 SmallVector<int> SubMask;
16706 std::optional<TTI::ShuffleKind> Res =
16707 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16708 ShufflesRes[Part] = Res;
16709 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
16710 }
16711 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
16712 return Res.has_value();
16713 }))
16714 ShufflesRes.clear();
16715 return ShufflesRes;
16716}
16717
16718std::optional<TargetTransformInfo::ShuffleKind>
16719BoUpSLP::isGatherShuffledSingleRegisterEntry(
16720 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
16721 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
16722 Entries.clear();
16723 // TODO: currently checking only for Scalars in the tree entry, need to count
16724 // reused elements too for better cost estimation.
16725 auto GetUserEntry = [&](const TreeEntry *TE) {
16726 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16727 TE = TE->UserTreeIndex.UserTE;
16728 if (TE == VectorizableTree.front().get())
16729 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
16730 return TE->UserTreeIndex;
16731 };
16732 auto HasGatherUser = [&](const TreeEntry *TE) {
16733 while (TE->Idx != 0 && TE->UserTreeIndex) {
16734 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16735 return true;
16736 TE = TE->UserTreeIndex.UserTE;
16737 }
16738 return false;
16739 };
16740 const EdgeInfo TEUseEI = GetUserEntry(TE);
16741 if (!TEUseEI)
16742 return std::nullopt;
16743 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16744 const BasicBlock *TEInsertBlock = nullptr;
16745 // Main node of PHI entries keeps the correct order of operands/incoming
16746 // blocks.
16747 if (auto *PHI = dyn_cast_or_null<PHINode>(
16748 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
16749 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16750 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16751 TEInsertPt = TEInsertBlock->getTerminator();
16752 } else {
16753 TEInsertBlock = TEInsertPt->getParent();
16754 }
16755 if (!DT->isReachableFromEntry(TEInsertBlock))
16756 return std::nullopt;
16757 auto *NodeUI = DT->getNode(TEInsertBlock);
16758 assert(NodeUI && "Should only process reachable instructions");
16759 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
16760 auto CheckOrdering = [&](const Instruction *InsertPt) {
16761 // Argument InsertPt is an instruction where vector code for some other
16762 // tree entry (one that shares one or more scalars with TE) is going to be
16763 // generated. This lambda returns true if insertion point of vector code
16764 // for the TE dominates that point (otherwise dependency is the other way
16765 // around). The other node is not limited to be of a gather kind. Gather
16766 // nodes are not scheduled and their vector code is inserted before their
16767 // first user. If user is PHI, that is supposed to be at the end of a
16768 // predecessor block. Otherwise it is the last instruction among scalars of
16769 // the user node. So, instead of checking dependency between instructions
16770 // themselves, we check dependency between their insertion points for vector
16771 // code (since each scalar instruction ends up as a lane of a vector
16772 // instruction).
16773 const BasicBlock *InsertBlock = InsertPt->getParent();
16774 auto *NodeEUI = DT->getNode(InsertBlock);
16775 if (!NodeEUI)
16776 return false;
16777 assert((NodeUI == NodeEUI) ==
16778 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16779 "Different nodes should have different DFS numbers");
16780 // Check the order of the gather nodes users.
16781 if (TEInsertPt->getParent() != InsertBlock &&
16782 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16783 return false;
16784 if (TEInsertPt->getParent() == InsertBlock &&
16785 TEInsertPt->comesBefore(InsertPt))
16786 return false;
16787 return true;
16788 };
16789 // Find all tree entries used by the gathered values. If no common entries
16790 // found - not a shuffle.
16791 // Here we build a set of tree nodes for each gathered value and trying to
16792 // find the intersection between these sets. If we have at least one common
16793 // tree node for each gathered value - we have just a permutation of the
16794 // single vector. If we have 2 different sets, we're in situation where we
16795 // have a permutation of 2 input vectors.
16797 SmallDenseMap<Value *, int> UsedValuesEntry;
16798 SmallPtrSet<const Value *, 16> VisitedValue;
16799 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
16800 // The node is reused - exit.
16801 if ((TEPtr->getVectorFactor() != VL.size() &&
16802 TEPtr->Scalars.size() != VL.size()) ||
16803 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
16804 return false;
16805 UsedTEs.clear();
16806 UsedTEs.emplace_back().insert(TEPtr);
16807 for (Value *V : VL) {
16808 if (isConstant(V))
16809 continue;
16810 UsedValuesEntry.try_emplace(V, 0);
16811 }
16812 return true;
16813 };
16814 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
16815 unsigned EdgeIdx) {
16816 const TreeEntry *Ptr1 = User1;
16817 const TreeEntry *Ptr2 = User2;
16818 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16819 while (Ptr2) {
16820 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
16821 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16822 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16823 }
16824 while (Ptr1) {
16825 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16826 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16827 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
16828 return Idx < It->second;
16829 }
16830 return false;
16831 };
16832 for (Value *V : VL) {
16833 if (isConstant(V) || !VisitedValue.insert(V).second)
16834 continue;
16835 // Build a list of tree entries where V is used.
16836 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16837 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16838 if (TEPtr == TE || TEPtr->Idx == 0)
16839 continue;
16840 assert(any_of(TEPtr->Scalars,
16841 [&](Value *V) { return GatheredScalars.contains(V); }) &&
16842 "Must contain at least single gathered value.");
16843 assert(TEPtr->UserTreeIndex &&
16844 "Expected only single user of a gather node.");
16845 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16846
16847 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16848 UseEI.UserTE->hasState())
16849 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
16850 : nullptr;
16851 Instruction *InsertPt =
16852 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
16853 : &getLastInstructionInBundle(UseEI.UserTE);
16854 if (TEInsertPt == InsertPt) {
16855 // Check nodes, which might be emitted first.
16856 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16857 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16858 TEUseEI.UserTE->isAltShuffle()) &&
16859 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
16860 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16861 (UseEI.UserTE->hasState() &&
16862 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16863 !UseEI.UserTE->isAltShuffle()) ||
16864 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
16865 continue;
16866 }
16867
16868 // If the schedulable insertion point is used in multiple entries - just
16869 // exit, no known ordering at this point, available only after real
16870 // scheduling.
16871 if (!doesNotNeedToBeScheduled(InsertPt) &&
16872 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16873 continue;
16874 // If the users are the PHI nodes with the same incoming blocks - skip.
16875 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16876 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16877 UseEI.UserTE->State == TreeEntry::Vectorize &&
16878 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16879 TEUseEI.UserTE != UseEI.UserTE)
16880 continue;
16881 // If 2 gathers are operands of the same entry (regardless of whether
16882 // user is PHI or else), compare operands indices, use the earlier one
16883 // as the base.
16884 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16885 continue;
16886 // If the user instruction is used for some reason in different
16887 // vectorized nodes - make it depend on index.
16888 if (TEUseEI.UserTE != UseEI.UserTE &&
16889 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16890 HasGatherUser(TEUseEI.UserTE)))
16891 continue;
16892 // If the user node is the operand of the other user node - skip.
16893 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16894 continue;
16895 }
16896
16897 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16898 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16899 UseEI.UserTE->doesNotNeedToSchedule() &&
16900 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
16901 continue;
16902 // Check if the user node of the TE comes after user node of TEPtr,
16903 // otherwise TEPtr depends on TE.
16904 if ((TEInsertBlock != InsertPt->getParent() ||
16905 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16906 !CheckOrdering(InsertPt))
16907 continue;
16908 // The node is reused - exit.
16909 if (CheckAndUseSameNode(TEPtr))
16910 break;
16911 VToTEs.insert(TEPtr);
16912 }
16913 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
16914 const auto *It = find_if(
16915 VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
16916 if (It != VTEs.end()) {
16917 const TreeEntry *VTE = *It;
16918 if (none_of(TE->CombinedEntriesWithIndices,
16919 [&](const auto &P) { return P.first == VTE->Idx; })) {
16920 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16921 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16922 continue;
16923 }
16924 // The node is reused - exit.
16925 if (CheckAndUseSameNode(VTE))
16926 break;
16927 VToTEs.insert(VTE);
16928 }
16929 }
16930 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
16931 const TreeEntry *VTE = VTEs.front();
16932 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16933 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16934 VTEs = VTEs.drop_front();
16935 // Iterate through all vectorized nodes.
16936 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
16937 return MTE->State == TreeEntry::Vectorize;
16938 });
16939 if (MIt == VTEs.end())
16940 continue;
16941 VTE = *MIt;
16942 }
16943 if (none_of(TE->CombinedEntriesWithIndices,
16944 [&](const auto &P) { return P.first == VTE->Idx; })) {
16945 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16946 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16947 continue;
16948 }
16949 // The node is reused - exit.
16950 if (CheckAndUseSameNode(VTE))
16951 break;
16952 VToTEs.insert(VTE);
16953 }
16954 if (VToTEs.empty())
16955 continue;
16956 if (UsedTEs.empty()) {
16957 // The first iteration, just insert the list of nodes to vector.
16958 UsedTEs.push_back(VToTEs);
16959 UsedValuesEntry.try_emplace(V, 0);
16960 } else {
16961 // Need to check if there are any previously used tree nodes which use V.
16962 // If there are no such nodes, consider that we have another one input
16963 // vector.
16964 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16965 unsigned Idx = 0;
16966 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16967 // Do we have a non-empty intersection of previously listed tree entries
16968 // and tree entries using current V?
16969 set_intersect(VToTEs, Set);
16970 if (!VToTEs.empty()) {
16971 // Yes, write the new subset and continue analysis for the next
16972 // scalar.
16973 Set.swap(VToTEs);
16974 break;
16975 }
16976 VToTEs = SavedVToTEs;
16977 ++Idx;
16978 }
16979 // No non-empty intersection found - need to add a second set of possible
16980 // source vectors.
16981 if (Idx == UsedTEs.size()) {
16982 // If the number of input vectors is greater than 2 - not a permutation,
16983 // fallback to the regular gather.
16984 // TODO: support multiple reshuffled nodes.
16985 if (UsedTEs.size() == 2)
16986 continue;
16987 UsedTEs.push_back(SavedVToTEs);
16988 Idx = UsedTEs.size() - 1;
16989 }
16990 UsedValuesEntry.try_emplace(V, Idx);
16991 }
16992 }
16993
16994 if (UsedTEs.empty()) {
16995 Entries.clear();
16996 return std::nullopt;
16997 }
16998
16999 unsigned VF = 0;
17000 if (UsedTEs.size() == 1) {
17001 // Keep the order to avoid non-determinism.
17002 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
17003 UsedTEs.front().end());
17004 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17005 return TE1->Idx < TE2->Idx;
17006 });
17007 // Try to find the perfect match in another gather node at first.
17008 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
17009 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
17010 });
17011 if (It != FirstEntries.end() &&
17012 ((*It)->getVectorFactor() == VL.size() ||
17013 ((*It)->getVectorFactor() == TE->Scalars.size() &&
17014 TE->ReuseShuffleIndices.size() == VL.size() &&
17015 (*It)->isSame(TE->Scalars)))) {
17016 Entries.push_back(*It);
17017 if ((*It)->getVectorFactor() == VL.size()) {
17018 std::iota(std::next(Mask.begin(), Part * VL.size()),
17019 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
17020 } else {
17021 SmallVector<int> CommonMask = TE->getCommonMask();
17022 copy(CommonMask, Mask.begin());
17023 }
17024 // Clear undef scalars.
17025 for (unsigned I : seq<unsigned>(VL.size()))
17026 if (isa<PoisonValue>(VL[I]))
17027 Mask[Part * VL.size() + I] = PoisonMaskElem;
17029 }
17030 // No perfect match, just shuffle, so choose the first tree node from the
17031 // tree.
17032 Entries.push_back(FirstEntries.front());
17033 // Update mapping between values and corresponding tree entries.
17034 for (auto &P : UsedValuesEntry)
17035 P.second = 0;
17036 VF = FirstEntries.front()->getVectorFactor();
17037 } else {
17038 // Try to find nodes with the same vector factor.
17039 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
17040 // Keep the order of tree nodes to avoid non-determinism.
17041 DenseMap<int, const TreeEntry *> VFToTE;
17042 for (const TreeEntry *TE : UsedTEs.front()) {
17043 unsigned VF = TE->getVectorFactor();
17044 auto It = VFToTE.find(VF);
17045 if (It != VFToTE.end()) {
17046 if (It->second->Idx > TE->Idx)
17047 It->getSecond() = TE;
17048 continue;
17049 }
17050 VFToTE.try_emplace(VF, TE);
17051 }
17052 // Same, keep the order to avoid non-determinism.
17053 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
17054 UsedTEs.back().end());
17055 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17056 return TE1->Idx < TE2->Idx;
17057 });
17058 for (const TreeEntry *TE : SecondEntries) {
17059 auto It = VFToTE.find(TE->getVectorFactor());
17060 if (It != VFToTE.end()) {
17061 VF = It->first;
17062 Entries.push_back(It->second);
17063 Entries.push_back(TE);
17064 break;
17065 }
17066 }
17067 // No 2 source vectors with the same vector factor - just choose 2 with max
17068 // index.
17069 if (Entries.empty()) {
17071 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
17072 return TE1->Idx < TE2->Idx;
17073 }));
17074 Entries.push_back(SecondEntries.front());
17075 VF = std::max(Entries.front()->getVectorFactor(),
17076 Entries.back()->getVectorFactor());
17077 } else {
17078 VF = Entries.front()->getVectorFactor();
17079 }
17080 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
17081 for (const TreeEntry *E : Entries)
17082 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
17083 E->Scalars.end());
17084 // Update mapping between values and corresponding tree entries.
17085 for (auto &P : UsedValuesEntry) {
17086 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
17087 if (ValuesToEntries[Idx].contains(P.first)) {
17088 P.second = Idx;
17089 break;
17090 }
17091 }
17092 }
17093
17094 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
17095 // Checks if the 2 PHIs are compatible in terms of high possibility to be
17096 // vectorized.
17097 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
17098 auto *PHI = cast<PHINode>(V);
17099 auto *PHI1 = cast<PHINode>(V1);
17100 // Check that all incoming values are compatible/from same parent (if they
17101 // are instructions).
17102 // The incoming values are compatible if they all are constants, or
17103 // instruction with the same/alternate opcodes from the same basic block.
17104 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
17105 Value *In = PHI->getIncomingValue(I);
17106 Value *In1 = PHI1->getIncomingValue(I);
17107 if (isConstant(In) && isConstant(In1))
17108 continue;
17109 if (!getSameOpcode({In, In1}, *TLI))
17110 return false;
17111 if (cast<Instruction>(In)->getParent() !=
17113 return false;
17114 }
17115 return true;
17116 };
17117 // Check if the value can be ignored during analysis for shuffled gathers.
17118 // We suppose it is better to ignore instruction, which do not form splats,
17119 // are not vectorized/not extractelements (these instructions will be handled
17120 // by extractelements processing) or may form vector node in future.
17121 auto MightBeIgnored = [=](Value *V) {
17122 auto *I = dyn_cast<Instruction>(V);
17123 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17125 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17126 };
17127 // Check that the neighbor instruction may form a full vector node with the
17128 // current instruction V. It is possible, if they have same/alternate opcode
17129 // and same parent basic block.
17130 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17131 Value *V1 = VL[Idx];
17132 bool UsedInSameVTE = false;
17133 auto It = UsedValuesEntry.find(V1);
17134 if (It != UsedValuesEntry.end())
17135 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17136 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17137 getSameOpcode({V, V1}, *TLI) &&
17138 cast<Instruction>(V)->getParent() ==
17139 cast<Instruction>(V1)->getParent() &&
17140 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17141 };
17142 // Build a shuffle mask for better cost estimation and vector emission.
17143 SmallBitVector UsedIdxs(Entries.size());
17145 for (int I = 0, E = VL.size(); I < E; ++I) {
17146 Value *V = VL[I];
17147 auto It = UsedValuesEntry.find(V);
17148 if (It == UsedValuesEntry.end())
17149 continue;
17150 // Do not try to shuffle scalars, if they are constants, or instructions
17151 // that can be vectorized as a result of the following vector build
17152 // vectorization.
17153 if (isConstant(V) || (MightBeIgnored(V) &&
17154 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17155 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17156 continue;
17157 unsigned Idx = It->second;
17158 EntryLanes.emplace_back(Idx, I);
17159 UsedIdxs.set(Idx);
17160 }
17161 // Iterate through all shuffled scalars and select entries, which can be used
17162 // for final shuffle.
17164 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17165 if (!UsedIdxs.test(I))
17166 continue;
17167 // Fix the entry number for the given scalar. If it is the first entry, set
17168 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17169 // These indices are used when calculating final shuffle mask as the vector
17170 // offset.
17171 for (std::pair<unsigned, int> &Pair : EntryLanes)
17172 if (Pair.first == I)
17173 Pair.first = TempEntries.size();
17174 TempEntries.push_back(Entries[I]);
17175 }
17176 Entries.swap(TempEntries);
17177 if (EntryLanes.size() == Entries.size() &&
17178 !VL.equals(ArrayRef(TE->Scalars)
17179 .slice(Part * VL.size(),
17180 std::min<int>(VL.size(), TE->Scalars.size())))) {
17181 // We may have here 1 or 2 entries only. If the number of scalars is equal
17182 // to the number of entries, no need to do the analysis, it is not very
17183 // profitable. Since VL is not the same as TE->Scalars, it means we already
17184 // have some shuffles before. Cut off not profitable case.
17185 Entries.clear();
17186 return std::nullopt;
17187 }
17188 // Build the final mask, check for the identity shuffle, if possible.
17189 bool IsIdentity = Entries.size() == 1;
17190 // Pair.first is the offset to the vector, while Pair.second is the index of
17191 // scalar in the list.
17192 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17193 unsigned Idx = Part * VL.size() + Pair.second;
17194 Mask[Idx] =
17195 Pair.first * VF +
17196 (ForOrder ? std::distance(
17197 Entries[Pair.first]->Scalars.begin(),
17198 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17199 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17200 IsIdentity &= Mask[Idx] == Pair.second;
17201 }
17202 if (ForOrder || IsIdentity || Entries.empty()) {
17203 switch (Entries.size()) {
17204 case 1:
17205 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17207 break;
17208 case 2:
17209 if (EntryLanes.size() > 2 || VL.size() <= 2)
17211 break;
17212 default:
17213 break;
17214 }
17215 } else if (!isa<VectorType>(VL.front()->getType()) &&
17216 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17217 // Do the cost estimation if shuffle beneficial than buildvector.
17218 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17219 std::next(Mask.begin(), (Part + 1) * VL.size()));
17220 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17221 for (int Idx : SubMask) {
17222 if (Idx == PoisonMaskElem)
17223 continue;
17224 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17225 MinElement = Idx;
17226 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17227 MaxElement = Idx;
17228 }
17229 assert(MaxElement >= 0 && MinElement >= 0 &&
17230 MaxElement % VF >= MinElement % VF &&
17231 "Expected at least single element.");
17232 unsigned NewVF = std::max<unsigned>(
17233 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17234 (MaxElement % VF) -
17235 (MinElement % VF) + 1));
17236 if (NewVF < VF) {
17237 for (int &Idx : SubMask) {
17238 if (Idx == PoisonMaskElem)
17239 continue;
17240 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17241 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17242 }
17243 } else {
17244 NewVF = VF;
17245 }
17246
17248 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17249 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17250 auto GetShuffleCost = [&,
17251 &TTI = *TTI](ArrayRef<int> Mask,
17253 VectorType *VecTy) -> InstructionCost {
17254 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17256 Mask, Entries.front()->getInterleaveFactor()))
17257 return TTI::TCC_Free;
17258 return ::getShuffleCost(TTI,
17259 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17261 VecTy, Mask, CostKind);
17262 };
17263 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17264 InstructionCost FirstShuffleCost = 0;
17265 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17266 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17267 FirstShuffleCost = ShuffleCost;
17268 } else {
17269 // Transform mask to include only first entry.
17270 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17271 bool IsIdentity = true;
17272 for (auto [I, Idx] : enumerate(FirstMask)) {
17273 if (Idx >= static_cast<int>(NewVF)) {
17274 Idx = PoisonMaskElem;
17275 } else {
17276 DemandedElts.clearBit(I);
17277 if (Idx != PoisonMaskElem)
17278 IsIdentity &= static_cast<int>(I) == Idx;
17279 }
17280 }
17281 if (!IsIdentity)
17282 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17283 FirstShuffleCost += getScalarizationOverhead(
17284 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17285 /*Extract=*/false, CostKind);
17286 }
17287 InstructionCost SecondShuffleCost = 0;
17288 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17289 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17290 SecondShuffleCost = ShuffleCost;
17291 } else {
17292 // Transform mask to include only first entry.
17293 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17294 bool IsIdentity = true;
17295 for (auto [I, Idx] : enumerate(SecondMask)) {
17296 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
17297 Idx = PoisonMaskElem;
17298 } else {
17299 DemandedElts.clearBit(I);
17300 if (Idx != PoisonMaskElem) {
17301 Idx -= NewVF;
17302 IsIdentity &= static_cast<int>(I) == Idx;
17303 }
17304 }
17305 }
17306 if (!IsIdentity)
17307 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17308 SecondShuffleCost += getScalarizationOverhead(
17309 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17310 /*Extract=*/false, CostKind);
17311 }
17312 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17313 for (auto [I, Idx] : enumerate(SubMask))
17314 if (Idx == PoisonMaskElem)
17315 DemandedElts.clearBit(I);
17316 InstructionCost BuildVectorCost = getScalarizationOverhead(
17317 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17318 /*Extract=*/false, CostKind);
17319 const TreeEntry *BestEntry = nullptr;
17320 if (FirstShuffleCost < ShuffleCost) {
17321 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17322 std::next(Mask.begin(), (Part + 1) * VL.size()),
17323 [&](int &Idx) {
17324 if (Idx >= static_cast<int>(VF))
17325 Idx = PoisonMaskElem;
17326 });
17327 BestEntry = Entries.front();
17328 ShuffleCost = FirstShuffleCost;
17329 }
17330 if (SecondShuffleCost < ShuffleCost) {
17331 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17332 std::next(Mask.begin(), (Part + 1) * VL.size()),
17333 [&](int &Idx) {
17334 if (Idx < static_cast<int>(VF))
17335 Idx = PoisonMaskElem;
17336 else
17337 Idx -= VF;
17338 });
17339 BestEntry = Entries[1];
17340 ShuffleCost = SecondShuffleCost;
17341 }
17342 if (BuildVectorCost >= ShuffleCost) {
17343 if (BestEntry) {
17344 Entries.clear();
17345 Entries.push_back(BestEntry);
17346 }
17347 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
17349 }
17350 }
17351 Entries.clear();
17352 // Clear the corresponding mask elements.
17353 std::fill(std::next(Mask.begin(), Part * VL.size()),
17354 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
17355 return std::nullopt;
17356}
17357
17359BoUpSLP::isGatherShuffledEntry(
17360 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
17361 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
17362 bool ForOrder) {
17363 assert(NumParts > 0 && NumParts < VL.size() &&
17364 "Expected positive number of registers.");
17365 Entries.clear();
17366 // No need to check for the topmost gather node.
17367 if (TE == VectorizableTree.front().get() &&
17368 (!GatheredLoadsEntriesFirst.has_value() ||
17369 none_of(ArrayRef(VectorizableTree).drop_front(),
17370 [](const std::unique_ptr<TreeEntry> &TE) {
17371 return !TE->isGather();
17372 })))
17373 return {};
17374 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
17375 // implemented yet.
17376 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17377 return {};
17378 Mask.assign(VL.size(), PoisonMaskElem);
17379 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17380 "Expected only single user of the gather node.");
17381 assert(VL.size() % NumParts == 0 &&
17382 "Number of scalars must be divisible by NumParts.");
17383 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
17384 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17385 (TE->Idx == 0 ||
17386 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
17387 isSplat(TE->Scalars) ||
17388 (TE->hasState() &&
17389 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
17390 return {};
17391 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17393 for (unsigned Part : seq<unsigned>(NumParts)) {
17394 ArrayRef<Value *> SubVL =
17395 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17396 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17397 std::optional<TTI::ShuffleKind> SubRes =
17398 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17399 ForOrder);
17400 if (!SubRes)
17401 SubEntries.clear();
17402 Res.push_back(SubRes);
17403 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
17404 SubEntries.front()->getVectorFactor() == VL.size() &&
17405 (SubEntries.front()->isSame(TE->Scalars) ||
17406 SubEntries.front()->isSame(VL))) {
17407 SmallVector<const TreeEntry *> LocalSubEntries;
17408 LocalSubEntries.swap(SubEntries);
17409 Entries.clear();
17410 Res.clear();
17411 std::iota(Mask.begin(), Mask.end(), 0);
17412 // Clear undef scalars.
17413 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
17414 if (isa<PoisonValue>(VL[I]))
17416 Entries.emplace_back(1, LocalSubEntries.front());
17418 return Res;
17419 }
17420 }
17421 if (all_of(Res,
17422 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
17423 Entries.clear();
17424 return {};
17425 }
17426 return Res;
17427}
17428
17429InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
17430 Type *ScalarTy) const {
17431 const unsigned VF = VL.size();
17432 auto *VecTy = getWidenedType(ScalarTy, VF);
17433 // Find the cost of inserting/extracting values from the vector.
17434 // Check if the same elements are inserted several times and count them as
17435 // shuffle candidates.
17436 APInt DemandedElements = APInt::getZero(VF);
17439 auto EstimateInsertCost = [&](unsigned I, Value *V) {
17440 DemandedElements.setBit(I);
17441 if (V->getType() != ScalarTy)
17442 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
17444 };
17445 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
17446 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17447 for (auto [I, V] : enumerate(VL)) {
17448 // No need to shuffle duplicates for constants.
17449 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
17450 continue;
17451
17452 if (isConstant(V)) {
17453 ConstantShuffleMask[I] = I + VF;
17454 continue;
17455 }
17456 EstimateInsertCost(I, V);
17457 }
17458 // FIXME: add a cost for constant vector materialization.
17459 bool IsAnyNonUndefConst =
17460 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
17461 // 1. Shuffle input source vector and constant vector.
17462 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17464 ConstantShuffleMask);
17465 }
17466
17467 // 2. Insert unique non-constants.
17468 if (!DemandedElements.isZero())
17469 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
17470 /*Insert=*/true,
17471 /*Extract=*/false, CostKind,
17472 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17473 return Cost;
17474}
17475
17476Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
17477 auto It = EntryToLastInstruction.find(E);
17478 if (It != EntryToLastInstruction.end())
17479 return *cast<Instruction>(It->second);
17480 Instruction *Res = nullptr;
17481 // Get the basic block this bundle is in. All instructions in the bundle
17482 // should be in this block (except for extractelement-like instructions with
17483 // constant indices or gathered loads or copyables).
17484 Instruction *Front;
17485 unsigned Opcode;
17486 if (E->hasState()) {
17487 Front = E->getMainOp();
17488 Opcode = E->getOpcode();
17489 } else {
17490 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
17491 Opcode = Front->getOpcode();
17492 }
17493 auto *BB = Front->getParent();
17494 assert(
17495 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17496 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17497 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17498 all_of(E->Scalars,
17499 [=](Value *V) -> bool {
17500 if (Opcode == Instruction::GetElementPtr &&
17501 !isa<GetElementPtrInst>(V))
17502 return true;
17503 auto *I = dyn_cast<Instruction>(V);
17504 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17505 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17506 })) &&
17507 "Expected gathered loads or GEPs or instructions from same basic "
17508 "block.");
17509
17510 auto FindLastInst = [&]() {
17511 Instruction *LastInst = Front;
17512 for (Value *V : E->Scalars) {
17513 auto *I = dyn_cast<Instruction>(V);
17514 if (!I)
17515 continue;
17516 if (E->isCopyableElement(I))
17517 continue;
17518 if (LastInst->getParent() == I->getParent()) {
17519 if (LastInst->comesBefore(I))
17520 LastInst = I;
17521 continue;
17522 }
17523 assert(((Opcode == Instruction::GetElementPtr &&
17525 E->State == TreeEntry::SplitVectorize ||
17526 (isVectorLikeInstWithConstOps(LastInst) &&
17528 (GatheredLoadsEntriesFirst.has_value() &&
17529 Opcode == Instruction::Load && E->isGather() &&
17530 E->Idx < *GatheredLoadsEntriesFirst)) &&
17531 "Expected vector-like or non-GEP in GEP node insts only.");
17532 if (!DT->isReachableFromEntry(LastInst->getParent())) {
17533 LastInst = I;
17534 continue;
17535 }
17536 if (!DT->isReachableFromEntry(I->getParent()))
17537 continue;
17538 auto *NodeA = DT->getNode(LastInst->getParent());
17539 auto *NodeB = DT->getNode(I->getParent());
17540 assert(NodeA && "Should only process reachable instructions");
17541 assert(NodeB && "Should only process reachable instructions");
17542 assert((NodeA == NodeB) ==
17543 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17544 "Different nodes should have different DFS numbers");
17545 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17546 LastInst = I;
17547 }
17548 BB = LastInst->getParent();
17549 return LastInst;
17550 };
17551
17552 auto FindFirstInst = [&]() {
17553 Instruction *FirstInst = Front;
17554 for (Value *V : E->Scalars) {
17555 auto *I = dyn_cast<Instruction>(V);
17556 if (!I)
17557 continue;
17558 if (E->isCopyableElement(I))
17559 continue;
17560 if (FirstInst->getParent() == I->getParent()) {
17561 if (I->comesBefore(FirstInst))
17562 FirstInst = I;
17563 continue;
17564 }
17565 assert(((Opcode == Instruction::GetElementPtr &&
17567 (isVectorLikeInstWithConstOps(FirstInst) &&
17569 "Expected vector-like or non-GEP in GEP node insts only.");
17570 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
17571 FirstInst = I;
17572 continue;
17573 }
17574 if (!DT->isReachableFromEntry(I->getParent()))
17575 continue;
17576 auto *NodeA = DT->getNode(FirstInst->getParent());
17577 auto *NodeB = DT->getNode(I->getParent());
17578 assert(NodeA && "Should only process reachable instructions");
17579 assert(NodeB && "Should only process reachable instructions");
17580 assert((NodeA == NodeB) ==
17581 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17582 "Different nodes should have different DFS numbers");
17583 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17584 FirstInst = I;
17585 }
17586 return FirstInst;
17587 };
17588
17589 if (E->State == TreeEntry::SplitVectorize) {
17590 Res = FindLastInst();
17591 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
17592 for (auto *E : Entries) {
17593 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17594 if (!I)
17595 I = &getLastInstructionInBundle(E);
17596 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
17597 Res = I;
17598 }
17599 }
17600 EntryToLastInstruction.try_emplace(E, Res);
17601 return *Res;
17602 }
17603
17604 // Set insertpoint for gathered loads to the very first load.
17605 if (GatheredLoadsEntriesFirst.has_value() &&
17606 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17607 Opcode == Instruction::Load) {
17608 Res = FindFirstInst();
17609 EntryToLastInstruction.try_emplace(E, Res);
17610 return *Res;
17611 }
17612
17613 // Set the insert point to the beginning of the basic block if the entry
17614 // should not be scheduled.
17615 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
17616 if (E->isGather())
17617 return nullptr;
17618 // Found previously that the instruction do not need to be scheduled.
17619 const auto *It = BlocksSchedules.find(BB);
17620 if (It == BlocksSchedules.end())
17621 return nullptr;
17622 for (Value *V : E->Scalars) {
17623 auto *I = dyn_cast<Instruction>(V);
17624 if (!I || isa<PHINode>(I) ||
17625 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
17626 continue;
17627 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
17628 if (Bundles.empty())
17629 continue;
17630 const auto *It = find_if(
17631 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
17632 if (It != Bundles.end())
17633 return *It;
17634 }
17635 return nullptr;
17636 };
17637 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17638 if (!E->isGather() && !Bundle) {
17639 if ((Opcode == Instruction::GetElementPtr &&
17640 any_of(E->Scalars,
17641 [](Value *V) {
17642 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17643 })) ||
17644 all_of(E->Scalars, [&](Value *V) {
17645 return isa<PoisonValue>(V) ||
17646 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17647 E->isCopyableElement(V) ||
17648 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17649 }))
17650 Res = FindLastInst();
17651 else
17652 Res = FindFirstInst();
17653 EntryToLastInstruction.try_emplace(E, Res);
17654 return *Res;
17655 }
17656
17657 // Find the last instruction. The common case should be that BB has been
17658 // scheduled, and the last instruction is VL.back(). So we start with
17659 // VL.back() and iterate over schedule data until we reach the end of the
17660 // bundle. The end of the bundle is marked by null ScheduleData.
17661 if (Bundle) {
17662 assert(!E->isGather() && "Gathered instructions should not be scheduled");
17663 Res = Bundle->getBundle().back()->getInst();
17664 EntryToLastInstruction.try_emplace(E, Res);
17665 return *Res;
17666 }
17667
17668 // LastInst can still be null at this point if there's either not an entry
17669 // for BB in BlocksSchedules or there's no ScheduleData available for
17670 // VL.back(). This can be the case if buildTreeRec aborts for various
17671 // reasons (e.g., the maximum recursion depth is reached, the maximum region
17672 // size is reached, etc.). ScheduleData is initialized in the scheduling
17673 // "dry-run".
17674 //
17675 // If this happens, we can still find the last instruction by brute force. We
17676 // iterate forwards from Front (inclusive) until we either see all
17677 // instructions in the bundle or reach the end of the block. If Front is the
17678 // last instruction in program order, LastInst will be set to Front, and we
17679 // will visit all the remaining instructions in the block.
17680 //
17681 // One of the reasons we exit early from buildTreeRec is to place an upper
17682 // bound on compile-time. Thus, taking an additional compile-time hit here is
17683 // not ideal. However, this should be exceedingly rare since it requires that
17684 // we both exit early from buildTreeRec and that the bundle be out-of-order
17685 // (causing us to iterate all the way to the end of the block).
17686 if (!Res)
17687 Res = FindLastInst();
17688 assert(Res && "Failed to find last instruction in bundle");
17689 EntryToLastInstruction.try_emplace(E, Res);
17690 return *Res;
17691}
17692
17693void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
17694 auto *Front = E->getMainOp();
17695 Instruction *LastInst = &getLastInstructionInBundle(E);
17696 assert(LastInst && "Failed to find last instruction in bundle");
17697 BasicBlock::iterator LastInstIt = LastInst->getIterator();
17698 // If the instruction is PHI, set the insert point after all the PHIs.
17699 bool IsPHI = isa<PHINode>(LastInst);
17700 if (IsPHI) {
17701 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
17702 if (LastInstIt != LastInst->getParent()->end() &&
17703 LastInstIt->getParent()->isLandingPad())
17704 LastInstIt = std::next(LastInstIt);
17705 }
17706 if (IsPHI ||
17707 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17708 (E->doesNotNeedToSchedule() ||
17709 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
17710 isUsedOutsideBlock(LastInst)))) ||
17711 (GatheredLoadsEntriesFirst.has_value() &&
17712 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17713 E->getOpcode() == Instruction::Load)) {
17714 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
17715 } else {
17716 // Set the insertion point after the last instruction in the bundle. Set the
17717 // debug location to Front.
17718 Builder.SetInsertPoint(
17719 LastInst->getParent(),
17720 LastInst->getNextNode()->getIterator());
17721 }
17722 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
17723}
17724
17725Value *BoUpSLP::gather(
17726 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
17727 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
17728 // List of instructions/lanes from current block and/or the blocks which are
17729 // part of the current loop. These instructions will be inserted at the end to
17730 // make it possible to optimize loops and hoist invariant instructions out of
17731 // the loops body with better chances for success.
17733 SmallSet<int, 4> PostponedIndices;
17734 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
17735 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
17736 SmallPtrSet<BasicBlock *, 4> Visited;
17737 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
17738 InsertBB = InsertBB->getSinglePredecessor();
17739 return InsertBB && InsertBB == InstBB;
17740 };
17741 for (int I = 0, E = VL.size(); I < E; ++I) {
17742 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
17743 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17744 isVectorized(Inst) ||
17745 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
17746 PostponedIndices.insert(I).second)
17747 PostponedInsts.emplace_back(Inst, I);
17748 }
17749
17750 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
17751 Type *Ty) {
17752 Value *Scalar = V;
17753 if (Scalar->getType() != Ty) {
17754 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17755 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
17756 Value *V = Scalar;
17757 if (auto *CI = dyn_cast<CastInst>(Scalar);
17759 Value *Op = CI->getOperand(0);
17760 if (auto *IOp = dyn_cast<Instruction>(Op);
17761 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
17762 V = Op;
17763 }
17764 Scalar = Builder.CreateIntCast(
17765 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
17766 }
17767
17768 Instruction *InsElt;
17769 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17770 assert(SLPReVec && "FixedVectorType is not expected.");
17771 Vec =
17772 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
17773 auto *II = dyn_cast<Instruction>(Vec);
17774 if (!II)
17775 return Vec;
17776 InsElt = II;
17777 } else {
17778 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17779 InsElt = dyn_cast<InsertElementInst>(Vec);
17780 if (!InsElt)
17781 return Vec;
17782 }
17783 GatherShuffleExtractSeq.insert(InsElt);
17784 CSEBlocks.insert(InsElt->getParent());
17785 // Add to our 'need-to-extract' list.
17786 if (isa<Instruction>(V)) {
17787 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
17788 // Find which lane we need to extract.
17789 User *UserOp = nullptr;
17790 if (Scalar != V) {
17791 if (auto *SI = dyn_cast<Instruction>(Scalar))
17792 UserOp = SI;
17793 } else {
17794 if (V->getType()->isVectorTy()) {
17795 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
17796 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17797 // Find shufflevector, caused by resize.
17798 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
17799 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
17800 if (SV->getOperand(0) == V)
17801 return SV;
17802 if (SV->getOperand(1) == V)
17803 return SV;
17804 }
17805 return nullptr;
17806 };
17807 InsElt = nullptr;
17808 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17809 InsElt = User;
17810 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17811 InsElt = User;
17812 assert(InsElt &&
17813 "Failed to find shufflevector, caused by resize.");
17814 }
17815 }
17816 UserOp = InsElt;
17817 }
17818 if (UserOp) {
17819 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17820 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17821 }
17822 }
17823 }
17824 return Vec;
17825 };
17826 auto *VecTy = getWidenedType(ScalarTy, VL.size());
17827 Value *Vec = PoisonValue::get(VecTy);
17828 SmallVector<int> NonConsts;
17829 SmallVector<int> Mask(VL.size());
17830 std::iota(Mask.begin(), Mask.end(), 0);
17831 Value *OriginalRoot = Root;
17832 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
17833 SV && isa<PoisonValue>(SV->getOperand(1)) &&
17834 SV->getOperand(0)->getType() == VecTy) {
17835 Root = SV->getOperand(0);
17836 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17837 }
17838 // Insert constant values at first.
17839 for (int I = 0, E = VL.size(); I < E; ++I) {
17840 if (PostponedIndices.contains(I))
17841 continue;
17842 if (!isConstant(VL[I])) {
17843 NonConsts.push_back(I);
17844 continue;
17845 }
17846 if (isa<PoisonValue>(VL[I]))
17847 continue;
17848 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17849 Mask[I] = I + E;
17850 }
17851 if (Root) {
17852 if (isa<PoisonValue>(Vec)) {
17853 Vec = OriginalRoot;
17854 } else {
17855 Vec = CreateShuffle(Root, Vec, Mask);
17856 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
17857 OI && OI->use_empty() &&
17858 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
17859 return TE->VectorizedValue == OI;
17860 }))
17861 eraseInstruction(OI);
17862 }
17863 }
17864 // Insert non-constant values.
17865 for (int I : NonConsts)
17866 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17867 // Append instructions, which are/may be part of the loop, in the end to make
17868 // it possible to hoist non-loop-based instructions.
17869 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17870 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17871
17872 return Vec;
17873}
17874
17875/// Merges shuffle masks and emits final shuffle instruction, if required. It
17876/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
17877/// when the actual shuffle instruction is generated only if this is actually
17878/// required. Otherwise, the shuffle instruction emission is delayed till the
17879/// end of the process, to reduce the number of emitted instructions and further
17880/// analysis/transformations.
17881/// The class also will look through the previously emitted shuffle instructions
17882/// and properly mark indices in mask as undef.
17883/// For example, given the code
17884/// \code
17885/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
17886/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
17887/// \endcode
17888/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
17889/// look through %s1 and %s2 and emit
17890/// \code
17891/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17892/// \endcode
17893/// instead.
17894/// If 2 operands are of different size, the smallest one will be resized and
17895/// the mask recalculated properly.
17896/// For example, given the code
17897/// \code
17898/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
17899/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
17900/// \endcode
17901/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
17902/// look through %s1 and %s2 and emit
17903/// \code
17904/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17905/// \endcode
17906/// instead.
17907class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
17908 bool IsFinalized = false;
17909 /// Combined mask for all applied operands and masks. It is built during
17910 /// analysis and actual emission of shuffle vector instructions.
17911 SmallVector<int> CommonMask;
17912 /// List of operands for the shuffle vector instruction. It hold at max 2
17913 /// operands, if the 3rd is going to be added, the first 2 are combined into
17914 /// shuffle with \p CommonMask mask, the first operand sets to be the
17915 /// resulting shuffle and the second operand sets to be the newly added
17916 /// operand. The \p CommonMask is transformed in the proper way after that.
17917 SmallVector<Value *, 2> InVectors;
17918 IRBuilderBase &Builder;
17919 BoUpSLP &R;
17920
17921 class ShuffleIRBuilder {
17922 IRBuilderBase &Builder;
17923 /// Holds all of the instructions that we gathered.
17924 SetVector<Instruction *> &GatherShuffleExtractSeq;
17925 /// A list of blocks that we are going to CSE.
17926 DenseSet<BasicBlock *> &CSEBlocks;
17927 /// Data layout.
17928 const DataLayout &DL;
17929
17930 public:
17931 ShuffleIRBuilder(IRBuilderBase &Builder,
17932 SetVector<Instruction *> &GatherShuffleExtractSeq,
17933 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
17934 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17935 CSEBlocks(CSEBlocks), DL(DL) {}
17936 ~ShuffleIRBuilder() = default;
17937 /// Creates shufflevector for the 2 operands with the given mask.
17938 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
17939 if (V1->getType() != V2->getType()) {
17941 V1->getType()->isIntOrIntVectorTy() &&
17942 "Expected integer vector types only.");
17943 if (V1->getType() != V2->getType()) {
17944 if (cast<VectorType>(V2->getType())
17945 ->getElementType()
17946 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
17947 ->getElementType()
17948 ->getIntegerBitWidth())
17949 V2 = Builder.CreateIntCast(
17950 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
17951 else
17952 V1 = Builder.CreateIntCast(
17953 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
17954 }
17955 }
17956 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17957 if (auto *I = dyn_cast<Instruction>(Vec)) {
17958 GatherShuffleExtractSeq.insert(I);
17959 CSEBlocks.insert(I->getParent());
17960 }
17961 return Vec;
17962 }
17963 /// Creates permutation of the single vector operand with the given mask, if
17964 /// it is not identity mask.
17965 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
17966 if (Mask.empty())
17967 return V1;
17968 unsigned VF = Mask.size();
17969 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
17970 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
17971 return V1;
17972 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17973 if (auto *I = dyn_cast<Instruction>(Vec)) {
17974 GatherShuffleExtractSeq.insert(I);
17975 CSEBlocks.insert(I->getParent());
17976 }
17977 return Vec;
17978 }
17979 Value *createIdentity(Value *V) { return V; }
17980 Value *createPoison(Type *Ty, unsigned VF) {
17981 return PoisonValue::get(getWidenedType(Ty, VF));
17982 }
17983 /// Resizes 2 input vector to match the sizes, if the they are not equal
17984 /// yet. The smallest vector is resized to the size of the larger vector.
17985 void resizeToMatch(Value *&V1, Value *&V2) {
17986 if (V1->getType() == V2->getType())
17987 return;
17988 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
17989 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
17990 int VF = std::max(V1VF, V2VF);
17991 int MinVF = std::min(V1VF, V2VF);
17992 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
17993 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
17994 0);
17995 Value *&Op = MinVF == V1VF ? V1 : V2;
17996 Op = Builder.CreateShuffleVector(Op, IdentityMask);
17997 if (auto *I = dyn_cast<Instruction>(Op)) {
17998 GatherShuffleExtractSeq.insert(I);
17999 CSEBlocks.insert(I->getParent());
18000 }
18001 if (MinVF == V1VF)
18002 V1 = Op;
18003 else
18004 V2 = Op;
18005 }
18006 };
18007
18008 /// Smart shuffle instruction emission, walks through shuffles trees and
18009 /// tries to find the best matching vector for the actual shuffle
18010 /// instruction.
18011 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
18012 assert(V1 && "Expected at least one vector value.");
18013 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18014 R.CSEBlocks, *R.DL);
18015 return BaseShuffleAnalysis::createShuffle<Value *>(
18016 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18017 }
18018
18019 /// Cast value \p V to the vector type with the same number of elements, but
18020 /// the base type \p ScalarTy.
18021 Value *castToScalarTyElem(Value *V,
18022 std::optional<bool> IsSigned = std::nullopt) {
18023 auto *VecTy = cast<VectorType>(V->getType());
18024 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
18025 if (VecTy->getElementType() == ScalarTy->getScalarType())
18026 return V;
18027 return Builder.CreateIntCast(
18028 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18029 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
18030 }
18031
18032 Value *getVectorizedValue(const TreeEntry &E) {
18033 Value *Vec = E.VectorizedValue;
18034 if (!Vec->getType()->isIntOrIntVectorTy())
18035 return Vec;
18036 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
18037 return !isa<PoisonValue>(V) &&
18038 !isKnownNonNegative(
18039 V, SimplifyQuery(*R.DL));
18040 }));
18041 }
18042
18043public:
18045 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18046
18047 /// Adjusts extractelements after reusing them.
18048 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
18049 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18050 unsigned NumParts, bool &UseVecBaseAsInput) {
18051 UseVecBaseAsInput = false;
18052 SmallPtrSet<Value *, 4> UniqueBases;
18053 Value *VecBase = nullptr;
18054 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
18055 if (!E->ReorderIndices.empty()) {
18056 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18057 E->ReorderIndices.end());
18058 reorderScalars(VL, ReorderMask);
18059 }
18060 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18061 int Idx = Mask[I];
18062 if (Idx == PoisonMaskElem)
18063 continue;
18064 auto *EI = cast<ExtractElementInst>(VL[I]);
18065 VecBase = EI->getVectorOperand();
18066 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
18067 VecBase = TEs.front()->VectorizedValue;
18068 assert(VecBase && "Expected vectorized value.");
18069 UniqueBases.insert(VecBase);
18070 // If the only one use is vectorized - can delete the extractelement
18071 // itself.
18072 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18073 (NumParts != 1 && count(VL, EI) > 1) ||
18074 any_of(EI->users(), [&](User *U) {
18075 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18076 return UTEs.empty() || UTEs.size() > 1 ||
18077 (isa<GetElementPtrInst>(U) &&
18078 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18079 (!UTEs.empty() &&
18080 count_if(R.VectorizableTree,
18081 [&](const std::unique_ptr<TreeEntry> &TE) {
18082 return TE->UserTreeIndex.UserTE ==
18083 UTEs.front() &&
18084 is_contained(VL, EI);
18085 }) != 1);
18086 }))
18087 continue;
18088 R.eraseInstruction(EI);
18089 }
18090 if (NumParts == 1 || UniqueBases.size() == 1) {
18091 assert(VecBase && "Expected vectorized value.");
18092 return castToScalarTyElem(VecBase);
18093 }
18094 UseVecBaseAsInput = true;
18095 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
18096 for (auto [I, Idx] : enumerate(Mask))
18097 if (Idx != PoisonMaskElem)
18098 Idx = I;
18099 };
18100 // Perform multi-register vector shuffle, joining them into a single virtual
18101 // long vector.
18102 // Need to shuffle each part independently and then insert all this parts
18103 // into a long virtual vector register, forming the original vector.
18104 Value *Vec = nullptr;
18105 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18106 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18107 for (unsigned Part : seq<unsigned>(NumParts)) {
18108 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
18109 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
18110 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
18111 constexpr int MaxBases = 2;
18112 SmallVector<Value *, MaxBases> Bases(MaxBases);
18113 auto VLMask = zip(SubVL, SubMask);
18114 const unsigned VF = std::accumulate(
18115 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
18116 if (std::get<1>(D) == PoisonMaskElem)
18117 return S;
18118 Value *VecOp =
18119 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18120 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18121 !TEs.empty())
18122 VecOp = TEs.front()->VectorizedValue;
18123 assert(VecOp && "Expected vectorized value.");
18124 const unsigned Size =
18125 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18126 return std::max(S, Size);
18127 });
18128 for (const auto [V, I] : VLMask) {
18129 if (I == PoisonMaskElem)
18130 continue;
18131 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18132 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18133 VecOp = TEs.front()->VectorizedValue;
18134 assert(VecOp && "Expected vectorized value.");
18135 VecOp = castToScalarTyElem(VecOp);
18136 Bases[I / VF] = VecOp;
18137 }
18138 if (!Bases.front())
18139 continue;
18140 Value *SubVec;
18141 if (Bases.back()) {
18142 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18143 TransformToIdentity(SubMask);
18144 } else {
18145 SubVec = Bases.front();
18146 }
18147 if (!Vec) {
18148 Vec = SubVec;
18149 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18150 [&](unsigned P) {
18151 ArrayRef<int> SubMask =
18152 Mask.slice(P * SliceSize,
18153 getNumElems(Mask.size(),
18154 SliceSize, P));
18155 return all_of(SubMask, [](int Idx) {
18156 return Idx == PoisonMaskElem;
18157 });
18158 })) &&
18159 "Expected first part or all previous parts masked.");
18160 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18161 } else {
18162 unsigned NewVF =
18163 cast<FixedVectorType>(Vec->getType())->getNumElements();
18164 if (Vec->getType() != SubVec->getType()) {
18165 unsigned SubVecVF =
18166 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18167 NewVF = std::max(NewVF, SubVecVF);
18168 }
18169 // Adjust SubMask.
18170 for (int &Idx : SubMask)
18171 if (Idx != PoisonMaskElem)
18172 Idx += NewVF;
18173 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18174 Vec = createShuffle(Vec, SubVec, VecMask);
18175 TransformToIdentity(VecMask);
18176 }
18177 }
18178 copy(VecMask, Mask.begin());
18179 return Vec;
18180 }
18181 /// Checks if the specified entry \p E needs to be delayed because of its
18182 /// dependency nodes.
18183 std::optional<Value *>
18184 needToDelay(const TreeEntry *E,
18186 // No need to delay emission if all deps are ready.
18187 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18188 return all_of(
18189 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18190 }))
18191 return std::nullopt;
18192 // Postpone gather emission, will be emitted after the end of the
18193 // process to keep correct order.
18194 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18195 return Builder.CreateAlignedLoad(
18196 ResVecTy,
18197 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18198 MaybeAlign());
18199 }
18200 /// Reset the builder to handle perfect diamond match.
18202 IsFinalized = false;
18203 CommonMask.clear();
18204 InVectors.clear();
18205 }
18206 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18207 /// shuffling.
18208 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18209 Value *V1 = getVectorizedValue(E1);
18210 Value *V2 = getVectorizedValue(E2);
18211 add(V1, V2, Mask);
18212 }
18213 /// Adds single input vector (in form of tree entry) and the mask for its
18214 /// shuffling.
18215 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18216 Value *V1 = getVectorizedValue(E1);
18217 add(V1, Mask);
18218 }
18219 /// Adds 2 input vectors and the mask for their shuffling.
18220 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18221 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18224 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18225 V1 = castToScalarTyElem(V1);
18226 V2 = castToScalarTyElem(V2);
18227 if (InVectors.empty()) {
18228 InVectors.push_back(V1);
18229 InVectors.push_back(V2);
18230 CommonMask.assign(Mask.begin(), Mask.end());
18231 return;
18232 }
18233 Value *Vec = InVectors.front();
18234 if (InVectors.size() == 2) {
18235 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18236 transformMaskAfterShuffle(CommonMask, CommonMask);
18237 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18238 Mask.size()) {
18239 Vec = createShuffle(Vec, nullptr, CommonMask);
18240 transformMaskAfterShuffle(CommonMask, CommonMask);
18241 }
18242 V1 = createShuffle(V1, V2, Mask);
18243 unsigned VF = std::max(getVF(V1), getVF(Vec));
18244 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18245 if (Mask[Idx] != PoisonMaskElem)
18246 CommonMask[Idx] = Idx + VF;
18247 InVectors.front() = Vec;
18248 if (InVectors.size() == 2)
18249 InVectors.back() = V1;
18250 else
18251 InVectors.push_back(V1);
18252 }
18253 /// Adds another one input vector and the mask for the shuffling.
18254 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
18256 "castToScalarTyElem expects V1 to be FixedVectorType");
18257 V1 = castToScalarTyElem(V1);
18258 if (InVectors.empty()) {
18259 InVectors.push_back(V1);
18260 CommonMask.assign(Mask.begin(), Mask.end());
18261 return;
18262 }
18263 const auto *It = find(InVectors, V1);
18264 if (It == InVectors.end()) {
18265 if (InVectors.size() == 2 ||
18266 InVectors.front()->getType() != V1->getType()) {
18267 Value *V = InVectors.front();
18268 if (InVectors.size() == 2) {
18269 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18270 transformMaskAfterShuffle(CommonMask, CommonMask);
18271 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18272 CommonMask.size()) {
18273 V = createShuffle(InVectors.front(), nullptr, CommonMask);
18274 transformMaskAfterShuffle(CommonMask, CommonMask);
18275 }
18276 unsigned VF = std::max(CommonMask.size(), Mask.size());
18277 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18278 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
18279 CommonMask[Idx] = V->getType() != V1->getType()
18280 ? Idx + VF
18281 : Mask[Idx] + getVF(V1);
18282 if (V->getType() != V1->getType())
18283 V1 = createShuffle(V1, nullptr, Mask);
18284 InVectors.front() = V;
18285 if (InVectors.size() == 2)
18286 InVectors.back() = V1;
18287 else
18288 InVectors.push_back(V1);
18289 return;
18290 }
18291 // Check if second vector is required if the used elements are already
18292 // used from the first one.
18293 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18294 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
18295 InVectors.push_back(V1);
18296 break;
18297 }
18298 }
18299 unsigned VF = 0;
18300 for (Value *V : InVectors)
18301 VF = std::max(VF, getVF(V));
18302 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18303 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
18304 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18305 }
18306 /// Adds another one input vector and the mask for the shuffling.
18308 SmallVector<int> NewMask;
18309 inversePermutation(Order, NewMask);
18310 add(V1, NewMask);
18311 }
18312 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
18313 Value *Root = nullptr) {
18314 return R.gather(VL, Root, ScalarTy,
18315 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
18316 return createShuffle(V1, V2, Mask);
18317 });
18318 }
18319 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
18320 /// Finalize emission of the shuffles.
18321 /// \param Action the action (if any) to be performed before final applying of
18322 /// the \p ExtMask mask.
18324 ArrayRef<int> ExtMask,
18325 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18326 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
18329 Action = {}) {
18330 IsFinalized = true;
18331 if (Action) {
18332 Value *Vec = InVectors.front();
18333 if (InVectors.size() == 2) {
18334 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18335 InVectors.pop_back();
18336 } else {
18337 Vec = createShuffle(Vec, nullptr, CommonMask);
18338 }
18339 transformMaskAfterShuffle(CommonMask, CommonMask);
18340 assert(VF > 0 &&
18341 "Expected vector length for the final value before action.");
18342 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
18343 if (VecVF < VF) {
18344 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18345 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18346 Vec = createShuffle(Vec, nullptr, ResizeMask);
18347 }
18348 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
18349 return createShuffle(V1, V2, Mask);
18350 });
18351 InVectors.front() = Vec;
18352 }
18353 if (!SubVectors.empty()) {
18354 Value *Vec = InVectors.front();
18355 if (InVectors.size() == 2) {
18356 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18357 InVectors.pop_back();
18358 } else {
18359 Vec = createShuffle(Vec, nullptr, CommonMask);
18360 }
18361 transformMaskAfterShuffle(CommonMask, CommonMask);
18362 auto CreateSubVectors = [&](Value *Vec,
18363 SmallVectorImpl<int> &CommonMask) {
18364 for (auto [E, Idx] : SubVectors) {
18365 Value *V = getVectorizedValue(*E);
18366 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
18367 // Use scalar version of the SCalarType to correctly handle shuffles
18368 // for revectorization. The revectorization mode operates by the
18369 // vectors, but here we need to operate on the scalars, because the
18370 // masks were already transformed for the vector elements and we don't
18371 // need doing this transformation again.
18372 Type *OrigScalarTy = ScalarTy;
18373 ScalarTy = ScalarTy->getScalarType();
18374 Vec = createInsertVector(
18375 Builder, Vec, V, InsertionIndex,
18376 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
18377 _3));
18378 ScalarTy = OrigScalarTy;
18379 if (!CommonMask.empty()) {
18380 std::iota(std::next(CommonMask.begin(), Idx),
18381 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
18382 Idx);
18383 }
18384 }
18385 return Vec;
18386 };
18387 if (SubVectorsMask.empty()) {
18388 Vec = CreateSubVectors(Vec, CommonMask);
18389 } else {
18390 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
18391 copy(SubVectorsMask, SVMask.begin());
18392 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
18393 if (I2 != PoisonMaskElem) {
18394 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
18395 I1 = I2 + CommonMask.size();
18396 }
18397 }
18398 Value *InsertVec =
18399 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
18400 Vec = createShuffle(InsertVec, Vec, SVMask);
18401 transformMaskAfterShuffle(CommonMask, SVMask);
18402 }
18403 InVectors.front() = Vec;
18404 }
18405
18406 if (!ExtMask.empty()) {
18407 if (CommonMask.empty()) {
18408 CommonMask.assign(ExtMask.begin(), ExtMask.end());
18409 } else {
18410 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
18411 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
18412 if (ExtMask[I] == PoisonMaskElem)
18413 continue;
18414 NewMask[I] = CommonMask[ExtMask[I]];
18415 }
18416 CommonMask.swap(NewMask);
18417 }
18418 }
18419 if (CommonMask.empty()) {
18420 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
18421 return InVectors.front();
18422 }
18423 if (InVectors.size() == 2)
18424 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18425 return createShuffle(InVectors.front(), nullptr, CommonMask);
18426 }
18427
18429 assert((IsFinalized || CommonMask.empty()) &&
18430 "Shuffle construction must be finalized.");
18431 }
18432};
18433
18434Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
18435 return vectorizeTree(getOperandEntry(E, NodeIdx));
18436}
18437
18438template <typename BVTy, typename ResTy, typename... Args>
18439ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
18440 Args &...Params) {
18441 assert(E->isGather() && "Expected gather node.");
18442 unsigned VF = E->getVectorFactor();
18443
18444 bool NeedFreeze = false;
18445 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
18446 // Clear values, to be replaced by insertvector instructions.
18447 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18448 for_each(MutableArrayRef(GatheredScalars)
18449 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18450 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
18452 E->CombinedEntriesWithIndices.size());
18453 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18454 [&](const auto &P) {
18455 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18456 });
18457 // Build a mask out of the reorder indices and reorder scalars per this
18458 // mask.
18459 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18460 E->ReorderIndices.end());
18461 if (!ReorderMask.empty())
18462 reorderScalars(GatheredScalars, ReorderMask);
18463 SmallVector<int> SubVectorsMask;
18464 inversePermutation(E->ReorderIndices, SubVectorsMask);
18465 // Transform non-clustered elements in the mask to poison (-1).
18466 // "Clustered" operations will be reordered using this mask later.
18467 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
18468 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
18469 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
18470 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
18471 } else {
18472 SubVectorsMask.clear();
18473 }
18474 SmallVector<Value *> StoredGS(GatheredScalars);
18475 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
18476 unsigned I, unsigned SliceSize,
18477 bool IsNotPoisonous) {
18478 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
18479 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18480 }))
18481 return false;
18482 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18483 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18484 if (UserTE->getNumOperands() != 2)
18485 return false;
18486 if (!IsNotPoisonous) {
18487 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18488 [=](const std::unique_ptr<TreeEntry> &TE) {
18489 return TE->UserTreeIndex.UserTE == UserTE &&
18490 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18491 });
18492 if (It == VectorizableTree.end())
18493 return false;
18494 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
18495 if (!(*It)->ReorderIndices.empty()) {
18496 inversePermutation((*It)->ReorderIndices, ReorderMask);
18497 reorderScalars(GS, ReorderMask);
18498 }
18499 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
18500 Value *V0 = std::get<0>(P);
18501 Value *V1 = std::get<1>(P);
18502 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18503 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18504 is_contained(E->Scalars, V1));
18505 }))
18506 return false;
18507 }
18508 int Idx;
18509 if ((Mask.size() < InputVF &&
18510 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
18511 Idx == 0) ||
18512 (Mask.size() == InputVF &&
18513 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
18514 std::iota(
18515 std::next(Mask.begin(), I * SliceSize),
18516 std::next(Mask.begin(),
18517 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18518 0);
18519 } else {
18520 unsigned IVal =
18521 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
18522 std::fill(
18523 std::next(Mask.begin(), I * SliceSize),
18524 std::next(Mask.begin(),
18525 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18526 IVal);
18527 }
18528 return true;
18529 };
18530 BVTy ShuffleBuilder(ScalarTy, Params...);
18531 ResTy Res = ResTy();
18532 SmallVector<int> Mask;
18533 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
18535 Value *ExtractVecBase = nullptr;
18536 bool UseVecBaseAsInput = false;
18539 Type *OrigScalarTy = GatheredScalars.front()->getType();
18540 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
18541 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
18542 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
18543 // Check for gathered extracts.
18544 bool Resized = false;
18545 ExtractShuffles =
18546 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18547 if (!ExtractShuffles.empty()) {
18548 SmallVector<const TreeEntry *> ExtractEntries;
18549 for (auto [Idx, I] : enumerate(ExtractMask)) {
18550 if (I == PoisonMaskElem)
18551 continue;
18552 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
18553 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
18554 !TEs.empty())
18555 ExtractEntries.append(TEs.begin(), TEs.end());
18556 }
18557 if (std::optional<ResTy> Delayed =
18558 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18559 // Delay emission of gathers which are not ready yet.
18560 PostponedGathers.insert(E);
18561 // Postpone gather emission, will be emitted after the end of the
18562 // process to keep correct order.
18563 return *Delayed;
18564 }
18565 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
18566 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18567 ExtractVecBase = VecBase;
18568 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18569 if (VF == VecBaseTy->getNumElements() &&
18570 GatheredScalars.size() != VF) {
18571 Resized = true;
18572 GatheredScalars.append(VF - GatheredScalars.size(),
18573 PoisonValue::get(OrigScalarTy));
18574 NumParts =
18575 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
18576 }
18577 }
18578 }
18579 // Gather extracts after we check for full matched gathers only.
18580 if (!ExtractShuffles.empty() || !E->hasState() ||
18581 E->getOpcode() != Instruction::Load ||
18582 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18583 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18584 any_of(E->Scalars,
18585 [this](Value *V) {
18586 return isa<LoadInst>(V) && isVectorized(V);
18587 })) ||
18588 (E->hasState() && E->isAltShuffle()) ||
18589 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
18590 isSplat(E->Scalars) ||
18591 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18592 GatherShuffles =
18593 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18594 }
18595 if (!GatherShuffles.empty()) {
18596 if (std::optional<ResTy> Delayed =
18597 ShuffleBuilder.needToDelay(E, Entries)) {
18598 // Delay emission of gathers which are not ready yet.
18599 PostponedGathers.insert(E);
18600 // Postpone gather emission, will be emitted after the end of the
18601 // process to keep correct order.
18602 return *Delayed;
18603 }
18604 if (GatherShuffles.size() == 1 &&
18605 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
18606 Entries.front().front()->isSame(E->Scalars)) {
18607 // Perfect match in the graph, will reuse the previously vectorized
18608 // node. Cost is 0.
18609 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
18610 << shortBundleName(E->Scalars, E->Idx) << ".\n");
18611 // Restore the mask for previous partially matched values.
18612 Mask.resize(E->Scalars.size());
18613 const TreeEntry *FrontTE = Entries.front().front();
18614 if (FrontTE->ReorderIndices.empty() &&
18615 ((FrontTE->ReuseShuffleIndices.empty() &&
18616 E->Scalars.size() == FrontTE->Scalars.size()) ||
18617 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18618 std::iota(Mask.begin(), Mask.end(), 0);
18619 } else {
18620 for (auto [I, V] : enumerate(E->Scalars)) {
18621 if (isa<PoisonValue>(V)) {
18623 continue;
18624 }
18625 Mask[I] = FrontTE->findLaneForValue(V);
18626 }
18627 }
18628 // Reset the builder(s) to correctly handle perfect diamond matched
18629 // nodes.
18630 ShuffleBuilder.resetForSameNode();
18631 ShuffleBuilder.add(*FrontTE, Mask);
18632 // Full matched entry found, no need to insert subvectors.
18633 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18634 return Res;
18635 }
18636 if (!Resized) {
18637 if (GatheredScalars.size() != VF &&
18638 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
18639 return any_of(TEs, [&](const TreeEntry *TE) {
18640 return TE->getVectorFactor() == VF;
18641 });
18642 }))
18643 GatheredScalars.append(VF - GatheredScalars.size(),
18644 PoisonValue::get(OrigScalarTy));
18645 }
18646 // Remove shuffled elements from list of gathers.
18647 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18648 if (Mask[I] != PoisonMaskElem)
18649 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18650 }
18651 }
18652 }
18653 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18654 SmallVectorImpl<int> &ReuseMask,
18655 bool IsRootPoison) {
18656 // For splats with can emit broadcasts instead of gathers, so try to find
18657 // such sequences.
18658 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
18659 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
18660 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
18661 SmallVector<int> UndefPos;
18662 DenseMap<Value *, unsigned> UniquePositions;
18663 // Gather unique non-const values and all constant values.
18664 // For repeated values, just shuffle them.
18665 int NumNonConsts = 0;
18666 int SinglePos = 0;
18667 for (auto [I, V] : enumerate(Scalars)) {
18668 if (isa<UndefValue>(V)) {
18669 if (!isa<PoisonValue>(V)) {
18670 ReuseMask[I] = I;
18671 UndefPos.push_back(I);
18672 }
18673 continue;
18674 }
18675 if (isConstant(V)) {
18676 ReuseMask[I] = I;
18677 continue;
18678 }
18679 ++NumNonConsts;
18680 SinglePos = I;
18681 Value *OrigV = V;
18682 Scalars[I] = PoisonValue::get(OrigScalarTy);
18683 if (IsSplat) {
18684 Scalars.front() = OrigV;
18685 ReuseMask[I] = 0;
18686 } else {
18687 const auto Res = UniquePositions.try_emplace(OrigV, I);
18688 Scalars[Res.first->second] = OrigV;
18689 ReuseMask[I] = Res.first->second;
18690 }
18691 }
18692 if (NumNonConsts == 1) {
18693 // Restore single insert element.
18694 if (IsSplat) {
18695 ReuseMask.assign(VF, PoisonMaskElem);
18696 std::swap(Scalars.front(), Scalars[SinglePos]);
18697 if (!UndefPos.empty() && UndefPos.front() == 0)
18698 Scalars.front() = UndefValue::get(OrigScalarTy);
18699 }
18700 ReuseMask[SinglePos] = SinglePos;
18701 } else if (!UndefPos.empty() && IsSplat) {
18702 // For undef values, try to replace them with the simple broadcast.
18703 // We can do it if the broadcasted value is guaranteed to be
18704 // non-poisonous, or by freezing the incoming scalar value first.
18705 auto *It = find_if(Scalars, [this, E](Value *V) {
18706 return !isa<UndefValue>(V) &&
18708 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
18709 // Check if the value already used in the same operation in
18710 // one of the nodes already.
18711 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18712 is_contained(E->UserTreeIndex.UserTE->Scalars,
18713 U.getUser());
18714 })));
18715 });
18716 if (It != Scalars.end()) {
18717 // Replace undefs by the non-poisoned scalars and emit broadcast.
18718 int Pos = std::distance(Scalars.begin(), It);
18719 for (int I : UndefPos) {
18720 // Set the undef position to the non-poisoned scalar.
18721 ReuseMask[I] = Pos;
18722 // Replace the undef by the poison, in the mask it is replaced by
18723 // non-poisoned scalar already.
18724 if (I != Pos)
18725 Scalars[I] = PoisonValue::get(OrigScalarTy);
18726 }
18727 } else {
18728 // Replace undefs by the poisons, emit broadcast and then emit
18729 // freeze.
18730 for (int I : UndefPos) {
18731 ReuseMask[I] = PoisonMaskElem;
18732 if (isa<UndefValue>(Scalars[I]))
18733 Scalars[I] = PoisonValue::get(OrigScalarTy);
18734 }
18735 NeedFreeze = true;
18736 }
18737 }
18738 };
18739 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
18740 bool IsNonPoisoned = true;
18741 bool IsUsedInExpr = true;
18742 Value *Vec1 = nullptr;
18743 if (!ExtractShuffles.empty()) {
18744 // Gather of extractelements can be represented as just a shuffle of
18745 // a single/two vectors the scalars are extracted from.
18746 // Find input vectors.
18747 Value *Vec2 = nullptr;
18748 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18749 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
18750 ExtractMask[I] = PoisonMaskElem;
18751 }
18752 if (UseVecBaseAsInput) {
18753 Vec1 = ExtractVecBase;
18754 } else {
18755 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18756 if (ExtractMask[I] == PoisonMaskElem)
18757 continue;
18758 if (isa<UndefValue>(StoredGS[I]))
18759 continue;
18760 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
18761 Value *VecOp = EI->getVectorOperand();
18762 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
18763 !TEs.empty() && TEs.front()->VectorizedValue)
18764 VecOp = TEs.front()->VectorizedValue;
18765 if (!Vec1) {
18766 Vec1 = VecOp;
18767 } else if (Vec1 != VecOp) {
18768 assert((!Vec2 || Vec2 == VecOp) &&
18769 "Expected only 1 or 2 vectors shuffle.");
18770 Vec2 = VecOp;
18771 }
18772 }
18773 }
18774 if (Vec2) {
18775 IsUsedInExpr = false;
18776 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
18777 isGuaranteedNotToBePoison(Vec2, AC);
18778 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18779 } else if (Vec1) {
18780 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
18781 IsUsedInExpr &= FindReusedSplat(
18782 ExtractMask,
18783 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
18784 ExtractMask.size(), IsNotPoisonedVec);
18785 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
18786 IsNonPoisoned &= IsNotPoisonedVec;
18787 } else {
18788 IsUsedInExpr = false;
18789 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
18790 /*ForExtracts=*/true);
18791 }
18792 }
18793 if (!GatherShuffles.empty()) {
18794 unsigned SliceSize =
18795 getPartNumElems(E->Scalars.size(),
18796 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
18797 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18798 for (const auto [I, TEs] : enumerate(Entries)) {
18799 if (TEs.empty()) {
18800 assert(!GatherShuffles[I] &&
18801 "No shuffles with empty entries list expected.");
18802 continue;
18803 }
18804 assert((TEs.size() == 1 || TEs.size() == 2) &&
18805 "Expected shuffle of 1 or 2 entries.");
18806 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
18807 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
18808 VecMask.assign(VecMask.size(), PoisonMaskElem);
18809 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
18810 if (TEs.size() == 1) {
18811 bool IsNotPoisonedVec =
18812 TEs.front()->VectorizedValue
18813 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
18814 : true;
18815 IsUsedInExpr &=
18816 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
18817 SliceSize, IsNotPoisonedVec);
18818 ShuffleBuilder.add(*TEs.front(), VecMask);
18819 IsNonPoisoned &= IsNotPoisonedVec;
18820 } else {
18821 IsUsedInExpr = false;
18822 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
18823 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
18824 IsNonPoisoned &=
18825 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
18826 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
18827 }
18828 }
18829 }
18830 // Try to figure out best way to combine values: build a shuffle and insert
18831 // elements or just build several shuffles.
18832 // Insert non-constant scalars.
18833 SmallVector<Value *> NonConstants(GatheredScalars);
18834 int EMSz = ExtractMask.size();
18835 int MSz = Mask.size();
18836 // Try to build constant vector and shuffle with it only if currently we
18837 // have a single permutation and more than 1 scalar constants.
18838 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
18839 bool IsIdentityShuffle =
18840 ((UseVecBaseAsInput ||
18841 all_of(ExtractShuffles,
18842 [](const std::optional<TTI::ShuffleKind> &SK) {
18843 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18845 })) &&
18846 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
18847 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
18848 (!GatherShuffles.empty() &&
18849 all_of(GatherShuffles,
18850 [](const std::optional<TTI::ShuffleKind> &SK) {
18851 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18853 }) &&
18854 none_of(Mask, [&](int I) { return I >= MSz; }) &&
18856 bool EnoughConstsForShuffle =
18857 IsSingleShuffle &&
18858 (none_of(GatheredScalars,
18859 [](Value *V) {
18860 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18861 }) ||
18862 any_of(GatheredScalars,
18863 [](Value *V) {
18864 return isa<Constant>(V) && !isa<UndefValue>(V);
18865 })) &&
18866 (!IsIdentityShuffle ||
18867 (GatheredScalars.size() == 2 &&
18868 any_of(GatheredScalars,
18869 [](Value *V) { return !isa<UndefValue>(V); })) ||
18870 count_if(GatheredScalars, [](Value *V) {
18871 return isa<Constant>(V) && !isa<PoisonValue>(V);
18872 }) > 1);
18873 // NonConstants array contains just non-constant values, GatheredScalars
18874 // contains only constant to build final vector and then shuffle.
18875 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
18876 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
18877 NonConstants[I] = PoisonValue::get(OrigScalarTy);
18878 else
18879 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18880 }
18881 // Generate constants for final shuffle and build a mask for them.
18882 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
18883 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
18884 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
18885 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18886 ShuffleBuilder.add(BV, BVMask);
18887 }
18888 if (all_of(NonConstants, [=](Value *V) {
18889 return isa<PoisonValue>(V) ||
18890 (IsSingleShuffle && ((IsIdentityShuffle &&
18891 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
18892 }))
18893 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18894 SubVectorsMask);
18895 else
18896 Res = ShuffleBuilder.finalize(
18897 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
18898 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
18899 bool IsSplat = isSplat(NonConstants);
18900 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18901 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
18902 auto CheckIfSplatIsProfitable = [&]() {
18903 // Estimate the cost of splatting + shuffle and compare with
18904 // insert + shuffle.
18905 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18906 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18907 if (isa<ExtractElementInst>(V) || isVectorized(V))
18908 return false;
18909 InstructionCost SplatCost = TTI->getVectorInstrCost(
18910 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
18911 PoisonValue::get(VecTy), V);
18912 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18913 for (auto [Idx, I] : enumerate(BVMask))
18914 if (I != PoisonMaskElem)
18915 NewMask[Idx] = Mask.size();
18916 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18917 NewMask, CostKind);
18918 InstructionCost BVCost = TTI->getVectorInstrCost(
18919 Instruction::InsertElement, VecTy, CostKind,
18920 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18921 Vec, V);
18922 // Shuffle required?
18923 if (count(BVMask, PoisonMaskElem) <
18924 static_cast<int>(BVMask.size() - 1)) {
18925 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18926 for (auto [Idx, I] : enumerate(BVMask))
18927 if (I != PoisonMaskElem)
18928 NewMask[Idx] = I;
18929 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18930 VecTy, NewMask, CostKind);
18931 }
18932 return SplatCost <= BVCost;
18933 };
18934 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18935 for (auto [Idx, I] : enumerate(BVMask))
18936 if (I != PoisonMaskElem)
18937 Mask[Idx] = I;
18938 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
18939 } else {
18940 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18941 SmallVector<Value *> Values(NonConstants.size(),
18942 PoisonValue::get(ScalarTy));
18943 Values[0] = V;
18944 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18945 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
18946 transform(BVMask, SplatMask.begin(), [](int I) {
18947 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18948 });
18949 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
18950 BV = CreateShuffle(BV, nullptr, SplatMask);
18951 for (auto [Idx, I] : enumerate(BVMask))
18952 if (I != PoisonMaskElem)
18953 Mask[Idx] = BVMask.size() + Idx;
18954 Vec = CreateShuffle(Vec, BV, Mask);
18955 for (auto [Idx, I] : enumerate(Mask))
18956 if (I != PoisonMaskElem)
18957 Mask[Idx] = Idx;
18958 }
18959 });
18960 } else if (!allConstant(GatheredScalars)) {
18961 // Gather unique scalars and all constants.
18962 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
18963 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
18964 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
18965 ShuffleBuilder.add(BV, ReuseMask);
18966 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18967 SubVectorsMask);
18968 } else {
18969 // Gather all constants.
18970 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
18971 for (auto [I, V] : enumerate(GatheredScalars)) {
18972 if (!isa<PoisonValue>(V))
18973 Mask[I] = I;
18974 }
18975 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18976 ShuffleBuilder.add(BV, Mask);
18977 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18978 SubVectorsMask);
18979 }
18980
18981 if (NeedFreeze)
18982 Res = ShuffleBuilder.createFreeze(Res);
18983 return Res;
18984}
18985
18986Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
18987 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
18988 (void)vectorizeTree(VectorizableTree[EIdx].get());
18989 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
18990 Builder, *this);
18991}
18992
18993/// \returns \p I after propagating metadata from \p VL only for instructions in
18994/// \p VL.
18997 for (Value *V : VL)
18998 if (isa<Instruction>(V))
18999 Insts.push_back(V);
19000 return llvm::propagateMetadata(Inst, Insts);
19001}
19002
19004 if (DebugLoc DL = PN.getDebugLoc())
19005 return DL;
19006 return DebugLoc::getUnknown();
19007}
19008
19009Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
19010 IRBuilderBase::InsertPointGuard Guard(Builder);
19011
19012 Value *V = E->Scalars.front();
19013 Type *ScalarTy = V->getType();
19014 if (!isa<CmpInst>(V))
19015 ScalarTy = getValueType(V);
19016 auto It = MinBWs.find(E);
19017 if (It != MinBWs.end()) {
19018 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
19019 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
19020 if (VecTy)
19021 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
19022 }
19023 if (E->VectorizedValue)
19024 return E->VectorizedValue;
19025 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
19026 if (E->isGather()) {
19027 // Set insert point for non-reduction initial nodes.
19028 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
19029 setInsertPointAfterBundle(E);
19030 Value *Vec = createBuildVector(E, ScalarTy);
19031 E->VectorizedValue = Vec;
19032 return Vec;
19033 }
19034 if (E->State == TreeEntry::SplitVectorize) {
19035 assert(E->CombinedEntriesWithIndices.size() == 2 &&
19036 "Expected exactly 2 combined entries.");
19037 setInsertPointAfterBundle(E);
19038 TreeEntry &OpTE1 =
19039 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
19040 assert(OpTE1.isSame(
19041 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19042 "Expected same first part of scalars.");
19043 Value *Op1 = vectorizeTree(&OpTE1);
19044 TreeEntry &OpTE2 =
19045 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
19046 assert(
19047 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19048 "Expected same second part of scalars.");
19049 Value *Op2 = vectorizeTree(&OpTE2);
19050 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
19051 bool IsSigned = false;
19052 auto It = MinBWs.find(OpE);
19053 if (It != MinBWs.end())
19054 IsSigned = It->second.second;
19055 else
19056 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19057 if (isa<PoisonValue>(V))
19058 return false;
19059 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19060 });
19061 return IsSigned;
19062 };
19063 if (cast<VectorType>(Op1->getType())->getElementType() !=
19064 ScalarTy->getScalarType()) {
19065 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19066 Op1 = Builder.CreateIntCast(
19067 Op1,
19069 ScalarTy,
19070 cast<FixedVectorType>(Op1->getType())->getNumElements()),
19071 GetOperandSignedness(&OpTE1));
19072 }
19073 if (cast<VectorType>(Op2->getType())->getElementType() !=
19074 ScalarTy->getScalarType()) {
19075 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19076 Op2 = Builder.CreateIntCast(
19077 Op2,
19079 ScalarTy,
19080 cast<FixedVectorType>(Op2->getType())->getNumElements()),
19081 GetOperandSignedness(&OpTE2));
19082 }
19083 if (E->ReorderIndices.empty()) {
19084 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
19085 std::iota(
19086 Mask.begin(),
19087 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
19088 0);
19089 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
19090 if (ScalarTyNumElements != 1) {
19091 assert(SLPReVec && "Only supported by REVEC.");
19092 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
19093 }
19094 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19095 Vec = createInsertVector(Builder, Vec, Op2,
19096 E->CombinedEntriesWithIndices.back().second *
19097 ScalarTyNumElements);
19098 E->VectorizedValue = Vec;
19099 return Vec;
19100 }
19101 unsigned CommonVF =
19102 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19103 if (getNumElements(Op1->getType()) != CommonVF) {
19104 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19105 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
19106 0);
19107 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19108 }
19109 if (getNumElements(Op2->getType()) != CommonVF) {
19110 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19111 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
19112 0);
19113 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19114 }
19115 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
19116 E->VectorizedValue = Vec;
19117 return Vec;
19118 }
19119
19120 bool IsReverseOrder =
19121 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
19122 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
19123 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
19124 if (E->getOpcode() == Instruction::Store &&
19125 E->State == TreeEntry::Vectorize) {
19126 ArrayRef<int> Mask =
19127 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19128 E->ReorderIndices.size());
19129 ShuffleBuilder.add(V, Mask);
19130 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19131 E->State == TreeEntry::CompressVectorize) {
19132 ShuffleBuilder.addOrdered(V, {});
19133 } else {
19134 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19135 }
19137 E->CombinedEntriesWithIndices.size());
19138 transform(
19139 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19140 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19141 });
19142 assert(
19143 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19144 "Expected either combined subnodes or reordering");
19145 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19146 };
19147
19148 assert(!E->isGather() && "Unhandled state");
19149 unsigned ShuffleOrOp =
19150 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19151 Instruction *VL0 = E->getMainOp();
19152 auto GetOperandSignedness = [&](unsigned Idx) {
19153 const TreeEntry *OpE = getOperandEntry(E, Idx);
19154 bool IsSigned = false;
19155 auto It = MinBWs.find(OpE);
19156 if (It != MinBWs.end())
19157 IsSigned = It->second.second;
19158 else
19159 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19160 if (isa<PoisonValue>(V))
19161 return false;
19162 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19163 });
19164 return IsSigned;
19165 };
19166 switch (ShuffleOrOp) {
19167 case Instruction::PHI: {
19168 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19169 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19170 "PHI reordering is free.");
19171 auto *PH = cast<PHINode>(VL0);
19172 Builder.SetInsertPoint(PH->getParent(),
19173 PH->getParent()->getFirstNonPHIIt());
19174 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19175 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19176 Value *V = NewPhi;
19177
19178 // Adjust insertion point once all PHI's have been generated.
19179 Builder.SetInsertPoint(PH->getParent(),
19180 PH->getParent()->getFirstInsertionPt());
19181 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19182
19183 V = FinalShuffle(V, E);
19184
19185 E->VectorizedValue = V;
19186 // If phi node is fully emitted - exit.
19187 if (NewPhi->getNumIncomingValues() != 0)
19188 return NewPhi;
19189
19190 // PHINodes may have multiple entries from the same block. We want to
19191 // visit every block once.
19192 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19193
19194 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19195 BasicBlock *IBB = PH->getIncomingBlock(I);
19196
19197 // Stop emission if all incoming values are generated.
19198 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19199 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19200 return NewPhi;
19201 }
19202
19203 if (!VisitedBBs.insert(IBB).second) {
19204 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19205 NewPhi->addIncoming(VecOp, IBB);
19206 TreeEntry *OpTE = getOperandEntry(E, I);
19207 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19208 OpTE->VectorizedValue = VecOp;
19209 continue;
19210 }
19211
19212 Builder.SetInsertPoint(IBB->getTerminator());
19213 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19214 Value *Vec = vectorizeOperand(E, I);
19215 if (VecTy != Vec->getType()) {
19216 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19217 MinBWs.contains(getOperandEntry(E, I))) &&
19218 "Expected item in MinBWs.");
19219 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19220 }
19221 NewPhi->addIncoming(Vec, IBB);
19222 }
19223
19224 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19225 "Invalid number of incoming values");
19226 assert(E->VectorizedValue && "Expected vectorized value.");
19227 return E->VectorizedValue;
19228 }
19229
19230 case Instruction::ExtractElement: {
19231 Value *V = E->getSingleOperand(0);
19232 setInsertPointAfterBundle(E);
19233 V = FinalShuffle(V, E);
19234 E->VectorizedValue = V;
19235 return V;
19236 }
19237 case Instruction::ExtractValue: {
19238 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19239 Builder.SetInsertPoint(LI);
19240 Value *Ptr = LI->getPointerOperand();
19241 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19242 Value *NewV = ::propagateMetadata(V, E->Scalars);
19243 NewV = FinalShuffle(NewV, E);
19244 E->VectorizedValue = NewV;
19245 return NewV;
19246 }
19247 case Instruction::InsertElement: {
19248 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
19249 if (const TreeEntry *OpE = getOperandEntry(E, 1);
19250 OpE && !OpE->isGather() && OpE->hasState() &&
19251 !OpE->hasCopyableElements())
19252 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
19253 else
19254 setInsertPointAfterBundle(E);
19255 Value *V = vectorizeOperand(E, 1);
19256 ArrayRef<Value *> Op = E->getOperand(1);
19257 Type *ScalarTy = Op.front()->getType();
19258 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
19259 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19260 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
19261 assert(Res.first > 0 && "Expected item in MinBWs.");
19262 V = Builder.CreateIntCast(
19263 V,
19265 ScalarTy,
19266 cast<FixedVectorType>(V->getType())->getNumElements()),
19267 Res.second);
19268 }
19269
19270 // Create InsertVector shuffle if necessary
19271 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
19272 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19273 }));
19274 const unsigned NumElts =
19275 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19276 const unsigned NumScalars = E->Scalars.size();
19277
19278 unsigned Offset = *getElementIndex(VL0);
19279 assert(Offset < NumElts && "Failed to find vector index offset");
19280
19281 // Create shuffle to resize vector
19282 SmallVector<int> Mask;
19283 if (!E->ReorderIndices.empty()) {
19284 inversePermutation(E->ReorderIndices, Mask);
19285 Mask.append(NumElts - NumScalars, PoisonMaskElem);
19286 } else {
19287 Mask.assign(NumElts, PoisonMaskElem);
19288 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
19289 }
19290 // Create InsertVector shuffle if necessary
19291 bool IsIdentity = true;
19292 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
19293 Mask.swap(PrevMask);
19294 for (unsigned I = 0; I < NumScalars; ++I) {
19295 Value *Scalar = E->Scalars[PrevMask[I]];
19296 unsigned InsertIdx = *getElementIndex(Scalar);
19297 IsIdentity &= InsertIdx - Offset == I;
19298 Mask[InsertIdx - Offset] = I;
19299 }
19300 if (!IsIdentity || NumElts != NumScalars) {
19301 Value *V2 = nullptr;
19302 bool IsVNonPoisonous =
19304 SmallVector<int> InsertMask(Mask);
19305 if (NumElts != NumScalars && Offset == 0) {
19306 // Follow all insert element instructions from the current buildvector
19307 // sequence.
19308 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
19309 do {
19310 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
19311 if (!InsertIdx)
19312 break;
19313 if (InsertMask[*InsertIdx] == PoisonMaskElem)
19314 InsertMask[*InsertIdx] = *InsertIdx;
19315 if (!Ins->hasOneUse())
19316 break;
19318 Ins->getUniqueUndroppableUser());
19319 } while (Ins);
19320 SmallBitVector UseMask =
19321 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19322 SmallBitVector IsFirstPoison =
19323 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19324 SmallBitVector IsFirstUndef =
19325 isUndefVector(FirstInsert->getOperand(0), UseMask);
19326 if (!IsFirstPoison.all()) {
19327 unsigned Idx = 0;
19328 for (unsigned I = 0; I < NumElts; I++) {
19329 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
19330 IsFirstUndef.test(I)) {
19331 if (IsVNonPoisonous) {
19332 InsertMask[I] = I < NumScalars ? I : 0;
19333 continue;
19334 }
19335 if (!V2)
19336 V2 = UndefValue::get(V->getType());
19337 if (Idx >= NumScalars)
19338 Idx = NumScalars - 1;
19339 InsertMask[I] = NumScalars + Idx;
19340 ++Idx;
19341 } else if (InsertMask[I] != PoisonMaskElem &&
19342 Mask[I] == PoisonMaskElem) {
19343 InsertMask[I] = PoisonMaskElem;
19344 }
19345 }
19346 } else {
19347 InsertMask = Mask;
19348 }
19349 }
19350 if (!V2)
19351 V2 = PoisonValue::get(V->getType());
19352 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19353 if (auto *I = dyn_cast<Instruction>(V)) {
19354 GatherShuffleExtractSeq.insert(I);
19355 CSEBlocks.insert(I->getParent());
19356 }
19357 }
19358
19359 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
19360 for (unsigned I = 0; I < NumElts; I++) {
19361 if (Mask[I] != PoisonMaskElem)
19362 InsertMask[Offset + I] = I;
19363 }
19364 SmallBitVector UseMask =
19365 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19366 SmallBitVector IsFirstUndef =
19367 isUndefVector(FirstInsert->getOperand(0), UseMask);
19368 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
19369 NumElts != NumScalars) {
19370 if (IsFirstUndef.all()) {
19371 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
19372 SmallBitVector IsFirstPoison =
19373 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19374 if (!IsFirstPoison.all()) {
19375 for (unsigned I = 0; I < NumElts; I++) {
19376 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
19377 InsertMask[I] = I + NumElts;
19378 }
19379 }
19380 V = Builder.CreateShuffleVector(
19381 V,
19382 IsFirstPoison.all() ? PoisonValue::get(V->getType())
19383 : FirstInsert->getOperand(0),
19384 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
19385 if (auto *I = dyn_cast<Instruction>(V)) {
19386 GatherShuffleExtractSeq.insert(I);
19387 CSEBlocks.insert(I->getParent());
19388 }
19389 }
19390 } else {
19391 SmallBitVector IsFirstPoison =
19392 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19393 for (unsigned I = 0; I < NumElts; I++) {
19394 if (InsertMask[I] == PoisonMaskElem)
19395 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
19396 else
19397 InsertMask[I] += NumElts;
19398 }
19399 V = Builder.CreateShuffleVector(
19400 FirstInsert->getOperand(0), V, InsertMask,
19401 cast<Instruction>(E->Scalars.back())->getName());
19402 if (auto *I = dyn_cast<Instruction>(V)) {
19403 GatherShuffleExtractSeq.insert(I);
19404 CSEBlocks.insert(I->getParent());
19405 }
19406 }
19407 }
19408
19409 ++NumVectorInstructions;
19410 E->VectorizedValue = V;
19411 return V;
19412 }
19413 case Instruction::ZExt:
19414 case Instruction::SExt:
19415 case Instruction::FPToUI:
19416 case Instruction::FPToSI:
19417 case Instruction::FPExt:
19418 case Instruction::PtrToInt:
19419 case Instruction::IntToPtr:
19420 case Instruction::SIToFP:
19421 case Instruction::UIToFP:
19422 case Instruction::Trunc:
19423 case Instruction::FPTrunc:
19424 case Instruction::BitCast: {
19425 setInsertPointAfterBundle(E);
19426
19427 Value *InVec = vectorizeOperand(E, 0);
19428
19429 auto *CI = cast<CastInst>(VL0);
19430 Instruction::CastOps VecOpcode = CI->getOpcode();
19431 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
19432 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
19433 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
19434 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19435 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
19436 // Check if the values are candidates to demote.
19437 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19438 if (SrcIt != MinBWs.end())
19439 SrcBWSz = SrcIt->second.first;
19440 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
19441 if (BWSz == SrcBWSz) {
19442 VecOpcode = Instruction::BitCast;
19443 } else if (BWSz < SrcBWSz) {
19444 VecOpcode = Instruction::Trunc;
19445 } else if (It != MinBWs.end()) {
19446 assert(BWSz > SrcBWSz && "Invalid cast!");
19447 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19448 } else if (SrcIt != MinBWs.end()) {
19449 assert(BWSz > SrcBWSz && "Invalid cast!");
19450 VecOpcode =
19451 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19452 }
19453 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19454 !SrcIt->second.second) {
19455 VecOpcode = Instruction::UIToFP;
19456 }
19457 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19458 ? InVec
19459 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19460 V = FinalShuffle(V, E);
19461
19462 E->VectorizedValue = V;
19463 ++NumVectorInstructions;
19464 return V;
19465 }
19466 case Instruction::FCmp:
19467 case Instruction::ICmp: {
19468 setInsertPointAfterBundle(E);
19469
19470 Value *L = vectorizeOperand(E, 0);
19471 Value *R = vectorizeOperand(E, 1);
19472 if (L->getType() != R->getType()) {
19473 assert((getOperandEntry(E, 0)->isGather() ||
19474 getOperandEntry(E, 1)->isGather() ||
19475 MinBWs.contains(getOperandEntry(E, 0)) ||
19476 MinBWs.contains(getOperandEntry(E, 1))) &&
19477 "Expected item in MinBWs.");
19478 if (cast<VectorType>(L->getType())
19479 ->getElementType()
19480 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
19481 ->getElementType()
19482 ->getIntegerBitWidth()) {
19483 Type *CastTy = R->getType();
19484 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19485 } else {
19486 Type *CastTy = L->getType();
19487 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19488 }
19489 }
19490
19491 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
19492 Value *V = Builder.CreateCmp(P0, L, R);
19493 propagateIRFlags(V, E->Scalars, VL0);
19494 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
19495 ICmp->setSameSign(/*B=*/false);
19496 // Do not cast for cmps.
19497 VecTy = cast<FixedVectorType>(V->getType());
19498 V = FinalShuffle(V, E);
19499
19500 E->VectorizedValue = V;
19501 ++NumVectorInstructions;
19502 return V;
19503 }
19504 case Instruction::Select: {
19505 setInsertPointAfterBundle(E);
19506
19507 Value *Cond = vectorizeOperand(E, 0);
19508 Value *True = vectorizeOperand(E, 1);
19509 Value *False = vectorizeOperand(E, 2);
19510 if (True->getType() != VecTy || False->getType() != VecTy) {
19511 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
19512 getOperandEntry(E, 2)->isGather() ||
19513 MinBWs.contains(getOperandEntry(E, 1)) ||
19514 MinBWs.contains(getOperandEntry(E, 2))) &&
19515 "Expected item in MinBWs.");
19516 if (True->getType() != VecTy)
19517 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19518 if (False->getType() != VecTy)
19519 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19520 }
19521
19522 unsigned CondNumElements = getNumElements(Cond->getType());
19523 unsigned TrueNumElements = getNumElements(True->getType());
19524 assert(TrueNumElements >= CondNumElements &&
19525 TrueNumElements % CondNumElements == 0 &&
19526 "Cannot vectorize Instruction::Select");
19527 assert(TrueNumElements == getNumElements(False->getType()) &&
19528 "Cannot vectorize Instruction::Select");
19529 if (CondNumElements != TrueNumElements) {
19530 // When the return type is i1 but the source is fixed vector type, we
19531 // need to duplicate the condition value.
19532 Cond = Builder.CreateShuffleVector(
19533 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
19534 CondNumElements));
19535 }
19536 assert(getNumElements(Cond->getType()) == TrueNumElements &&
19537 "Cannot vectorize Instruction::Select");
19538 Value *V =
19539 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
19540 V = FinalShuffle(V, E);
19541
19542 E->VectorizedValue = V;
19543 ++NumVectorInstructions;
19544 return V;
19545 }
19546 case Instruction::FNeg: {
19547 setInsertPointAfterBundle(E);
19548
19549 Value *Op = vectorizeOperand(E, 0);
19550
19551 Value *V = Builder.CreateUnOp(
19552 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
19553 propagateIRFlags(V, E->Scalars, VL0);
19554 if (auto *I = dyn_cast<Instruction>(V))
19555 V = ::propagateMetadata(I, E->Scalars);
19556
19557 V = FinalShuffle(V, E);
19558
19559 E->VectorizedValue = V;
19560 ++NumVectorInstructions;
19561
19562 return V;
19563 }
19564 case Instruction::Freeze: {
19565 setInsertPointAfterBundle(E);
19566
19567 Value *Op = vectorizeOperand(E, 0);
19568
19569 if (Op->getType() != VecTy) {
19570 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19571 MinBWs.contains(getOperandEntry(E, 0))) &&
19572 "Expected item in MinBWs.");
19573 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
19574 }
19575 Value *V = Builder.CreateFreeze(Op);
19576 V = FinalShuffle(V, E);
19577
19578 E->VectorizedValue = V;
19579 ++NumVectorInstructions;
19580
19581 return V;
19582 }
19583 case Instruction::Add:
19584 case Instruction::FAdd:
19585 case Instruction::Sub:
19586 case Instruction::FSub:
19587 case Instruction::Mul:
19588 case Instruction::FMul:
19589 case Instruction::UDiv:
19590 case Instruction::SDiv:
19591 case Instruction::FDiv:
19592 case Instruction::URem:
19593 case Instruction::SRem:
19594 case Instruction::FRem:
19595 case Instruction::Shl:
19596 case Instruction::LShr:
19597 case Instruction::AShr:
19598 case Instruction::And:
19599 case Instruction::Or:
19600 case Instruction::Xor: {
19601 setInsertPointAfterBundle(E);
19602
19603 Value *LHS = vectorizeOperand(E, 0);
19604 Value *RHS = vectorizeOperand(E, 1);
19605 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19606 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19607 ArrayRef<Value *> Ops = E->getOperand(I);
19608 if (all_of(Ops, [&](Value *Op) {
19609 auto *CI = dyn_cast<ConstantInt>(Op);
19610 return CI && CI->getValue().countr_one() >= It->second.first;
19611 })) {
19612 V = FinalShuffle(I == 0 ? RHS : LHS, E);
19613 E->VectorizedValue = V;
19614 ++NumVectorInstructions;
19615 return V;
19616 }
19617 }
19618 }
19619 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
19620 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19621 getOperandEntry(E, 1)->isGather() ||
19622 MinBWs.contains(getOperandEntry(E, 0)) ||
19623 MinBWs.contains(getOperandEntry(E, 1))) &&
19624 "Expected item in MinBWs.");
19625 if (LHS->getType() != VecTy)
19626 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
19627 if (RHS->getType() != VecTy)
19628 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
19629 }
19630
19631 Value *V = Builder.CreateBinOp(
19632 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
19633 RHS);
19634 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
19635 if (auto *I = dyn_cast<Instruction>(V)) {
19636 V = ::propagateMetadata(I, E->Scalars);
19637 // Drop nuw flags for abs(sub(commutative), true).
19638 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19639 any_of(E->Scalars, [](Value *V) {
19640 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19641 }))
19642 I->setHasNoUnsignedWrap(/*b=*/false);
19643 }
19644
19645 V = FinalShuffle(V, E);
19646
19647 E->VectorizedValue = V;
19648 ++NumVectorInstructions;
19649
19650 return V;
19651 }
19652 case Instruction::Load: {
19653 // Loads are inserted at the head of the tree because we don't want to
19654 // sink them all the way down past store instructions.
19655 setInsertPointAfterBundle(E);
19656
19657 LoadInst *LI = cast<LoadInst>(VL0);
19658 Instruction *NewLI;
19659 FixedVectorType *StridedLoadTy = nullptr;
19660 Value *PO = LI->getPointerOperand();
19661 if (E->State == TreeEntry::Vectorize) {
19662 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19663 } else if (E->State == TreeEntry::CompressVectorize) {
19664 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19665 CompressEntryToData.at(E);
19666 Align CommonAlignment = LI->getAlign();
19667 if (IsMasked) {
19668 unsigned VF = getNumElements(LoadVecTy);
19669 SmallVector<Constant *> MaskValues(
19670 VF / getNumElements(LI->getType()),
19671 ConstantInt::getFalse(VecTy->getContext()));
19672 for (int I : CompressMask)
19673 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
19674 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19675 assert(SLPReVec && "Only supported by REVEC.");
19676 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
19677 }
19678 Constant *MaskValue = ConstantVector::get(MaskValues);
19679 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19680 MaskValue);
19681 } else {
19682 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19683 }
19684 NewLI = ::propagateMetadata(NewLI, E->Scalars);
19685 // TODO: include this cost into CommonCost.
19686 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19687 assert(SLPReVec && "FixedVectorType is not expected.");
19688 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
19689 CompressMask);
19690 }
19691 NewLI =
19692 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
19693 } else if (E->State == TreeEntry::StridedVectorize) {
19694 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19695 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19696 PO = IsReverseOrder ? PtrN : Ptr0;
19697 Type *StrideTy = DL->getIndexType(PO->getType());
19698 Value *StrideVal;
19699 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
19700 StridedLoadTy = SPtrInfo.Ty;
19701 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
19702 unsigned StridedLoadEC =
19703 StridedLoadTy->getElementCount().getKnownMinValue();
19704
19705 Value *Stride = SPtrInfo.StrideVal;
19706 if (!Stride) {
19707 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19708 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
19709 SCEVExpander Expander(*SE, *DL, "strided-load-vec");
19710 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
19711 &*Builder.GetInsertPoint());
19712 }
19713 Value *NewStride =
19714 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
19715 StrideVal = Builder.CreateMul(
19716 NewStride, ConstantInt::get(
19717 StrideTy, (IsReverseOrder ? -1 : 1) *
19718 static_cast<int>(
19719 DL->getTypeAllocSize(ScalarTy))));
19720 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19721 auto *Inst = Builder.CreateIntrinsic(
19722 Intrinsic::experimental_vp_strided_load,
19723 {StridedLoadTy, PO->getType(), StrideTy},
19724 {PO, StrideVal,
19725 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
19726 Builder.getInt32(StridedLoadEC)});
19727 Inst->addParamAttr(
19728 /*ArgNo=*/0,
19729 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19730 NewLI = Inst;
19731 } else {
19732 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
19733 Value *VecPtr = vectorizeOperand(E, 0);
19734 if (isa<FixedVectorType>(ScalarTy)) {
19735 assert(SLPReVec && "FixedVectorType is not expected.");
19736 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
19737 // to expand VecPtr if ScalarTy is a vector type.
19738 unsigned ScalarTyNumElements =
19739 cast<FixedVectorType>(ScalarTy)->getNumElements();
19740 unsigned VecTyNumElements =
19741 cast<FixedVectorType>(VecTy)->getNumElements();
19742 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19743 "Cannot expand getelementptr.");
19744 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19745 SmallVector<Constant *> Indices(VecTyNumElements);
19746 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
19747 return Builder.getInt64(I % ScalarTyNumElements);
19748 });
19749 VecPtr = Builder.CreateGEP(
19750 VecTy->getElementType(),
19751 Builder.CreateShuffleVector(
19752 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
19753 ConstantVector::get(Indices));
19754 }
19755 // Use the minimum alignment of the gathered loads.
19756 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19757 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19758 }
19759 Value *V = E->State == TreeEntry::CompressVectorize
19760 ? NewLI
19761 : ::propagateMetadata(NewLI, E->Scalars);
19762
19763 V = FinalShuffle(V, E);
19764 E->VectorizedValue = V;
19765 ++NumVectorInstructions;
19766 return V;
19767 }
19768 case Instruction::Store: {
19769 auto *SI = cast<StoreInst>(VL0);
19770
19771 setInsertPointAfterBundle(E);
19772
19773 Value *VecValue = vectorizeOperand(E, 0);
19774 if (VecValue->getType() != VecTy)
19775 VecValue =
19776 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19777 VecValue = FinalShuffle(VecValue, E);
19778
19779 Value *Ptr = SI->getPointerOperand();
19780 Instruction *ST;
19781 if (E->State == TreeEntry::Vectorize) {
19782 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
19783 } else {
19784 assert(E->State == TreeEntry::StridedVectorize &&
19785 "Expected either strided or consecutive stores.");
19786 if (!E->ReorderIndices.empty()) {
19787 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
19788 Ptr = SI->getPointerOperand();
19789 }
19790 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
19791 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
19792 auto *Inst = Builder.CreateIntrinsic(
19793 Intrinsic::experimental_vp_strided_store,
19794 {VecTy, Ptr->getType(), StrideTy},
19795 {VecValue, Ptr,
19796 ConstantInt::get(
19797 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19798 Builder.getAllOnesMask(VecTy->getElementCount()),
19799 Builder.getInt32(E->Scalars.size())});
19800 Inst->addParamAttr(
19801 /*ArgNo=*/1,
19802 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19803 ST = Inst;
19804 }
19805
19806 Value *V = ::propagateMetadata(ST, E->Scalars);
19807
19808 E->VectorizedValue = V;
19809 ++NumVectorInstructions;
19810 return V;
19811 }
19812 case Instruction::GetElementPtr: {
19813 auto *GEP0 = cast<GetElementPtrInst>(VL0);
19814 setInsertPointAfterBundle(E);
19815
19816 Value *Op0 = vectorizeOperand(E, 0);
19817
19818 SmallVector<Value *> OpVecs;
19819 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
19820 Value *OpVec = vectorizeOperand(E, J);
19821 OpVecs.push_back(OpVec);
19822 }
19823
19824 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19825 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
19827 for (Value *V : E->Scalars) {
19829 GEPs.push_back(V);
19830 }
19831 V = ::propagateMetadata(I, GEPs);
19832 }
19833
19834 V = FinalShuffle(V, E);
19835
19836 E->VectorizedValue = V;
19837 ++NumVectorInstructions;
19838
19839 return V;
19840 }
19841 case Instruction::Call: {
19842 CallInst *CI = cast<CallInst>(VL0);
19843 setInsertPointAfterBundle(E);
19844
19846
19848 CI, ID, VecTy->getNumElements(),
19849 It != MinBWs.end() ? It->second.first : 0, TTI);
19850 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
19851 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
19852 VecCallCosts.first <= VecCallCosts.second;
19853
19854 Value *ScalarArg = nullptr;
19855 SmallVector<Value *> OpVecs;
19856 SmallVector<Type *, 2> TysForDecl;
19857 // Add return type if intrinsic is overloaded on it.
19858 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
19859 TysForDecl.push_back(VecTy);
19860 auto *CEI = cast<CallInst>(VL0);
19861 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
19862 // Some intrinsics have scalar arguments. This argument should not be
19863 // vectorized.
19864 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
19865 ScalarArg = CEI->getArgOperand(I);
19866 // if decided to reduce bitwidth of abs intrinsic, it second argument
19867 // must be set false (do not return poison, if value issigned min).
19868 if (ID == Intrinsic::abs && It != MinBWs.end() &&
19869 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19870 ScalarArg = Builder.getFalse();
19871 OpVecs.push_back(ScalarArg);
19873 TysForDecl.push_back(ScalarArg->getType());
19874 continue;
19875 }
19876
19877 Value *OpVec = vectorizeOperand(E, I);
19878 ScalarArg = CEI->getArgOperand(I);
19879 if (cast<VectorType>(OpVec->getType())->getElementType() !=
19880 ScalarArg->getType()->getScalarType() &&
19881 It == MinBWs.end()) {
19882 auto *CastTy =
19883 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
19884 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
19885 } else if (It != MinBWs.end()) {
19886 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
19887 }
19888 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
19889 OpVecs.push_back(OpVec);
19890 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
19891 TysForDecl.push_back(OpVec->getType());
19892 }
19893
19894 Function *CF;
19895 if (!UseIntrinsic) {
19896 VFShape Shape =
19898 ElementCount::getFixed(VecTy->getNumElements()),
19899 false /*HasGlobalPred*/);
19900 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19901 } else {
19902 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
19903 }
19904
19906 CI->getOperandBundlesAsDefs(OpBundles);
19907 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
19908
19909 propagateIRFlags(V, E->Scalars, VL0);
19910 V = FinalShuffle(V, E);
19911
19912 E->VectorizedValue = V;
19913 ++NumVectorInstructions;
19914 return V;
19915 }
19916 case Instruction::ShuffleVector: {
19917 Value *V;
19918 if (SLPReVec && !E->isAltShuffle()) {
19919 setInsertPointAfterBundle(E);
19920 Value *Src = vectorizeOperand(E, 0);
19921 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
19922 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
19923 SmallVector<int> NewMask(ThisMask.size());
19924 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
19925 return SVSrc->getShuffleMask()[Mask];
19926 });
19927 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19928 SVSrc->getOperand(1), NewMask);
19929 } else {
19930 V = Builder.CreateShuffleVector(Src, ThisMask);
19931 }
19932 propagateIRFlags(V, E->Scalars, VL0);
19933 if (auto *I = dyn_cast<Instruction>(V))
19934 V = ::propagateMetadata(I, E->Scalars);
19935 V = FinalShuffle(V, E);
19936 } else {
19937 assert(E->isAltShuffle() &&
19938 ((Instruction::isBinaryOp(E->getOpcode()) &&
19939 Instruction::isBinaryOp(E->getAltOpcode())) ||
19940 (Instruction::isCast(E->getOpcode()) &&
19941 Instruction::isCast(E->getAltOpcode())) ||
19942 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
19943 "Invalid Shuffle Vector Operand");
19944
19945 Value *LHS = nullptr, *RHS = nullptr;
19946 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
19947 setInsertPointAfterBundle(E);
19948 LHS = vectorizeOperand(E, 0);
19949 RHS = vectorizeOperand(E, 1);
19950 } else {
19951 setInsertPointAfterBundle(E);
19952 LHS = vectorizeOperand(E, 0);
19953 }
19954 if (LHS && RHS &&
19955 ((Instruction::isBinaryOp(E->getOpcode()) &&
19956 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
19957 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
19958 assert((It != MinBWs.end() ||
19959 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
19960 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
19961 MinBWs.contains(getOperandEntry(E, 0)) ||
19962 MinBWs.contains(getOperandEntry(E, 1))) &&
19963 "Expected item in MinBWs.");
19964 Type *CastTy = VecTy;
19965 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
19967 ->getElementType()
19968 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
19969 ->getElementType()
19970 ->getIntegerBitWidth())
19971 CastTy = RHS->getType();
19972 else
19973 CastTy = LHS->getType();
19974 }
19975 if (LHS->getType() != CastTy)
19976 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
19977 if (RHS->getType() != CastTy)
19978 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
19979 }
19980
19981 Value *V0, *V1;
19982 if (Instruction::isBinaryOp(E->getOpcode())) {
19983 V0 = Builder.CreateBinOp(
19984 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
19985 V1 = Builder.CreateBinOp(
19986 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
19987 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
19988 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
19989 auto *AltCI = cast<CmpInst>(E->getAltOp());
19990 CmpInst::Predicate AltPred = AltCI->getPredicate();
19991 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
19992 } else {
19993 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
19994 unsigned SrcBWSz = DL->getTypeSizeInBits(
19995 cast<VectorType>(LHS->getType())->getElementType());
19996 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19997 if (BWSz <= SrcBWSz) {
19998 if (BWSz < SrcBWSz)
19999 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
20000 assert(LHS->getType() == VecTy &&
20001 "Expected same type as operand.");
20002 if (auto *I = dyn_cast<Instruction>(LHS))
20003 LHS = ::propagateMetadata(I, E->Scalars);
20004 LHS = FinalShuffle(LHS, E);
20005 E->VectorizedValue = LHS;
20006 ++NumVectorInstructions;
20007 return LHS;
20008 }
20009 }
20010 V0 = Builder.CreateCast(
20011 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
20012 V1 = Builder.CreateCast(
20013 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
20014 }
20015 // Add V0 and V1 to later analysis to try to find and remove matching
20016 // instruction, if any.
20017 for (Value *V : {V0, V1}) {
20018 if (auto *I = dyn_cast<Instruction>(V)) {
20019 GatherShuffleExtractSeq.insert(I);
20020 CSEBlocks.insert(I->getParent());
20021 }
20022 }
20023
20024 // Create shuffle to take alternate operations from the vector.
20025 // Also, gather up main and alt scalar ops to propagate IR flags to
20026 // each vector operation.
20027 ValueList OpScalars, AltScalars;
20028 SmallVector<int> Mask;
20029 E->buildAltOpShuffleMask(
20030 [E, this](Instruction *I) {
20031 assert(E->getMatchingMainOpOrAltOp(I) &&
20032 "Unexpected main/alternate opcode");
20033 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
20034 *TLI);
20035 },
20036 Mask, &OpScalars, &AltScalars);
20037
20038 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
20039 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
20040 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
20041 // Drop nuw flags for abs(sub(commutative), true).
20042 if (auto *I = dyn_cast<Instruction>(Vec);
20043 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
20044 any_of(E->Scalars, [](Value *V) {
20045 if (isa<PoisonValue>(V))
20046 return false;
20047 auto *IV = cast<Instruction>(V);
20048 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20049 }))
20050 I->setHasNoUnsignedWrap(/*b=*/false);
20051 };
20052 DropNuwFlag(V0, E->getOpcode());
20053 DropNuwFlag(V1, E->getAltOpcode());
20054
20055 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20056 assert(SLPReVec && "FixedVectorType is not expected.");
20057 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
20058 }
20059 V = Builder.CreateShuffleVector(V0, V1, Mask);
20060 if (auto *I = dyn_cast<Instruction>(V)) {
20061 V = ::propagateMetadata(I, E->Scalars);
20062 GatherShuffleExtractSeq.insert(I);
20063 CSEBlocks.insert(I->getParent());
20064 }
20065 }
20066
20067 E->VectorizedValue = V;
20068 ++NumVectorInstructions;
20069
20070 return V;
20071 }
20072 default:
20073 llvm_unreachable("unknown inst");
20074 }
20075 return nullptr;
20076}
20077
20079 ExtraValueToDebugLocsMap ExternallyUsedValues;
20080 return vectorizeTree(ExternallyUsedValues);
20081}
20082
20084 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
20085 Instruction *ReductionRoot,
20086 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20087 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
20088 // need to rebuild it.
20089 EntryToLastInstruction.clear();
20090 // All blocks must be scheduled before any instructions are inserted.
20091 for (auto &BSIter : BlocksSchedules)
20092 scheduleBlock(*this, BSIter.second.get());
20093 // Cache last instructions for the nodes to avoid side effects, which may
20094 // appear during vectorization, like extra uses, etc.
20095 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20096 if (TE->isGather())
20097 continue;
20098 (void)getLastInstructionInBundle(TE.get());
20099 }
20100
20101 if (ReductionRoot)
20102 Builder.SetInsertPoint(ReductionRoot->getParent(),
20103 ReductionRoot->getIterator());
20104 else
20105 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20106
20107 // Vectorize gather operands of the nodes with the external uses only.
20109 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20110 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20111 TE->UserTreeIndex.UserTE->hasState() &&
20112 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20113 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20114 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20115 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20116 all_of(TE->UserTreeIndex.UserTE->Scalars,
20117 [](Value *V) { return isUsedOutsideBlock(V); })) {
20118 Instruction &LastInst =
20119 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20120 GatherEntries.emplace_back(TE.get(), &LastInst);
20121 }
20122 }
20123 for (auto &Entry : GatherEntries) {
20124 IRBuilderBase::InsertPointGuard Guard(Builder);
20125 Builder.SetInsertPoint(Entry.second);
20126 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20127 (void)vectorizeTree(Entry.first);
20128 }
20129 // Emit gathered loads first to emit better code for the users of those
20130 // gathered loads.
20131 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20132 if (GatheredLoadsEntriesFirst.has_value() &&
20133 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20134 (!TE->isGather() || TE->UserTreeIndex)) {
20135 assert((TE->UserTreeIndex ||
20136 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20137 "Expected gathered load node.");
20138 (void)vectorizeTree(TE.get());
20139 }
20140 }
20141 (void)vectorizeTree(VectorizableTree[0].get());
20142 // Run through the list of postponed gathers and emit them, replacing the temp
20143 // emitted allocas with actual vector instructions.
20144 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20146 for (const TreeEntry *E : PostponedNodes) {
20147 auto *TE = const_cast<TreeEntry *>(E);
20148 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20149 TE->VectorizedValue = nullptr;
20150 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20151 // If user is a PHI node, its vector code have to be inserted right before
20152 // block terminator. Since the node was delayed, there were some unresolved
20153 // dependencies at the moment when stab instruction was emitted. In a case
20154 // when any of these dependencies turn out an operand of another PHI, coming
20155 // from this same block, position of a stab instruction will become invalid.
20156 // The is because source vector that supposed to feed this gather node was
20157 // inserted at the end of the block [after stab instruction]. So we need
20158 // to adjust insertion point again to the end of block.
20159 if (isa<PHINode>(UserI) ||
20160 (TE->UserTreeIndex.UserTE->hasState() &&
20161 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20162 // Insert before all users.
20163 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20164 for (User *U : PrevVec->users()) {
20165 if (U == UserI)
20166 continue;
20167 auto *UI = dyn_cast<Instruction>(U);
20168 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20169 continue;
20170 if (UI->comesBefore(InsertPt))
20171 InsertPt = UI;
20172 }
20173 Builder.SetInsertPoint(InsertPt);
20174 } else {
20175 Builder.SetInsertPoint(PrevVec);
20176 }
20177 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20178 Value *Vec = vectorizeTree(TE);
20179 if (auto *VecI = dyn_cast<Instruction>(Vec);
20180 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20181 Builder.GetInsertPoint()->comesBefore(VecI))
20182 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20183 Builder.GetInsertPoint());
20184 if (Vec->getType() != PrevVec->getType()) {
20185 assert(Vec->getType()->isIntOrIntVectorTy() &&
20186 PrevVec->getType()->isIntOrIntVectorTy() &&
20187 "Expected integer vector types only.");
20188 std::optional<bool> IsSigned;
20189 for (Value *V : TE->Scalars) {
20190 if (isVectorized(V)) {
20191 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20192 auto It = MinBWs.find(MNTE);
20193 if (It != MinBWs.end()) {
20194 IsSigned = IsSigned.value_or(false) || It->second.second;
20195 if (*IsSigned)
20196 break;
20197 }
20198 }
20199 if (IsSigned.value_or(false))
20200 break;
20201 // Scan through gather nodes.
20202 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20203 auto It = MinBWs.find(BVE);
20204 if (It != MinBWs.end()) {
20205 IsSigned = IsSigned.value_or(false) || It->second.second;
20206 if (*IsSigned)
20207 break;
20208 }
20209 }
20210 if (IsSigned.value_or(false))
20211 break;
20212 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20213 IsSigned =
20214 IsSigned.value_or(false) ||
20215 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20216 continue;
20217 }
20218 if (IsSigned.value_or(false))
20219 break;
20220 }
20221 }
20222 if (IsSigned.value_or(false)) {
20223 // Final attempt - check user node.
20224 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20225 if (It != MinBWs.end())
20226 IsSigned = It->second.second;
20227 }
20228 assert(IsSigned &&
20229 "Expected user node or perfect diamond match in MinBWs.");
20230 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20231 }
20232 PrevVec->replaceAllUsesWith(Vec);
20233 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20234 // Replace the stub vector node, if it was used before for one of the
20235 // buildvector nodes already.
20236 auto It = PostponedValues.find(PrevVec);
20237 if (It != PostponedValues.end()) {
20238 for (TreeEntry *VTE : It->getSecond())
20239 VTE->VectorizedValue = Vec;
20240 }
20241 eraseInstruction(PrevVec);
20242 }
20243
20244 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
20245 << " values .\n");
20246
20248 // Maps vector instruction to original insertelement instruction
20249 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
20250 // Maps extract Scalar to the corresponding extractelement instruction in the
20251 // basic block. Only one extractelement per block should be emitted.
20253 ScalarToEEs;
20254 SmallDenseSet<Value *, 4> UsedInserts;
20256 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
20258 // Extract all of the elements with the external uses.
20259 for (const auto &ExternalUse : ExternalUses) {
20260 Value *Scalar = ExternalUse.Scalar;
20261 llvm::User *User = ExternalUse.User;
20262
20263 // Skip users that we already RAUW. This happens when one instruction
20264 // has multiple uses of the same value.
20265 if (User && !is_contained(Scalar->users(), User))
20266 continue;
20267 const TreeEntry *E = &ExternalUse.E;
20268 assert(E && "Invalid scalar");
20269 assert(!E->isGather() && "Extracting from a gather list");
20270 // Non-instruction pointers are not deleted, just skip them.
20271 if (E->getOpcode() == Instruction::GetElementPtr &&
20272 !isa<GetElementPtrInst>(Scalar))
20273 continue;
20274
20275 Value *Vec = E->VectorizedValue;
20276 assert(Vec && "Can't find vectorizable value");
20277
20278 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20279 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
20280 if (Scalar->getType() != Vec->getType()) {
20281 Value *Ex = nullptr;
20282 Value *ExV = nullptr;
20283 auto *Inst = dyn_cast<Instruction>(Scalar);
20284 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20285 auto It = ScalarToEEs.find(Scalar);
20286 if (It != ScalarToEEs.end()) {
20287 // No need to emit many extracts, just move the only one in the
20288 // current block.
20289 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20290 : Builder.GetInsertBlock());
20291 if (EEIt != It->second.end()) {
20292 Value *PrevV = EEIt->second.first;
20293 if (auto *I = dyn_cast<Instruction>(PrevV);
20294 I && !ReplaceInst &&
20295 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20296 Builder.GetInsertPoint()->comesBefore(I)) {
20297 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20298 Builder.GetInsertPoint());
20299 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20300 CI->moveAfter(I);
20301 }
20302 Ex = PrevV;
20303 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20304 }
20305 }
20306 if (!Ex) {
20307 // "Reuse" the existing extract to improve final codegen.
20308 if (ReplaceInst) {
20309 // Leave the instruction as is, if it cheaper extracts and all
20310 // operands are scalar.
20311 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20312 IgnoredExtracts.insert(EE);
20313 Ex = EE;
20314 } else {
20315 auto *CloneInst = Inst->clone();
20316 CloneInst->insertBefore(Inst->getIterator());
20317 if (Inst->hasName())
20318 CloneInst->takeName(Inst);
20319 Ex = CloneInst;
20320 }
20321 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20322 ES && isa<Instruction>(Vec)) {
20323 Value *V = ES->getVectorOperand();
20324 auto *IVec = cast<Instruction>(Vec);
20325 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
20326 V = ETEs.front()->VectorizedValue;
20327 if (auto *IV = dyn_cast<Instruction>(V);
20328 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
20329 IV->comesBefore(IVec))
20330 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20331 else
20332 Ex = Builder.CreateExtractElement(Vec, Lane);
20333 } else if (auto *VecTy =
20334 dyn_cast<FixedVectorType>(Scalar->getType())) {
20335 assert(SLPReVec && "FixedVectorType is not expected.");
20336 unsigned VecTyNumElements = VecTy->getNumElements();
20337 // When REVEC is enabled, we need to extract a vector.
20338 // Note: The element size of Scalar may be different from the
20339 // element size of Vec.
20340 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
20341 ExternalUse.Lane * VecTyNumElements);
20342 } else {
20343 Ex = Builder.CreateExtractElement(Vec, Lane);
20344 }
20345 // If necessary, sign-extend or zero-extend ScalarRoot
20346 // to the larger type.
20347 ExV = Ex;
20348 if (Scalar->getType() != Ex->getType())
20349 ExV = Builder.CreateIntCast(
20350 Ex, Scalar->getType(),
20351 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
20352 auto *I = dyn_cast<Instruction>(Ex);
20353 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
20354 : &F->getEntryBlock(),
20355 std::make_pair(Ex, ExV));
20356 }
20357 // The then branch of the previous if may produce constants, since 0
20358 // operand might be a constant.
20359 if (auto *ExI = dyn_cast<Instruction>(Ex);
20360 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
20361 GatherShuffleExtractSeq.insert(ExI);
20362 CSEBlocks.insert(ExI->getParent());
20363 }
20364 return ExV;
20365 }
20366 assert(isa<FixedVectorType>(Scalar->getType()) &&
20367 isa<InsertElementInst>(Scalar) &&
20368 "In-tree scalar of vector type is not insertelement?");
20369 auto *IE = cast<InsertElementInst>(Scalar);
20370 VectorToInsertElement.try_emplace(Vec, IE);
20371 return Vec;
20372 };
20373 // If User == nullptr, the Scalar remains as scalar in vectorized
20374 // instructions or is used as extra arg. Generate ExtractElement instruction
20375 // and update the record for this scalar in ExternallyUsedValues.
20376 if (!User) {
20377 if (!ScalarsWithNullptrUser.insert(Scalar).second)
20378 continue;
20379 assert(
20380 (ExternallyUsedValues.count(Scalar) ||
20381 ExternalUsesWithNonUsers.count(Scalar) ||
20382 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20383 any_of(
20384 Scalar->users(),
20385 [&, TTI = TTI](llvm::User *U) {
20386 if (ExternalUsesAsOriginalScalar.contains(U))
20387 return true;
20388 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20389 return !UseEntries.empty() &&
20390 (E->State == TreeEntry::Vectorize ||
20391 E->State == TreeEntry::StridedVectorize ||
20392 E->State == TreeEntry::CompressVectorize) &&
20393 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20394 return (UseEntry->State == TreeEntry::Vectorize ||
20395 UseEntry->State ==
20396 TreeEntry::StridedVectorize ||
20397 UseEntry->State ==
20398 TreeEntry::CompressVectorize) &&
20399 doesInTreeUserNeedToExtract(
20400 Scalar, getRootEntryInstruction(*UseEntry),
20401 TLI, TTI);
20402 });
20403 })) &&
20404 "Scalar with nullptr User must be registered in "
20405 "ExternallyUsedValues map or remain as scalar in vectorized "
20406 "instructions");
20407 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20408 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
20409 if (PHI->getParent()->isLandingPad())
20410 Builder.SetInsertPoint(
20411 PHI->getParent(),
20412 std::next(
20413 PHI->getParent()->getLandingPadInst()->getIterator()));
20414 else
20415 Builder.SetInsertPoint(PHI->getParent(),
20416 PHI->getParent()->getFirstNonPHIIt());
20417 } else {
20418 Builder.SetInsertPoint(VecI->getParent(),
20419 std::next(VecI->getIterator()));
20420 }
20421 } else {
20422 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20423 }
20424 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20425 // Required to update internally referenced instructions.
20426 if (Scalar != NewInst) {
20427 assert((!isa<ExtractElementInst>(Scalar) ||
20428 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
20429 "Extractelements should not be replaced.");
20430 Scalar->replaceAllUsesWith(NewInst);
20431 }
20432 continue;
20433 }
20434
20435 if (auto *VU = dyn_cast<InsertElementInst>(User);
20436 VU && VU->getOperand(1) == Scalar) {
20437 // Skip if the scalar is another vector op or Vec is not an instruction.
20438 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20439 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
20440 if (!UsedInserts.insert(VU).second)
20441 continue;
20442 // Need to use original vector, if the root is truncated.
20443 auto BWIt = MinBWs.find(E);
20444 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
20445 auto *ScalarTy = FTy->getElementType();
20446 auto Key = std::make_pair(Vec, ScalarTy);
20447 auto VecIt = VectorCasts.find(Key);
20448 if (VecIt == VectorCasts.end()) {
20449 IRBuilderBase::InsertPointGuard Guard(Builder);
20450 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
20451 if (IVec->getParent()->isLandingPad())
20452 Builder.SetInsertPoint(IVec->getParent(),
20453 std::next(IVec->getParent()
20454 ->getLandingPadInst()
20455 ->getIterator()));
20456 else
20457 Builder.SetInsertPoint(
20458 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20459 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
20460 Builder.SetInsertPoint(IVec->getNextNode());
20461 }
20462 Vec = Builder.CreateIntCast(
20463 Vec,
20465 ScalarTy,
20466 cast<FixedVectorType>(Vec->getType())->getNumElements()),
20467 BWIt->second.second);
20468 VectorCasts.try_emplace(Key, Vec);
20469 } else {
20470 Vec = VecIt->second;
20471 }
20472 }
20473
20474 std::optional<unsigned> InsertIdx = getElementIndex(VU);
20475 if (InsertIdx) {
20476 auto *It = find_if(
20477 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
20478 // Checks if 2 insertelements are from the same buildvector.
20479 InsertElementInst *VecInsert = Data.InsertElements.front();
20481 VU, VecInsert,
20482 [](InsertElementInst *II) { return II->getOperand(0); });
20483 });
20484 unsigned Idx = *InsertIdx;
20485 if (It == ShuffledInserts.end()) {
20486 (void)ShuffledInserts.emplace_back();
20487 It = std::next(ShuffledInserts.begin(),
20488 ShuffledInserts.size() - 1);
20489 }
20490 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
20491 if (Mask.empty())
20492 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
20493 Mask[Idx] = ExternalUse.Lane;
20494 It->InsertElements.push_back(cast<InsertElementInst>(User));
20495 continue;
20496 }
20497 }
20498 }
20499 }
20500
20501 // Generate extracts for out-of-tree users.
20502 // Find the insertion point for the extractelement lane.
20503 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20504 if (PHINode *PH = dyn_cast<PHINode>(User)) {
20505 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20506 if (PH->getIncomingValue(I) == Scalar) {
20507 Instruction *IncomingTerminator =
20508 PH->getIncomingBlock(I)->getTerminator();
20509 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20510 Builder.SetInsertPoint(VecI->getParent(),
20511 std::next(VecI->getIterator()));
20512 } else {
20513 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
20514 }
20515 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20516 PH->setOperand(I, NewInst);
20517 }
20518 }
20519 } else {
20520 Builder.SetInsertPoint(cast<Instruction>(User));
20521 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20522 User->replaceUsesOfWith(Scalar, NewInst);
20523 }
20524 } else {
20525 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20526 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20527 User->replaceUsesOfWith(Scalar, NewInst);
20528 }
20529
20530 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
20531 }
20532
20533 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20534 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
20535 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
20536 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
20537 for (int I = 0, E = Mask.size(); I < E; ++I) {
20538 if (Mask[I] < VF)
20539 CombinedMask1[I] = Mask[I];
20540 else
20541 CombinedMask2[I] = Mask[I] - VF;
20542 }
20543 ShuffleInstructionBuilder ShuffleBuilder(
20544 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
20545 ShuffleBuilder.add(V1, CombinedMask1);
20546 if (V2)
20547 ShuffleBuilder.add(V2, CombinedMask2);
20548 return ShuffleBuilder.finalize({}, {}, {});
20549 };
20550
20551 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
20552 bool ForSingleMask) {
20553 unsigned VF = Mask.size();
20554 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20555 if (VF != VecVF) {
20556 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
20557 Vec = CreateShuffle(Vec, nullptr, Mask);
20558 return std::make_pair(Vec, true);
20559 }
20560 if (!ForSingleMask) {
20561 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20562 for (unsigned I = 0; I < VF; ++I) {
20563 if (Mask[I] != PoisonMaskElem)
20564 ResizeMask[Mask[I]] = Mask[I];
20565 }
20566 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
20567 }
20568 }
20569
20570 return std::make_pair(Vec, false);
20571 };
20572 // Perform shuffling of the vectorize tree entries for better handling of
20573 // external extracts.
20574 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20575 // Find the first and the last instruction in the list of insertelements.
20576 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
20577 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
20578 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
20579 Builder.SetInsertPoint(LastInsert);
20580 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20582 MutableArrayRef(Vector.data(), Vector.size()),
20583 FirstInsert->getOperand(0),
20584 [](Value *Vec) {
20585 return cast<VectorType>(Vec->getType())
20586 ->getElementCount()
20587 .getKnownMinValue();
20588 },
20589 ResizeToVF,
20590 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20591 ArrayRef<Value *> Vals) {
20592 assert((Vals.size() == 1 || Vals.size() == 2) &&
20593 "Expected exactly 1 or 2 input values.");
20594 if (Vals.size() == 1) {
20595 // Do not create shuffle if the mask is a simple identity
20596 // non-resizing mask.
20597 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20598 ->getNumElements() ||
20599 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20600 return CreateShuffle(Vals.front(), nullptr, Mask);
20601 return Vals.front();
20602 }
20603 return CreateShuffle(Vals.front() ? Vals.front()
20604 : FirstInsert->getOperand(0),
20605 Vals.back(), Mask);
20606 });
20607 auto It = ShuffledInserts[I].InsertElements.rbegin();
20608 // Rebuild buildvector chain.
20609 InsertElementInst *II = nullptr;
20610 if (It != ShuffledInserts[I].InsertElements.rend())
20611 II = *It;
20613 while (It != ShuffledInserts[I].InsertElements.rend()) {
20614 assert(II && "Must be an insertelement instruction.");
20615 if (*It == II)
20616 ++It;
20617 else
20618 Inserts.push_back(cast<Instruction>(II));
20619 II = dyn_cast<InsertElementInst>(II->getOperand(0));
20620 }
20621 for (Instruction *II : reverse(Inserts)) {
20622 II->replaceUsesOfWith(II->getOperand(0), NewInst);
20623 if (auto *NewI = dyn_cast<Instruction>(NewInst))
20624 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
20625 II->moveAfter(NewI);
20626 NewInst = II;
20627 }
20628 LastInsert->replaceAllUsesWith(NewInst);
20629 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
20630 IE->replaceUsesOfWith(IE->getOperand(0),
20631 PoisonValue::get(IE->getOperand(0)->getType()));
20632 IE->replaceUsesOfWith(IE->getOperand(1),
20633 PoisonValue::get(IE->getOperand(1)->getType()));
20634 eraseInstruction(IE);
20635 }
20636 CSEBlocks.insert(LastInsert->getParent());
20637 }
20638
20639 SmallVector<Instruction *> RemovedInsts;
20640 // For each vectorized value:
20641 for (auto &TEPtr : VectorizableTree) {
20642 TreeEntry *Entry = TEPtr.get();
20643
20644 // No need to handle users of gathered values.
20645 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
20646 continue;
20647
20648 assert(Entry->VectorizedValue && "Can't find vectorizable value");
20649
20650 // For each lane:
20651 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
20652 Value *Scalar = Entry->Scalars[Lane];
20653
20654 if (Entry->getOpcode() == Instruction::GetElementPtr &&
20655 !isa<GetElementPtrInst>(Scalar))
20656 continue;
20657 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20658 EE && IgnoredExtracts.contains(EE))
20659 continue;
20660 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
20661 continue;
20662#ifndef NDEBUG
20663 Type *Ty = Scalar->getType();
20664 if (!Ty->isVoidTy()) {
20665 for (User *U : Scalar->users()) {
20666 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
20667
20668 // It is legal to delete users in the ignorelist.
20669 assert((isVectorized(U) ||
20670 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20673 "Deleting out-of-tree value");
20674 }
20675 }
20676#endif
20677 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
20678 auto *I = cast<Instruction>(Scalar);
20679 RemovedInsts.push_back(I);
20680 }
20681 }
20682
20683 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
20684 // new vector instruction.
20685 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20686 V->mergeDIAssignID(RemovedInsts);
20687
20688 // Clear up reduction references, if any.
20689 if (UserIgnoreList) {
20690 for (Instruction *I : RemovedInsts) {
20691 const TreeEntry *IE = getTreeEntries(I).front();
20692 if (IE->Idx != 0 &&
20693 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
20694 (ValueToGatherNodes.lookup(I).contains(
20695 VectorizableTree.front().get()) ||
20696 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20697 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20698 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20699 IE->UserTreeIndex &&
20700 is_contained(VectorizableTree.front()->Scalars, I)) &&
20701 !(GatheredLoadsEntriesFirst.has_value() &&
20702 IE->Idx >= *GatheredLoadsEntriesFirst &&
20703 VectorizableTree.front()->isGather() &&
20704 is_contained(VectorizableTree.front()->Scalars, I)) &&
20705 !(!VectorizableTree.front()->isGather() &&
20706 VectorizableTree.front()->isCopyableElement(I)))
20707 continue;
20708 SmallVector<SelectInst *> LogicalOpSelects;
20709 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
20710 // Do not replace condition of the logical op in form select <cond>.
20711 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20712 (match(U.getUser(), m_LogicalAnd()) ||
20713 match(U.getUser(), m_LogicalOr())) &&
20714 U.getOperandNo() == 0;
20715 if (IsPoisoningLogicalOp) {
20716 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20717 return false;
20718 }
20719 return UserIgnoreList->contains(U.getUser());
20720 });
20721 // Replace conditions of the poisoning logical ops with the non-poison
20722 // constant value.
20723 for (SelectInst *SI : LogicalOpSelects)
20724 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
20725 }
20726 }
20727 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
20728 // cache correctness.
20729 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
20730 // - instructions are not deleted until later.
20731 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
20732
20733 Builder.ClearInsertionPoint();
20734 InstrElementSize.clear();
20735
20736 const TreeEntry &RootTE = *VectorizableTree.front();
20737 Value *Vec = RootTE.VectorizedValue;
20738 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20739 It != MinBWs.end() &&
20740 ReductionBitWidth != It->second.first) {
20741 IRBuilder<>::InsertPointGuard Guard(Builder);
20742 Builder.SetInsertPoint(ReductionRoot->getParent(),
20743 ReductionRoot->getIterator());
20744 Vec = Builder.CreateIntCast(
20745 Vec,
20746 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20747 cast<VectorType>(Vec->getType())->getElementCount()),
20748 It->second.second);
20749 }
20750 return Vec;
20751}
20752
20754 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
20755 << " gather sequences instructions.\n");
20756 // LICM InsertElementInst sequences.
20757 for (Instruction *I : GatherShuffleExtractSeq) {
20758 if (isDeleted(I))
20759 continue;
20760
20761 // Check if this block is inside a loop.
20762 Loop *L = LI->getLoopFor(I->getParent());
20763 if (!L)
20764 continue;
20765
20766 // Check if it has a preheader.
20767 BasicBlock *PreHeader = L->getLoopPreheader();
20768 if (!PreHeader)
20769 continue;
20770
20771 // If the vector or the element that we insert into it are
20772 // instructions that are defined in this basic block then we can't
20773 // hoist this instruction.
20774 if (any_of(I->operands(), [L](Value *V) {
20775 auto *OpI = dyn_cast<Instruction>(V);
20776 return OpI && L->contains(OpI);
20777 }))
20778 continue;
20779
20780 // We can hoist this instruction. Move it to the pre-header.
20781 I->moveBefore(PreHeader->getTerminator()->getIterator());
20782 CSEBlocks.insert(PreHeader);
20783 }
20784
20785 // Make a list of all reachable blocks in our CSE queue.
20787 CSEWorkList.reserve(CSEBlocks.size());
20788 for (BasicBlock *BB : CSEBlocks)
20789 if (DomTreeNode *N = DT->getNode(BB)) {
20790 assert(DT->isReachableFromEntry(N));
20791 CSEWorkList.push_back(N);
20792 }
20793
20794 // Sort blocks by domination. This ensures we visit a block after all blocks
20795 // dominating it are visited.
20796 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
20797 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
20798 "Different nodes should have different DFS numbers");
20799 return A->getDFSNumIn() < B->getDFSNumIn();
20800 });
20801
20802 // Less defined shuffles can be replaced by the more defined copies.
20803 // Between two shuffles one is less defined if it has the same vector operands
20804 // and its mask indeces are the same as in the first one or undefs. E.g.
20805 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
20806 // poison, <0, 0, 0, 0>.
20807 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
20808 Instruction *I2,
20809 SmallVectorImpl<int> &NewMask) {
20810 if (I1->getType() != I2->getType())
20811 return false;
20812 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
20813 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
20814 if (!SI1 || !SI2)
20815 return I1->isIdenticalTo(I2);
20816 if (SI1->isIdenticalTo(SI2))
20817 return true;
20818 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
20819 if (SI1->getOperand(I) != SI2->getOperand(I))
20820 return false;
20821 // Check if the second instruction is more defined than the first one.
20822 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20823 ArrayRef<int> SM1 = SI1->getShuffleMask();
20824 // Count trailing undefs in the mask to check the final number of used
20825 // registers.
20826 unsigned LastUndefsCnt = 0;
20827 for (int I = 0, E = NewMask.size(); I < E; ++I) {
20828 if (SM1[I] == PoisonMaskElem)
20829 ++LastUndefsCnt;
20830 else
20831 LastUndefsCnt = 0;
20832 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
20833 NewMask[I] != SM1[I])
20834 return false;
20835 if (NewMask[I] == PoisonMaskElem)
20836 NewMask[I] = SM1[I];
20837 }
20838 // Check if the last undefs actually change the final number of used vector
20839 // registers.
20840 return SM1.size() - LastUndefsCnt > 1 &&
20841 ::getNumberOfParts(*TTI, SI1->getType()) ==
20843 *TTI, getWidenedType(SI1->getType()->getElementType(),
20844 SM1.size() - LastUndefsCnt));
20845 };
20846 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
20847 // instructions. TODO: We can further optimize this scan if we split the
20848 // instructions into different buckets based on the insert lane.
20850 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
20851 assert(*I &&
20852 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
20853 "Worklist not sorted properly!");
20854 BasicBlock *BB = (*I)->getBlock();
20855 // For all instructions in blocks containing gather sequences:
20856 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
20857 if (isDeleted(&In))
20858 continue;
20860 !GatherShuffleExtractSeq.contains(&In))
20861 continue;
20862
20863 // Check if we can replace this instruction with any of the
20864 // visited instructions.
20865 bool Replaced = false;
20866 for (Instruction *&V : Visited) {
20867 SmallVector<int> NewMask;
20868 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20869 DT->dominates(V->getParent(), In.getParent())) {
20870 In.replaceAllUsesWith(V);
20871 eraseInstruction(&In);
20872 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
20873 if (!NewMask.empty())
20874 SI->setShuffleMask(NewMask);
20875 Replaced = true;
20876 break;
20877 }
20879 GatherShuffleExtractSeq.contains(V) &&
20880 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20881 DT->dominates(In.getParent(), V->getParent())) {
20882 In.moveAfter(V);
20883 V->replaceAllUsesWith(&In);
20885 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
20886 if (!NewMask.empty())
20887 SI->setShuffleMask(NewMask);
20888 V = &In;
20889 Replaced = true;
20890 break;
20891 }
20892 }
20893 if (!Replaced) {
20894 assert(!is_contained(Visited, &In));
20895 Visited.push_back(&In);
20896 }
20897 }
20898 }
20899 CSEBlocks.clear();
20900 GatherShuffleExtractSeq.clear();
20901}
20902
20903BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20904 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
20905 auto &BundlePtr =
20906 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20907 for (Value *V : VL) {
20908 if (S.isNonSchedulable(V))
20909 continue;
20910 auto *I = cast<Instruction>(V);
20911 if (S.isCopyableElement(V)) {
20912 // Add a copyable element model.
20913 ScheduleCopyableData &SD =
20914 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
20915 // Group the instructions to a bundle.
20916 BundlePtr->add(&SD);
20917 continue;
20918 }
20919 ScheduleData *BundleMember = getScheduleData(V);
20920 assert(BundleMember && "no ScheduleData for bundle member "
20921 "(maybe not in same basic block)");
20922 // Group the instructions to a bundle.
20923 BundlePtr->add(BundleMember);
20924 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
20925 BundlePtr.get());
20926 }
20927 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
20928 return *BundlePtr;
20929}
20930
20931// Groups the instructions to a bundle (which is then a single scheduling entity)
20932// and schedules instructions until the bundle gets ready.
20933std::optional<BoUpSLP::ScheduleBundle *>
20934BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
20935 const InstructionsState &S,
20936 const EdgeInfo &EI) {
20937 // No need to schedule PHIs, insertelement, extractelement and extractvalue
20938 // instructions.
20939 if (isa<PHINode>(S.getMainOp()) ||
20940 isVectorLikeInstWithConstOps(S.getMainOp()))
20941 return nullptr;
20942 bool HasCopyables = S.areInstructionsWithCopyableElements();
20943 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
20944 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
20945 // If all operands were replaced by copyables, the operands of this node
20946 // might be not, so need to recalculate dependencies for schedule data,
20947 // replaced by copyable schedule data.
20948 SmallVector<ScheduleData *> ControlDependentMembers;
20949 for (Value *V : VL) {
20950 auto *I = dyn_cast<Instruction>(V);
20951 if (!I || (HasCopyables && S.isCopyableElement(V)))
20952 continue;
20953 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
20954 for (const Use &U : I->operands()) {
20955 unsigned &NumOps =
20956 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
20957 .first->getSecond();
20958 ++NumOps;
20959 if (auto *Op = dyn_cast<Instruction>(U.get());
20960 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
20961 if (ScheduleData *OpSD = getScheduleData(Op);
20962 OpSD && OpSD->hasValidDependencies()) {
20963 OpSD->clearDirectDependencies();
20964 if (RegionHasStackSave ||
20966 ControlDependentMembers.push_back(OpSD);
20967 }
20968 }
20969 }
20970 }
20971 if (!ControlDependentMembers.empty()) {
20972 ScheduleBundle Invalid = ScheduleBundle::invalid();
20973 calculateDependencies(Invalid, /*InsertInReadyList=*/true, SLP,
20974 ControlDependentMembers);
20975 }
20976 return nullptr;
20977 }
20978
20979 // Initialize the instruction bundle.
20980 Instruction *OldScheduleEnd = ScheduleEnd;
20981 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
20982
20983 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
20984 // Clear deps or recalculate the region, if the memory instruction is a
20985 // copyable. It may have memory deps, which must be recalculated.
20986 SmallVector<ScheduleData *> ControlDependentMembers;
20987 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20988 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
20989 for (ScheduleEntity *SE : Bundle.getBundle()) {
20990 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
20991 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20992 BundleMember && BundleMember->hasValidDependencies()) {
20993 BundleMember->clearDirectDependencies();
20994 if (RegionHasStackSave ||
20996 BundleMember->getInst()))
20997 ControlDependentMembers.push_back(BundleMember);
20998 }
20999 continue;
21000 }
21001 auto *SD = cast<ScheduleData>(SE);
21002 if (SD->hasValidDependencies() &&
21003 (!S.areInstructionsWithCopyableElements() ||
21004 !S.isCopyableElement(SD->getInst())) &&
21005 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21006 EI.UserTE->hasState() &&
21007 (!EI.UserTE->hasCopyableElements() ||
21008 !EI.UserTE->isCopyableElement(SD->getInst())))
21009 SD->clearDirectDependencies();
21010 for (const Use &U : SD->getInst()->operands()) {
21011 unsigned &NumOps =
21012 UserOpToNumOps
21013 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
21014 .first->getSecond();
21015 ++NumOps;
21016 if (auto *Op = dyn_cast<Instruction>(U.get());
21017 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
21018 *SLP, NumOps)) {
21019 if (ScheduleData *OpSD = getScheduleData(Op);
21020 OpSD && OpSD->hasValidDependencies()) {
21021 OpSD->clearDirectDependencies();
21022 if (RegionHasStackSave ||
21024 ControlDependentMembers.push_back(OpSD);
21025 }
21026 }
21027 }
21028 }
21029 };
21030 // The scheduling region got new instructions at the lower end (or it is a
21031 // new region for the first bundle). This makes it necessary to
21032 // recalculate all dependencies.
21033 // It is seldom that this needs to be done a second time after adding the
21034 // initial bundle to the region.
21035 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21036 for_each(ScheduleDataMap, [&](auto &P) {
21037 if (BB != P.first->getParent())
21038 return;
21039 ScheduleData *SD = P.second;
21040 if (isInSchedulingRegion(*SD))
21041 SD->clearDependencies();
21042 });
21043 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21044 for_each(P.second, [&](ScheduleCopyableData *SD) {
21045 if (isInSchedulingRegion(*SD))
21046 SD->clearDependencies();
21047 });
21048 });
21049 ReSchedule = true;
21050 }
21051 // Check if the bundle data has deps for copyable elements already. In
21052 // this case need to reset deps and recalculate it.
21053 if (Bundle && !Bundle.getBundle().empty()) {
21054 if (S.areInstructionsWithCopyableElements() ||
21055 !ScheduleCopyableDataMap.empty())
21056 CheckIfNeedToClearDeps(Bundle);
21057 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
21058 << BB->getName() << "\n");
21059 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
21060 ControlDependentMembers);
21061 } else if (!ControlDependentMembers.empty()) {
21062 ScheduleBundle Invalid = ScheduleBundle::invalid();
21063 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
21064 ControlDependentMembers);
21065 }
21066
21067 if (ReSchedule) {
21068 resetSchedule();
21069 initialFillReadyList(ReadyInsts);
21070 }
21071
21072 // Now try to schedule the new bundle or (if no bundle) just calculate
21073 // dependencies. As soon as the bundle is "ready" it means that there are no
21074 // cyclic dependencies and we can schedule it. Note that's important that we
21075 // don't "schedule" the bundle yet.
21076 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21077 !ReadyInsts.empty()) {
21078 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21079 assert(Picked->isReady() && "must be ready to schedule");
21080 schedule(*SLP, S, EI, Picked, ReadyInsts);
21081 if (Picked == &Bundle)
21082 break;
21083 }
21084 };
21085
21086 // Make sure that the scheduling region contains all
21087 // instructions of the bundle.
21088 for (Value *V : VL) {
21089 if (S.isNonSchedulable(V))
21090 continue;
21091 if (!extendSchedulingRegion(V, S)) {
21092 // If the scheduling region got new instructions at the lower end (or it
21093 // is a new region for the first bundle). This makes it necessary to
21094 // recalculate all dependencies.
21095 // Otherwise the compiler may crash trying to incorrectly calculate
21096 // dependencies and emit instruction in the wrong order at the actual
21097 // scheduling.
21098 ScheduleBundle Invalid = ScheduleBundle::invalid();
21099 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
21100 return std::nullopt;
21101 }
21102 }
21103
21104 bool ReSchedule = false;
21105 for (Value *V : VL) {
21106 if (S.isNonSchedulable(V))
21107 continue;
21109 getScheduleCopyableData(cast<Instruction>(V));
21110 if (!CopyableData.empty()) {
21111 for (ScheduleCopyableData *SD : CopyableData)
21112 ReadyInsts.remove(SD);
21113 }
21114 ScheduleData *BundleMember = getScheduleData(V);
21115 assert((BundleMember || S.isCopyableElement(V)) &&
21116 "no ScheduleData for bundle member (maybe not in same basic block)");
21117 if (!BundleMember)
21118 continue;
21119
21120 // Make sure we don't leave the pieces of the bundle in the ready list when
21121 // whole bundle might not be ready.
21122 ReadyInsts.remove(BundleMember);
21123 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
21124 !Bundles.empty()) {
21125 for (ScheduleBundle *B : Bundles)
21126 ReadyInsts.remove(B);
21127 }
21128
21129 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21130 continue;
21131 // A bundle member was scheduled as single instruction before and now
21132 // needs to be scheduled as part of the bundle. We just get rid of the
21133 // existing schedule.
21134 // A bundle member has deps calculated before it was copyable element - need
21135 // to reschedule.
21136 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
21137 << " was already scheduled\n");
21138 ReSchedule = true;
21139 }
21140
21141 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21142 TryScheduleBundleImpl(ReSchedule, Bundle);
21143 if (!Bundle.isReady()) {
21144 for (ScheduleEntity *BD : Bundle.getBundle()) {
21145 // Copyable data scheduling is just removed.
21147 continue;
21148 if (BD->isReady()) {
21149 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
21150 if (Bundles.empty()) {
21151 ReadyInsts.insert(BD);
21152 continue;
21153 }
21154 for (ScheduleBundle *B : Bundles)
21155 if (B->isReady())
21156 ReadyInsts.insert(B);
21157 }
21158 }
21159 ScheduledBundlesList.pop_back();
21160 SmallVector<ScheduleData *> ControlDependentMembers;
21161 SmallPtrSet<Instruction *, 4> Visited;
21162 for (Value *V : VL) {
21163 if (S.isNonSchedulable(V))
21164 continue;
21165 auto *I = cast<Instruction>(V);
21166 if (S.isCopyableElement(I)) {
21167 // Remove the copyable data from the scheduling region and restore
21168 // previous mappings.
21169 auto KV = std::make_pair(EI, I);
21170 assert(ScheduleCopyableDataMap.contains(KV) &&
21171 "no ScheduleCopyableData for copyable element");
21172 ScheduleCopyableData *SD =
21173 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
21174 ScheduleCopyableDataMapByUsers[I].remove(SD);
21175 if (EI.UserTE) {
21176 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21177 const auto *It = find(Op, I);
21178 assert(It != Op.end() && "Lane not set");
21179 SmallPtrSet<Instruction *, 4> Visited;
21180 do {
21181 int Lane = std::distance(Op.begin(), It);
21182 assert(Lane >= 0 && "Lane not set");
21183 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21184 !EI.UserTE->ReorderIndices.empty())
21185 Lane = EI.UserTE->ReorderIndices[Lane];
21186 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21187 "Couldn't find extract lane");
21188 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21189 if (!Visited.insert(In).second) {
21190 It = find(make_range(std::next(It), Op.end()), I);
21191 break;
21192 }
21193 ScheduleCopyableDataMapByInstUser
21194 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
21195 .pop_back();
21196 It = find(make_range(std::next(It), Op.end()), I);
21197 } while (It != Op.end());
21198 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
21199 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
21200 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
21201 }
21202 if (ScheduleCopyableDataMapByUsers[I].empty())
21203 ScheduleCopyableDataMapByUsers.erase(I);
21204 ScheduleCopyableDataMap.erase(KV);
21205 // Need to recalculate dependencies for the actual schedule data.
21206 if (ScheduleData *OpSD = getScheduleData(I);
21207 OpSD && OpSD->hasValidDependencies()) {
21208 OpSD->clearDirectDependencies();
21209 if (RegionHasStackSave ||
21211 ControlDependentMembers.push_back(OpSD);
21212 }
21213 continue;
21214 }
21215 ScheduledBundles.find(I)->getSecond().pop_back();
21216 }
21217 if (!ControlDependentMembers.empty()) {
21218 ScheduleBundle Invalid = ScheduleBundle::invalid();
21219 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
21220 ControlDependentMembers);
21221 }
21222 return std::nullopt;
21223 }
21224 return &Bundle;
21225}
21226
21227BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21228 // Allocate a new ScheduleData for the instruction.
21229 if (ChunkPos >= ChunkSize) {
21230 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21231 ChunkPos = 0;
21232 }
21233 return &(ScheduleDataChunks.back()[ChunkPos++]);
21234}
21235
21236bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21237 Value *V, const InstructionsState &S) {
21239 assert(I && "bundle member must be an instruction");
21240 if (getScheduleData(I))
21241 return true;
21242 if (!ScheduleStart) {
21243 // It's the first instruction in the new region.
21244 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
21245 ScheduleStart = I;
21246 ScheduleEnd = I->getNextNode();
21247 assert(ScheduleEnd && "tried to vectorize a terminator?");
21248 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
21249 return true;
21250 }
21251 // Search up and down at the same time, because we don't know if the new
21252 // instruction is above or below the existing scheduling region.
21253 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
21254 // against the budget. Otherwise debug info could affect codegen.
21256 ++ScheduleStart->getIterator().getReverse();
21257 BasicBlock::reverse_iterator UpperEnd = BB->rend();
21258 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
21259 BasicBlock::iterator LowerEnd = BB->end();
21260 auto IsAssumeLikeIntr = [](const Instruction &I) {
21261 if (auto *II = dyn_cast<IntrinsicInst>(&I))
21262 return II->isAssumeLikeIntrinsic();
21263 return false;
21264 };
21265 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21266 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21267 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
21268 &*DownIter != I) {
21269 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21270 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
21271 return false;
21272 }
21273
21274 ++UpIter;
21275 ++DownIter;
21276
21277 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21278 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21279 }
21280 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
21281 assert(I->getParent() == ScheduleStart->getParent() &&
21282 "Instruction is in wrong basic block.");
21283 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
21284 ScheduleStart = I;
21285 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
21286 << "\n");
21287 return true;
21288 }
21289 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
21290 "Expected to reach top of the basic block or instruction down the "
21291 "lower end.");
21292 assert(I->getParent() == ScheduleEnd->getParent() &&
21293 "Instruction is in wrong basic block.");
21294 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
21295 nullptr);
21296 ScheduleEnd = I->getNextNode();
21297 assert(ScheduleEnd && "tried to vectorize a terminator?");
21298 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
21299 return true;
21300}
21301
21302void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21303 Instruction *ToI,
21304 ScheduleData *PrevLoadStore,
21305 ScheduleData *NextLoadStore) {
21306 ScheduleData *CurrentLoadStore = PrevLoadStore;
21307 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
21308 // No need to allocate data for non-schedulable instructions.
21309 if (isa<PHINode>(I))
21310 continue;
21311 ScheduleData *SD = ScheduleDataMap.lookup(I);
21312 if (!SD) {
21313 SD = allocateScheduleDataChunks();
21314 ScheduleDataMap[I] = SD;
21315 }
21316 assert(!isInSchedulingRegion(*SD) &&
21317 "new ScheduleData already in scheduling region");
21318 SD->init(SchedulingRegionID, I);
21319
21320 if (I->mayReadOrWriteMemory() &&
21321 (!isa<IntrinsicInst>(I) ||
21322 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
21323 cast<IntrinsicInst>(I)->getIntrinsicID() !=
21324 Intrinsic::pseudoprobe))) {
21325 // Update the linked list of memory accessing instructions.
21326 if (CurrentLoadStore) {
21327 CurrentLoadStore->setNextLoadStore(SD);
21328 } else {
21329 FirstLoadStoreInRegion = SD;
21330 }
21331 CurrentLoadStore = SD;
21332 }
21333
21336 RegionHasStackSave = true;
21337 }
21338 if (NextLoadStore) {
21339 if (CurrentLoadStore)
21340 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21341 } else {
21342 LastLoadStoreInRegion = CurrentLoadStore;
21343 }
21344}
21345
21346void BoUpSLP::BlockScheduling::calculateDependencies(
21347 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
21348 ArrayRef<ScheduleData *> ControlDeps) {
21349 SmallVector<ScheduleEntity *> WorkList;
21350 auto ProcessNode = [&](ScheduleEntity *SE) {
21351 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21352 if (CD->hasValidDependencies())
21353 return;
21354 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
21355 CD->initDependencies();
21356 CD->resetUnscheduledDeps();
21357 const EdgeInfo &EI = CD->getEdgeInfo();
21358 if (EI.UserTE) {
21359 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21360 const auto *It = find(Op, CD->getInst());
21361 assert(It != Op.end() && "Lane not set");
21362 SmallPtrSet<Instruction *, 4> Visited;
21363 do {
21364 int Lane = std::distance(Op.begin(), It);
21365 assert(Lane >= 0 && "Lane not set");
21366 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21367 !EI.UserTE->ReorderIndices.empty())
21368 Lane = EI.UserTE->ReorderIndices[Lane];
21369 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21370 "Couldn't find extract lane");
21371 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21372 if (EI.UserTE->isCopyableElement(In)) {
21373 // We may have not have related copyable scheduling data, if the
21374 // instruction is non-schedulable.
21375 if (ScheduleCopyableData *UseSD =
21376 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21377 CD->incDependencies();
21378 if (!UseSD->isScheduled())
21379 CD->incrementUnscheduledDeps(1);
21380 if (!UseSD->hasValidDependencies() ||
21381 (InsertInReadyList && UseSD->isReady()))
21382 WorkList.push_back(UseSD);
21383 }
21384 } else if (Visited.insert(In).second) {
21385 if (ScheduleData *UseSD = getScheduleData(In)) {
21386 CD->incDependencies();
21387 if (!UseSD->isScheduled())
21388 CD->incrementUnscheduledDeps(1);
21389 if (!UseSD->hasValidDependencies() ||
21390 (InsertInReadyList && UseSD->isReady()))
21391 WorkList.push_back(UseSD);
21392 }
21393 }
21394 It = find(make_range(std::next(It), Op.end()), CD->getInst());
21395 } while (It != Op.end());
21396 if (CD->isReady() && CD->getDependencies() == 0 &&
21397 (EI.UserTE->hasState() &&
21398 (EI.UserTE->getMainOp()->getParent() !=
21399 CD->getInst()->getParent() ||
21400 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21401 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
21402 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21403 auto *IU = dyn_cast<Instruction>(U);
21404 if (!IU)
21405 return true;
21406 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21407 })))))) {
21408 // If no uses in the block - mark as having pseudo-use, which cannot
21409 // be scheduled.
21410 // Prevents incorrect def-use tracking between external user and
21411 // actual instruction.
21412 CD->incDependencies();
21413 CD->incrementUnscheduledDeps(1);
21414 }
21415 }
21416 return;
21417 }
21418 auto *BundleMember = cast<ScheduleData>(SE);
21419 if (BundleMember->hasValidDependencies())
21420 return;
21421 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
21422 BundleMember->initDependencies();
21423 BundleMember->resetUnscheduledDeps();
21424 // Handle def-use chain dependencies.
21425 SmallDenseMap<Value *, unsigned> UserToNumOps;
21426 for (User *U : BundleMember->getInst()->users()) {
21427 if (isa<PHINode>(U))
21428 continue;
21429 if (ScheduleData *UseSD = getScheduleData(U)) {
21430 // The operand is a copyable element - skip.
21431 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
21432 ++NumOps;
21433 if (areAllOperandsReplacedByCopyableData(
21434 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21435 continue;
21436 BundleMember->incDependencies();
21437 if (!UseSD->isScheduled())
21438 BundleMember->incrementUnscheduledDeps(1);
21439 if (!UseSD->hasValidDependencies() ||
21440 (InsertInReadyList && UseSD->isReady()))
21441 WorkList.push_back(UseSD);
21442 }
21443 }
21444 for (ScheduleCopyableData *UseSD :
21445 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21446 BundleMember->incDependencies();
21447 if (!UseSD->isScheduled())
21448 BundleMember->incrementUnscheduledDeps(1);
21449 if (!UseSD->hasValidDependencies() ||
21450 (InsertInReadyList && UseSD->isReady()))
21451 WorkList.push_back(UseSD);
21452 }
21453
21454 SmallPtrSet<const Instruction *, 4> Visited;
21455 auto MakeControlDependent = [&](Instruction *I) {
21456 // Do not mark control dependent twice.
21457 if (!Visited.insert(I).second)
21458 return;
21459 auto *DepDest = getScheduleData(I);
21460 assert(DepDest && "must be in schedule window");
21461 DepDest->addControlDependency(BundleMember);
21462 BundleMember->incDependencies();
21463 if (!DepDest->isScheduled())
21464 BundleMember->incrementUnscheduledDeps(1);
21465 if (!DepDest->hasValidDependencies() ||
21466 (InsertInReadyList && DepDest->isReady()))
21467 WorkList.push_back(DepDest);
21468 };
21469
21470 // Any instruction which isn't safe to speculate at the beginning of the
21471 // block is control depend on any early exit or non-willreturn call
21472 // which proceeds it.
21473 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
21474 for (Instruction *I = BundleMember->getInst()->getNextNode();
21475 I != ScheduleEnd; I = I->getNextNode()) {
21476 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
21477 continue;
21478
21479 // Add the dependency
21480 MakeControlDependent(I);
21481
21483 // Everything past here must be control dependent on I.
21484 break;
21485 }
21486 }
21487
21488 if (RegionHasStackSave) {
21489 // If we have an inalloc alloca instruction, it needs to be scheduled
21490 // after any preceeding stacksave. We also need to prevent any alloca
21491 // from reordering above a preceeding stackrestore.
21492 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21493 match(BundleMember->getInst(),
21495 for (Instruction *I = BundleMember->getInst()->getNextNode();
21496 I != ScheduleEnd; I = I->getNextNode()) {
21499 // Any allocas past here must be control dependent on I, and I
21500 // must be memory dependend on BundleMember->Inst.
21501 break;
21502
21503 if (!isa<AllocaInst>(I))
21504 continue;
21505
21506 // Add the dependency
21507 MakeControlDependent(I);
21508 }
21509 }
21510
21511 // In addition to the cases handle just above, we need to prevent
21512 // allocas and loads/stores from moving below a stacksave or a
21513 // stackrestore. Avoiding moving allocas below stackrestore is currently
21514 // thought to be conservatism. Moving loads/stores below a stackrestore
21515 // can lead to incorrect code.
21516 if (isa<AllocaInst>(BundleMember->getInst()) ||
21517 BundleMember->getInst()->mayReadOrWriteMemory()) {
21518 for (Instruction *I = BundleMember->getInst()->getNextNode();
21519 I != ScheduleEnd; I = I->getNextNode()) {
21522 continue;
21523
21524 // Add the dependency
21525 MakeControlDependent(I);
21526 break;
21527 }
21528 }
21529 }
21530
21531 // Handle the memory dependencies (if any).
21532 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21533 if (!NextLoadStore)
21534 return;
21535 Instruction *SrcInst = BundleMember->getInst();
21536 assert(SrcInst->mayReadOrWriteMemory() &&
21537 "NextLoadStore list for non memory effecting bundle?");
21538 MemoryLocation SrcLoc = getLocation(SrcInst);
21539 bool SrcMayWrite = SrcInst->mayWriteToMemory();
21540 unsigned NumAliased = 0;
21541 unsigned DistToSrc = 1;
21542 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
21543
21544 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21545 DepDest = DepDest->getNextLoadStore()) {
21546 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
21547
21548 // We have two limits to reduce the complexity:
21549 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
21550 // SLP->isAliased (which is the expensive part in this loop).
21551 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
21552 // the whole loop (even if the loop is fast, it's quadratic).
21553 // It's important for the loop break condition (see below) to
21554 // check this limit even between two read-only instructions.
21555 if (DistToSrc >= MaxMemDepDistance ||
21556 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21557 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
21558 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21559
21560 // We increment the counter only if the locations are aliased
21561 // (instead of counting all alias checks). This gives a better
21562 // balance between reduced runtime and accurate dependencies.
21563 NumAliased++;
21564
21565 DepDest->addMemoryDependency(BundleMember);
21566 BundleMember->incDependencies();
21567 if (!DepDest->isScheduled())
21568 BundleMember->incrementUnscheduledDeps(1);
21569 if (!DepDest->hasValidDependencies() ||
21570 (InsertInReadyList && DepDest->isReady()))
21571 WorkList.push_back(DepDest);
21572 }
21573
21574 // Example, explaining the loop break condition: Let's assume our
21575 // starting instruction is i0 and MaxMemDepDistance = 3.
21576 //
21577 // +--------v--v--v
21578 // i0,i1,i2,i3,i4,i5,i6,i7,i8
21579 // +--------^--^--^
21580 //
21581 // MaxMemDepDistance let us stop alias-checking at i3 and we add
21582 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
21583 // Previously we already added dependencies from i3 to i6,i7,i8
21584 // (because of MaxMemDepDistance). As we added a dependency from
21585 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
21586 // and we can abort this loop at i6.
21587 if (DistToSrc >= 2 * MaxMemDepDistance)
21588 break;
21589 DistToSrc++;
21590 }
21591 };
21592
21593 assert((Bundle || !ControlDeps.empty()) &&
21594 "expected at least one instruction to schedule");
21595 if (Bundle)
21596 WorkList.push_back(Bundle.getBundle().front());
21597 WorkList.append(ControlDeps.begin(), ControlDeps.end());
21598 SmallPtrSet<ScheduleBundle *, 16> Visited;
21599 while (!WorkList.empty()) {
21600 ScheduleEntity *SD = WorkList.pop_back_val();
21601 SmallVector<ScheduleBundle *, 1> CopyableBundle;
21603 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21604 CopyableBundle.push_back(&CD->getBundle());
21605 Bundles = CopyableBundle;
21606 } else {
21607 Bundles = getScheduleBundles(SD->getInst());
21608 }
21609 if (Bundles.empty()) {
21610 if (!SD->hasValidDependencies())
21611 ProcessNode(SD);
21612 if (InsertInReadyList && SD->isReady()) {
21613 ReadyInsts.insert(SD);
21614 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
21615 }
21616 continue;
21617 }
21618 for (ScheduleBundle *Bundle : Bundles) {
21619 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
21620 continue;
21621 assert(isInSchedulingRegion(*Bundle) &&
21622 "ScheduleData not in scheduling region");
21623 for_each(Bundle->getBundle(), ProcessNode);
21624 }
21625 if (InsertInReadyList && SD->isReady()) {
21626 for (ScheduleBundle *Bundle : Bundles) {
21627 assert(isInSchedulingRegion(*Bundle) &&
21628 "ScheduleData not in scheduling region");
21629 if (!Bundle->isReady())
21630 continue;
21631 ReadyInsts.insert(Bundle);
21632 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
21633 << "\n");
21634 }
21635 }
21636 }
21637}
21638
21639void BoUpSLP::BlockScheduling::resetSchedule() {
21640 assert(ScheduleStart &&
21641 "tried to reset schedule on block which has not been scheduled");
21642 for_each(ScheduleDataMap, [&](auto &P) {
21643 if (BB != P.first->getParent())
21644 return;
21645 ScheduleData *SD = P.second;
21646 if (isInSchedulingRegion(*SD)) {
21647 SD->setScheduled(/*Scheduled=*/false);
21648 SD->resetUnscheduledDeps();
21649 }
21650 });
21651 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21652 for_each(P.second, [&](ScheduleCopyableData *SD) {
21653 if (isInSchedulingRegion(*SD)) {
21654 SD->setScheduled(/*Scheduled=*/false);
21655 SD->resetUnscheduledDeps();
21656 }
21657 });
21658 });
21659 for_each(ScheduledBundles, [&](auto &P) {
21660 for_each(P.second, [&](ScheduleBundle *Bundle) {
21661 if (isInSchedulingRegion(*Bundle))
21662 Bundle->setScheduled(/*Scheduled=*/false);
21663 });
21664 });
21665 // Reset schedule data for copyable elements.
21666 for (auto &P : ScheduleCopyableDataMap) {
21667 if (isInSchedulingRegion(*P.second)) {
21668 P.second->setScheduled(/*Scheduled=*/false);
21669 P.second->resetUnscheduledDeps();
21670 }
21671 }
21672 ReadyInsts.clear();
21673}
21674
21675void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
21676 if (!BS->ScheduleStart)
21677 return;
21678
21679 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
21680
21681 // A key point - if we got here, pre-scheduling was able to find a valid
21682 // scheduling of the sub-graph of the scheduling window which consists
21683 // of all vector bundles and their transitive users. As such, we do not
21684 // need to reschedule anything *outside of* that subgraph.
21685
21686 BS->resetSchedule();
21687
21688 // For the real scheduling we use a more sophisticated ready-list: it is
21689 // sorted by the original instruction location. This lets the final schedule
21690 // be as close as possible to the original instruction order.
21691 // WARNING: If changing this order causes a correctness issue, that means
21692 // there is some missing dependence edge in the schedule data graph.
21693 struct ScheduleDataCompare {
21694 bool operator()(const ScheduleEntity *SD1,
21695 const ScheduleEntity *SD2) const {
21696 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21697 }
21698 };
21699 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21700
21701 // Ensure that all dependency data is updated (for nodes in the sub-graph)
21702 // and fill the ready-list with initial instructions.
21703 int Idx = 0;
21704 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21705 I = I->getNextNode()) {
21706 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21707 if (!Bundles.empty()) {
21708 for (ScheduleBundle *Bundle : Bundles) {
21709 Bundle->setSchedulingPriority(Idx++);
21710 if (!Bundle->hasValidDependencies())
21711 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
21712 }
21713 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
21714 for (ScheduleCopyableData *SD : reverse(SDs)) {
21715 ScheduleBundle &Bundle = SD->getBundle();
21716 Bundle.setSchedulingPriority(Idx++);
21717 if (!Bundle.hasValidDependencies())
21718 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21719 }
21720 continue;
21721 }
21723 BS->getScheduleCopyableDataUsers(I);
21724 if (ScheduleData *SD = BS->getScheduleData(I)) {
21725 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
21726 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
21727 SDTEs.front()->doesNotNeedToSchedule() ||
21729 "scheduler and vectorizer bundle mismatch");
21730 SD->setSchedulingPriority(Idx++);
21731 if (!SD->hasValidDependencies() &&
21732 (!CopyableData.empty() ||
21733 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
21734 assert(TE->isGather() && "expected gather node");
21735 return TE->hasState() && TE->hasCopyableElements() &&
21736 TE->isCopyableElement(I);
21737 }))) {
21738 // Need to calculate deps for these nodes to correctly handle copyable
21739 // dependencies, even if they were cancelled.
21740 // If copyables bundle was cancelled, the deps are cleared and need to
21741 // recalculate them.
21742 ScheduleBundle Bundle;
21743 Bundle.add(SD);
21744 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21745 }
21746 }
21747 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
21748 ScheduleBundle &Bundle = SD->getBundle();
21749 Bundle.setSchedulingPriority(Idx++);
21750 if (!Bundle.hasValidDependencies())
21751 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21752 }
21753 }
21754 BS->initialFillReadyList(ReadyInsts);
21755
21756 Instruction *LastScheduledInst = BS->ScheduleEnd;
21757
21758 // Do the "real" scheduling.
21759 SmallPtrSet<Instruction *, 16> Scheduled;
21760 while (!ReadyInsts.empty()) {
21761 auto *Picked = *ReadyInsts.begin();
21762 ReadyInsts.erase(ReadyInsts.begin());
21763
21764 // Move the scheduled instruction(s) to their dedicated places, if not
21765 // there yet.
21766 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
21767 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21768 Instruction *PickedInst = BundleMember->getInst();
21769 // If copyable must be schedule as part of something else, skip it.
21770 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21771 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21772 (!IsCopyable && !Scheduled.insert(PickedInst).second))
21773 continue;
21774 if (PickedInst->getNextNode() != LastScheduledInst)
21775 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21776 LastScheduledInst = PickedInst;
21777 }
21778 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21779 LastScheduledInst);
21780 } else {
21781 auto *SD = cast<ScheduleData>(Picked);
21782 Instruction *PickedInst = SD->getInst();
21783 if (PickedInst->getNextNode() != LastScheduledInst)
21784 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21785 LastScheduledInst = PickedInst;
21786 }
21787 auto Invalid = InstructionsState::invalid();
21788 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
21789 }
21790
21791 // Check that we didn't break any of our invariants.
21792#ifdef EXPENSIVE_CHECKS
21793 BS->verify();
21794#endif
21795
21796#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21797 // Check that all schedulable entities got scheduled
21798 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21799 I = I->getNextNode()) {
21800 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21801 assert(all_of(Bundles,
21802 [](const ScheduleBundle *Bundle) {
21803 return Bundle->isScheduled();
21804 }) &&
21805 "must be scheduled at this point");
21806 }
21807#endif
21808
21809 // Avoid duplicate scheduling of the block.
21810 BS->ScheduleStart = nullptr;
21811}
21812
21814 // If V is a store, just return the width of the stored value (or value
21815 // truncated just before storing) without traversing the expression tree.
21816 // This is the common case.
21817 if (auto *Store = dyn_cast<StoreInst>(V))
21818 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21819
21820 if (auto *IEI = dyn_cast<InsertElementInst>(V))
21821 return getVectorElementSize(IEI->getOperand(1));
21822
21823 auto E = InstrElementSize.find(V);
21824 if (E != InstrElementSize.end())
21825 return E->second;
21826
21827 // If V is not a store, we can traverse the expression tree to find loads
21828 // that feed it. The type of the loaded value may indicate a more suitable
21829 // width than V's type. We want to base the vector element size on the width
21830 // of memory operations where possible.
21833 if (auto *I = dyn_cast<Instruction>(V)) {
21834 Worklist.emplace_back(I, I->getParent(), 0);
21835 Visited.insert(I);
21836 }
21837
21838 // Traverse the expression tree in bottom-up order looking for loads. If we
21839 // encounter an instruction we don't yet handle, we give up.
21840 auto Width = 0u;
21841 Value *FirstNonBool = nullptr;
21842 while (!Worklist.empty()) {
21843 auto [I, Parent, Level] = Worklist.pop_back_val();
21844
21845 // We should only be looking at scalar instructions here. If the current
21846 // instruction has a vector type, skip.
21847 auto *Ty = I->getType();
21848 if (isa<VectorType>(Ty))
21849 continue;
21850 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21851 FirstNonBool = I;
21852 if (Level > RecursionMaxDepth)
21853 continue;
21854
21855 // If the current instruction is a load, update MaxWidth to reflect the
21856 // width of the loaded value.
21858 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21859
21860 // Otherwise, we need to visit the operands of the instruction. We only
21861 // handle the interesting cases from buildTree here. If an operand is an
21862 // instruction we haven't yet visited and from the same basic block as the
21863 // user or the use is a PHI node, we add it to the worklist.
21866 for (Use &U : I->operands()) {
21867 if (auto *J = dyn_cast<Instruction>(U.get()))
21868 if (Visited.insert(J).second &&
21869 (isa<PHINode>(I) || J->getParent() == Parent)) {
21870 Worklist.emplace_back(J, J->getParent(), Level + 1);
21871 continue;
21872 }
21873 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
21874 FirstNonBool = U.get();
21875 }
21876 } else {
21877 break;
21878 }
21879 }
21880
21881 // If we didn't encounter a memory access in the expression tree, or if we
21882 // gave up for some reason, just return the width of V. Otherwise, return the
21883 // maximum width we found.
21884 if (!Width) {
21885 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21886 V = FirstNonBool;
21887 Width = DL->getTypeSizeInBits(V->getType());
21888 }
21889
21890 for (Instruction *I : Visited)
21891 InstrElementSize[I] = Width;
21892
21893 return Width;
21894}
21895
21896bool BoUpSLP::collectValuesToDemote(
21897 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
21899 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
21900 bool &IsProfitableToDemote, bool IsTruncRoot) const {
21901 // We can always demote constants.
21902 if (all_of(E.Scalars, IsaPred<Constant>))
21903 return true;
21904
21905 unsigned OrigBitWidth =
21906 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21907 if (OrigBitWidth == BitWidth) {
21908 MaxDepthLevel = 1;
21909 return true;
21910 }
21911
21912 // Check if the node was analyzed already and must keep its original bitwidth.
21913 if (NodesToKeepBWs.contains(E.Idx))
21914 return false;
21915
21916 // If the value is not a vectorized instruction in the expression and not used
21917 // by the insertelement instruction and not used in multiple vector nodes, it
21918 // cannot be demoted.
21919 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
21920 if (isa<PoisonValue>(R))
21921 return false;
21922 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21923 });
21924 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
21925 if (isa<PoisonValue>(V))
21926 return true;
21927 if (getTreeEntries(V).size() > 1)
21928 return false;
21929 // For lat shuffle of sext/zext with many uses need to check the extra bit
21930 // for unsigned values, otherwise may have incorrect casting for reused
21931 // scalars.
21932 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
21933 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
21934 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21935 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21936 return true;
21937 }
21938 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
21939 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21940 if (IsSignedNode)
21941 ++BitWidth1;
21942 if (auto *I = dyn_cast<Instruction>(V)) {
21943 APInt Mask = DB->getDemandedBits(I);
21944 unsigned BitWidth2 =
21945 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21946 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21947 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
21948 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21949 break;
21950 BitWidth2 *= 2;
21951 }
21952 BitWidth1 = std::min(BitWidth1, BitWidth2);
21953 }
21954 BitWidth = std::max(BitWidth, BitWidth1);
21955 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
21956 };
21957 auto FinalAnalysis = [&, TTI = TTI]() {
21958 if (!IsProfitableToDemote)
21959 return false;
21960 bool Res = all_of(
21961 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
21962 // Demote gathers.
21963 if (Res && E.isGather()) {
21964 if (E.hasState()) {
21965 if (const TreeEntry *SameTE =
21966 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
21967 SameTE)
21968 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
21969 ToDemote, Visited, NodesToKeepBWs,
21970 MaxDepthLevel, IsProfitableToDemote,
21971 IsTruncRoot)) {
21972 ToDemote.push_back(E.Idx);
21973 return true;
21974 }
21975 }
21976 // Check possible extractelement instructions bases and final vector
21977 // length.
21978 SmallPtrSet<Value *, 4> UniqueBases;
21979 for (Value *V : E.Scalars) {
21980 auto *EE = dyn_cast<ExtractElementInst>(V);
21981 if (!EE)
21982 continue;
21983 UniqueBases.insert(EE->getVectorOperand());
21984 }
21985 const unsigned VF = E.Scalars.size();
21986 Type *OrigScalarTy = E.Scalars.front()->getType();
21987 if (UniqueBases.size() <= 2 ||
21988 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
21990 *TTI,
21992 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
21993 VF))) {
21994 ToDemote.push_back(E.Idx);
21995 return true;
21996 }
21997 }
21998 return Res;
21999 };
22000 if (E.isGather() || !Visited.insert(&E).second ||
22001 any_of(E.Scalars, [&](Value *V) {
22002 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22003 return isa<InsertElementInst>(U) && !isVectorized(U);
22004 });
22005 }))
22006 return FinalAnalysis();
22007
22008 if (any_of(E.Scalars, [&](Value *V) {
22009 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22010 return isVectorized(U) ||
22011 (E.Idx == 0 && UserIgnoreList &&
22012 UserIgnoreList->contains(U)) ||
22013 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22014 !U->getType()->isScalableTy() &&
22015 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22016 }) && !IsPotentiallyTruncated(V, BitWidth);
22017 }))
22018 return false;
22019
22020 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
22021 bool &NeedToExit) {
22022 NeedToExit = false;
22023 unsigned InitLevel = MaxDepthLevel;
22024 for (const TreeEntry *Op : Operands) {
22025 unsigned Level = InitLevel;
22026 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
22027 ToDemote, Visited, NodesToKeepBWs, Level,
22028 IsProfitableToDemote, IsTruncRoot)) {
22029 if (!IsProfitableToDemote)
22030 return false;
22031 NeedToExit = true;
22032 if (!FinalAnalysis())
22033 return false;
22034 continue;
22035 }
22036 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22037 }
22038 return true;
22039 };
22040 auto AttemptCheckBitwidth =
22041 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
22042 // Try all bitwidth < OrigBitWidth.
22043 NeedToExit = false;
22044 unsigned BestFailBitwidth = 0;
22045 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
22046 if (Checker(BitWidth, OrigBitWidth))
22047 return true;
22048 if (BestFailBitwidth == 0 && FinalAnalysis())
22049 BestFailBitwidth = BitWidth;
22050 }
22051 if (BitWidth >= OrigBitWidth) {
22052 if (BestFailBitwidth == 0) {
22053 BitWidth = OrigBitWidth;
22054 return false;
22055 }
22056 MaxDepthLevel = 1;
22057 BitWidth = BestFailBitwidth;
22058 NeedToExit = true;
22059 return true;
22060 }
22061 return false;
22062 };
22063 auto TryProcessInstruction =
22064 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
22065 function_ref<bool(unsigned, unsigned)> Checker = {}) {
22066 if (Operands.empty()) {
22067 if (!IsTruncRoot)
22068 MaxDepthLevel = 1;
22069 for (Value *V : E.Scalars)
22070 (void)IsPotentiallyTruncated(V, BitWidth);
22071 } else {
22072 // Several vectorized uses? Check if we can truncate it, otherwise -
22073 // exit.
22074 if (any_of(E.Scalars, [&](Value *V) {
22075 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22076 }))
22077 return false;
22078 bool NeedToExit = false;
22079 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22080 return false;
22081 if (NeedToExit)
22082 return true;
22083 if (!ProcessOperands(Operands, NeedToExit))
22084 return false;
22085 if (NeedToExit)
22086 return true;
22087 }
22088
22089 ++MaxDepthLevel;
22090 // Record the entry that we can demote.
22091 ToDemote.push_back(E.Idx);
22092 return IsProfitableToDemote;
22093 };
22094
22095 if (E.State == TreeEntry::SplitVectorize)
22096 return TryProcessInstruction(
22097 BitWidth,
22098 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
22099 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
22100
22101 switch (E.getOpcode()) {
22102
22103 // We can always demote truncations and extensions. Since truncations can
22104 // seed additional demotion, we save the truncated value.
22105 case Instruction::Trunc:
22106 if (IsProfitableToDemoteRoot)
22107 IsProfitableToDemote = true;
22108 return TryProcessInstruction(BitWidth);
22109 case Instruction::ZExt:
22110 case Instruction::SExt:
22111 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
22112 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22113 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22114 return false;
22115 IsProfitableToDemote = true;
22116 return TryProcessInstruction(BitWidth);
22117
22118 // We can demote certain binary operations if we can demote both of their
22119 // operands.
22120 case Instruction::Add:
22121 case Instruction::Sub:
22122 case Instruction::Mul:
22123 case Instruction::And:
22124 case Instruction::Or:
22125 case Instruction::Xor: {
22126 return TryProcessInstruction(
22127 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
22128 }
22129 case Instruction::Freeze:
22130 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
22131 case Instruction::Shl: {
22132 // If we are truncating the result of this SHL, and if it's a shift of an
22133 // inrange amount, we can always perform a SHL in a smaller type.
22134 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
22135 return all_of(E.Scalars, [&](Value *V) {
22136 if (isa<PoisonValue>(V))
22137 return true;
22138 if (E.isCopyableElement(V))
22139 return true;
22140 auto *I = cast<Instruction>(V);
22141 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22142 return AmtKnownBits.getMaxValue().ult(BitWidth);
22143 });
22144 };
22145 return TryProcessInstruction(
22146 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
22147 }
22148 case Instruction::LShr: {
22149 // If this is a truncate of a logical shr, we can truncate it to a smaller
22150 // lshr iff we know that the bits we would otherwise be shifting in are
22151 // already zeros.
22152 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22153 return all_of(E.Scalars, [&](Value *V) {
22154 if (isa<PoisonValue>(V))
22155 return true;
22156 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22157 if (E.isCopyableElement(V))
22158 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22159 auto *I = cast<Instruction>(V);
22160 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22161 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22162 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22163 SimplifyQuery(*DL));
22164 });
22165 };
22166 return TryProcessInstruction(
22167 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22168 LShrChecker);
22169 }
22170 case Instruction::AShr: {
22171 // If this is a truncate of an arithmetic shr, we can truncate it to a
22172 // smaller ashr iff we know that all the bits from the sign bit of the
22173 // original type and the sign bit of the truncate type are similar.
22174 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22175 return all_of(E.Scalars, [&](Value *V) {
22176 if (isa<PoisonValue>(V))
22177 return true;
22178 auto *I = cast<Instruction>(V);
22179 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22180 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22181 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22182 ShiftedBits <
22183 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22184 });
22185 };
22186 return TryProcessInstruction(
22187 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22188 AShrChecker);
22189 }
22190 case Instruction::UDiv:
22191 case Instruction::URem: {
22192 // UDiv and URem can be truncated if all the truncated bits are zero.
22193 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22194 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22195 return all_of(E.Scalars, [&](Value *V) {
22196 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22197 if (E.hasCopyableElements() && E.isCopyableElement(V))
22198 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22199 auto *I = cast<Instruction>(V);
22200 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22201 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22202 });
22203 };
22204 return TryProcessInstruction(
22205 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
22206 }
22207
22208 // We can demote selects if we can demote their true and false values.
22209 case Instruction::Select: {
22210 return TryProcessInstruction(
22211 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
22212 }
22213
22214 // We can demote phis if we can demote all their incoming operands.
22215 case Instruction::PHI: {
22216 const unsigned NumOps = E.getNumOperands();
22218 transform(seq<unsigned>(0, NumOps), Ops.begin(),
22219 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
22220
22221 return TryProcessInstruction(BitWidth, Ops);
22222 }
22223
22224 case Instruction::Call: {
22225 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
22226 if (!IC)
22227 break;
22229 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
22230 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
22231 break;
22232 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
22233 function_ref<bool(unsigned, unsigned)> CallChecker;
22234 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22235 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22236 return all_of(E.Scalars, [&](Value *V) {
22237 auto *I = cast<Instruction>(V);
22238 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22239 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22240 return MaskedValueIsZero(I->getOperand(0), Mask,
22241 SimplifyQuery(*DL)) &&
22242 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22243 }
22244 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
22245 "Expected min/max intrinsics only.");
22246 unsigned SignBits = OrigBitWidth - BitWidth;
22247 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22248 unsigned Op0SignBits =
22249 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22250 unsigned Op1SignBits =
22251 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
22252 return SignBits <= Op0SignBits &&
22253 ((SignBits != Op0SignBits &&
22254 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22255 MaskedValueIsZero(I->getOperand(0), Mask,
22256 SimplifyQuery(*DL))) &&
22257 SignBits <= Op1SignBits &&
22258 ((SignBits != Op1SignBits &&
22259 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
22260 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
22261 });
22262 };
22263 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22264 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22265 return all_of(E.Scalars, [&](Value *V) {
22266 auto *I = cast<Instruction>(V);
22267 unsigned SignBits = OrigBitWidth - BitWidth;
22268 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22269 unsigned Op0SignBits =
22270 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22271 return SignBits <= Op0SignBits &&
22272 ((SignBits != Op0SignBits &&
22273 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22274 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22275 });
22276 };
22277 if (ID != Intrinsic::abs) {
22278 Operands.push_back(getOperandEntry(&E, 1));
22279 CallChecker = CompChecker;
22280 } else {
22281 CallChecker = AbsChecker;
22282 }
22283 InstructionCost BestCost =
22284 std::numeric_limits<InstructionCost::CostType>::max();
22285 unsigned BestBitWidth = BitWidth;
22286 unsigned VF = E.Scalars.size();
22287 // Choose the best bitwidth based on cost estimations.
22288 auto Checker = [&](unsigned BitWidth, unsigned) {
22289 unsigned MinBW = PowerOf2Ceil(BitWidth);
22290 SmallVector<Type *> ArgTys =
22291 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
22292 auto VecCallCosts = getVectorCallCosts(
22293 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
22294 TTI, TLI, ArgTys);
22295 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
22296 if (Cost < BestCost) {
22297 BestCost = Cost;
22298 BestBitWidth = BitWidth;
22299 }
22300 return false;
22301 };
22302 [[maybe_unused]] bool NeedToExit;
22303 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22304 BitWidth = BestBitWidth;
22305 return TryProcessInstruction(BitWidth, Operands, CallChecker);
22306 }
22307
22308 // Otherwise, conservatively give up.
22309 default:
22310 break;
22311 }
22312 MaxDepthLevel = 1;
22313 return FinalAnalysis();
22314}
22315
22316static RecurKind getRdxKind(Value *V);
22317
22319 // We only attempt to truncate integer expressions.
22320 bool IsStoreOrInsertElt =
22321 VectorizableTree.front()->hasState() &&
22322 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22323 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22324 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22325 ExtraBitWidthNodes.size() <= 1 &&
22326 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22327 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22328 return;
22329
22330 unsigned NodeIdx = 0;
22331 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22332 NodeIdx = 1;
22333
22334 // Ensure the roots of the vectorizable tree don't form a cycle.
22335 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
22336 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22337 "Unexpected tree is graph.");
22338
22339 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
22340 // resize to the final type.
22341 bool IsTruncRoot = false;
22342 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22343 SmallVector<unsigned> RootDemotes;
22344 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
22345 if (NodeIdx != 0 &&
22346 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22347 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22348 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
22349 IsTruncRoot = true;
22350 RootDemotes.push_back(NodeIdx);
22351 IsProfitableToDemoteRoot = true;
22352 ++NodeIdx;
22353 }
22354
22355 // Analyzed the reduction already and not profitable - exit.
22356 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22357 return;
22358
22359 SmallVector<unsigned> ToDemote;
22360 auto ComputeMaxBitWidth =
22361 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
22362 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
22363 ToDemote.clear();
22364 // Check if the root is trunc and the next node is gather/buildvector, then
22365 // keep trunc in scalars, which is free in most cases.
22366 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22367 !NodesToKeepBWs.contains(E.Idx) &&
22368 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22369 all_of(E.Scalars, [&](Value *V) {
22370 return V->hasOneUse() || isa<Constant>(V) ||
22371 (!V->hasNUsesOrMore(UsesLimit) &&
22372 none_of(V->users(), [&](User *U) {
22373 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22374 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22375 if (TEs.empty() || is_contained(TEs, UserTE))
22376 return false;
22377 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22378 SelectInst>(U) ||
22379 isa<SIToFPInst, UIToFPInst>(U) ||
22380 (UserTE->hasState() &&
22381 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22382 SelectInst>(UserTE->getMainOp()) ||
22383 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22384 return true;
22385 unsigned UserTESz = DL->getTypeSizeInBits(
22386 UserTE->Scalars.front()->getType());
22387 if (all_of(TEs, [&](const TreeEntry *TE) {
22388 auto It = MinBWs.find(TE);
22389 return It != MinBWs.end() &&
22390 It->second.first > UserTESz;
22391 }))
22392 return true;
22393 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22394 }));
22395 })) {
22396 ToDemote.push_back(E.Idx);
22397 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22398 auto It = MinBWs.find(UserTE);
22399 if (It != MinBWs.end())
22400 return It->second.first;
22401 unsigned MaxBitWidth =
22402 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22403 MaxBitWidth = bit_ceil(MaxBitWidth);
22404 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22405 MaxBitWidth = 8;
22406 return MaxBitWidth;
22407 }
22408
22409 if (!E.hasState())
22410 return 0u;
22411
22412 unsigned VF = E.getVectorFactor();
22413 Type *ScalarTy = E.Scalars.front()->getType();
22414 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22415 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
22416 if (!TreeRootIT)
22417 return 0u;
22418
22419 if (any_of(E.Scalars,
22420 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
22421 return 0u;
22422
22423 unsigned NumParts = ::getNumberOfParts(
22424 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
22425
22426 // The maximum bit width required to represent all the values that can be
22427 // demoted without loss of precision. It would be safe to truncate the roots
22428 // of the expression to this width.
22429 unsigned MaxBitWidth = 1u;
22430
22431 // True if the roots can be zero-extended back to their original type,
22432 // rather than sign-extended. We know that if the leading bits are not
22433 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
22434 // True.
22435 // Determine if the sign bit of all the roots is known to be zero. If not,
22436 // IsKnownPositive is set to False.
22437 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
22438 if (isa<PoisonValue>(R))
22439 return true;
22440 KnownBits Known = computeKnownBits(R, *DL);
22441 return Known.isNonNegative();
22442 });
22443
22444 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22445 E.UserTreeIndex.UserTE->hasState() &&
22446 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22447 MaxBitWidth =
22448 std::min(DL->getTypeSizeInBits(
22449 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22450 DL->getTypeSizeInBits(ScalarTy));
22451
22452 // We first check if all the bits of the roots are demanded. If they're not,
22453 // we can truncate the roots to this narrower type.
22454 for (Value *Root : E.Scalars) {
22455 if (isa<PoisonValue>(Root))
22456 continue;
22457 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
22458 TypeSize NumTypeBits =
22459 DL->getTypeSizeInBits(Root->getType()->getScalarType());
22460 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22461 // If we can't prove that the sign bit is zero, we must add one to the
22462 // maximum bit width to account for the unknown sign bit. This preserves
22463 // the existing sign bit so we can safely sign-extend the root back to the
22464 // original type. Otherwise, if we know the sign bit is zero, we will
22465 // zero-extend the root instead.
22466 //
22467 // FIXME: This is somewhat suboptimal, as there will be cases where adding
22468 // one to the maximum bit width will yield a larger-than-necessary
22469 // type. In general, we need to add an extra bit only if we can't
22470 // prove that the upper bit of the original type is equal to the
22471 // upper bit of the proposed smaller type. If these two bits are
22472 // the same (either zero or one) we know that sign-extending from
22473 // the smaller type will result in the same value. Here, since we
22474 // can't yet prove this, we are just making the proposed smaller
22475 // type larger to ensure correctness.
22476 if (!IsKnownPositive)
22477 ++BitWidth1;
22478
22479 auto *I = dyn_cast<Instruction>(Root);
22480 if (!I) {
22481 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22482 continue;
22483 }
22484 APInt Mask = DB->getDemandedBits(I);
22485 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22486 MaxBitWidth =
22487 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22488 }
22489
22490 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22491 MaxBitWidth = 8;
22492
22493 // If the original type is large, but reduced type does not improve the reg
22494 // use - ignore it.
22495 if (NumParts > 1 &&
22496 NumParts ==
22498 *TTI, getWidenedType(IntegerType::get(F->getContext(),
22499 bit_ceil(MaxBitWidth)),
22500 VF)))
22501 return 0u;
22502
22503 unsigned Opcode = E.getOpcode();
22504 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22505 Opcode == Instruction::SExt ||
22506 Opcode == Instruction::ZExt || NumParts > 1;
22507 // Conservatively determine if we can actually truncate the roots of the
22508 // expression. Collect the values that can be demoted in ToDemote and
22509 // additional roots that require investigating in Roots.
22511 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22512 bool NeedToDemote = IsProfitableToDemote;
22513
22514 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22515 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22516 NeedToDemote, IsTruncRoot) ||
22517 (MaxDepthLevel <= Limit &&
22518 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22519 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22520 DL->getTypeSizeInBits(TreeRootIT) /
22521 DL->getTypeSizeInBits(
22522 E.getMainOp()->getOperand(0)->getType()) >
22523 2)))))
22524 return 0u;
22525 // Round MaxBitWidth up to the next power-of-two.
22526 MaxBitWidth = bit_ceil(MaxBitWidth);
22527
22528 return MaxBitWidth;
22529 };
22530
22531 // If we can truncate the root, we must collect additional values that might
22532 // be demoted as a result. That is, those seeded by truncations we will
22533 // modify.
22534 // Add reduction ops sizes, if any.
22535 if (UserIgnoreList &&
22536 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22537 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
22538 // x i1> to in)).
22539 if (all_of(*UserIgnoreList,
22540 [](Value *V) {
22541 return isa<PoisonValue>(V) ||
22542 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22543 }) &&
22544 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22545 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22546 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22547 Builder.getInt1Ty()) {
22548 ReductionBitWidth = 1;
22549 } else {
22550 for (Value *V : *UserIgnoreList) {
22551 if (isa<PoisonValue>(V))
22552 continue;
22553 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22554 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
22555 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22557 ++BitWidth1;
22558 unsigned BitWidth2 = BitWidth1;
22561 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22562 }
22563 ReductionBitWidth =
22564 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22565 }
22566 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22567 ReductionBitWidth = 8;
22568
22569 ReductionBitWidth = bit_ceil(ReductionBitWidth);
22570 }
22571 }
22572 bool IsTopRoot = NodeIdx == 0;
22573 while (NodeIdx < VectorizableTree.size() &&
22574 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22575 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22576 RootDemotes.push_back(NodeIdx);
22577 ++NodeIdx;
22578 IsTruncRoot = true;
22579 }
22580 bool IsSignedCmp = false;
22581 if (UserIgnoreList &&
22582 all_of(*UserIgnoreList,
22584 m_SMax(m_Value(), m_Value())))))
22585 IsSignedCmp = true;
22586 while (NodeIdx < VectorizableTree.size()) {
22587 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
22588 unsigned Limit = 2;
22589 if (IsTopRoot &&
22590 ReductionBitWidth ==
22591 DL->getTypeSizeInBits(
22592 VectorizableTree.front()->Scalars.front()->getType()))
22593 Limit = 3;
22594 unsigned MaxBitWidth = ComputeMaxBitWidth(
22595 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22596 IsTruncRoot, IsSignedCmp);
22597 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22598 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22599 ReductionBitWidth = bit_ceil(MaxBitWidth);
22600 else if (MaxBitWidth == 0)
22601 ReductionBitWidth = 0;
22602 }
22603
22604 for (unsigned Idx : RootDemotes) {
22605 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
22606 uint32_t OrigBitWidth =
22607 DL->getTypeSizeInBits(V->getType()->getScalarType());
22608 if (OrigBitWidth > MaxBitWidth) {
22609 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
22610 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22611 }
22612 return false;
22613 }))
22614 ToDemote.push_back(Idx);
22615 }
22616 RootDemotes.clear();
22617 IsTopRoot = false;
22618 IsProfitableToDemoteRoot = true;
22619
22620 if (ExtraBitWidthNodes.empty()) {
22621 NodeIdx = VectorizableTree.size();
22622 } else {
22623 unsigned NewIdx = 0;
22624 do {
22625 NewIdx = *ExtraBitWidthNodes.begin();
22626 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22627 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22628 NodeIdx = NewIdx;
22629 IsTruncRoot =
22630 NodeIdx < VectorizableTree.size() &&
22631 VectorizableTree[NodeIdx]->UserTreeIndex &&
22632 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22633 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22634 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22635 Instruction::Trunc &&
22636 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22637 IsSignedCmp =
22638 NodeIdx < VectorizableTree.size() &&
22639 VectorizableTree[NodeIdx]->UserTreeIndex &&
22640 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22641 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22642 Instruction::ICmp &&
22643 any_of(
22644 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22645 [&](Value *V) {
22646 auto *IC = dyn_cast<ICmpInst>(V);
22647 return IC && (IC->isSigned() ||
22648 !isKnownNonNegative(IC->getOperand(0),
22649 SimplifyQuery(*DL)) ||
22650 !isKnownNonNegative(IC->getOperand(1),
22651 SimplifyQuery(*DL)));
22652 });
22653 }
22654
22655 // If the maximum bit width we compute is less than the width of the roots'
22656 // type, we can proceed with the narrowing. Otherwise, do nothing.
22657 if (MaxBitWidth == 0 ||
22658 MaxBitWidth >=
22659 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
22660 ->getBitWidth()) {
22661 if (UserIgnoreList)
22662 AnalyzedMinBWVals.insert_range(TreeRoot);
22663 NodesToKeepBWs.insert_range(ToDemote);
22664 continue;
22665 }
22666
22667 // Finally, map the values we can demote to the maximum bit with we
22668 // computed.
22669 for (unsigned Idx : ToDemote) {
22670 TreeEntry *TE = VectorizableTree[Idx].get();
22671 if (MinBWs.contains(TE))
22672 continue;
22673 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
22674 if (isa<PoisonValue>(R))
22675 return false;
22676 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22677 });
22678 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22679 }
22680 }
22681}
22682
22684 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
22685 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
22687 auto *AA = &AM.getResult<AAManager>(F);
22688 auto *LI = &AM.getResult<LoopAnalysis>(F);
22689 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
22690 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
22691 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
22693
22694 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
22695 if (!Changed)
22696 return PreservedAnalyses::all();
22697
22700 return PA;
22701}
22702
22704 TargetTransformInfo *TTI_,
22705 TargetLibraryInfo *TLI_, AAResults *AA_,
22706 LoopInfo *LI_, DominatorTree *DT_,
22707 AssumptionCache *AC_, DemandedBits *DB_,
22710 return false;
22711 SE = SE_;
22712 TTI = TTI_;
22713 TLI = TLI_;
22714 AA = AA_;
22715 LI = LI_;
22716 DT = DT_;
22717 AC = AC_;
22718 DB = DB_;
22719 DL = &F.getDataLayout();
22720
22721 Stores.clear();
22722 GEPs.clear();
22723 bool Changed = false;
22724
22725 // If the target claims to have no vector registers don't attempt
22726 // vectorization.
22727 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
22728 LLVM_DEBUG(
22729 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
22730 return false;
22731 }
22732
22733 // Don't vectorize when the attribute NoImplicitFloat is used.
22734 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
22735 return false;
22736
22737 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
22738
22739 // Use the bottom up slp vectorizer to construct chains that start with
22740 // store instructions.
22741 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
22742
22743 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
22744 // delete instructions.
22745
22746 // Update DFS numbers now so that we can use them for ordering.
22747 DT->updateDFSNumbers();
22748
22749 // Scan the blocks in the function in post order.
22750 for (auto *BB : post_order(&F.getEntryBlock())) {
22752 continue;
22753
22754 // Start new block - clear the list of reduction roots.
22755 R.clearReductionData();
22756 collectSeedInstructions(BB);
22757
22758 // Vectorize trees that end at stores.
22759 if (!Stores.empty()) {
22760 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
22761 << " underlying objects.\n");
22762 Changed |= vectorizeStoreChains(R);
22763 }
22764
22765 // Vectorize trees that end at reductions.
22766 Changed |= vectorizeChainsInBlock(BB, R);
22767
22768 // Vectorize the index computations of getelementptr instructions. This
22769 // is primarily intended to catch gather-like idioms ending at
22770 // non-consecutive loads.
22771 if (!GEPs.empty()) {
22772 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
22773 << " underlying objects.\n");
22774 Changed |= vectorizeGEPIndices(BB, R);
22775 }
22776 }
22777
22778 if (Changed) {
22779 R.optimizeGatherSequence();
22780 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
22781 }
22782 return Changed;
22783}
22784
22785std::optional<bool>
22786SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
22787 unsigned Idx, unsigned MinVF,
22788 unsigned &Size) {
22789 Size = 0;
22790 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
22791 << "\n");
22792 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22793 unsigned VF = Chain.size();
22794
22795 if (!has_single_bit(Sz) ||
22797 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
22798 VF) ||
22799 VF < 2 || VF < MinVF) {
22800 // Check if vectorizing with a non-power-of-2 VF should be considered. At
22801 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
22802 // all vector lanes are used.
22803 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
22804 return false;
22805 }
22806
22807 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
22808 << "\n");
22809
22810 SetVector<Value *> ValOps;
22811 for (Value *V : Chain)
22812 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
22813 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
22814 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
22815 InstructionsState S = Analysis.buildInstructionsState(
22816 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
22817 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
22818 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
22819 bool IsAllowedSize =
22820 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
22821 ValOps.size()) ||
22822 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
22823 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22824 (!S.getMainOp()->isSafeToRemove() ||
22825 any_of(ValOps.getArrayRef(),
22826 [&](Value *V) {
22827 return !isa<ExtractElementInst>(V) &&
22828 (V->getNumUses() > Chain.size() ||
22829 any_of(V->users(), [&](User *U) {
22830 return !Stores.contains(U);
22831 }));
22832 }))) ||
22833 (ValOps.size() > Chain.size() / 2 && !S)) {
22834 Size = (!IsAllowedSize && S) ? 1 : 2;
22835 return false;
22836 }
22837 }
22838 if (R.isLoadCombineCandidate(Chain))
22839 return true;
22840 R.buildTree(Chain);
22841 // Check if tree tiny and store itself or its value is not vectorized.
22842 if (R.isTreeTinyAndNotFullyVectorizable()) {
22843 if (R.isGathered(Chain.front()) ||
22844 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
22845 return std::nullopt;
22846 Size = R.getCanonicalGraphSize();
22847 return false;
22848 }
22849 if (R.isProfitableToReorder()) {
22850 R.reorderTopToBottom();
22851 R.reorderBottomToTop();
22852 }
22853 R.transformNodes();
22854 R.buildExternalUses();
22855
22856 R.computeMinimumValueSizes();
22857
22858 Size = R.getCanonicalGraphSize();
22859 if (S && S.getOpcode() == Instruction::Load)
22860 Size = 2; // cut off masked gather small trees
22861 InstructionCost Cost = R.getTreeCost();
22862
22863 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
22864 if (Cost < -SLPCostThreshold) {
22865 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
22866
22867 using namespace ore;
22868
22869 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
22870 cast<StoreInst>(Chain[0]))
22871 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
22872 << " and with tree size "
22873 << NV("TreeSize", R.getTreeSize()));
22874
22875 R.vectorizeTree();
22876 return true;
22877 }
22878
22879 return false;
22880}
22881
22882/// Checks if the quadratic mean deviation is less than 90% of the mean size.
22883static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
22884 bool First) {
22885 unsigned Num = 0;
22886 uint64_t Sum = std::accumulate(
22887 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22888 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22889 unsigned Size = First ? Val.first : Val.second;
22890 if (Size == 1)
22891 return V;
22892 ++Num;
22893 return V + Size;
22894 });
22895 if (Num == 0)
22896 return true;
22897 uint64_t Mean = Sum / Num;
22898 if (Mean == 0)
22899 return true;
22900 uint64_t Dev = std::accumulate(
22901 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22902 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22903 unsigned P = First ? Val.first : Val.second;
22904 if (P == 1)
22905 return V;
22906 return V + (P - Mean) * (P - Mean);
22907 }) /
22908 Num;
22909 return Dev * 96 / (Mean * Mean) == 0;
22910}
22911
22912namespace {
22913
22914/// A group of stores that we'll try to bundle together using vector ops.
22915/// They are ordered using the signed distance of their address operand to the
22916/// address of this group's BaseInstr.
22917class RelatedStoreInsts {
22918public:
22919 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
22920 : AllStores(AllStores) {
22921 reset(BaseInstrIdx);
22922 }
22923
22924 void reset(unsigned NewBaseInstr) {
22925 assert(NewBaseInstr < AllStores.size() &&
22926 "Instruction index out of bounds");
22927 BaseInstrIdx = NewBaseInstr;
22928 Instrs.clear();
22929 insertOrLookup(NewBaseInstr, 0);
22930 }
22931
22932 /// Tries to insert \p InstrIdx as the store with a pointer distance of
22933 /// \p PtrDist.
22934 /// Does nothing if there is already a store with that \p PtrDist.
22935 /// \returns The previously associated Instruction index, or std::nullopt
22936 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
22937 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22938 return Inserted ? std::nullopt : std::make_optional(It->second);
22939 }
22940
22941 using DistToInstMap = std::map<int64_t, unsigned>;
22942 const DistToInstMap &getStores() const { return Instrs; }
22943
22944 /// If \p SI is related to this group of stores, return the distance of its
22945 /// pointer operand to the one the group's BaseInstr.
22946 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
22947 ScalarEvolution &SE) const {
22948 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22949 return getPointersDiff(
22950 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
22951 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
22952 /*StrictCheck=*/true);
22953 }
22954
22955 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
22956 /// Stores whose index is less than \p MinSafeIdx will be dropped.
22957 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
22958 int64_t DistFromCurBase) {
22959 DistToInstMap PrevSet = std::move(Instrs);
22960 reset(NewBaseInstIdx);
22961
22962 // Re-insert stores that come after MinSafeIdx to try and vectorize them
22963 // again. Their distance will be "rebased" to use NewBaseInstIdx as
22964 // reference.
22965 for (auto [Dist, InstIdx] : PrevSet) {
22966 if (InstIdx >= MinSafeIdx)
22967 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22968 }
22969 }
22970
22971 /// Remove all stores that have been vectorized from this group.
22972 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
22973 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
22974 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
22975 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
22976 });
22977
22978 // Get a forward iterator pointing after the last vectorized store and erase
22979 // all stores before it so we don't try to vectorize them again.
22980 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22981 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22982 }
22983
22984private:
22985 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
22986 unsigned BaseInstrIdx;
22987
22988 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
22989 DistToInstMap Instrs;
22990
22991 /// Reference to all the stores in the BB being analyzed.
22992 ArrayRef<StoreInst *> AllStores;
22993};
22994
22995} // end anonymous namespace
22996
22997bool SLPVectorizerPass::vectorizeStores(
22998 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
22999 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23000 &Visited) {
23001 // We may run into multiple chains that merge into a single chain. We mark the
23002 // stores that we vectorized so that we don't visit the same store twice.
23003 BoUpSLP::ValueSet VectorizedStores;
23004 bool Changed = false;
23005
23006 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23007 int64_t PrevDist = -1;
23008 BoUpSLP::ValueList Operands;
23009 // Collect the chain into a list.
23010 for (auto [Idx, Data] : enumerate(StoreSeq)) {
23011 auto &[Dist, InstIdx] = Data;
23012 if (Operands.empty() || Dist - PrevDist == 1) {
23013 Operands.push_back(Stores[InstIdx]);
23014 PrevDist = Dist;
23015 if (Idx != StoreSeq.size() - 1)
23016 continue;
23017 }
23018 auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
23019 Operands.clear();
23020 Operands.push_back(Stores[InstIdx]);
23021 PrevDist = Dist;
23022 });
23023
23024 if (Operands.size() <= 1 ||
23025 !Visited
23026 .insert({Operands.front(),
23027 cast<StoreInst>(Operands.front())->getValueOperand(),
23028 Operands.back(),
23029 cast<StoreInst>(Operands.back())->getValueOperand(),
23030 Operands.size()})
23031 .second)
23032 continue;
23033
23034 unsigned MaxVecRegSize = R.getMaxVecRegSize();
23035 unsigned EltSize = R.getVectorElementSize(Operands[0]);
23036 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
23037
23038 unsigned MaxVF =
23039 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23040 auto *Store = cast<StoreInst>(Operands[0]);
23041 Type *StoreTy = Store->getValueOperand()->getType();
23042 Type *ValueTy = StoreTy;
23043 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
23044 ValueTy = Trunc->getSrcTy();
23045 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
23046 // getStoreMinimumVF only support scalar type as arguments. As a result,
23047 // we need to use the element type of StoreTy and ValueTy to retrieve the
23048 // VF and then transform it back.
23049 // Remember: VF is defined as the number we want to vectorize, not the
23050 // number of elements in the final vector.
23051 Type *StoreScalarTy = StoreTy->getScalarType();
23052 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
23053 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23054 ValueTy->getScalarType()));
23055 MinVF /= getNumElements(StoreTy);
23056 MinVF = std::max<unsigned>(2, MinVF);
23057
23058 if (MaxVF < MinVF) {
23059 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23060 << ") < "
23061 << "MinVF (" << MinVF << ")\n");
23062 continue;
23063 }
23064
23065 unsigned NonPowerOf2VF = 0;
23067 // First try vectorizing with a non-power-of-2 VF. At the moment, only
23068 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
23069 // lanes are used.
23070 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
23071 if (has_single_bit(CandVF + 1)) {
23072 NonPowerOf2VF = CandVF;
23073 assert(NonPowerOf2VF != MaxVF &&
23074 "Non-power-of-2 VF should not be equal to MaxVF");
23075 }
23076 }
23077
23078 // MaxRegVF represents the number of instructions (scalar, or vector in
23079 // case of revec) that can be vectorized to naturally fit in a vector
23080 // register.
23081 unsigned MaxRegVF = MaxVF;
23082
23083 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
23084 if (MaxVF < MinVF) {
23085 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23086 << ") < "
23087 << "MinVF (" << MinVF << ")\n");
23088 continue;
23089 }
23090
23091 SmallVector<unsigned> CandidateVFs;
23092 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23093 VF = divideCeil(VF, 2))
23094 CandidateVFs.push_back(VF);
23095
23096 unsigned End = Operands.size();
23097 unsigned Repeat = 0;
23098 constexpr unsigned MaxAttempts = 4;
23099 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
23100 for (std::pair<unsigned, unsigned> &P : RangeSizes)
23101 P.first = P.second = 1;
23102 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23103 auto IsNotVectorized = [](bool First,
23104 const std::pair<unsigned, unsigned> &P) {
23105 return First ? P.first > 0 : P.second > 0;
23106 };
23107 auto IsVectorized = [](bool First,
23108 const std::pair<unsigned, unsigned> &P) {
23109 return First ? P.first == 0 : P.second == 0;
23110 };
23111 auto VFIsProfitable = [](bool First, unsigned Size,
23112 const std::pair<unsigned, unsigned> &P) {
23113 return First ? Size >= P.first : Size >= P.second;
23114 };
23115 auto FirstSizeSame = [](unsigned Size,
23116 const std::pair<unsigned, unsigned> &P) {
23117 return Size == P.first;
23118 };
23119 while (true) {
23120 ++Repeat;
23121 bool RepeatChanged = false;
23122 bool AnyProfitableGraph = false;
23123 for (unsigned VF : CandidateVFs) {
23124 AnyProfitableGraph = false;
23125 unsigned FirstUnvecStore =
23126 std::distance(RangeSizes.begin(),
23127 find_if(RangeSizes, std::bind(IsNotVectorized,
23128 VF >= MaxRegVF, _1)));
23129
23130 // Form slices of size VF starting from FirstUnvecStore and try to
23131 // vectorize them.
23132 while (FirstUnvecStore < End) {
23133 unsigned FirstVecStore = std::distance(
23134 RangeSizes.begin(),
23135 find_if(RangeSizes.drop_front(FirstUnvecStore),
23136 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23137 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23138 for (unsigned SliceStartIdx = FirstUnvecStore;
23139 SliceStartIdx + VF <= MaxSliceEnd;) {
23140 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
23141 VF >= MaxRegVF)) {
23142 ++SliceStartIdx;
23143 continue;
23144 }
23145 ArrayRef<Value *> Slice =
23146 ArrayRef(Operands).slice(SliceStartIdx, VF);
23147 assert(all_of(Slice,
23148 [&](Value *V) {
23149 return cast<StoreInst>(V)
23150 ->getValueOperand()
23151 ->getType() ==
23152 cast<StoreInst>(Slice.front())
23153 ->getValueOperand()
23154 ->getType();
23155 }) &&
23156 "Expected all operands of same type.");
23157 if (!NonSchedulable.empty()) {
23158 auto [NonSchedSizeMax, NonSchedSizeMin] =
23159 NonSchedulable.lookup(Slice.front());
23160 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23161 // VF is too ambitious. Try to vectorize another slice before
23162 // trying a smaller VF.
23163 SliceStartIdx += NonSchedSizeMax;
23164 continue;
23165 }
23166 }
23167 unsigned TreeSize;
23168 std::optional<bool> Res =
23169 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23170 if (!Res) {
23171 // Update the range of non schedulable VFs for slices starting
23172 // at SliceStartIdx.
23173 NonSchedulable
23174 .try_emplace(Slice.front(), std::make_pair(VF, VF))
23175 .first->getSecond()
23176 .second = VF;
23177 } else if (*Res) {
23178 // Mark the vectorized stores so that we don't vectorize them
23179 // again.
23180 VectorizedStores.insert_range(Slice);
23181 // Mark the vectorized stores so that we don't vectorize them
23182 // again.
23183 AnyProfitableGraph = RepeatChanged = Changed = true;
23184 // If we vectorized initial block, no need to try to vectorize
23185 // it again.
23186 for (std::pair<unsigned, unsigned> &P :
23187 RangeSizes.slice(SliceStartIdx, VF))
23188 P.first = P.second = 0;
23189 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23190 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
23191 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23192 P.first = P.second = 0;
23193 FirstUnvecStore = SliceStartIdx + VF;
23194 }
23195 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23196 for (std::pair<unsigned, unsigned> &P :
23197 RangeSizes.slice(SliceStartIdx + VF,
23198 MaxSliceEnd - (SliceStartIdx + VF)))
23199 P.first = P.second = 0;
23200 if (MaxSliceEnd == End)
23201 End = SliceStartIdx;
23202 MaxSliceEnd = SliceStartIdx;
23203 }
23204 SliceStartIdx += VF;
23205 continue;
23206 }
23207 if (VF > 2 && Res &&
23208 !all_of(RangeSizes.slice(SliceStartIdx, VF),
23209 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23210 _1))) {
23211 SliceStartIdx += VF;
23212 continue;
23213 }
23214 // Check for the very big VFs that we're not rebuilding same
23215 // trees, just with larger number of elements.
23216 if (VF > MaxRegVF && TreeSize > 1 &&
23217 all_of(RangeSizes.slice(SliceStartIdx, VF),
23218 std::bind(FirstSizeSame, TreeSize, _1))) {
23219 SliceStartIdx += VF;
23220 while (SliceStartIdx != MaxSliceEnd &&
23221 RangeSizes[SliceStartIdx].first == TreeSize)
23222 ++SliceStartIdx;
23223 continue;
23224 }
23225 if (TreeSize > 1) {
23226 for (std::pair<unsigned, unsigned> &P :
23227 RangeSizes.slice(SliceStartIdx, VF)) {
23228 if (VF >= MaxRegVF)
23229 P.second = std::max(P.second, TreeSize);
23230 else
23231 P.first = std::max(P.first, TreeSize);
23232 }
23233 }
23234 ++SliceStartIdx;
23235 AnyProfitableGraph = true;
23236 }
23237 if (FirstUnvecStore >= End)
23238 break;
23239 if (MaxSliceEnd - FirstUnvecStore < VF &&
23240 MaxSliceEnd - FirstUnvecStore >= MinVF)
23241 AnyProfitableGraph = true;
23242 FirstUnvecStore = std::distance(
23243 RangeSizes.begin(),
23244 find_if(RangeSizes.drop_front(MaxSliceEnd),
23245 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23246 }
23247 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
23248 break;
23249 }
23250 // All values vectorized - exit.
23251 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
23252 return P.first == 0 && P.second == 0;
23253 }))
23254 break;
23255 // Check if tried all attempts or no need for the last attempts at all.
23256 if (Repeat >= MaxAttempts ||
23257 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23258 break;
23259 constexpr unsigned StoresLimit = 64;
23260 const unsigned MaxTotalNum = std::min<unsigned>(
23261 Operands.size(),
23262 static_cast<unsigned>(
23263 End -
23264 std::distance(
23265 RangeSizes.begin(),
23266 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
23267 1));
23268 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
23269 unsigned Limit =
23270 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
23271 CandidateVFs.clear();
23272 if (bit_floor(Limit) == VF)
23273 CandidateVFs.push_back(Limit);
23274 if (VF > MaxTotalNum || VF >= StoresLimit)
23275 break;
23276 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
23277 if (P.first != 0)
23278 P.first = std::max(P.second, P.first);
23279 }
23280 // Last attempt to vectorize max number of elements, if all previous
23281 // attempts were unsuccessful because of the cost issues.
23282 CandidateVFs.push_back(VF);
23283 }
23284 }
23285 };
23286
23287 /// Groups of stores to vectorize
23288 SmallVector<RelatedStoreInsts> SortedStores;
23289
23290 // Inserts the specified store SI with the given index Idx to the set of the
23291 // stores. If the store with the same distance is found already - stop
23292 // insertion, try to vectorize already found stores. If some stores from this
23293 // sequence were not vectorized - try to vectorize them with the new store
23294 // later. But this logic is applied only to the stores, that come before the
23295 // previous store with the same distance.
23296 // Example:
23297 // 1. store x, %p
23298 // 2. store y, %p+1
23299 // 3. store z, %p+2
23300 // 4. store a, %p
23301 // 5. store b, %p+3
23302 // - Scan this from the last to first store. The very first bunch of stores is
23303 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
23304 // vector).
23305 // - The next store in the list - #1 - has the same distance from store #5 as
23306 // the store #4.
23307 // - Try to vectorize sequence of stores 4,2,3,5.
23308 // - If all these stores are vectorized - just drop them.
23309 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
23310 // - Start new stores sequence.
23311 // The new bunch of stores is {1, {1, 0}}.
23312 // - Add the stores from previous sequence, that were not vectorized.
23313 // Here we consider the stores in the reversed order, rather they are used in
23314 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
23315 // Store #3 can be added -> comes after store #4 with the same distance as
23316 // store #1.
23317 // Store #5 cannot be added - comes before store #4.
23318 // This logic allows to improve the compile time, we assume that the stores
23319 // after previous store with the same distance most likely have memory
23320 // dependencies and no need to waste compile time to try to vectorize them.
23321 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
23322 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
23323 std::optional<int64_t> PtrDist;
23324 auto *RelatedStores = find_if(
23325 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
23326 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23327 return PtrDist.has_value();
23328 });
23329
23330 // We did not find a comparable store, start a new group.
23331 if (RelatedStores == SortedStores.end()) {
23332 SortedStores.emplace_back(Idx, Stores);
23333 return;
23334 }
23335
23336 // If there is already a store in the group with the same PtrDiff, try to
23337 // vectorize the existing instructions before adding the current store.
23338 // Otherwise, insert this store and keep collecting.
23339 if (std::optional<unsigned> PrevInst =
23340 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23341 TryToVectorize(RelatedStores->getStores());
23342 RelatedStores->clearVectorizedStores(VectorizedStores);
23343 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
23344 /*NewBaseInstIdx=*/Idx,
23345 /*DistFromCurBase=*/*PtrDist);
23346 }
23347 };
23348 Type *PrevValTy = nullptr;
23349 for (auto [I, SI] : enumerate(Stores)) {
23350 if (R.isDeleted(SI))
23351 continue;
23352 if (!PrevValTy)
23353 PrevValTy = SI->getValueOperand()->getType();
23354 // Check that we do not try to vectorize stores of different types.
23355 if (PrevValTy != SI->getValueOperand()->getType()) {
23356 for (RelatedStoreInsts &StoreSeq : SortedStores)
23357 TryToVectorize(StoreSeq.getStores());
23358 SortedStores.clear();
23359 PrevValTy = SI->getValueOperand()->getType();
23360 }
23361 FillStoresSet(I, SI);
23362 }
23363
23364 // Final vectorization attempt.
23365 for (RelatedStoreInsts &StoreSeq : SortedStores)
23366 TryToVectorize(StoreSeq.getStores());
23367
23368 return Changed;
23369}
23370
23371void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23372 // Initialize the collections. We will make a single pass over the block.
23373 Stores.clear();
23374 GEPs.clear();
23375
23376 // Visit the store and getelementptr instructions in BB and organize them in
23377 // Stores and GEPs according to the underlying objects of their pointer
23378 // operands.
23379 for (Instruction &I : *BB) {
23380 // Ignore store instructions that are volatile or have a pointer operand
23381 // that doesn't point to a scalar type.
23382 if (auto *SI = dyn_cast<StoreInst>(&I)) {
23383 if (!SI->isSimple())
23384 continue;
23385 if (!isValidElementType(SI->getValueOperand()->getType()))
23386 continue;
23387 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
23388 }
23389
23390 // Ignore getelementptr instructions that have more than one index, a
23391 // constant index, or a pointer operand that doesn't point to a scalar
23392 // type.
23393 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
23394 if (GEP->getNumIndices() != 1)
23395 continue;
23396 Value *Idx = GEP->idx_begin()->get();
23397 if (isa<Constant>(Idx))
23398 continue;
23399 if (!isValidElementType(Idx->getType()))
23400 continue;
23401 if (GEP->getType()->isVectorTy())
23402 continue;
23403 GEPs[GEP->getPointerOperand()].push_back(GEP);
23404 }
23405 }
23406}
23407
23408bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
23409 bool MaxVFOnly) {
23410 if (VL.size() < 2)
23411 return false;
23412
23413 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
23414 << VL.size() << ".\n");
23415
23416 // Check that all of the parts are instructions of the same type,
23417 // we permit an alternate opcode via InstructionsState.
23418 InstructionsState S = getSameOpcode(VL, *TLI);
23419 if (!S)
23420 return false;
23421
23422 Instruction *I0 = S.getMainOp();
23423 // Make sure invalid types (including vector type) are rejected before
23424 // determining vectorization factor for scalar instructions.
23425 for (Value *V : VL) {
23426 Type *Ty = V->getType();
23428 // NOTE: the following will give user internal llvm type name, which may
23429 // not be useful.
23430 R.getORE()->emit([&]() {
23431 std::string TypeStr;
23432 llvm::raw_string_ostream OS(TypeStr);
23433 Ty->print(OS);
23434 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
23435 << "Cannot SLP vectorize list: type "
23436 << TypeStr + " is unsupported by vectorizer";
23437 });
23438 return false;
23439 }
23440 }
23441
23442 Type *ScalarTy = getValueType(VL[0]);
23443 unsigned Sz = R.getVectorElementSize(I0);
23444 unsigned MinVF = R.getMinVF(Sz);
23445 unsigned MaxVF = std::max<unsigned>(
23446 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
23447 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23448 if (MaxVF < 2) {
23449 R.getORE()->emit([&]() {
23450 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
23451 << "Cannot SLP vectorize list: vectorization factor "
23452 << "less than 2 is not supported";
23453 });
23454 return false;
23455 }
23456
23457 bool Changed = false;
23458 bool CandidateFound = false;
23459 InstructionCost MinCost = SLPCostThreshold.getValue();
23460
23461 unsigned NextInst = 0, MaxInst = VL.size();
23462 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23463 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
23464 // No actual vectorization should happen, if number of parts is the same as
23465 // provided vectorization factor (i.e. the scalar type is used for vector
23466 // code during codegen).
23467 auto *VecTy = getWidenedType(ScalarTy, VF);
23468 if (TTI->getNumberOfParts(VecTy) == VF)
23469 continue;
23470 for (unsigned I = NextInst; I < MaxInst; ++I) {
23471 unsigned ActualVF = std::min(MaxInst - I, VF);
23472
23473 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
23474 continue;
23475
23476 if (MaxVFOnly && ActualVF < MaxVF)
23477 break;
23478 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23479 break;
23480
23481 SmallVector<Value *> Ops(ActualVF, nullptr);
23482 unsigned Idx = 0;
23483 for (Value *V : VL.drop_front(I)) {
23484 // Check that a previous iteration of this loop did not delete the
23485 // Value.
23486 if (auto *Inst = dyn_cast<Instruction>(V);
23487 !Inst || !R.isDeleted(Inst)) {
23488 Ops[Idx] = V;
23489 ++Idx;
23490 if (Idx == ActualVF)
23491 break;
23492 }
23493 }
23494 // Not enough vectorizable instructions - exit.
23495 if (Idx != ActualVF)
23496 break;
23497
23498 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
23499 << "\n");
23500
23501 R.buildTree(Ops);
23502 if (R.isTreeTinyAndNotFullyVectorizable())
23503 continue;
23504 if (R.isProfitableToReorder()) {
23505 R.reorderTopToBottom();
23506 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
23507 }
23508 R.transformNodes();
23509 R.buildExternalUses();
23510
23511 R.computeMinimumValueSizes();
23512 InstructionCost Cost = R.getTreeCost();
23513 CandidateFound = true;
23514 MinCost = std::min(MinCost, Cost);
23515
23516 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
23517 << " for VF=" << ActualVF << "\n");
23518 if (Cost < -SLPCostThreshold) {
23519 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
23520 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
23522 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
23523 << " and with tree size "
23524 << ore::NV("TreeSize", R.getTreeSize()));
23525
23526 R.vectorizeTree();
23527 // Move to the next bundle.
23528 I += VF - 1;
23529 NextInst = I + 1;
23530 Changed = true;
23531 }
23532 }
23533 }
23534
23535 if (!Changed && CandidateFound) {
23536 R.getORE()->emit([&]() {
23537 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
23538 << "List vectorization was possible but not beneficial with cost "
23539 << ore::NV("Cost", MinCost) << " >= "
23540 << ore::NV("Treshold", -SLPCostThreshold);
23541 });
23542 } else if (!Changed) {
23543 R.getORE()->emit([&]() {
23544 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
23545 << "Cannot SLP vectorize list: vectorization was impossible"
23546 << " with available vectorization factors";
23547 });
23548 }
23549 return Changed;
23550}
23551
23552namespace {
23553
23554/// Model horizontal reductions.
23555///
23556/// A horizontal reduction is a tree of reduction instructions that has values
23557/// that can be put into a vector as its leaves. For example:
23558///
23559/// mul mul mul mul
23560/// \ / \ /
23561/// + +
23562/// \ /
23563/// +
23564/// This tree has "mul" as its leaf values and "+" as its reduction
23565/// instructions. A reduction can feed into a store or a binary operation
23566/// feeding a phi.
23567/// ...
23568/// \ /
23569/// +
23570/// |
23571/// phi +=
23572///
23573/// Or:
23574/// ...
23575/// \ /
23576/// +
23577/// |
23578/// *p =
23579///
23580class HorizontalReduction {
23581 using ReductionOpsType = SmallVector<Value *, 16>;
23582 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23583 ReductionOpsListType ReductionOps;
23584 /// List of possibly reduced values.
23586 /// Maps reduced value to the corresponding reduction operation.
23587 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23588 WeakTrackingVH ReductionRoot;
23589 /// The type of reduction operation.
23590 RecurKind RdxKind;
23591 /// Checks if the optimization of original scalar identity operations on
23592 /// matched horizontal reductions is enabled and allowed.
23593 bool IsSupportedHorRdxIdentityOp = false;
23594 /// The minimum number of the reduced values.
23595 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
23596 /// Contains vector values for reduction including their scale factor and
23597 /// signedness.
23599
23600 static bool isCmpSelMinMax(Instruction *I) {
23601 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
23603 }
23604
23605 // And/or are potentially poison-safe logical patterns like:
23606 // select x, y, false
23607 // select x, true, y
23608 static bool isBoolLogicOp(Instruction *I) {
23609 return isa<SelectInst>(I) &&
23610 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
23611 }
23612
23613 /// Checks if instruction is associative and can be vectorized.
23614 static bool isVectorizable(RecurKind Kind, Instruction *I,
23615 bool TwoElementReduction = false) {
23616 if (Kind == RecurKind::None)
23617 return false;
23618
23619 // Integer ops that map to select instructions or intrinsics are fine.
23621 isBoolLogicOp(I))
23622 return true;
23623
23624 // No need to check for associativity, if 2 reduced values.
23625 if (TwoElementReduction)
23626 return true;
23627
23628 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23629 // FP min/max are associative except for NaN and -0.0. We do not
23630 // have to rule out -0.0 here because the intrinsic semantics do not
23631 // specify a fixed result for it.
23632 return I->getFastMathFlags().noNaNs();
23633 }
23634
23635 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23636 return true;
23637
23638 return I->isAssociative();
23639 }
23640
23641 static Value *getRdxOperand(Instruction *I, unsigned Index) {
23642 // Poison-safe 'or' takes the form: select X, true, Y
23643 // To make that work with the normal operand processing, we skip the
23644 // true value operand.
23645 // TODO: Change the code and data structures to handle this without a hack.
23646 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
23647 return I->getOperand(2);
23648 return I->getOperand(Index);
23649 }
23650
23651 /// Creates reduction operation with the current opcode.
23652 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
23653 Value *RHS, const Twine &Name, bool UseSelect) {
23654 Type *OpTy = LHS->getType();
23655 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
23656 switch (Kind) {
23657 case RecurKind::Or: {
23658 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23659 return Builder.CreateSelectWithUnknownProfile(
23660 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
23661 RHS, DEBUG_TYPE, Name);
23662 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23663 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23664 Name);
23665 }
23666 case RecurKind::And: {
23667 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23668 return Builder.CreateSelectWithUnknownProfile(
23669 LHS, RHS,
23670 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
23671 DEBUG_TYPE, Name);
23672 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23673 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23674 Name);
23675 }
23676 case RecurKind::Add:
23677 case RecurKind::Mul:
23678 case RecurKind::Xor:
23679 case RecurKind::FAdd:
23680 case RecurKind::FMul: {
23681 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23682 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23683 Name);
23684 }
23685 case RecurKind::SMax:
23686 case RecurKind::SMin:
23687 case RecurKind::UMax:
23688 case RecurKind::UMin:
23689 if (UseSelect) {
23691 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
23692 return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
23693 Name);
23694 }
23695 [[fallthrough]];
23696 case RecurKind::FMax:
23697 case RecurKind::FMin:
23698 case RecurKind::FMaximum:
23699 case RecurKind::FMinimum:
23700 case RecurKind::FMaximumNum:
23701 case RecurKind::FMinimumNum: {
23703 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
23704 }
23705 default:
23706 llvm_unreachable("Unknown reduction operation.");
23707 }
23708 }
23709
23710 /// Creates reduction operation with the current opcode with the IR flags
23711 /// from \p ReductionOps, dropping nuw/nsw flags.
23712 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
23713 Value *RHS, const Twine &Name,
23714 const ReductionOpsListType &ReductionOps) {
23715 bool UseSelect = ReductionOps.size() == 2 ||
23716 // Logical or/and.
23717 (ReductionOps.size() == 1 &&
23718 any_of(ReductionOps.front(), IsaPred<SelectInst>));
23719 assert((!UseSelect || ReductionOps.size() != 2 ||
23720 isa<SelectInst>(ReductionOps[1][0])) &&
23721 "Expected cmp + select pairs for reduction");
23722 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
23724 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
23725 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
23726 /*IncludeWrapFlags=*/false);
23727 propagateIRFlags(Op, ReductionOps[1], nullptr,
23728 /*IncludeWrapFlags=*/false);
23729 return Op;
23730 }
23731 }
23732 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
23733 return Op;
23734 }
23735
23736public:
23737 static RecurKind getRdxKind(Value *V) {
23738 auto *I = dyn_cast<Instruction>(V);
23739 if (!I)
23740 return RecurKind::None;
23741 if (match(I, m_Add(m_Value(), m_Value())))
23742 return RecurKind::Add;
23743 if (match(I, m_Mul(m_Value(), m_Value())))
23744 return RecurKind::Mul;
23745 if (match(I, m_And(m_Value(), m_Value())) ||
23747 return RecurKind::And;
23748 if (match(I, m_Or(m_Value(), m_Value())) ||
23750 return RecurKind::Or;
23751 if (match(I, m_Xor(m_Value(), m_Value())))
23752 return RecurKind::Xor;
23753 if (match(I, m_FAdd(m_Value(), m_Value())))
23754 return RecurKind::FAdd;
23755 if (match(I, m_FMul(m_Value(), m_Value())))
23756 return RecurKind::FMul;
23757
23759 return RecurKind::FMax;
23761 return RecurKind::FMin;
23762
23763 if (match(I, m_FMaximum(m_Value(), m_Value())))
23764 return RecurKind::FMaximum;
23765 if (match(I, m_FMinimum(m_Value(), m_Value())))
23766 return RecurKind::FMinimum;
23767 // This matches either cmp+select or intrinsics. SLP is expected to handle
23768 // either form.
23769 // TODO: If we are canonicalizing to intrinsics, we can remove several
23770 // special-case paths that deal with selects.
23771 if (match(I, m_SMax(m_Value(), m_Value())))
23772 return RecurKind::SMax;
23773 if (match(I, m_SMin(m_Value(), m_Value())))
23774 return RecurKind::SMin;
23775 if (match(I, m_UMax(m_Value(), m_Value())))
23776 return RecurKind::UMax;
23777 if (match(I, m_UMin(m_Value(), m_Value())))
23778 return RecurKind::UMin;
23779
23780 if (auto *Select = dyn_cast<SelectInst>(I)) {
23781 // Try harder: look for min/max pattern based on instructions producing
23782 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
23783 // During the intermediate stages of SLP, it's very common to have
23784 // pattern like this (since optimizeGatherSequence is run only once
23785 // at the end):
23786 // %1 = extractelement <2 x i32> %a, i32 0
23787 // %2 = extractelement <2 x i32> %a, i32 1
23788 // %cond = icmp sgt i32 %1, %2
23789 // %3 = extractelement <2 x i32> %a, i32 0
23790 // %4 = extractelement <2 x i32> %a, i32 1
23791 // %select = select i1 %cond, i32 %3, i32 %4
23792 CmpPredicate Pred;
23793 Instruction *L1;
23794 Instruction *L2;
23795
23796 Value *LHS = Select->getTrueValue();
23797 Value *RHS = Select->getFalseValue();
23798 Value *Cond = Select->getCondition();
23799
23800 // TODO: Support inverse predicates.
23801 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
23804 return RecurKind::None;
23805 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
23808 return RecurKind::None;
23809 } else {
23811 return RecurKind::None;
23812 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
23815 return RecurKind::None;
23816 }
23817
23818 switch (Pred) {
23819 default:
23820 return RecurKind::None;
23821 case CmpInst::ICMP_SGT:
23822 case CmpInst::ICMP_SGE:
23823 return RecurKind::SMax;
23824 case CmpInst::ICMP_SLT:
23825 case CmpInst::ICMP_SLE:
23826 return RecurKind::SMin;
23827 case CmpInst::ICMP_UGT:
23828 case CmpInst::ICMP_UGE:
23829 return RecurKind::UMax;
23830 case CmpInst::ICMP_ULT:
23831 case CmpInst::ICMP_ULE:
23832 return RecurKind::UMin;
23833 }
23834 }
23835 return RecurKind::None;
23836 }
23837
23838 /// Get the index of the first operand.
23839 static unsigned getFirstOperandIndex(Instruction *I) {
23840 return isCmpSelMinMax(I) ? 1 : 0;
23841 }
23842
23843private:
23844 /// Total number of operands in the reduction operation.
23845 static unsigned getNumberOfOperands(Instruction *I) {
23846 return isCmpSelMinMax(I) ? 3 : 2;
23847 }
23848
23849 /// Checks if the instruction is in basic block \p BB.
23850 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
23851 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
23852 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
23853 auto *Sel = cast<SelectInst>(I);
23854 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
23855 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
23856 }
23857 return I->getParent() == BB;
23858 }
23859
23860 /// Expected number of uses for reduction operations/reduced values.
23861 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
23862 if (IsCmpSelMinMax) {
23863 // SelectInst must be used twice while the condition op must have single
23864 // use only.
23865 if (auto *Sel = dyn_cast<SelectInst>(I))
23866 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23867 return I->hasNUses(2);
23868 }
23869
23870 // Arithmetic reduction operation must be used once only.
23871 return I->hasOneUse();
23872 }
23873
23874 /// Initializes the list of reduction operations.
23875 void initReductionOps(Instruction *I) {
23876 if (isCmpSelMinMax(I))
23877 ReductionOps.assign(2, ReductionOpsType());
23878 else
23879 ReductionOps.assign(1, ReductionOpsType());
23880 }
23881
23882 /// Add all reduction operations for the reduction instruction \p I.
23883 void addReductionOps(Instruction *I) {
23884 if (isCmpSelMinMax(I)) {
23885 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
23886 ReductionOps[1].emplace_back(I);
23887 } else {
23888 ReductionOps[0].emplace_back(I);
23889 }
23890 }
23891
23892 static bool isGoodForReduction(ArrayRef<Value *> Data) {
23893 int Sz = Data.size();
23894 auto *I = dyn_cast<Instruction>(Data.front());
23895 return Sz > 1 || isConstant(Data.front()) ||
23896 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
23897 }
23898
23899public:
23900 HorizontalReduction() = default;
23902 : ReductionRoot(I), ReductionLimit(2) {
23903 RdxKind = HorizontalReduction::getRdxKind(I);
23904 ReductionOps.emplace_back().push_back(I);
23905 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
23906 for (Value *V : Ops)
23907 ReducedValsToOps[V].push_back(I);
23908 }
23909
23910 bool matchReductionForOperands() const {
23911 // Analyze "regular" integer/FP types for reductions - no target-specific
23912 // types or pointers.
23913 assert(ReductionRoot && "Reduction root is not set!");
23914 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
23915 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
23916 return Ops.size() == 2;
23917 })))
23918 return false;
23919
23920 return true;
23921 }
23922
23923 /// Try to find a reduction tree.
23924 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23925 ScalarEvolution &SE, const DataLayout &DL,
23926 const TargetLibraryInfo &TLI) {
23927 RdxKind = HorizontalReduction::getRdxKind(Root);
23928 if (!isVectorizable(RdxKind, Root))
23929 return false;
23930
23931 // Analyze "regular" integer/FP types for reductions - no target-specific
23932 // types or pointers.
23933 Type *Ty = Root->getType();
23934 if (!isValidElementType(Ty) || Ty->isPointerTy())
23935 return false;
23936
23937 // Though the ultimate reduction may have multiple uses, its condition must
23938 // have only single use.
23939 if (auto *Sel = dyn_cast<SelectInst>(Root))
23940 if (!Sel->getCondition()->hasOneUse())
23941 return false;
23942
23943 ReductionRoot = Root;
23944
23945 // Iterate through all the operands of the possible reduction tree and
23946 // gather all the reduced values, sorting them by their value id.
23947 BasicBlock *BB = Root->getParent();
23948 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23950 1, std::make_pair(Root, 0));
23951 // Checks if the operands of the \p TreeN instruction are also reduction
23952 // operations or should be treated as reduced values or an extra argument,
23953 // which is not part of the reduction.
23954 auto CheckOperands = [&](Instruction *TreeN,
23955 SmallVectorImpl<Value *> &PossibleReducedVals,
23956 SmallVectorImpl<Instruction *> &ReductionOps,
23957 unsigned Level) {
23958 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
23959 getNumberOfOperands(TreeN)))) {
23960 Value *EdgeVal = getRdxOperand(TreeN, I);
23961 ReducedValsToOps[EdgeVal].push_back(TreeN);
23962 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
23963 // If the edge is not an instruction, or it is different from the main
23964 // reduction opcode or has too many uses - possible reduced value.
23965 // Also, do not try to reduce const values, if the operation is not
23966 // foldable.
23967 if (!EdgeInst || Level > RecursionMaxDepth ||
23968 getRdxKind(EdgeInst) != RdxKind ||
23969 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23970 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23971 !isVectorizable(RdxKind, EdgeInst) ||
23972 (R.isAnalyzedReductionRoot(EdgeInst) &&
23973 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
23974 PossibleReducedVals.push_back(EdgeVal);
23975 continue;
23976 }
23977 ReductionOps.push_back(EdgeInst);
23978 }
23979 };
23980 // Try to regroup reduced values so that it gets more profitable to try to
23981 // reduce them. Values are grouped by their value ids, instructions - by
23982 // instruction op id and/or alternate op id, plus do extra analysis for
23983 // loads (grouping them by the distance between pointers) and cmp
23984 // instructions (grouping them by the predicate).
23985 SmallMapVector<
23986 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
23987 8>
23988 PossibleReducedVals;
23989 initReductionOps(Root);
23990 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
23991 SmallSet<size_t, 2> LoadKeyUsed;
23992
23993 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
23995 Value *Ptr =
23997 if (!LoadKeyUsed.insert(Key).second) {
23998 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
23999 if (LIt != LoadsMap.end()) {
24000 for (LoadInst *RLI : LIt->second) {
24001 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
24002 LI->getType(), LI->getPointerOperand(), DL, SE,
24003 /*StrictCheck=*/true))
24004 return hash_value(RLI->getPointerOperand());
24005 }
24006 for (LoadInst *RLI : LIt->second) {
24008 LI->getPointerOperand(), TLI)) {
24009 hash_code SubKey = hash_value(RLI->getPointerOperand());
24010 return SubKey;
24011 }
24012 }
24013 if (LIt->second.size() > 2) {
24014 hash_code SubKey =
24015 hash_value(LIt->second.back()->getPointerOperand());
24016 return SubKey;
24017 }
24018 }
24019 }
24020 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
24021 .first->second.push_back(LI);
24022 return hash_value(LI->getPointerOperand());
24023 };
24024
24025 while (!Worklist.empty()) {
24026 auto [TreeN, Level] = Worklist.pop_back_val();
24027 SmallVector<Value *> PossibleRedVals;
24028 SmallVector<Instruction *> PossibleReductionOps;
24029 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24030 addReductionOps(TreeN);
24031 // Add reduction values. The values are sorted for better vectorization
24032 // results.
24033 for (Value *V : PossibleRedVals) {
24034 size_t Key, Idx;
24035 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
24036 /*AllowAlternate=*/false);
24037 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
24038 }
24039 for (Instruction *I : reverse(PossibleReductionOps))
24040 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
24041 }
24042 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
24043 // Sort values by the total number of values kinds to start the reduction
24044 // from the longest possible reduced values sequences.
24045 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
24046 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
24047 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
24048 for (auto &Slice : PossibleRedVals) {
24049 PossibleRedValsVect.emplace_back();
24050 auto RedValsVect = Slice.second.takeVector();
24051 stable_sort(RedValsVect, llvm::less_second());
24052 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
24053 PossibleRedValsVect.back().append(Data.second, Data.first);
24054 }
24055 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
24056 return P1.size() > P2.size();
24057 });
24058 bool First = true;
24059 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
24060 if (First) {
24061 First = false;
24062 ReducedVals.emplace_back();
24063 } else if (!isGoodForReduction(Data)) {
24064 auto *LI = dyn_cast<LoadInst>(Data.front());
24065 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
24066 if (!LI || !LastLI ||
24068 getUnderlyingObject(LastLI->getPointerOperand()))
24069 ReducedVals.emplace_back();
24070 }
24071 ReducedVals.back().append(Data.rbegin(), Data.rend());
24072 }
24073 }
24074 // Sort the reduced values by number of same/alternate opcode and/or pointer
24075 // operand.
24076 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
24077 return P1.size() > P2.size();
24078 });
24079 return true;
24080 }
24081
24082 /// Attempt to vectorize the tree found by matchAssociativeReduction.
24083 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
24084 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24085 DominatorTree &DT) {
24086 constexpr unsigned RegMaxNumber = 4;
24087 constexpr unsigned RedValsMaxNumber = 128;
24088 // If there are a sufficient number of reduction values, reduce
24089 // to a nearby power-of-2. We can safely generate oversized
24090 // vectors and rely on the backend to split them to legal sizes.
24091 if (unsigned NumReducedVals = std::accumulate(
24092 ReducedVals.begin(), ReducedVals.end(), 0,
24093 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
24094 if (!isGoodForReduction(Vals))
24095 return Num;
24096 return Num + Vals.size();
24097 });
24098 NumReducedVals < ReductionLimit &&
24099 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
24100 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
24101 })) {
24102 for (ReductionOpsType &RdxOps : ReductionOps)
24103 for (Value *RdxOp : RdxOps)
24104 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24105 return nullptr;
24106 }
24107
24108 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24109 TargetFolder(DL));
24110 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
24111
24112 // Track the reduced values in case if they are replaced by extractelement
24113 // because of the vectorization.
24114 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
24115 ReducedVals.front().size());
24116
24117 // The compare instruction of a min/max is the insertion point for new
24118 // instructions and may be replaced with a new compare instruction.
24119 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
24120 assert(isa<SelectInst>(RdxRootInst) &&
24121 "Expected min/max reduction to have select root instruction");
24122 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
24123 assert(isa<Instruction>(ScalarCond) &&
24124 "Expected min/max reduction to have compare condition");
24125 return cast<Instruction>(ScalarCond);
24126 };
24127
24128 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
24129 return isBoolLogicOp(cast<Instruction>(V));
24130 });
24131 // Return new VectorizedTree, based on previous value.
24132 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
24133 if (VectorizedTree) {
24134 // Update the final value in the reduction.
24136 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
24137 if (AnyBoolLogicOp) {
24138 auto It = ReducedValsToOps.find(VectorizedTree);
24139 auto It1 = ReducedValsToOps.find(Res);
24140 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
24141 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
24142 (It != ReducedValsToOps.end() &&
24143 any_of(It->getSecond(), [&](Instruction *I) {
24144 return isBoolLogicOp(I) &&
24145 getRdxOperand(I, 0) == VectorizedTree;
24146 }))) {
24147 ;
24148 } else if (isGuaranteedNotToBePoison(Res, AC) ||
24149 (It1 != ReducedValsToOps.end() &&
24150 any_of(It1->getSecond(), [&](Instruction *I) {
24151 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24152 }))) {
24153 std::swap(VectorizedTree, Res);
24154 } else {
24155 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
24156 }
24157 }
24158
24159 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
24160 ReductionOps);
24161 }
24162 // Initialize the final value in the reduction.
24163 return Res;
24164 };
24165 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24166 ReductionOps.front().size());
24167 for (ReductionOpsType &RdxOps : ReductionOps)
24168 for (Value *RdxOp : RdxOps) {
24169 if (!RdxOp)
24170 continue;
24171 IgnoreList.insert(RdxOp);
24172 }
24173 // Intersect the fast-math-flags from all reduction operations.
24174 FastMathFlags RdxFMF;
24175 RdxFMF.set();
24176 for (Value *U : IgnoreList)
24177 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
24178 RdxFMF &= FPMO->getFastMathFlags();
24179 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
24180
24181 // Need to track reduced vals, they may be changed during vectorization of
24182 // subvectors.
24183 for (ArrayRef<Value *> Candidates : ReducedVals)
24184 for (Value *V : Candidates)
24185 TrackedVals.try_emplace(V, V);
24186
24187 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24188 Value *V) -> unsigned & {
24189 auto *It = MV.find(V);
24190 assert(It != MV.end() && "Unable to find given key.");
24191 return It->second;
24192 };
24193
24194 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
24195 // List of the values that were reduced in other trees as part of gather
24196 // nodes and thus requiring extract if fully vectorized in other trees.
24197 SmallPtrSet<Value *, 4> RequiredExtract;
24198 WeakTrackingVH VectorizedTree = nullptr;
24199 bool CheckForReusedReductionOps = false;
24200 // Try to vectorize elements based on their type.
24202 for (ArrayRef<Value *> RV : ReducedVals)
24203 States.push_back(getSameOpcode(RV, TLI));
24204 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
24205 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
24206 InstructionsState S = States[I];
24207 SmallVector<Value *> Candidates;
24208 Candidates.reserve(2 * OrigReducedVals.size());
24209 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
24210 for (Value *ReducedVal : OrigReducedVals) {
24211 Value *RdxVal = TrackedVals.at(ReducedVal);
24212 // Check if the reduction value was not overriden by the extractelement
24213 // instruction because of the vectorization and exclude it, if it is not
24214 // compatible with other values.
24215 // Also check if the instruction was folded to constant/other value.
24216 auto *Inst = dyn_cast<Instruction>(RdxVal);
24217 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
24218 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24219 (S && !Inst))
24220 continue;
24221 Candidates.push_back(RdxVal);
24222 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24223 }
24224 bool ShuffledExtracts = false;
24225 // Try to handle shuffled extractelements.
24226 if (S && S.getOpcode() == Instruction::ExtractElement &&
24227 !S.isAltShuffle() && I + 1 < E) {
24228 SmallVector<Value *> CommonCandidates(Candidates);
24229 for (Value *RV : ReducedVals[I + 1]) {
24230 Value *RdxVal = TrackedVals.at(RV);
24231 // Check if the reduction value was not overriden by the
24232 // extractelement instruction because of the vectorization and
24233 // exclude it, if it is not compatible with other values.
24234 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
24235 if (!Inst)
24236 continue;
24237 CommonCandidates.push_back(RdxVal);
24238 TrackedToOrig.try_emplace(RdxVal, RV);
24239 }
24240 SmallVector<int> Mask;
24241 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
24242 ++I;
24243 Candidates.swap(CommonCandidates);
24244 ShuffledExtracts = true;
24245 }
24246 }
24247
24248 // Emit code for constant values.
24249 if (Candidates.size() > 1 && allConstant(Candidates)) {
24250 Value *Res = Candidates.front();
24251 Value *OrigV = TrackedToOrig.at(Candidates.front());
24252 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24253 for (Value *VC : ArrayRef(Candidates).drop_front()) {
24254 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
24255 Value *OrigV = TrackedToOrig.at(VC);
24256 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24257 if (auto *ResI = dyn_cast<Instruction>(Res))
24258 V.analyzedReductionRoot(ResI);
24259 }
24260 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24261 continue;
24262 }
24263
24264 unsigned NumReducedVals = Candidates.size();
24265 if (NumReducedVals < ReductionLimit &&
24266 (NumReducedVals < 2 || !isSplat(Candidates)))
24267 continue;
24268
24269 // Check if we support repeated scalar values processing (optimization of
24270 // original scalar identity operations on matched horizontal reductions).
24271 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24272 RdxKind != RecurKind::FMul &&
24273 RdxKind != RecurKind::FMulAdd;
24274 // Gather same values.
24275 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24276 if (IsSupportedHorRdxIdentityOp)
24277 for (Value *V : Candidates) {
24278 Value *OrigV = TrackedToOrig.at(V);
24279 ++SameValuesCounter.try_emplace(OrigV).first->second;
24280 }
24281 // Used to check if the reduced values used same number of times. In this
24282 // case the compiler may produce better code. E.g. if reduced values are
24283 // aabbccdd (8 x values), then the first node of the tree will have a node
24284 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
24285 // Plus, the final reduction will be performed on <8 x aabbccdd>.
24286 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
24287 // x abcd) * 2.
24288 // Currently it only handles add/fadd/xor. and/or/min/max do not require
24289 // this analysis, other operations may require an extra estimation of
24290 // the profitability.
24291 bool SameScaleFactor = false;
24292 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24293 SameValuesCounter.size() != Candidates.size();
24294 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
24295 if (OptReusedScalars) {
24296 SameScaleFactor =
24297 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24298 RdxKind == RecurKind::Xor) &&
24299 all_of(drop_begin(SameValuesCounter),
24300 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
24301 return P.second == SameValuesCounter.front().second;
24302 });
24303 Candidates.resize(SameValuesCounter.size());
24304 transform(SameValuesCounter, Candidates.begin(),
24305 [&](const auto &P) { return TrackedVals.at(P.first); });
24306 NumReducedVals = Candidates.size();
24307 // Have a reduction of the same element.
24308 if (NumReducedVals == 1) {
24309 Value *OrigV = TrackedToOrig.at(Candidates.front());
24310 unsigned Cnt = At(SameValuesCounter, OrigV);
24311 Value *RedVal =
24312 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24313 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24314 VectorizedVals.try_emplace(OrigV, Cnt);
24315 ExternallyUsedValues.insert(OrigV);
24316 continue;
24317 }
24318 }
24319
24320 unsigned MaxVecRegSize = V.getMaxVecRegSize();
24321 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
24322 const unsigned MaxElts = std::clamp<unsigned>(
24323 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
24324 RegMaxNumber * RedValsMaxNumber);
24325
24326 unsigned ReduxWidth = NumReducedVals;
24327 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
24328 unsigned NumParts, NumRegs;
24329 Type *ScalarTy = Candidates.front()->getType();
24330 ReduxWidth =
24331 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
24332 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24333 NumParts = ::getNumberOfParts(TTI, Tp);
24334 NumRegs =
24336 while (NumParts > NumRegs) {
24337 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
24338 ReduxWidth = bit_floor(ReduxWidth - 1);
24339 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24340 NumParts = ::getNumberOfParts(TTI, Tp);
24341 NumRegs =
24343 }
24344 if (NumParts > NumRegs / 2)
24345 ReduxWidth = bit_floor(ReduxWidth);
24346 return ReduxWidth;
24347 };
24348 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
24349 ReduxWidth = GetVectorFactor(ReduxWidth);
24350 ReduxWidth = std::min(ReduxWidth, MaxElts);
24351
24352 unsigned Start = 0;
24353 unsigned Pos = Start;
24354 // Restarts vectorization attempt with lower vector factor.
24355 unsigned PrevReduxWidth = ReduxWidth;
24356 bool CheckForReusedReductionOpsLocal = false;
24357 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
24358 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
24359 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24360 // Check if any of the reduction ops are gathered. If so, worth
24361 // trying again with less number of reduction ops.
24362 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24363 }
24364 ++Pos;
24365 if (Pos < NumReducedVals - ReduxWidth + 1)
24366 return IsAnyRedOpGathered;
24367 Pos = Start;
24368 --ReduxWidth;
24369 if (ReduxWidth > 1)
24370 ReduxWidth = GetVectorFactor(ReduxWidth);
24371 return IsAnyRedOpGathered;
24372 };
24373 bool AnyVectorized = false;
24374 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24375 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24376 ReduxWidth >= ReductionLimit) {
24377 // Dependency in tree of the reduction ops - drop this attempt, try
24378 // later.
24379 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24380 Start == 0) {
24381 CheckForReusedReductionOps = true;
24382 break;
24383 }
24384 PrevReduxWidth = ReduxWidth;
24385 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
24386 // Been analyzed already - skip.
24387 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
24388 (!has_single_bit(ReduxWidth) &&
24389 (IgnoredCandidates.contains(
24390 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
24391 IgnoredCandidates.contains(
24392 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
24393 bit_floor(ReduxWidth))))) ||
24394 V.areAnalyzedReductionVals(VL)) {
24395 (void)AdjustReducedVals(/*IgnoreVL=*/true);
24396 continue;
24397 }
24398 // Early exit if any of the reduction values were deleted during
24399 // previous vectorization attempts.
24400 if (any_of(VL, [&V](Value *RedVal) {
24401 auto *RedValI = dyn_cast<Instruction>(RedVal);
24402 return RedValI && V.isDeleted(RedValI);
24403 }))
24404 break;
24405 V.buildTree(VL, IgnoreList);
24406 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
24407 if (!AdjustReducedVals())
24408 V.analyzedReductionVals(VL);
24409 continue;
24410 }
24411 if (V.isLoadCombineReductionCandidate(RdxKind)) {
24412 if (!AdjustReducedVals())
24413 V.analyzedReductionVals(VL);
24414 continue;
24415 }
24416 V.reorderTopToBottom();
24417 // No need to reorder the root node at all for reassociative reduction.
24418 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
24419 VL.front()->getType()->isIntOrIntVectorTy() ||
24420 ReductionLimit > 2);
24421 // Keep extracted other reduction values, if they are used in the
24422 // vectorization trees.
24423 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
24424 ExternallyUsedValues);
24425 // The reduction root is used as the insertion point for new
24426 // instructions, so set it as externally used to prevent it from being
24427 // deleted.
24428 LocalExternallyUsedValues.insert(ReductionRoot);
24429 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
24430 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
24431 continue;
24432 for (Value *V : ReducedVals[Cnt])
24433 if (isa<Instruction>(V))
24434 LocalExternallyUsedValues.insert(TrackedVals[V]);
24435 }
24436 if (!IsSupportedHorRdxIdentityOp) {
24437 // Number of uses of the candidates in the vector of values.
24438 assert(SameValuesCounter.empty() &&
24439 "Reused values counter map is not empty");
24440 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24441 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24442 continue;
24443 Value *V = Candidates[Cnt];
24444 Value *OrigV = TrackedToOrig.at(V);
24445 ++SameValuesCounter.try_emplace(OrigV).first->second;
24446 }
24447 }
24448 V.transformNodes();
24449 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
24450 // Gather externally used values.
24451 SmallPtrSet<Value *, 4> Visited;
24452 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24453 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24454 continue;
24455 Value *RdxVal = Candidates[Cnt];
24456 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24457 RdxVal = It->second;
24458 if (!Visited.insert(RdxVal).second)
24459 continue;
24460 // Check if the scalar was vectorized as part of the vectorization
24461 // tree but not the top node.
24462 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
24463 LocalExternallyUsedValues.insert(RdxVal);
24464 continue;
24465 }
24466 Value *OrigV = TrackedToOrig.at(RdxVal);
24467 unsigned NumOps =
24468 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24469 if (NumOps != ReducedValsToOps.at(OrigV).size())
24470 LocalExternallyUsedValues.insert(RdxVal);
24471 }
24472 // Do not need the list of reused scalars in regular mode anymore.
24473 if (!IsSupportedHorRdxIdentityOp)
24474 SameValuesCounter.clear();
24475 for (Value *RdxVal : VL)
24476 if (RequiredExtract.contains(RdxVal))
24477 LocalExternallyUsedValues.insert(RdxVal);
24478 V.buildExternalUses(LocalExternallyUsedValues);
24479
24480 V.computeMinimumValueSizes();
24481
24482 // Estimate cost.
24483 InstructionCost ReductionCost =
24484 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
24485 InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
24486 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24487 << " for reduction\n");
24488 if (!Cost.isValid())
24489 break;
24490 if (Cost >= -SLPCostThreshold) {
24491 V.getORE()->emit([&]() {
24492 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
24493 ReducedValsToOps.at(VL[0]).front())
24494 << "Vectorizing horizontal reduction is possible "
24495 << "but not beneficial with cost " << ore::NV("Cost", Cost)
24496 << " and threshold "
24497 << ore::NV("Threshold", -SLPCostThreshold);
24498 });
24499 if (!AdjustReducedVals()) {
24500 V.analyzedReductionVals(VL);
24501 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24502 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
24503 // Add subvectors of VL to the list of the analyzed values.
24504 for (unsigned VF = getFloorFullVectorNumberOfElements(
24505 *TTI, VL.front()->getType(), ReduxWidth - 1);
24506 VF >= ReductionLimit;
24508 *TTI, VL.front()->getType(), VF - 1)) {
24509 if (has_single_bit(VF) &&
24510 V.getCanonicalGraphSize() != V.getTreeSize())
24511 continue;
24512 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24513 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
24514 }
24515 }
24516 }
24517 continue;
24518 }
24519
24520 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
24521 << Cost << ". (HorRdx)\n");
24522 V.getORE()->emit([&]() {
24523 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
24524 ReducedValsToOps.at(VL[0]).front())
24525 << "Vectorized horizontal reduction with cost "
24526 << ore::NV("Cost", Cost) << " and with tree size "
24527 << ore::NV("TreeSize", V.getTreeSize());
24528 });
24529
24530 Builder.setFastMathFlags(RdxFMF);
24531
24532 // Emit a reduction. If the root is a select (min/max idiom), the insert
24533 // point is the compare condition of that select.
24534 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24535 Instruction *InsertPt = RdxRootInst;
24536 if (IsCmpSelMinMax)
24537 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24538
24539 // Vectorize a tree.
24540 Value *VectorizedRoot = V.vectorizeTree(
24541 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24542 // Update TrackedToOrig mapping, since the tracked values might be
24543 // updated.
24544 for (Value *RdxVal : Candidates) {
24545 Value *OrigVal = TrackedToOrig.at(RdxVal);
24546 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24547 if (TransformedRdxVal != RdxVal)
24548 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24549 }
24550
24551 Builder.SetInsertPoint(InsertPt);
24552
24553 // To prevent poison from leaking across what used to be sequential,
24554 // safe, scalar boolean logic operations, the reduction operand must be
24555 // frozen.
24556 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
24557 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
24558
24559 // Emit code to correctly handle reused reduced values, if required.
24560 if (OptReusedScalars && !SameScaleFactor) {
24561 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24562 SameValuesCounter, TrackedToOrig);
24563 }
24564
24565 Type *ScalarTy = VL.front()->getType();
24566 Type *VecTy = VectorizedRoot->getType();
24567 Type *RedScalarTy = VecTy->getScalarType();
24568 VectorValuesAndScales.emplace_back(
24569 VectorizedRoot,
24570 OptReusedScalars && SameScaleFactor
24571 ? SameValuesCounter.front().second
24572 : 1,
24573 RedScalarTy != ScalarTy->getScalarType()
24574 ? V.isSignedMinBitwidthRootNode()
24575 : true);
24576
24577 // Count vectorized reduced values to exclude them from final reduction.
24578 for (Value *RdxVal : VL) {
24579 Value *OrigV = TrackedToOrig.at(RdxVal);
24580 if (IsSupportedHorRdxIdentityOp) {
24581 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24582 continue;
24583 }
24584 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24585 if (!V.isVectorized(RdxVal))
24586 RequiredExtract.insert(RdxVal);
24587 }
24588 Pos += ReduxWidth;
24589 Start = Pos;
24590 ReduxWidth = NumReducedVals - Pos;
24591 if (ReduxWidth > 1)
24592 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24593 AnyVectorized = true;
24594 }
24595 if (OptReusedScalars && !AnyVectorized) {
24596 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
24597 Value *RdxVal = TrackedVals.at(P.first);
24598 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
24599 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24600 VectorizedVals.try_emplace(P.first, P.second);
24601 }
24602 continue;
24603 }
24604 }
24605 if (!VectorValuesAndScales.empty())
24606 VectorizedTree = GetNewVectorizedTree(
24607 VectorizedTree,
24608 emitReduction(Builder, *TTI, ReductionRoot->getType()));
24609
24610 if (!VectorizedTree) {
24611 if (!CheckForReusedReductionOps) {
24612 for (ReductionOpsType &RdxOps : ReductionOps)
24613 for (Value *RdxOp : RdxOps)
24614 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24615 }
24616 return nullptr;
24617 }
24618
24619 // Reorder operands of bool logical op in the natural order to avoid
24620 // possible problem with poison propagation. If not possible to reorder
24621 // (both operands are originally RHS), emit an extra freeze instruction
24622 // for the LHS operand.
24623 // I.e., if we have original code like this:
24624 // RedOp1 = select i1 ?, i1 LHS, i1 false
24625 // RedOp2 = select i1 RHS, i1 ?, i1 false
24626
24627 // Then, we swap LHS/RHS to create a new op that matches the poison
24628 // semantics of the original code.
24629
24630 // If we have original code like this and both values could be poison:
24631 // RedOp1 = select i1 ?, i1 LHS, i1 false
24632 // RedOp2 = select i1 ?, i1 RHS, i1 false
24633
24634 // Then, we must freeze LHS in the new op.
24635 auto FixBoolLogicalOps =
24636 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
24637 Instruction *RedOp2, bool InitStep) {
24638 if (!AnyBoolLogicOp)
24639 return;
24640 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
24641 getRdxOperand(RedOp1, 0) == LHS ||
24643 return;
24644 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
24645 getRdxOperand(RedOp2, 0) == RHS ||
24647 std::swap(LHS, RHS);
24648 return;
24649 }
24650 if (LHS != VectorizedTree)
24651 LHS = Builder.CreateFreeze(LHS);
24652 };
24653 // Finish the reduction.
24654 // Need to add extra arguments and not vectorized possible reduction values.
24655 // Try to avoid dependencies between the scalar remainders after reductions.
24656 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
24657 bool InitStep) {
24658 unsigned Sz = InstVals.size();
24659 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
24660 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
24661 Instruction *RedOp = InstVals[I + 1].first;
24662 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
24663 Value *RdxVal1 = InstVals[I].second;
24664 Value *StableRdxVal1 = RdxVal1;
24665 auto It1 = TrackedVals.find(RdxVal1);
24666 if (It1 != TrackedVals.end())
24667 StableRdxVal1 = It1->second;
24668 Value *RdxVal2 = InstVals[I + 1].second;
24669 Value *StableRdxVal2 = RdxVal2;
24670 auto It2 = TrackedVals.find(RdxVal2);
24671 if (It2 != TrackedVals.end())
24672 StableRdxVal2 = It2->second;
24673 // To prevent poison from leaking across what used to be sequential,
24674 // safe, scalar boolean logic operations, the reduction operand must be
24675 // frozen.
24676 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
24677 RedOp, InitStep);
24678 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24679 StableRdxVal2, "op.rdx", ReductionOps);
24680 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
24681 }
24682 if (Sz % 2 == 1)
24683 ExtraReds[Sz / 2] = InstVals.back();
24684 return ExtraReds;
24685 };
24687 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
24688 VectorizedTree);
24689 SmallPtrSet<Value *, 8> Visited;
24690 for (ArrayRef<Value *> Candidates : ReducedVals) {
24691 for (Value *RdxVal : Candidates) {
24692 if (!Visited.insert(RdxVal).second)
24693 continue;
24694 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24695 for (Instruction *RedOp :
24696 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
24697 ExtraReductions.emplace_back(RedOp, RdxVal);
24698 }
24699 }
24700 // Iterate through all not-vectorized reduction values/extra arguments.
24701 bool InitStep = true;
24702 while (ExtraReductions.size() > 1) {
24704 FinalGen(ExtraReductions, InitStep);
24705 ExtraReductions.swap(NewReds);
24706 InitStep = false;
24707 }
24708 VectorizedTree = ExtraReductions.front().second;
24709
24710 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24711
24712 // The original scalar reduction is expected to have no remaining
24713 // uses outside the reduction tree itself. Assert that we got this
24714 // correct, replace internal uses with undef, and mark for eventual
24715 // deletion.
24716#ifndef NDEBUG
24717 SmallPtrSet<Value *, 4> IgnoreSet;
24718 for (ArrayRef<Value *> RdxOps : ReductionOps)
24719 IgnoreSet.insert_range(RdxOps);
24720#endif
24721 for (ArrayRef<Value *> RdxOps : ReductionOps) {
24722 for (Value *Ignore : RdxOps) {
24723 if (!Ignore)
24724 continue;
24725#ifndef NDEBUG
24726 for (auto *U : Ignore->users()) {
24727 assert(IgnoreSet.count(U) &&
24728 "All users must be either in the reduction ops list.");
24729 }
24730#endif
24731 if (!Ignore->use_empty()) {
24732 Value *P = PoisonValue::get(Ignore->getType());
24733 Ignore->replaceAllUsesWith(P);
24734 }
24735 }
24736 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24737 }
24738 return VectorizedTree;
24739 }
24740
24741private:
24742 /// Creates the reduction from the given \p Vec vector value with the given
24743 /// scale \p Scale and signedness \p IsSigned.
24744 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24745 Value *Vec, unsigned Scale, bool IsSigned,
24746 Type *DestTy) {
24747 Value *Rdx;
24748 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
24749 unsigned DestTyNumElements = getNumElements(VecTy);
24750 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
24751 Rdx = PoisonValue::get(
24752 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
24753 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
24754 // Do reduction for each lane.
24755 // e.g., do reduce add for
24756 // VL[0] = <4 x Ty> <a, b, c, d>
24757 // VL[1] = <4 x Ty> <e, f, g, h>
24758 // Lane[0] = <2 x Ty> <a, e>
24759 // Lane[1] = <2 x Ty> <b, f>
24760 // Lane[2] = <2 x Ty> <c, g>
24761 // Lane[3] = <2 x Ty> <d, h>
24762 // result[0] = reduce add Lane[0]
24763 // result[1] = reduce add Lane[1]
24764 // result[2] = reduce add Lane[2]
24765 // result[3] = reduce add Lane[3]
24766 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
24767 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
24768 Rdx = Builder.CreateInsertElement(
24769 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
24770 }
24771 } else {
24772 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
24773 }
24774 if (Rdx->getType() != DestTy)
24775 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
24776 // Improved analysis for add/fadd/xor reductions with same scale
24777 // factor for all operands of reductions. We can emit scalar ops for
24778 // them instead.
24779 if (Scale > 1)
24780 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24781 return Rdx;
24782 }
24783
24784 /// Calculate the cost of a reduction.
24785 InstructionCost getReductionCost(TargetTransformInfo *TTI,
24786 ArrayRef<Value *> ReducedVals,
24787 bool IsCmpSelMinMax, FastMathFlags FMF,
24788 const BoUpSLP &R, DominatorTree &DT,
24789 const DataLayout &DL,
24790 const TargetLibraryInfo &TLI) {
24792 Type *ScalarTy = ReducedVals.front()->getType();
24793 unsigned ReduxWidth = ReducedVals.size();
24794 FixedVectorType *VectorTy = R.getReductionType();
24795 InstructionCost VectorCost = 0, ScalarCost;
24796 // If all of the reduced values are constant, the vector cost is 0, since
24797 // the reduction value can be calculated at the compile time.
24798 bool AllConsts = allConstant(ReducedVals);
24799 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
24801 // Scalar cost is repeated for N-1 elements.
24802 int Cnt = ReducedVals.size();
24803 for (Value *RdxVal : ReducedVals) {
24804 if (Cnt == 1)
24805 break;
24806 --Cnt;
24807 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
24808 Cost += GenCostFn();
24809 continue;
24810 }
24811 InstructionCost ScalarCost = 0;
24812 for (User *U : RdxVal->users()) {
24813 auto *RdxOp = cast<Instruction>(U);
24814 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24815 if (RdxKind == RecurKind::FAdd) {
24817 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
24818 if (FMACost.isValid()) {
24819 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
24820 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
24821 // Also, exclude scalar fmul cost.
24822 InstructionCost FMulCost =
24824 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
24825 FMACost -= FMulCost;
24826 }
24827 ScalarCost += FMACost;
24828 continue;
24829 }
24830 }
24831 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
24832 continue;
24833 }
24834 ScalarCost = InstructionCost::getInvalid();
24835 break;
24836 }
24837 if (ScalarCost.isValid())
24838 Cost += ScalarCost;
24839 else
24840 Cost += GenCostFn();
24841 }
24842 return Cost;
24843 };
24844 // Require reduction cost if:
24845 // 1. This type is not a full register type and no other vectors with the
24846 // same type in the storage (first vector with small type).
24847 // 2. The storage does not have any vector with full vector use (first
24848 // vector with full register use).
24849 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
24850 switch (RdxKind) {
24851 case RecurKind::Add:
24852 case RecurKind::Mul:
24853 case RecurKind::Or:
24854 case RecurKind::And:
24855 case RecurKind::Xor:
24856 case RecurKind::FAdd:
24857 case RecurKind::FMul: {
24858 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
24859 if (!AllConsts) {
24860 if (DoesRequireReductionOp) {
24861 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
24862 assert(SLPReVec && "FixedVectorType is not expected.");
24863 unsigned ScalarTyNumElements = VecTy->getNumElements();
24864 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
24865 VectorCost += TTI->getShuffleCost(
24868 ReducedVals.size()),
24869 VectorTy,
24870 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
24871 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
24872 FMF, CostKind);
24873 }
24874 VectorCost += TTI->getScalarizationOverhead(
24875 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
24876 /*Extract*/ false, TTI::TCK_RecipThroughput);
24877 } else {
24878 Type *RedTy = VectorTy->getElementType();
24879 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24880 std::make_pair(RedTy, true));
24881 if (RType == RedTy) {
24882 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
24883 FMF, CostKind);
24884 } else {
24885 VectorCost = TTI->getExtendedReductionCost(
24886 RdxOpcode, !IsSigned, RedTy,
24887 getWidenedType(RType, ReduxWidth), FMF, CostKind);
24888 }
24889 }
24890 } else {
24891 Type *RedTy = VectorTy->getElementType();
24892 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24893 std::make_pair(RedTy, true));
24894 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24895 InstructionCost FMACost = InstructionCost::getInvalid();
24896 if (RdxKind == RecurKind::FAdd) {
24897 // Check if the reduction operands can be converted to FMA.
24899 FastMathFlags FMF;
24900 FMF.set();
24901 for (Value *RdxVal : ReducedVals) {
24902 if (!RdxVal->hasOneUse()) {
24903 Ops.clear();
24904 break;
24905 }
24906 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
24907 FMF &= FPCI->getFastMathFlags();
24908 Ops.push_back(RdxVal->user_back());
24909 }
24910 if (!Ops.empty()) {
24911 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
24912 *TTI, TLI);
24913 if (FMACost.isValid()) {
24914 // Calculate actual FMAD cost.
24915 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24916 {RVecTy, RVecTy, RVecTy}, FMF);
24917 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
24918
24919 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
24920 // Also, exclude vector fmul cost.
24922 Instruction::FMul, RVecTy, CostKind);
24924 << "Minus vector FMul cost: " << FMulCost << "\n");
24925 FMACost -= FMulCost;
24926 }
24927 }
24928 }
24929 if (FMACost.isValid())
24930 VectorCost += FMACost;
24931 else
24932 VectorCost +=
24933 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
24934 if (RType != RedTy) {
24935 unsigned Opcode = Instruction::Trunc;
24936 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24937 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24938 VectorCost += TTI->getCastInstrCost(
24939 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24940 }
24941 }
24942 }
24943 ScalarCost = EvaluateScalarCost([&]() {
24944 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
24945 });
24946 break;
24947 }
24948 case RecurKind::FMax:
24949 case RecurKind::FMin:
24950 case RecurKind::FMaximum:
24951 case RecurKind::FMinimum:
24952 case RecurKind::SMax:
24953 case RecurKind::SMin:
24954 case RecurKind::UMax:
24955 case RecurKind::UMin: {
24957 if (!AllConsts) {
24958 if (DoesRequireReductionOp) {
24959 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
24960 } else {
24961 // Check if the previous reduction already exists and account it as
24962 // series of operations + single reduction.
24963 Type *RedTy = VectorTy->getElementType();
24964 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24965 std::make_pair(RedTy, true));
24966 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24967 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24968 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
24969 if (RType != RedTy) {
24970 unsigned Opcode = Instruction::Trunc;
24971 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24972 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24973 VectorCost += TTI->getCastInstrCost(
24974 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24975 }
24976 }
24977 }
24978 ScalarCost = EvaluateScalarCost([&]() {
24979 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24980 return TTI->getIntrinsicInstrCost(ICA, CostKind);
24981 });
24982 break;
24983 }
24984 default:
24985 llvm_unreachable("Expected arithmetic or min/max reduction operation");
24986 }
24987
24988 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
24989 << " for reduction of " << shortBundleName(ReducedVals)
24990 << " (It is a splitting reduction)\n");
24991 return VectorCost - ScalarCost;
24992 }
24993
24994 /// Splits the values, stored in VectorValuesAndScales, into registers/free
24995 /// sub-registers, combines them with the given reduction operation as a
24996 /// vector operation and then performs single (small enough) reduction.
24997 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24998 Type *DestTy) {
24999 Value *ReducedSubTree = nullptr;
25000 // Creates reduction and combines with the previous reduction.
25001 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
25002 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
25003 if (ReducedSubTree)
25004 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25005 "op.rdx", ReductionOps);
25006 else
25007 ReducedSubTree = Rdx;
25008 };
25009 if (VectorValuesAndScales.size() == 1) {
25010 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
25011 CreateSingleOp(Vec, Scale, IsSigned);
25012 return ReducedSubTree;
25013 }
25014 // Scales Vec using given Cnt scale factor and then performs vector combine
25015 // with previous value of VecOp.
25016 Value *VecRes = nullptr;
25017 bool VecResSignedness = false;
25018 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
25019 Type *ScalarTy = Vec->getType()->getScalarType();
25020 // Scale Vec using given Cnt scale factor.
25021 if (Cnt > 1) {
25022 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
25023 switch (RdxKind) {
25024 case RecurKind::Add: {
25025 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
25026 unsigned VF = getNumElements(Vec->getType());
25027 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
25028 << ". (HorRdx)\n");
25029 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
25030 for (unsigned I : seq<unsigned>(Cnt))
25031 std::iota(std::next(Mask.begin(), VF * I),
25032 std::next(Mask.begin(), VF * (I + 1)), 0);
25033 ++NumVectorInstructions;
25034 Vec = Builder.CreateShuffleVector(Vec, Mask);
25035 break;
25036 }
25037 // res = mul vv, n
25038 if (ScalarTy != DestTy->getScalarType())
25039 Vec = Builder.CreateIntCast(
25040 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25041 IsSigned);
25043 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
25044 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
25045 << ". (HorRdx)\n");
25046 ++NumVectorInstructions;
25047 Vec = Builder.CreateMul(Vec, Scale);
25048 break;
25049 }
25050 case RecurKind::Xor: {
25051 // res = n % 2 ? 0 : vv
25053 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
25054 if (Cnt % 2 == 0)
25055 Vec = Constant::getNullValue(Vec->getType());
25056 break;
25057 }
25058 case RecurKind::FAdd: {
25059 // res = fmul v, n
25060 Value *Scale =
25061 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
25062 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
25063 << ". (HorRdx)\n");
25064 ++NumVectorInstructions;
25065 Vec = Builder.CreateFMul(Vec, Scale);
25066 break;
25067 }
25068 case RecurKind::And:
25069 case RecurKind::Or:
25070 case RecurKind::SMax:
25071 case RecurKind::SMin:
25072 case RecurKind::UMax:
25073 case RecurKind::UMin:
25074 case RecurKind::FMax:
25075 case RecurKind::FMin:
25076 case RecurKind::FMaximum:
25077 case RecurKind::FMinimum:
25078 // res = vv
25079 break;
25080 case RecurKind::Sub:
25081 case RecurKind::AddChainWithSubs:
25082 case RecurKind::Mul:
25083 case RecurKind::FMul:
25084 case RecurKind::FMulAdd:
25085 case RecurKind::AnyOf:
25086 case RecurKind::FindFirstIVSMin:
25087 case RecurKind::FindFirstIVUMin:
25088 case RecurKind::FindLastIVSMax:
25089 case RecurKind::FindLastIVUMax:
25090 case RecurKind::FMaxNum:
25091 case RecurKind::FMinNum:
25092 case RecurKind::FMaximumNum:
25093 case RecurKind::FMinimumNum:
25094 case RecurKind::None:
25095 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25096 }
25097 }
25098 // Combine Vec with the previous VecOp.
25099 if (!VecRes) {
25100 VecRes = Vec;
25101 VecResSignedness = IsSigned;
25102 } else {
25103 ++NumVectorInstructions;
25104 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
25105 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
25106 // Handle ctpop.
25107 unsigned VecResVF = getNumElements(VecRes->getType());
25108 unsigned VecVF = getNumElements(Vec->getType());
25109 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
25110 std::iota(Mask.begin(), Mask.end(), 0);
25111 // Ensure that VecRes is always larger than Vec
25112 if (VecResVF < VecVF) {
25113 std::swap(VecRes, Vec);
25114 std::swap(VecResVF, VecVF);
25115 }
25116 if (VecResVF != VecVF) {
25117 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
25118 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25119 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
25120 }
25121 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
25122 return;
25123 }
25124 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
25125 VecRes = Builder.CreateIntCast(
25126 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
25127 VecResSignedness);
25128 if (ScalarTy != DestTy->getScalarType())
25129 Vec = Builder.CreateIntCast(
25130 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25131 IsSigned);
25132 unsigned VecResVF = getNumElements(VecRes->getType());
25133 unsigned VecVF = getNumElements(Vec->getType());
25134 // Ensure that VecRes is always larger than Vec
25135 if (VecResVF < VecVF) {
25136 std::swap(VecRes, Vec);
25137 std::swap(VecResVF, VecVF);
25138 }
25139 // extract + op + insert
25140 Value *Op = VecRes;
25141 if (VecResVF != VecVF)
25142 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
25143 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
25144 if (VecResVF != VecVF)
25145 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
25146 VecRes = Op;
25147 }
25148 };
25149 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25150 CreateVecOp(Vec, Scale, IsSigned);
25151 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
25152
25153 return ReducedSubTree;
25154 }
25155
25156 /// Emit a horizontal reduction of the vectorized value.
25157 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
25158 const TargetTransformInfo *TTI, Type *DestTy) {
25159 assert(VectorizedValue && "Need to have a vectorized tree node");
25160 assert(RdxKind != RecurKind::FMulAdd &&
25161 "A call to the llvm.fmuladd intrinsic is not handled yet");
25162
25163 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
25164 if (FTy->getScalarType() == Builder.getInt1Ty() &&
25165 RdxKind == RecurKind::Add &&
25166 DestTy->getScalarType() != FTy->getScalarType()) {
25167 // Convert vector_reduce_add(ZExt(<n x i1>)) to
25168 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
25169 Value *V = Builder.CreateBitCast(
25170 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
25171 ++NumVectorInstructions;
25172 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
25173 }
25174 ++NumVectorInstructions;
25175 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
25176 }
25177
25178 /// Emits optimized code for unique scalar value reused \p Cnt times.
25179 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
25180 unsigned Cnt) {
25181 assert(IsSupportedHorRdxIdentityOp &&
25182 "The optimization of matched scalar identity horizontal reductions "
25183 "must be supported.");
25184 if (Cnt == 1)
25185 return VectorizedValue;
25186 switch (RdxKind) {
25187 case RecurKind::Add: {
25188 // res = mul vv, n
25189 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
25190 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
25191 << VectorizedValue << ". (HorRdx)\n");
25192 return Builder.CreateMul(VectorizedValue, Scale);
25193 }
25194 case RecurKind::Xor: {
25195 // res = n % 2 ? 0 : vv
25196 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
25197 << ". (HorRdx)\n");
25198 if (Cnt % 2 == 0)
25199 return Constant::getNullValue(VectorizedValue->getType());
25200 return VectorizedValue;
25201 }
25202 case RecurKind::FAdd: {
25203 // res = fmul v, n
25204 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
25205 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
25206 << VectorizedValue << ". (HorRdx)\n");
25207 return Builder.CreateFMul(VectorizedValue, Scale);
25208 }
25209 case RecurKind::And:
25210 case RecurKind::Or:
25211 case RecurKind::SMax:
25212 case RecurKind::SMin:
25213 case RecurKind::UMax:
25214 case RecurKind::UMin:
25215 case RecurKind::FMax:
25216 case RecurKind::FMin:
25217 case RecurKind::FMaximum:
25218 case RecurKind::FMinimum:
25219 // res = vv
25220 return VectorizedValue;
25221 case RecurKind::Sub:
25222 case RecurKind::AddChainWithSubs:
25223 case RecurKind::Mul:
25224 case RecurKind::FMul:
25225 case RecurKind::FMulAdd:
25226 case RecurKind::AnyOf:
25227 case RecurKind::FindFirstIVSMin:
25228 case RecurKind::FindFirstIVUMin:
25229 case RecurKind::FindLastIVSMax:
25230 case RecurKind::FindLastIVUMax:
25231 case RecurKind::FMaxNum:
25232 case RecurKind::FMinNum:
25233 case RecurKind::FMaximumNum:
25234 case RecurKind::FMinimumNum:
25235 case RecurKind::None:
25236 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25237 }
25238 return nullptr;
25239 }
25240
25241 /// Emits actual operation for the scalar identity values, found during
25242 /// horizontal reduction analysis.
25243 Value *
25244 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25245 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25246 const DenseMap<Value *, Value *> &TrackedToOrig) {
25247 assert(IsSupportedHorRdxIdentityOp &&
25248 "The optimization of matched scalar identity horizontal reductions "
25249 "must be supported.");
25250 ArrayRef<Value *> VL = R.getRootNodeScalars();
25251 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
25252 if (VTy->getElementType() != VL.front()->getType()) {
25253 VectorizedValue = Builder.CreateIntCast(
25254 VectorizedValue,
25255 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
25256 R.isSignedMinBitwidthRootNode());
25257 }
25258 switch (RdxKind) {
25259 case RecurKind::Add: {
25260 // root = mul prev_root, <1, 1, n, 1>
25262 for (Value *V : VL) {
25263 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25264 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
25265 }
25266 auto *Scale = ConstantVector::get(Vals);
25267 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
25268 << VectorizedValue << ". (HorRdx)\n");
25269 return Builder.CreateMul(VectorizedValue, Scale);
25270 }
25271 case RecurKind::And:
25272 case RecurKind::Or:
25273 // No need for multiple or/and(s).
25274 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
25275 << ". (HorRdx)\n");
25276 return VectorizedValue;
25277 case RecurKind::SMax:
25278 case RecurKind::SMin:
25279 case RecurKind::UMax:
25280 case RecurKind::UMin:
25281 case RecurKind::FMax:
25282 case RecurKind::FMin:
25283 case RecurKind::FMaximum:
25284 case RecurKind::FMinimum:
25285 // No need for multiple min/max(s) of the same value.
25286 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
25287 << ". (HorRdx)\n");
25288 return VectorizedValue;
25289 case RecurKind::Xor: {
25290 // Replace values with even number of repeats with 0, since
25291 // x xor x = 0.
25292 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
25293 // 7>, if elements 4th and 6th elements have even number of repeats.
25294 SmallVector<int> Mask(
25295 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
25297 std::iota(Mask.begin(), Mask.end(), 0);
25298 bool NeedShuffle = false;
25299 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
25300 Value *V = VL[I];
25301 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25302 if (Cnt % 2 == 0) {
25303 Mask[I] = VF;
25304 NeedShuffle = true;
25305 }
25306 }
25307 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
25308 : Mask) dbgs()
25309 << I << " ";
25310 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
25311 if (NeedShuffle)
25312 VectorizedValue = Builder.CreateShuffleVector(
25313 VectorizedValue,
25314 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
25315 return VectorizedValue;
25316 }
25317 case RecurKind::FAdd: {
25318 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
25320 for (Value *V : VL) {
25321 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25322 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
25323 }
25324 auto *Scale = ConstantVector::get(Vals);
25325 return Builder.CreateFMul(VectorizedValue, Scale);
25326 }
25327 case RecurKind::Sub:
25328 case RecurKind::AddChainWithSubs:
25329 case RecurKind::Mul:
25330 case RecurKind::FMul:
25331 case RecurKind::FMulAdd:
25332 case RecurKind::AnyOf:
25333 case RecurKind::FindFirstIVSMin:
25334 case RecurKind::FindFirstIVUMin:
25335 case RecurKind::FindLastIVSMax:
25336 case RecurKind::FindLastIVUMax:
25337 case RecurKind::FMaxNum:
25338 case RecurKind::FMinNum:
25339 case RecurKind::FMaximumNum:
25340 case RecurKind::FMinimumNum:
25341 case RecurKind::None:
25342 llvm_unreachable("Unexpected reduction kind for reused scalars.");
25343 }
25344 return nullptr;
25345 }
25346};
25347} // end anonymous namespace
25348
25349/// Gets recurrence kind from the specified value.
25351 return HorizontalReduction::getRdxKind(V);
25352}
25353static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
25354 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25355 return cast<FixedVectorType>(IE->getType())->getNumElements();
25356
25357 unsigned AggregateSize = 1;
25358 auto *IV = cast<InsertValueInst>(InsertInst);
25359 Type *CurrentType = IV->getType();
25360 do {
25361 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
25362 for (auto *Elt : ST->elements())
25363 if (Elt != ST->getElementType(0)) // check homogeneity
25364 return std::nullopt;
25365 AggregateSize *= ST->getNumElements();
25366 CurrentType = ST->getElementType(0);
25367 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25368 AggregateSize *= AT->getNumElements();
25369 CurrentType = AT->getElementType();
25370 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25371 AggregateSize *= VT->getNumElements();
25372 return AggregateSize;
25373 } else if (CurrentType->isSingleValueType()) {
25374 return AggregateSize;
25375 } else {
25376 return std::nullopt;
25377 }
25378 } while (true);
25379}
25380
25381static void findBuildAggregateRec(Instruction *LastInsertInst,
25383 SmallVectorImpl<Value *> &BuildVectorOpds,
25384 SmallVectorImpl<Value *> &InsertElts,
25385 unsigned OperandOffset, const BoUpSLP &R) {
25386 do {
25387 Value *InsertedOperand = LastInsertInst->getOperand(1);
25388 std::optional<unsigned> OperandIndex =
25389 getElementIndex(LastInsertInst, OperandOffset);
25390 if (!OperandIndex || R.isDeleted(LastInsertInst))
25391 return;
25392 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25394 BuildVectorOpds, InsertElts, *OperandIndex, R);
25395
25396 } else {
25397 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25398 InsertElts[*OperandIndex] = LastInsertInst;
25399 }
25400 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
25401 } while (LastInsertInst != nullptr &&
25403 LastInsertInst->hasOneUse());
25404}
25405
25406/// Recognize construction of vectors like
25407/// %ra = insertelement <4 x float> poison, float %s0, i32 0
25408/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
25409/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
25410/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
25411/// starting from the last insertelement or insertvalue instruction.
25412///
25413/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
25414/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
25415/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
25416///
25417/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
25418///
25419/// \return true if it matches.
25420static bool findBuildAggregate(Instruction *LastInsertInst,
25422 SmallVectorImpl<Value *> &BuildVectorOpds,
25423 SmallVectorImpl<Value *> &InsertElts,
25424 const BoUpSLP &R) {
25425
25426 assert((isa<InsertElementInst>(LastInsertInst) ||
25427 isa<InsertValueInst>(LastInsertInst)) &&
25428 "Expected insertelement or insertvalue instruction!");
25429
25430 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
25431 "Expected empty result vectors!");
25432
25433 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
25434 if (!AggregateSize)
25435 return false;
25436 BuildVectorOpds.resize(*AggregateSize);
25437 InsertElts.resize(*AggregateSize);
25438
25439 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
25440 llvm::erase(BuildVectorOpds, nullptr);
25441 llvm::erase(InsertElts, nullptr);
25442 if (BuildVectorOpds.size() >= 2)
25443 return true;
25444
25445 return false;
25446}
25447
25448/// Try and get a reduction instruction from a phi node.
25449///
25450/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
25451/// if they come from either \p ParentBB or a containing loop latch.
25452///
25453/// \returns A candidate reduction value if possible, or \code nullptr \endcode
25454/// if not possible.
25456 BasicBlock *ParentBB, LoopInfo *LI) {
25457 // There are situations where the reduction value is not dominated by the
25458 // reduction phi. Vectorizing such cases has been reported to cause
25459 // miscompiles. See PR25787.
25460 auto DominatedReduxValue = [&](Value *R) {
25461 return isa<Instruction>(R) &&
25462 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
25463 };
25464
25465 Instruction *Rdx = nullptr;
25466
25467 // Return the incoming value if it comes from the same BB as the phi node.
25468 if (P->getIncomingBlock(0) == ParentBB) {
25469 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25470 } else if (P->getIncomingBlock(1) == ParentBB) {
25471 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25472 }
25473
25474 if (Rdx && DominatedReduxValue(Rdx))
25475 return Rdx;
25476
25477 // Otherwise, check whether we have a loop latch to look at.
25478 Loop *BBL = LI->getLoopFor(ParentBB);
25479 if (!BBL)
25480 return nullptr;
25481 BasicBlock *BBLatch = BBL->getLoopLatch();
25482 if (!BBLatch)
25483 return nullptr;
25484
25485 // There is a loop latch, return the incoming value if it comes from
25486 // that. This reduction pattern occasionally turns up.
25487 if (P->getIncomingBlock(0) == BBLatch) {
25488 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25489 } else if (P->getIncomingBlock(1) == BBLatch) {
25490 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25491 }
25492
25493 if (Rdx && DominatedReduxValue(Rdx))
25494 return Rdx;
25495
25496 return nullptr;
25497}
25498
25499static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
25500 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
25501 return true;
25502 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
25503 return true;
25504 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
25505 return true;
25506 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
25507 return true;
25508 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
25509 return true;
25511 return true;
25513 return true;
25515 return true;
25517 return true;
25518 return false;
25519}
25520
25521/// We could have an initial reduction that is not an add.
25522/// r *= v1 + v2 + v3 + v4
25523/// In such a case start looking for a tree rooted in the first '+'.
25524/// \Returns the new root if found, which may be nullptr if not an instruction.
25526 Instruction *Root) {
25527 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25528 isa<IntrinsicInst>(Root)) &&
25529 "Expected binop, select, or intrinsic for reduction matching");
25530 Value *LHS =
25531 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25532 Value *RHS =
25533 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25534 if (LHS == Phi)
25535 return dyn_cast<Instruction>(RHS);
25536 if (RHS == Phi)
25537 return dyn_cast<Instruction>(LHS);
25538 return nullptr;
25539}
25540
25541/// \p Returns the first operand of \p I that does not match \p Phi. If
25542/// operand is not an instruction it returns nullptr.
25544 Value *Op0 = nullptr;
25545 Value *Op1 = nullptr;
25546 if (!matchRdxBop(I, Op0, Op1))
25547 return nullptr;
25548 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25549}
25550
25551/// \Returns true if \p I is a candidate instruction for reduction vectorization.
25553 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
25554 Value *B0 = nullptr, *B1 = nullptr;
25555 bool IsBinop = matchRdxBop(I, B0, B1);
25556 return IsBinop || IsSelect;
25557}
25558
25559bool SLPVectorizerPass::vectorizeHorReduction(
25560 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
25561 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25562 if (!ShouldVectorizeHor)
25563 return false;
25564 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
25565
25566 if (Root->getParent() != BB || isa<PHINode>(Root))
25567 return false;
25568
25569 // If we can find a secondary reduction root, use that instead.
25570 auto SelectRoot = [&]() {
25571 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
25572 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25573 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
25574 return NewRoot;
25575 return Root;
25576 };
25577
25578 // Start analysis starting from Root instruction. If horizontal reduction is
25579 // found, try to vectorize it. If it is not a horizontal reduction or
25580 // vectorization is not possible or not effective, and currently analyzed
25581 // instruction is a binary operation, try to vectorize the operands, using
25582 // pre-order DFS traversal order. If the operands were not vectorized, repeat
25583 // the same procedure considering each operand as a possible root of the
25584 // horizontal reduction.
25585 // Interrupt the process if the Root instruction itself was vectorized or all
25586 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
25587 // If a horizintal reduction was not matched or vectorized we collect
25588 // instructions for possible later attempts for vectorization.
25589 std::queue<std::pair<Instruction *, unsigned>> Stack;
25590 Stack.emplace(SelectRoot(), 0);
25591 SmallPtrSet<Value *, 8> VisitedInstrs;
25592 bool Res = false;
25593 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
25594 if (R.isAnalyzedReductionRoot(Inst))
25595 return nullptr;
25596 if (!isReductionCandidate(Inst))
25597 return nullptr;
25598 HorizontalReduction HorRdx;
25599 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25600 return nullptr;
25601 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25602 };
25603 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
25604 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25605 FutureSeed = getNonPhiOperand(Root, P);
25606 if (!FutureSeed)
25607 return false;
25608 }
25609 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
25610 // analysis is done separately.
25612 PostponedInsts.push_back(FutureSeed);
25613 return true;
25614 };
25615
25616 while (!Stack.empty()) {
25617 Instruction *Inst;
25618 unsigned Level;
25619 std::tie(Inst, Level) = Stack.front();
25620 Stack.pop();
25621 // Do not try to analyze instruction that has already been vectorized.
25622 // This may happen when we vectorize instruction operands on a previous
25623 // iteration while stack was populated before that happened.
25624 if (R.isDeleted(Inst))
25625 continue;
25626 if (Value *VectorizedV = TryToReduce(Inst)) {
25627 Res = true;
25628 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
25629 // Try to find another reduction.
25630 Stack.emplace(I, Level);
25631 continue;
25632 }
25633 if (R.isDeleted(Inst))
25634 continue;
25635 } else {
25636 // We could not vectorize `Inst` so try to use it as a future seed.
25637 if (!TryAppendToPostponedInsts(Inst)) {
25638 assert(Stack.empty() && "Expected empty stack");
25639 break;
25640 }
25641 }
25642
25643 // Try to vectorize operands.
25644 // Continue analysis for the instruction from the same basic block only to
25645 // save compile time.
25646 if (++Level < RecursionMaxDepth)
25647 for (auto *Op : Inst->operand_values())
25648 if (VisitedInstrs.insert(Op).second)
25649 if (auto *I = dyn_cast<Instruction>(Op))
25650 // Do not try to vectorize CmpInst operands, this is done
25651 // separately.
25653 !R.isDeleted(I) && I->getParent() == BB)
25654 Stack.emplace(I, Level);
25655 }
25656 return Res;
25657}
25658
25659bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
25660 if (!I)
25661 return false;
25662
25663 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
25664 return false;
25665 // Skip potential FMA candidates.
25666 if ((I->getOpcode() == Instruction::FAdd ||
25667 I->getOpcode() == Instruction::FSub) &&
25668 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
25669 .isValid())
25670 return false;
25671
25672 Value *P = I->getParent();
25673
25674 // Vectorize in current basic block only.
25675 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
25676 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
25677 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
25678 R.isDeleted(Op0) || R.isDeleted(Op1))
25679 return false;
25680
25681 // First collect all possible candidates
25683 Candidates.emplace_back(Op0, Op1);
25684
25685 auto *A = dyn_cast<BinaryOperator>(Op0);
25686 auto *B = dyn_cast<BinaryOperator>(Op1);
25687 // Try to skip B.
25688 if (A && B && B->hasOneUse()) {
25689 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
25690 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
25691 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
25692 Candidates.emplace_back(A, B0);
25693 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
25694 Candidates.emplace_back(A, B1);
25695 }
25696 // Try to skip A.
25697 if (B && A && A->hasOneUse()) {
25698 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
25699 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
25700 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
25701 Candidates.emplace_back(A0, B);
25702 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
25703 Candidates.emplace_back(A1, B);
25704 }
25705
25706 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
25708 if (!isReductionCandidate(Inst))
25709 return false;
25710 Type *Ty = Inst->getType();
25711 if (!isValidElementType(Ty) || Ty->isPointerTy())
25712 return false;
25713 HorizontalReduction HorRdx(Inst, Ops);
25714 if (!HorRdx.matchReductionForOperands())
25715 return false;
25716 // Check the cost of operations.
25717 VectorType *VecTy = getWidenedType(Ty, Ops.size());
25719 InstructionCost ScalarCost =
25720 TTI.getScalarizationOverhead(
25721 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
25722 /*Extract=*/true, CostKind) +
25723 TTI.getInstructionCost(Inst, CostKind);
25724 InstructionCost RedCost;
25725 switch (::getRdxKind(Inst)) {
25726 case RecurKind::Add:
25727 case RecurKind::Mul:
25728 case RecurKind::Or:
25729 case RecurKind::And:
25730 case RecurKind::Xor:
25731 case RecurKind::FAdd:
25732 case RecurKind::FMul: {
25733 FastMathFlags FMF;
25734 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
25735 FMF = FPCI->getFastMathFlags();
25736 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25737 CostKind);
25738 break;
25739 }
25740 default:
25741 return false;
25742 }
25743 if (RedCost >= ScalarCost)
25744 return false;
25745
25746 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
25747 };
25748 if (Candidates.size() == 1)
25749 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
25750
25751 // We have multiple options. Try to pick the single best.
25752 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
25753 if (!BestCandidate)
25754 return false;
25755 return (*BestCandidate == 0 &&
25756 TryToReduce(I, {Candidates[*BestCandidate].first,
25757 Candidates[*BestCandidate].second})) ||
25758 tryToVectorizeList({Candidates[*BestCandidate].first,
25759 Candidates[*BestCandidate].second},
25760 R);
25761}
25762
25763bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
25764 BasicBlock *BB, BoUpSLP &R) {
25765 SmallVector<WeakTrackingVH> PostponedInsts;
25766 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
25767 Res |= tryToVectorize(PostponedInsts, R);
25768 return Res;
25769}
25770
25771bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
25772 BoUpSLP &R) {
25773 bool Res = false;
25774 for (Value *V : Insts)
25775 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
25776 Res |= tryToVectorize(Inst, R);
25777 return Res;
25778}
25779
25780bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25781 BasicBlock *BB, BoUpSLP &R,
25782 bool MaxVFOnly) {
25783 if (!R.canMapToVector(IVI->getType()))
25784 return false;
25785
25786 SmallVector<Value *, 16> BuildVectorOpds;
25787 SmallVector<Value *, 16> BuildVectorInsts;
25788 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
25789 return false;
25790
25791 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
25792 R.getORE()->emit([&]() {
25793 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
25794 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
25795 "trying reduction first.";
25796 });
25797 return false;
25798 }
25799 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
25800 // Aggregate value is unlikely to be processed in vector register.
25801 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25802}
25803
25804bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25805 BasicBlock *BB, BoUpSLP &R,
25806 bool MaxVFOnly) {
25807 SmallVector<Value *, 16> BuildVectorInsts;
25808 SmallVector<Value *, 16> BuildVectorOpds;
25809 SmallVector<int> Mask;
25810 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
25812 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
25813 return false;
25814
25815 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
25816 R.getORE()->emit([&]() {
25817 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
25818 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
25819 "trying reduction first.";
25820 });
25821 return false;
25822 }
25823 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
25824 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25825}
25826
25827template <typename T>
25829 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
25830 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
25831 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
25832 bool MaxVFOnly, BoUpSLP &R) {
25833 bool Changed = false;
25834 // Sort by type, parent, operands.
25835 stable_sort(Incoming, Comparator);
25836
25837 // Try to vectorize elements base on their type.
25838 SmallVector<T *> Candidates;
25840 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
25841 VL.clear()) {
25842 // Look for the next elements with the same type, parent and operand
25843 // kinds.
25844 auto *I = dyn_cast<Instruction>(*IncIt);
25845 if (!I || R.isDeleted(I)) {
25846 ++IncIt;
25847 continue;
25848 }
25849 auto *SameTypeIt = IncIt;
25850 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
25851 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25852 AreCompatible(VL, *SameTypeIt))) {
25853 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25854 ++SameTypeIt;
25855 if (I && !R.isDeleted(I))
25856 VL.push_back(cast<T>(I));
25857 }
25858
25859 // Try to vectorize them.
25860 unsigned NumElts = VL.size();
25861 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
25862 << NumElts << ")\n");
25863 // The vectorization is a 3-state attempt:
25864 // 1. Try to vectorize instructions with the same/alternate opcodes with the
25865 // size of maximal register at first.
25866 // 2. Try to vectorize remaining instructions with the same type, if
25867 // possible. This may result in the better vectorization results rather than
25868 // if we try just to vectorize instructions with the same/alternate opcodes.
25869 // 3. Final attempt to try to vectorize all instructions with the
25870 // same/alternate ops only, this may result in some extra final
25871 // vectorization.
25872 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
25873 // Success start over because instructions might have been changed.
25874 Changed = true;
25875 VL.swap(Candidates);
25876 Candidates.clear();
25877 for (T *V : VL) {
25878 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25879 Candidates.push_back(V);
25880 }
25881 } else {
25882 /// \Returns the minimum number of elements that we will attempt to
25883 /// vectorize.
25884 auto GetMinNumElements = [&R](Value *V) {
25885 unsigned EltSize = R.getVectorElementSize(V);
25886 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25887 };
25888 if (NumElts < GetMinNumElements(*IncIt) &&
25889 (Candidates.empty() ||
25890 Candidates.front()->getType() == (*IncIt)->getType())) {
25891 for (T *V : VL) {
25892 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25893 Candidates.push_back(V);
25894 }
25895 }
25896 }
25897 // Final attempt to vectorize instructions with the same types.
25898 if (Candidates.size() > 1 &&
25899 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25900 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
25901 // Success start over because instructions might have been changed.
25902 Changed = true;
25903 } else if (MaxVFOnly) {
25904 // Try to vectorize using small vectors.
25906 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
25907 VL.clear()) {
25908 auto *I = dyn_cast<Instruction>(*It);
25909 if (!I || R.isDeleted(I)) {
25910 ++It;
25911 continue;
25912 }
25913 auto *SameTypeIt = It;
25914 while (SameTypeIt != End &&
25915 (!isa<Instruction>(*SameTypeIt) ||
25916 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25917 AreCompatible(*SameTypeIt, *It))) {
25918 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25919 ++SameTypeIt;
25920 if (I && !R.isDeleted(I))
25921 VL.push_back(cast<T>(I));
25922 }
25923 unsigned NumElts = VL.size();
25924 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
25925 /*MaxVFOnly=*/false))
25926 Changed = true;
25927 It = SameTypeIt;
25928 }
25929 }
25930 Candidates.clear();
25931 }
25932
25933 // Start over at the next instruction of a different type (or the end).
25934 IncIt = SameTypeIt;
25935 }
25936 return Changed;
25937}
25938
25939/// Compare two cmp instructions. If IsCompatibility is true, function returns
25940/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
25941/// operands. If IsCompatibility is false, function implements strict weak
25942/// ordering relation between two cmp instructions, returning true if the first
25943/// instruction is "less" than the second, i.e. its predicate is less than the
25944/// predicate of the second or the operands IDs are less than the operands IDs
25945/// of the second cmp instruction.
25946template <bool IsCompatibility>
25947static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
25948 const DominatorTree &DT) {
25949 assert(isValidElementType(V->getType()) &&
25950 isValidElementType(V2->getType()) &&
25951 "Expected valid element types only.");
25952 if (V == V2)
25953 return IsCompatibility;
25954 auto *CI1 = cast<CmpInst>(V);
25955 auto *CI2 = cast<CmpInst>(V2);
25956 if (CI1->getOperand(0)->getType()->getTypeID() <
25957 CI2->getOperand(0)->getType()->getTypeID())
25958 return !IsCompatibility;
25959 if (CI1->getOperand(0)->getType()->getTypeID() >
25960 CI2->getOperand(0)->getType()->getTypeID())
25961 return false;
25962 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25964 return !IsCompatibility;
25965 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25967 return false;
25968 CmpInst::Predicate Pred1 = CI1->getPredicate();
25969 CmpInst::Predicate Pred2 = CI2->getPredicate();
25972 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
25973 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
25974 if (BasePred1 < BasePred2)
25975 return !IsCompatibility;
25976 if (BasePred1 > BasePred2)
25977 return false;
25978 // Compare operands.
25979 bool CI1Preds = Pred1 == BasePred1;
25980 bool CI2Preds = Pred2 == BasePred1;
25981 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
25982 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
25983 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
25984 if (Op1 == Op2)
25985 continue;
25986 if (Op1->getValueID() < Op2->getValueID())
25987 return !IsCompatibility;
25988 if (Op1->getValueID() > Op2->getValueID())
25989 return false;
25990 if (auto *I1 = dyn_cast<Instruction>(Op1))
25991 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
25992 if (IsCompatibility) {
25993 if (I1->getParent() != I2->getParent())
25994 return false;
25995 } else {
25996 // Try to compare nodes with same parent.
25997 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
25998 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
25999 if (!NodeI1)
26000 return NodeI2 != nullptr;
26001 if (!NodeI2)
26002 return false;
26003 assert((NodeI1 == NodeI2) ==
26004 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26005 "Different nodes should have different DFS numbers");
26006 if (NodeI1 != NodeI2)
26007 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26008 }
26009 InstructionsState S = getSameOpcode({I1, I2}, TLI);
26010 if (S && (IsCompatibility || !S.isAltShuffle()))
26011 continue;
26012 if (IsCompatibility)
26013 return false;
26014 if (I1->getOpcode() != I2->getOpcode())
26015 return I1->getOpcode() < I2->getOpcode();
26016 }
26017 }
26018 return IsCompatibility;
26019}
26020
26021template <typename ItT>
26022bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
26023 BasicBlock *BB, BoUpSLP &R) {
26024 bool Changed = false;
26025 // Try to find reductions first.
26026 for (CmpInst *I : CmpInsts) {
26027 if (R.isDeleted(I))
26028 continue;
26029 for (Value *Op : I->operands())
26030 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
26031 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
26032 if (R.isDeleted(I))
26033 break;
26034 }
26035 }
26036 // Try to vectorize operands as vector bundles.
26037 for (CmpInst *I : CmpInsts) {
26038 if (R.isDeleted(I))
26039 continue;
26040 Changed |= tryToVectorize(I, R);
26041 }
26042 // Try to vectorize list of compares.
26043 // Sort by type, compare predicate, etc.
26044 auto CompareSorter = [&](Value *V, Value *V2) {
26045 if (V == V2)
26046 return false;
26047 return compareCmp<false>(V, V2, *TLI, *DT);
26048 };
26049
26050 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
26051 if (VL.empty() || VL.back() == V1)
26052 return true;
26053 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
26054 };
26055
26057 for (Instruction *V : CmpInsts)
26058 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
26059 Vals.push_back(V);
26060 if (Vals.size() <= 1)
26061 return Changed;
26063 Vals, CompareSorter, AreCompatibleCompares,
26064 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26065 // Exclude possible reductions from other blocks.
26066 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
26067 return any_of(V->users(), [V](User *U) {
26068 auto *Select = dyn_cast<SelectInst>(U);
26069 return Select &&
26070 Select->getParent() != cast<Instruction>(V)->getParent();
26071 });
26072 });
26073 if (ArePossiblyReducedInOtherBlock)
26074 return false;
26075 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26076 },
26077 /*MaxVFOnly=*/true, R);
26078 return Changed;
26079}
26080
26081bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
26082 BasicBlock *BB, BoUpSLP &R) {
26084 "This function only accepts Insert instructions");
26085 bool OpsChanged = false;
26086 SmallVector<WeakTrackingVH> PostponedInsts;
26087 for (auto *I : reverse(Instructions)) {
26088 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
26089 if (R.isDeleted(I) || isa<CmpInst>(I))
26090 continue;
26091 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
26092 OpsChanged |=
26093 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
26094 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
26095 OpsChanged |=
26096 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
26097 }
26098 // pass2 - try to vectorize reductions only
26099 if (R.isDeleted(I))
26100 continue;
26101 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
26102 if (R.isDeleted(I) || isa<CmpInst>(I))
26103 continue;
26104 // pass3 - try to match and vectorize a buildvector sequence.
26105 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
26106 OpsChanged |=
26107 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
26108 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
26109 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
26110 /*MaxVFOnly=*/false);
26111 }
26112 }
26113 // Now try to vectorize postponed instructions.
26114 OpsChanged |= tryToVectorize(PostponedInsts, R);
26115
26116 Instructions.clear();
26117 return OpsChanged;
26118}
26119
26120bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
26121 bool Changed = false;
26122 SmallVector<Value *, 4> Incoming;
26123 SmallPtrSet<Value *, 16> VisitedInstrs;
26124 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
26125 // node. Allows better to identify the chains that can be vectorized in the
26126 // better way.
26127 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
26128 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
26130 isValidElementType(V2->getType()) &&
26131 "Expected vectorizable types only.");
26132 if (V1 == V2)
26133 return false;
26134 // It is fine to compare type IDs here, since we expect only vectorizable
26135 // types, like ints, floats and pointers, we don't care about other type.
26136 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
26137 return true;
26138 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
26139 return false;
26140 if (V1->getType()->getScalarSizeInBits() <
26141 V2->getType()->getScalarSizeInBits())
26142 return true;
26143 if (V1->getType()->getScalarSizeInBits() >
26144 V2->getType()->getScalarSizeInBits())
26145 return false;
26146 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26147 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26148 if (Opcodes1.size() < Opcodes2.size())
26149 return true;
26150 if (Opcodes1.size() > Opcodes2.size())
26151 return false;
26152 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26153 {
26154 // Instructions come first.
26155 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
26156 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
26157 if (I1 && I2) {
26158 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
26159 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
26160 if (!NodeI1)
26161 return NodeI2 != nullptr;
26162 if (!NodeI2)
26163 return false;
26164 assert((NodeI1 == NodeI2) ==
26165 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26166 "Different nodes should have different DFS numbers");
26167 if (NodeI1 != NodeI2)
26168 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26169 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
26170 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
26171 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
26172 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
26173 if (!E1 || !E2)
26174 continue;
26175
26176 // Sort on ExtractElementInsts primarily by vector operands. Prefer
26177 // program order of the vector operands.
26178 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
26179 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
26180 if (V1 != V2) {
26181 if (V1 && !V2)
26182 return true;
26183 if (!V1 && V2)
26184 return false;
26186 DT->getNode(V1->getParent());
26188 DT->getNode(V2->getParent());
26189 if (!NodeI1)
26190 return NodeI2 != nullptr;
26191 if (!NodeI2)
26192 return false;
26193 assert((NodeI1 == NodeI2) ==
26194 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26195 "Different nodes should have different DFS numbers");
26196 if (NodeI1 != NodeI2)
26197 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26198 return V1->comesBefore(V2);
26199 }
26200 // If we have the same vector operand, try to sort by constant
26201 // index.
26202 std::optional<unsigned> Id1 = getExtractIndex(E1);
26203 std::optional<unsigned> Id2 = getExtractIndex(E2);
26204 // Bring constants to the top
26205 if (Id1 && !Id2)
26206 return true;
26207 if (!Id1 && Id2)
26208 return false;
26209 // First elements come first.
26210 if (Id1 && Id2)
26211 return *Id1 < *Id2;
26212
26213 continue;
26214 }
26215 if (I1->getOpcode() == I2->getOpcode())
26216 continue;
26217 return I1->getOpcode() < I2->getOpcode();
26218 }
26219 if (I1)
26220 return true;
26221 if (I2)
26222 return false;
26223 }
26224 {
26225 // Non-undef constants come next.
26226 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
26227 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
26228 if (C1 && C2)
26229 continue;
26230 if (C1)
26231 return true;
26232 if (C2)
26233 return false;
26234 }
26235 bool U1 = isa<UndefValue>(Opcodes1[I]);
26236 bool U2 = isa<UndefValue>(Opcodes2[I]);
26237 {
26238 // Non-constant non-instructions come next.
26239 if (!U1 && !U2) {
26240 auto ValID1 = Opcodes1[I]->getValueID();
26241 auto ValID2 = Opcodes2[I]->getValueID();
26242 if (ValID1 == ValID2)
26243 continue;
26244 if (ValID1 < ValID2)
26245 return true;
26246 if (ValID1 > ValID2)
26247 return false;
26248 }
26249 if (!U1)
26250 return true;
26251 if (!U2)
26252 return false;
26253 }
26254 // Undefs come last.
26255 assert(U1 && U2 && "The only thing left should be undef & undef.");
26256 }
26257 return false;
26258 };
26259 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
26260 Value *V1) {
26261 if (VL.empty() || V1 == VL.back())
26262 return true;
26263 Value *V2 = VL.back();
26264 if (V1->getType() != V2->getType())
26265 return false;
26266 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26267 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26268 if (Opcodes1.size() != Opcodes2.size())
26269 return false;
26270 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26271 // Undefs are compatible with any other value.
26272 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
26273 continue;
26274 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
26275 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
26276 if (R.isDeleted(I1) || R.isDeleted(I2))
26277 return false;
26278 if (I1->getParent() != I2->getParent())
26279 return false;
26280 if (getSameOpcode({I1, I2}, *TLI))
26281 continue;
26282 return false;
26283 }
26284 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
26285 continue;
26286 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
26287 return false;
26288 }
26289 return true;
26290 };
26291
26292 bool HaveVectorizedPhiNodes = false;
26293 do {
26294 // Collect the incoming values from the PHIs.
26295 Incoming.clear();
26296 for (Instruction &I : *BB) {
26297 auto *P = dyn_cast<PHINode>(&I);
26298 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
26299 break;
26300
26301 // No need to analyze deleted, vectorized and non-vectorizable
26302 // instructions.
26303 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
26304 isValidElementType(P->getType()))
26305 Incoming.push_back(P);
26306 }
26307
26308 if (Incoming.size() <= 1)
26309 break;
26310
26311 // Find the corresponding non-phi nodes for better matching when trying to
26312 // build the tree.
26313 for (Value *V : Incoming) {
26314 SmallVectorImpl<Value *> &Opcodes =
26315 PHIToOpcodes.try_emplace(V).first->getSecond();
26316 if (!Opcodes.empty())
26317 continue;
26318 SmallVector<Value *, 4> Nodes(1, V);
26319 SmallPtrSet<Value *, 4> Visited;
26320 while (!Nodes.empty()) {
26321 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
26322 if (!Visited.insert(PHI).second)
26323 continue;
26324 for (Value *V : PHI->incoming_values()) {
26325 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
26326 Nodes.push_back(PHI1);
26327 continue;
26328 }
26329 Opcodes.emplace_back(V);
26330 }
26331 }
26332 }
26333
26334 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26335 Incoming, PHICompare, AreCompatiblePHIs,
26336 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26337 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26338 },
26339 /*MaxVFOnly=*/true, R);
26340 Changed |= HaveVectorizedPhiNodes;
26341 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
26342 auto *PHI = dyn_cast<PHINode>(P.first);
26343 return !PHI || R.isDeleted(PHI);
26344 }))
26345 PHIToOpcodes.clear();
26346 VisitedInstrs.insert_range(Incoming);
26347 } while (HaveVectorizedPhiNodes);
26348
26349 VisitedInstrs.clear();
26350
26351 InstSetVector PostProcessInserts;
26352 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26353 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
26354 // also vectorizes `PostProcessCmps`.
26355 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
26356 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26357 if (VectorizeCmps) {
26358 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
26359 PostProcessCmps.clear();
26360 }
26361 PostProcessInserts.clear();
26362 return Changed;
26363 };
26364 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
26365 auto IsInPostProcessInstrs = [&](Instruction *I) {
26366 if (auto *Cmp = dyn_cast<CmpInst>(I))
26367 return PostProcessCmps.contains(Cmp);
26369 PostProcessInserts.contains(I);
26370 };
26371 // Returns true if `I` is an instruction without users, like terminator, or
26372 // function call with ignored return value, store. Ignore unused instructions
26373 // (basing on instruction type, except for CallInst and InvokeInst).
26374 auto HasNoUsers = [](Instruction *I) {
26375 return I->use_empty() &&
26376 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
26377 };
26378 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
26379 // Skip instructions with scalable type. The num of elements is unknown at
26380 // compile-time for scalable type.
26381 if (isa<ScalableVectorType>(It->getType()))
26382 continue;
26383
26384 // Skip instructions marked for the deletion.
26385 if (R.isDeleted(&*It))
26386 continue;
26387 // We may go through BB multiple times so skip the one we have checked.
26388 if (!VisitedInstrs.insert(&*It).second) {
26389 if (HasNoUsers(&*It) &&
26390 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
26391 // We would like to start over since some instructions are deleted
26392 // and the iterator may become invalid value.
26393 Changed = true;
26394 It = BB->begin();
26395 E = BB->end();
26396 }
26397 continue;
26398 }
26399
26400 // Try to vectorize reductions that use PHINodes.
26401 if (PHINode *P = dyn_cast<PHINode>(It)) {
26402 // Check that the PHI is a reduction PHI.
26403 if (P->getNumIncomingValues() == 2) {
26404 // Try to match and vectorize a horizontal reduction.
26405 Instruction *Root = getReductionInstr(DT, P, BB, LI);
26406 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
26407 Changed = true;
26408 It = BB->begin();
26409 E = BB->end();
26410 continue;
26411 }
26412 }
26413 // Try to vectorize the incoming values of the PHI, to catch reductions
26414 // that feed into PHIs.
26415 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
26416 // Skip if the incoming block is the current BB for now. Also, bypass
26417 // unreachable IR for efficiency and to avoid crashing.
26418 // TODO: Collect the skipped incoming values and try to vectorize them
26419 // after processing BB.
26420 if (BB == P->getIncomingBlock(I) ||
26421 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
26422 continue;
26423
26424 // Postponed instructions should not be vectorized here, delay their
26425 // vectorization.
26426 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
26427 PI && !IsInPostProcessInstrs(PI)) {
26428 bool Res =
26429 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
26430 Changed |= Res;
26431 if (Res && R.isDeleted(P)) {
26432 It = BB->begin();
26433 E = BB->end();
26434 break;
26435 }
26436 }
26437 }
26438 continue;
26439 }
26440
26441 if (HasNoUsers(&*It)) {
26442 bool OpsChanged = false;
26443 auto *SI = dyn_cast<StoreInst>(It);
26444 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
26445 if (SI) {
26446 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
26447 // Try to vectorize chain in store, if this is the only store to the
26448 // address in the block.
26449 // TODO: This is just a temporarily solution to save compile time. Need
26450 // to investigate if we can safely turn on slp-vectorize-hor-store
26451 // instead to allow lookup for reduction chains in all non-vectorized
26452 // stores (need to check side effects and compile time).
26453 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
26454 SI->getValueOperand()->hasOneUse();
26455 }
26456 if (TryToVectorizeRoot) {
26457 for (auto *V : It->operand_values()) {
26458 // Postponed instructions should not be vectorized here, delay their
26459 // vectorization.
26460 if (auto *VI = dyn_cast<Instruction>(V);
26461 VI && !IsInPostProcessInstrs(VI))
26462 // Try to match and vectorize a horizontal reduction.
26463 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
26464 }
26465 }
26466 // Start vectorization of post-process list of instructions from the
26467 // top-tree instructions to try to vectorize as many instructions as
26468 // possible.
26469 OpsChanged |=
26470 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
26471 if (OpsChanged) {
26472 // We would like to start over since some instructions are deleted
26473 // and the iterator may become invalid value.
26474 Changed = true;
26475 It = BB->begin();
26476 E = BB->end();
26477 continue;
26478 }
26479 }
26480
26482 PostProcessInserts.insert(&*It);
26483 else if (isa<CmpInst>(It))
26484 PostProcessCmps.insert(cast<CmpInst>(&*It));
26485 }
26486
26487 return Changed;
26488}
26489
26490bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
26491 auto Changed = false;
26492 for (auto &Entry : GEPs) {
26493 // If the getelementptr list has fewer than two elements, there's nothing
26494 // to do.
26495 if (Entry.second.size() < 2)
26496 continue;
26497
26498 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
26499 << Entry.second.size() << ".\n");
26500
26501 // Process the GEP list in chunks suitable for the target's supported
26502 // vector size. If a vector register can't hold 1 element, we are done. We
26503 // are trying to vectorize the index computations, so the maximum number of
26504 // elements is based on the size of the index expression, rather than the
26505 // size of the GEP itself (the target's pointer size).
26506 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
26507 return !R.isDeleted(GEP);
26508 });
26509 if (It == Entry.second.end())
26510 continue;
26511 unsigned MaxVecRegSize = R.getMaxVecRegSize();
26512 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
26513 if (MaxVecRegSize < EltSize)
26514 continue;
26515
26516 unsigned MaxElts = MaxVecRegSize / EltSize;
26517 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
26518 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26519 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
26520
26521 // Initialize a set a candidate getelementptrs. Note that we use a
26522 // SetVector here to preserve program order. If the index computations
26523 // are vectorizable and begin with loads, we want to minimize the chance
26524 // of having to reorder them later.
26525 SetVector<Value *> Candidates(llvm::from_range, GEPList);
26526
26527 // Some of the candidates may have already been vectorized after we
26528 // initially collected them or their index is optimized to constant value.
26529 // If so, they are marked as deleted, so remove them from the set of
26530 // candidates.
26531 Candidates.remove_if([&R](Value *I) {
26532 return R.isDeleted(cast<Instruction>(I)) ||
26533 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
26534 });
26535
26536 // Remove from the set of candidates all pairs of getelementptrs with
26537 // constant differences. Such getelementptrs are likely not good
26538 // candidates for vectorization in a bottom-up phase since one can be
26539 // computed from the other. We also ensure all candidate getelementptr
26540 // indices are unique.
26541 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
26542 auto *GEPI = GEPList[I];
26543 if (!Candidates.count(GEPI))
26544 continue;
26545 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
26546 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
26547 auto *GEPJ = GEPList[J];
26548 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26549 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
26550 Candidates.remove(GEPI);
26551 Candidates.remove(GEPJ);
26552 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26553 Candidates.remove(GEPJ);
26554 }
26555 }
26556 }
26557
26558 // We break out of the above computation as soon as we know there are
26559 // fewer than two candidates remaining.
26560 if (Candidates.size() < 2)
26561 continue;
26562
26563 // Add the single, non-constant index of each candidate to the bundle. We
26564 // ensured the indices met these constraints when we originally collected
26565 // the getelementptrs.
26566 SmallVector<Value *, 16> Bundle(Candidates.size());
26567 auto BundleIndex = 0u;
26568 for (auto *V : Candidates) {
26569 auto *GEP = cast<GetElementPtrInst>(V);
26570 auto *GEPIdx = GEP->idx_begin()->get();
26571 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26572 Bundle[BundleIndex++] = GEPIdx;
26573 }
26574
26575 // Try and vectorize the indices. We are currently only interested in
26576 // gather-like cases of the form:
26577 //
26578 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
26579 //
26580 // where the loads of "a", the loads of "b", and the subtractions can be
26581 // performed in parallel. It's likely that detecting this pattern in a
26582 // bottom-up phase will be simpler and less costly than building a
26583 // full-blown top-down phase beginning at the consecutive loads.
26584 Changed |= tryToVectorizeList(Bundle, R);
26585 }
26586 }
26587 return Changed;
26588}
26589
26590bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
26591 bool Changed = false;
26592 // Sort by type, base pointers and values operand. Value operands must be
26593 // compatible (have the same opcode, same parent), otherwise it is
26594 // definitely not profitable to try to vectorize them.
26595 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
26596 if (V->getValueOperand()->getType()->getTypeID() <
26597 V2->getValueOperand()->getType()->getTypeID())
26598 return true;
26599 if (V->getValueOperand()->getType()->getTypeID() >
26600 V2->getValueOperand()->getType()->getTypeID())
26601 return false;
26602 if (V->getPointerOperandType()->getTypeID() <
26603 V2->getPointerOperandType()->getTypeID())
26604 return true;
26605 if (V->getPointerOperandType()->getTypeID() >
26606 V2->getPointerOperandType()->getTypeID())
26607 return false;
26608 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
26609 V2->getValueOperand()->getType()->getScalarSizeInBits())
26610 return true;
26611 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
26612 V2->getValueOperand()->getType()->getScalarSizeInBits())
26613 return false;
26614 // UndefValues are compatible with all other values.
26615 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
26616 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26617 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26618 DT->getNode(I1->getParent());
26619 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26620 DT->getNode(I2->getParent());
26621 assert(NodeI1 && "Should only process reachable instructions");
26622 assert(NodeI2 && "Should only process reachable instructions");
26623 assert((NodeI1 == NodeI2) ==
26624 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26625 "Different nodes should have different DFS numbers");
26626 if (NodeI1 != NodeI2)
26627 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26628 return I1->getOpcode() < I2->getOpcode();
26629 }
26630 return V->getValueOperand()->getValueID() <
26631 V2->getValueOperand()->getValueID();
26632 };
26633
26634 bool SameParent = true;
26635 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
26636 if (VL.empty()) {
26637 SameParent = true;
26638 return true;
26639 }
26640 StoreInst *V2 = VL.back();
26641 if (V1 == V2)
26642 return true;
26643 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
26644 return false;
26645 if (V1->getPointerOperandType() != V2->getPointerOperandType())
26646 return false;
26647 // Undefs are compatible with any other value.
26648 if (isa<UndefValue>(V1->getValueOperand()) ||
26650 return true;
26651 if (isa<Constant>(V1->getValueOperand()) &&
26653 return true;
26654 // Check if the operands of the stores can be vectorized. They can be
26655 // vectorized, if they have compatible operands or have operands, which can
26656 // be vectorized as copyables.
26657 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
26658 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
26659 if (I1 || I2) {
26660 // Accept only tail-following non-compatible values for now.
26661 // TODO: investigate if it is possible to vectorize incompatible values,
26662 // if the copyables are first in the list.
26663 if (I1 && !I2)
26664 return false;
26665 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
26666 SmallVector<Value *> NewVL(VL.size() + 1);
26667 for (auto [SI, V] : zip(VL, NewVL))
26668 V = SI->getValueOperand();
26669 NewVL.back() = V1->getValueOperand();
26670 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
26671 InstructionsState S = Analysis.buildInstructionsState(
26672 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
26673 /*SkipSameCodeCheck=*/!SameParent);
26674 if (S)
26675 return true;
26676 if (!SameParent)
26677 return false;
26678 }
26679 return V1->getValueOperand()->getValueID() ==
26680 V2->getValueOperand()->getValueID();
26681 };
26682
26683 // Attempt to sort and vectorize each of the store-groups.
26684 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26685 for (auto &Pair : Stores) {
26686 if (Pair.second.size() < 2)
26687 continue;
26688
26689 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
26690 << Pair.second.size() << ".\n");
26691
26692 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
26693 continue;
26694
26695 // Reverse stores to do bottom-to-top analysis. This is important if the
26696 // values are stores to the same addresses several times, in this case need
26697 // to follow the stores order (reversed to meet the memory dependecies).
26698 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
26699 Pair.second.rend());
26701 ReversedStores, StoreSorter, AreCompatibleStores,
26702 [&](ArrayRef<StoreInst *> Candidates, bool) {
26703 return vectorizeStores(Candidates, R, Attempted);
26704 },
26705 /*MaxVFOnly=*/false, R);
26706 }
26707 return Changed;
26708}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
Definition ExpandFp.cpp:993
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void negate()
Negate this APInt in place.
Definition APInt.h:1468
unsigned logBase2() const
Definition APInt.h:1761
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
const T & back() const
back - Get the last element.
Definition ArrayRef.h:156
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:224
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:200
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:162
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:480
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:707
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:664
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
static bool shouldExecute(unsigned CounterName)
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getUnknown()
Definition DebugLoc.h:162
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
bool erase(const KeyT &Val)
Definition DenseMap.h:311
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:163
iterator end()
Definition DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:213
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:158
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:149
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:111
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
T & front() const
front - Get the first element.
Definition ArrayRef.h:354
iterator end() const
Definition ArrayRef.h:348
iterator begin() const
Definition ArrayRef.h:347
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:90
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:131
void insert_range(Range &&R)
Definition SetVector.h:175
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:93
void clear()
Completely clear the SetVector.
Definition SetVector.h:266
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:99
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:251
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:338
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:228
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:181
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:284
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
iterator_range< value_op_iterator > operand_values()
Definition User.h:316
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1718
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
InstructionCost Cost
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1724
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2231
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:1981
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:95
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2128
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1968
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1763
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:339
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1397
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1920
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ None
Not a recurrence.
@ Xor
Bitwise or logical XOR of integers.
@ FMul
Product of floats.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2030
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1835
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1407
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:830
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:257
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1425
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1434
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const